From 08b2987787615845ea0e721546295b744463c3b1 Mon Sep 17 00:00:00 2001
From: Per Held <per.held@arm.com>
Date: Fri, 24 Apr 2026 14:31:27 +0200
Subject: [PATCH 01/58] Arm backend: Generate random op test inputs lazily

Several Arm operator tests were creating random inputs at module import
time. The Arm test seed is applied later by an autouse pytest fixture,
so those tensors were not actually controlled by ARM_TEST_SEED.

That made tests nondeterministic across fresh pytest processes and could
expose different quantization behavior from run to run. Generate the
affected inputs lazily inside each test case so the existing seed
fixture makes them reproducible and ARM_TEST_SEED=RANDOM can rerandomize
the intended data.

Signed-off-by: Per Held <per.held@arm.com>
Change-Id: Ic4414da5e84b7fb19275e04399634289b10a0a19
---
 backends/arm/test/ops/test_addmm.py           |  62 ++++---
 backends/arm/test/ops/test_atan.py            |  30 ++--
 backends/arm/test/ops/test_atanh.py           |  29 ++--
 backends/arm/test/ops/test_bitwise_not.py     |  36 ++--
 .../arm/test/ops/test_conv_constant_pad_nd.py |  42 +++--
 backends/arm/test/ops/test_cos.py             |  32 ++--
 backends/arm/test/ops/test_cosh.py            |  44 ++---
 backends/arm/test/ops/test_detach_copy.py     |  12 +-
 backends/arm/test/ops/test_erfinv.py          |  36 ++--
 backends/arm/test/ops/test_expm1.py           |  30 ++--
 backends/arm/test/ops/test_glu.py             |  33 ++--
 backends/arm/test/ops/test_group_norm.py      |  35 ++--
 .../arm/test/ops/test_linalg_vector_norm.py   |  20 +--
 backends/arm/test/ops/test_logit.py           |  32 ++--
 backends/arm/test/ops/test_sign.py            |  30 ++--
 backends/arm/test/ops/test_sin.py             |  28 ++--
 backends/arm/test/ops/test_sinh.py            |  34 ++--
 backends/arm/test/ops/test_tan.py             |  42 ++---
 .../arm/test/ops/test_upsample_bilinear2d.py  | 155 ++++++++++++------
 19 files changed, 433 insertions(+), 329 deletions(-)

diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index 799b770e863..da44992ee28 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -28,73 +28,91 @@
 
 
 test_data_suite = {
-    "basic": [
+    "basic": lambda: [
         torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
         torch.tensor([[1.0, 0.0], [0.0, 1.0]]),
         torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
         1.0,
         1.0,
     ],
-    "zeros": [torch.zeros(2, 2), torch.zeros(2, 3), torch.zeros(3, 2), 1.0, 1.0],
-    "beta_only": [
+    "zeros": lambda: [
+        torch.zeros(2, 2),
+        torch.zeros(2, 3),
+        torch.zeros(3, 2),
+        1.0,
+        1.0,
+    ],
+    "beta_only": lambda: [
         torch.tensor([[10.0, 20.0], [30.0, 40.0]]),
         torch.randn(2, 3),
         torch.randn(3, 2),
         0.0,
         1.0,
     ],
-    "alpha_only": [
+    "alpha_only": lambda: [
         torch.tensor([[10.0, 20.0], [30.0, 40.0]]),
         torch.randn(2, 3),
         torch.randn(3, 2),
         1.0,
         0.0,
     ],
-    "scaled": [
+    "scaled": lambda: [
         torch.ones(2, 2),
         torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
         torch.tensor([[5.0, 6.0], [7.0, 8.0]]),
         0.5,
         2.0,
     ],
-    "negative_scalars": [
+    "negative_scalars": lambda: [
         torch.tensor([[1.0, -1.0], [-1.0, 1.0]]),
         torch.tensor([[2.0, 0.0], [0.0, 2.0]]),
         torch.tensor([[1.0, 1.0], [1.0, 1.0]]),
         -1.0,
         -1.0,
     ],
-    "non_square": [torch.ones(3, 4), torch.rand(3, 2), torch.rand(2, 4), 1.0, 1.0],
-    "large_values": [
+    "non_square": lambda: [
+        torch.ones(3, 4),
+        torch.rand(3, 2),
+        torch.rand(2, 4),
+        1.0,
+        1.0,
+    ],
+    "large_values": lambda: [
         torch.full((2, 2), 1e6),
         torch.full((2, 3), 1e3),
         torch.full((3, 2), 1e3),
         1.0,
         1.0,
     ],
-    "small_values": [
+    "small_values": lambda: [
         torch.full((2, 2), 1e-6),
         torch.full((2, 3), 1e-3),
         torch.full((3, 2), 1e-3),
         1.0,
         1.0,
     ],
-    "random": [torch.randn(4, 5), torch.randn(4, 3), torch.randn(3, 5), 1.0, 1.0],
-    "broadcast_bias_row": [
+    "random": lambda: [
+        torch.randn(4, 5),
+        torch.randn(4, 3),
+        torch.randn(3, 5),
+        1.0,
+        1.0,
+    ],
+    "broadcast_bias_row": lambda: [
         torch.randn(1, 2),
         torch.randn(3, 4),
         torch.randn(4, 2),
         1.0,
         1.0,
     ],
-    "row_bias": [
+    "row_bias": lambda: [
         torch.randn(3, 1),
         torch.randn(3, 4),
         torch.randn(4, 4),
         1.0,
         1.0,
     ],
-    "scalar_bias": [
+    "scalar_bias": lambda: [
         torch.tensor(2.0),
         torch.randn(5, 3),
         torch.randn(3, 6),
@@ -120,7 +138,7 @@ def forward(
 def test_addmm_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -131,7 +149,7 @@ def test_addmm_tosa_FP(test_data: Tuple):
 def test_addmm_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=exir_op,
     )
@@ -143,7 +161,7 @@ def test_addmm_tosa_INT(test_data: Tuple):
 def test_addmm_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -155,7 +173,7 @@ def test_addmm_u55_INT(test_data: Tuple):
 def test_addmm_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -167,7 +185,7 @@ def test_addmm_u85_INT(test_data: Tuple):
 def test_addmm_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=False,
@@ -180,7 +198,7 @@ def test_addmm_vgf_no_quant(test_data: input_t1):
 def test_addmm_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=exir_op,
         quantize=True,
@@ -197,7 +215,7 @@ def test_addmm_16a8w_tosa_INT(test_data: input_t1):
 
     pipeline = TosaPipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=[],
         per_channel_quantization=per_channel_quantization,
@@ -223,7 +241,7 @@ def test_addmm_16a8w_u55_INT(test_data: input_t1):
 
     pipeline = EthosU55PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
@@ -245,7 +263,7 @@ def test_addmm_16a8w_u85_INT(test_data: input_t1):
 
     pipeline = EthosU85PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
diff --git a/backends/arm/test/ops/test_atan.py b/backends/arm/test/ops/test_atan.py
index 4e103dcaa82..5ceae6fa189 100644
--- a/backends/arm/test/ops/test_atan.py
+++ b/backends/arm/test/ops/test_atan.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -22,14 +22,14 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "zeros_alt_shape": torch.zeros(1, 10, 3, 5),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(1, 10, 3, 5) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(1, 10, 3, 5),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(1, 10, 3, 5) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 
@@ -43,7 +43,7 @@ def forward(self, x: torch.Tensor):
 def test_atan_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -54,7 +54,7 @@ def test_atan_tosa_FP(test_data: Tuple):
 def test_atan_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -66,7 +66,7 @@ def test_atan_tosa_INT(test_data: Tuple):
 def test_atan_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -78,7 +78,7 @@ def test_atan_u55_INT(test_data: Tuple):
 def test_atan_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -90,7 +90,7 @@ def test_atan_u85_INT(test_data: Tuple):
 def test_atan_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -103,7 +103,7 @@ def test_atan_vgf_no_quant(test_data: Tuple):
 def test_atan_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_atanh.py b/backends/arm/test/ops/test_atanh.py
index 2eae5fcade2..2f621d8a02c 100644
--- a/backends/arm/test/ops/test_atanh.py
+++ b/backends/arm/test/ops/test_atanh.py
@@ -24,13 +24,13 @@
 
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "zeros_alt_shape": torch.zeros(1, 10, 3, 5),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(1, 10, 3, 5) - 0.5,
-    "ramp": torch.arange(-1, 1, 0.2),
-    "near_bounds": torch.tensor([-0.99, -0.9, 0.9, 0.99]),
-    "on_bounds": torch.tensor([-1.0, 1.0]),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(1, 10, 3, 5),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(1, 10, 3, 5) - 0.5,
+    "ramp": lambda: torch.arange(-1, 1, 0.2),
+    "near_bounds": lambda: torch.tensor([-0.99, -0.9, 0.9, 0.99]),
+    "on_bounds": lambda: torch.tensor([-1.0, 1.0]),
 }
 
 
@@ -43,7 +43,7 @@ def forward(self, x: torch.Tensor):
 def test_atanh_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -52,13 +52,14 @@ def test_atanh_tosa_FP(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 def test_atanh_tosa_INT(test_data: Tuple):
+    input_data = test_data()
     pipeline = TosaPipelineINT[input_t1](
         Atanh(),
-        (test_data,),
+        (input_data,),
         aten_op=aten_op,
         exir_op=exir_op,
     )
-    if torch.any(test_data >= 1) or torch.any(test_data <= -1):
+    if torch.any(input_data >= 1) or torch.any(input_data <= -1):
         # The quantized model will saturate to max/min values while the
         # original model will return inf/-inf, so comparison wont be valid here.
         pipeline.pop_stage("run_method_and_compare_outputs.original_model")
@@ -70,7 +71,7 @@ def test_atanh_tosa_INT(test_data: Tuple):
 def test_atanh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -82,7 +83,7 @@ def test_atanh_u55_INT(test_data: Tuple):
 def test_atanh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -94,7 +95,7 @@ def test_atanh_u85_INT(test_data: Tuple):
 def test_atanh_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=False,
@@ -107,7 +108,7 @@ def test_atanh_vgf_no_quant(test_data: input_t1):
 def test_atanh_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_bitwise_not.py b/backends/arm/test/ops/test_bitwise_not.py
index a4304476183..4f390fcca15 100644
--- a/backends/arm/test/ops/test_bitwise_not.py
+++ b/backends/arm/test/ops/test_bitwise_not.py
@@ -22,20 +22,20 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite_non_bool = {
-    "zeros": torch.zeros(1, 10, 10, 10, dtype=torch.int32),
-    "ones": torch.ones(10, 2, 3, dtype=torch.int8),
-    "pattern1_int8": 0xAA * torch.ones(1, 2, 2, 2, dtype=torch.int8),
-    "pattern1_int16": 0xAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int16),
-    "pattern1_int32": 0xAAAAAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int32),
-    "pattern2_int8": 0xCC * torch.ones(1, 2, 2, 2, dtype=torch.int8),
-    "pattern2_int16": 0xCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int16),
-    "pattern2_int32": 0xCCCCCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int32),
-    "rand_rank2": torch.randint(-128, 127, (10, 10), dtype=torch.int8),
-    "rand_rank4": torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10, dtype=torch.int32),
+    "ones": lambda: torch.ones(10, 2, 3, dtype=torch.int8),
+    "pattern1_int8": lambda: 0xAA * torch.ones(1, 2, 2, 2, dtype=torch.int8),
+    "pattern1_int16": lambda: 0xAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int16),
+    "pattern1_int32": lambda: 0xAAAAAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int32),
+    "pattern2_int8": lambda: 0xCC * torch.ones(1, 2, 2, 2, dtype=torch.int8),
+    "pattern2_int16": lambda: 0xCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int16),
+    "pattern2_int32": lambda: 0xCCCCCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int32),
+    "rand_rank2": lambda: torch.randint(-128, 127, (10, 10), dtype=torch.int8),
+    "rand_rank4": lambda: torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8),
 }
 
 test_data_suite_bool = {
-    "pattern_bool": torch.tensor([True, False, True], dtype=torch.bool),
+    "pattern_bool": lambda: torch.tensor([True, False, True], dtype=torch.bool),
 }
 
 test_data_suite = {**test_data_suite_non_bool, **test_data_suite_bool}
@@ -52,7 +52,7 @@ def test_bitwise_not_tosa_FP(test_data: Tuple):
     # We don't delegate bitwise_not since it is not supported on the FP profile.
     pipeline = OpNotSupportedPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         {exir_op: 1},
         quantize=False,
     )
@@ -63,7 +63,7 @@ def test_bitwise_not_tosa_FP(test_data: Tuple):
 def test_bitwise_not_tosa_FP_bool(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         "executorch_exir_dialects_edge__ops_aten_logical_not_default",
         atol=0,
@@ -77,7 +77,7 @@ def test_bitwise_not_tosa_FP_bool(test_data: Tuple):
 def test_bitwise_not_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -89,7 +89,7 @@ def test_bitwise_not_u55_INT(test_data: Tuple):
     # We don't delegate bitwise_not since it is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         {exir_op: 1},
         quantize=True,
         u55_subset=True,
@@ -102,7 +102,7 @@ def test_bitwise_not_u55_INT(test_data: Tuple):
 def test_bitwise_not_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -114,7 +114,7 @@ def test_bitwise_not_u85_INT(test_data: Tuple):
 def test_bitwise_not_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -127,7 +127,7 @@ def test_bitwise_not_vgf_no_quant(test_data: Tuple):
 def test_bitwise_not_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_conv_constant_pad_nd.py b/backends/arm/test/ops/test_conv_constant_pad_nd.py
index d26a1f2d90d..b732ffaf972 100644
--- a/backends/arm/test/ops/test_conv_constant_pad_nd.py
+++ b/backends/arm/test/ops/test_conv_constant_pad_nd.py
@@ -25,15 +25,31 @@
 input_t1 = Tuple[torch.Tensor]  # Input x
 
 test_data_suite = {
-    "4dim_last1dim": (torch.rand(1, 1, 16, 16), (1, 1, 0, 0, 0, 0, 0, 0), 1),
-    "4dim_last2dim": (torch.rand(1, 1, 16, 16), (1, 0, 1, 0, 0, 0, 0, 0), 2),
-    "4dim_last3dim": (torch.rand(1, 1, 16, 16), (1, 1, 0, 2, 0, 2, 0, 0), 3),
-    "4dim_last4dim": (torch.rand(1, 1, 16, 16), (1, 0, 1, 1, 0, 2, 0, 2), 4),
-    "3dim_last1dim": (torch.rand(1, 1, 16), (1, 1, 0, 0, 0, 0), 1),
-    "3dim_last2dim": (torch.rand(1, 1, 16), (1, 0, 1, 1, 0, 0), 2),
-    "3dim_last3dim": (torch.rand(1, 1, 16), (1, 0, 1, 0, 1, 1), 3),
-    "2dim_last1dim": (torch.rand(1, 1, 16), (1, 1, 0, 0), 1),
-    "2dim_last2dim": (torch.rand(1, 1, 16), (1, 0, 1, 1), 2),
+    "4dim_last1dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 1, 0, 0, 0, 0, 0, 0),
+        1,
+    ),
+    "4dim_last2dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 0, 1, 0, 0, 0, 0, 0),
+        2,
+    ),
+    "4dim_last3dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 1, 0, 2, 0, 2, 0, 0),
+        3,
+    ),
+    "4dim_last4dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 0, 1, 1, 0, 2, 0, 2),
+        4,
+    ),
+    "3dim_last1dim": lambda: (torch.rand(1, 1, 16), (1, 1, 0, 0, 0, 0), 1),
+    "3dim_last2dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 1, 0, 0), 2),
+    "3dim_last3dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 0, 1, 1), 3),
+    "2dim_last1dim": lambda: (torch.rand(1, 1, 16), (1, 1, 0, 0), 1),
+    "2dim_last2dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 1), 2),
 }
 """Tests conv + pad."""
 
@@ -91,7 +107,7 @@ def forward(self, x: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 def test_constant_pad_nd_tosa_FP(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = TosaPipelineFP[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -103,7 +119,7 @@ def test_constant_pad_nd_tosa_FP(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 def test_constant_pad_nd_tosa_INT(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = TosaPipelineINT[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -118,7 +134,7 @@ def test_constant_pad_nd_tosa_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
 def test_constant_pad_nd_vgf_no_quant(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = VgfPipeline[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -132,7 +148,7 @@ def test_constant_pad_nd_vgf_no_quant(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
 def test_constant_pad_nd_vgf_quant(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = VgfPipeline[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
diff --git a/backends/arm/test/ops/test_cos.py b/backends/arm/test/ops/test_cos.py
index da5d1470028..e020c3de971 100644
--- a/backends/arm/test/ops/test_cos.py
+++ b/backends/arm/test/ops/test_cos.py
@@ -23,21 +23,21 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10, 10),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "zeros": lambda: torch.zeros(10, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 test_data_suite_bf16 = {
-    "rand_bf16": torch.rand(4, 4, dtype=torch.bfloat16) - 0.5,
-    "ramp_bf16": torch.arange(-8, 8, 0.5, dtype=torch.bfloat16),
+    "rand_bf16": lambda: torch.rand(4, 4, dtype=torch.bfloat16) - 0.5,
+    "ramp_bf16": lambda: torch.arange(-8, 8, 0.5, dtype=torch.bfloat16),
 }
 test_data_suite_fp16 = {
-    "rand_fp16": torch.rand(4, 4, dtype=torch.float16) - 0.5,
-    "ramp_fp16": torch.arange(-8, 8, 0.5, dtype=torch.float16),
+    "rand_fp16": lambda: torch.rand(4, 4, dtype=torch.float16) - 0.5,
+    "ramp_fp16": lambda: torch.arange(-8, 8, 0.5, dtype=torch.float16),
 }
 
 
@@ -54,7 +54,7 @@ def forward(self, x: torch.Tensor):
 def test_cos_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         tosa_extensions=["bf16"],
@@ -67,7 +67,7 @@ def test_cos_tosa_FP(test_data: Tuple):
 def test_cos_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
     )
@@ -79,7 +79,7 @@ def test_cos_tosa_INT(test_data: Tuple):
 def test_cos_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -91,7 +91,7 @@ def test_cos_u55_INT(test_data: Tuple):
 def test_cos_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -103,7 +103,7 @@ def test_cos_u85_INT(test_data: Tuple):
 def test_cos_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         quantize=False,
@@ -116,7 +116,7 @@ def test_cos_vgf_no_quant(test_data: Tuple):
 def test_cos_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         quantize=True,
diff --git a/backends/arm/test/ops/test_cosh.py b/backends/arm/test/ops/test_cosh.py
index f07a87d5e2c..cc319b4087f 100644
--- a/backends/arm/test/ops/test_cosh.py
+++ b/backends/arm/test/ops/test_cosh.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -21,21 +21,21 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10),
-    "zeros_4D": torch.zeros(1, 10, 32, 7),
-    "zeros_alt_shape": torch.zeros(10, 3, 5),
-    "ones": torch.ones(15, 10, 7),
-    "ones_4D": torch.ones(1, 3, 32, 16),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(10, 3, 5) - 0.5,
-    "rand_4D": torch.rand(1, 6, 5, 7) - 0.5,
-    "randn_pos": torch.randn(10) + 3,
-    "randn_neg": torch.randn(10) - 3,
-    "ramp": torch.arange(-16, 16, 0.2),
-    "large": 100 * torch.ones(1, 1),
-    "small": 0.000001 * torch.ones(1, 1),
-    "small_rand": torch.rand(100) * 0.01,
-    "biggest": torch.tensor([700.0, 710.0, 750.0]),
+    "zeros": lambda: torch.zeros(10, 10, 10),
+    "zeros_4D": lambda: torch.zeros(1, 10, 32, 7),
+    "zeros_alt_shape": lambda: torch.zeros(10, 3, 5),
+    "ones": lambda: torch.ones(15, 10, 7),
+    "ones_4D": lambda: torch.ones(1, 3, 32, 16),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(10, 3, 5) - 0.5,
+    "rand_4D": lambda: torch.rand(1, 6, 5, 7) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 3,
+    "randn_neg": lambda: torch.randn(10) - 3,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+    "large": lambda: 100 * torch.ones(1, 1),
+    "small": lambda: 0.000001 * torch.ones(1, 1),
+    "small_rand": lambda: torch.rand(100) * 0.01,
+    "biggest": lambda: torch.tensor([700.0, 710.0, 750.0]),
 }
 
 
@@ -48,7 +48,7 @@ def forward(self, x: torch.Tensor):
 def test_cosh_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Cosh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -58,7 +58,7 @@ def test_cosh_tosa_FP(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_cosh_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
-        Cosh(), (test_data,), aten_op=aten_op, exir_op=exir_op
+        Cosh(), (test_data(),), aten_op=aten_op, exir_op=exir_op
     )
     pipeline.run()
 
@@ -67,7 +67,7 @@ def test_cosh_tosa_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_cosh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
-        Cosh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Cosh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -80,7 +80,7 @@ def test_cosh_u55_INT(test_data: Tuple):
 )
 def test_cosh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
-        Cosh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Cosh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -90,7 +90,7 @@ def test_cosh_u85_INT(test_data: Tuple):
 def test_cosh_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cosh(),
-        (test_data,),
+        (test_data(),),
         [],
         [],
         quantize=False,
@@ -103,7 +103,7 @@ def test_cosh_vgf_no_quant(test_data: Tuple):
 def test_cosh_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cosh(),
-        (test_data,),
+        (test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_detach_copy.py b/backends/arm/test/ops/test_detach_copy.py
index c8715ca847a..9fba6c44ca6 100644
--- a/backends/arm/test/ops/test_detach_copy.py
+++ b/backends/arm/test/ops/test_detach_copy.py
@@ -19,10 +19,10 @@
 exir_op = "executorch_exir_dialects_edge__ops_aten__detach_copy_default"
 
 test_data_suite = {
-    "zeros_2d": torch.zeros(3, 5),
-    "ones_3d": torch.ones(2, 3, 4),
-    "rand_2d": torch.rand(10, 10) - 0.5,
-    "ramp_1d": torch.arange(-8.0, 8.0, 0.5),
+    "zeros_2d": lambda: torch.zeros(3, 5),
+    "ones_3d": lambda: torch.ones(2, 3, 4),
+    "rand_2d": lambda: torch.rand(10, 10) - 0.5,
+    "ramp_1d": lambda: torch.arange(-8.0, 8.0, 0.5),
 }
 
 
@@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor):
 def test_detach_tosa_FP(test_data: torch.Tensor):
     pipeline = TosaPipelineFP[input_t1](
         DetachCopy(),
-        (test_data,),
+        (test_data(),),
         aten_op=DetachCopy.aten_op,
         exir_op=DetachCopy.exir_op,
     )
@@ -49,7 +49,7 @@ def test_detach_tosa_FP(test_data: torch.Tensor):
 def test_detach_tosa_INT(test_data: torch.Tensor):
     pipeline = TosaPipelineINT[input_t1](
         DetachCopy(),
-        (test_data,),
+        (test_data(),),
         aten_op=DetachCopy.aten_op,
         exir_op=DetachCopy.exir_op,
     )
diff --git a/backends/arm/test/ops/test_erfinv.py b/backends/arm/test/ops/test_erfinv.py
index 204a4c50455..efb6efb0028 100644
--- a/backends/arm/test/ops/test_erfinv.py
+++ b/backends/arm/test/ops/test_erfinv.py
@@ -22,26 +22,26 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "small": torch.randn(100) * 0.01,
-    "mid": torch.rand(10, 10) * 1.8 - 0.9,
-    "near_pos_bound": torch.full((32,), 0.99),
-    "near_neg_bound": torch.full((32,), -0.99),
-    "pos_one": torch.full((32,), 1.0),
-    "neg_one": torch.full((32,), -1.0),
-    "ramp": torch.arange(-0.99, 0.99, 0.02),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "small": lambda: torch.randn(100) * 0.01,
+    "mid": lambda: torch.rand(10, 10) * 1.8 - 0.9,
+    "near_pos_bound": lambda: torch.full((32,), 0.99),
+    "near_neg_bound": lambda: torch.full((32,), -0.99),
+    "pos_one": lambda: torch.full((32,), 1.0),
+    "neg_one": lambda: torch.full((32,), -1.0),
+    "ramp": lambda: torch.arange(-0.99, 0.99, 0.02),
 }
 
 
 test_data_nan_outputs = {
-    "pos_two": torch.full((32,), 2.0),
-    "neg_two": torch.full((32,), -2.0),
+    "pos_two": lambda: torch.full((32,), 2.0),
+    "neg_two": lambda: torch.full((32,), -2.0),
 }
 
 
 test_data_fp16 = {
-    "rand_fp16": (torch.rand(8, 8, dtype=torch.float16) * 1.8 - 0.9),
-    "ramp_fp16": torch.arange(-0.9, 0.9, 0.1, dtype=torch.float16),
+    "rand_fp16": lambda: (torch.rand(8, 8, dtype=torch.float16) * 1.8 - 0.9),
+    "ramp_fp16": lambda: torch.arange(-0.9, 0.9, 0.1, dtype=torch.float16),
 }
 
 
@@ -56,7 +56,7 @@ def forward(self, x: torch.Tensor):
 def test_erfinv_tosa_FP(test_data: torch.Tensor):
     pipeline = TosaPipelineFP[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -65,7 +65,7 @@ def test_erfinv_tosa_FP(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 def test_erfinv_tosa_INT(test_data: torch.Tensor):
-    pipeline = TosaPipelineINT[input_t1](Erfinv(), (test_data,), aten_op, exir_op)
+    pipeline = TosaPipelineINT[input_t1](Erfinv(), (test_data(),), aten_op, exir_op)
     pipeline.run()
 
 
@@ -74,7 +74,7 @@ def test_erfinv_tosa_INT(test_data: torch.Tensor):
 def test_erfinv_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -86,7 +86,7 @@ def test_erfinv_u55_INT(test_data: torch.Tensor):
 def test_erfinv_u85_INT(test_data: torch.Tensor):
     pipeline = EthosU85PipelineINT[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -100,7 +100,7 @@ def test_erfinv_u85_INT(test_data: torch.Tensor):
 def test_erfinv_vgf_no_quant(test_data: torch.Tensor):
     pipeline = VgfPipeline[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -113,7 +113,7 @@ def test_erfinv_vgf_no_quant(test_data: torch.Tensor):
 def test_erfinv_vgf_quant(test_data: torch.Tensor):
     pipeline = VgfPipeline[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_expm1.py b/backends/arm/test/ops/test_expm1.py
index 7556d1e45a8..2fb4f11d7ef 100644
--- a/backends/arm/test/ops/test_expm1.py
+++ b/backends/arm/test/ops/test_expm1.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -22,16 +22,16 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeroes": torch.zeros(1, 10, 10, 10),
-    "ones": torch.ones(10, 2, 3),
-    "rand": torch.rand(10, 10) - 0.5,
-    "near_zero": torch.randn(100) * 0.01,
-    "taylor_small": torch.empty(5).uniform_(
+    "zeroes": lambda: torch.zeros(1, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 2, 3),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "near_zero": lambda: torch.randn(100) * 0.01,
+    "taylor_small": lambda: torch.empty(5).uniform_(
         -0.35, 0.35
     ),  # test cases for taylor series expansion
-    "randn_large_pos": torch.randn(10) + 10,
-    "randn_large_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "randn_large_pos": lambda: torch.randn(10) + 10,
+    "randn_large_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 
@@ -45,7 +45,7 @@ def forward(self, x: torch.Tensor):
 def test_expm1_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -56,7 +56,7 @@ def test_expm1_tosa_FP(test_data: Tuple):
 def test_expm1_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -68,7 +68,7 @@ def test_expm1_tosa_INT(test_data: Tuple):
 def test_expm1_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -80,7 +80,7 @@ def test_expm1_u55_INT(test_data: Tuple):
 def test_expm1_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -92,7 +92,7 @@ def test_expm1_u85_INT(test_data: Tuple):
 def test_expm1_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -105,7 +105,7 @@ def test_expm1_vgf_no_quant(test_data: Tuple):
 def test_expm1_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_glu.py b/backends/arm/test/ops/test_glu.py
index 0ad4e35d4ca..d8f3ed89e5c 100644
--- a/backends/arm/test/ops/test_glu.py
+++ b/backends/arm/test/ops/test_glu.py
@@ -23,14 +23,14 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": [torch.zeros(10, 10, 2), -1],
-    "ones": [torch.ones(10, 10, 2), -1],
-    "rand": [torch.rand(10, 10, 2) - 0.5, -1],
-    "randn_pos": [torch.randn(10, 2) + 10, -1],
-    "randn_neg": [torch.randn(10, 2) - 10, -1],
-    "ramp": [torch.linspace(-16, 15.8, 160).reshape(-1, 2), -1],
-    "zeros_custom_dim": [torch.zeros(7, 10, 5), 1],
-    "rand_custom_dim": [torch.rand(10, 3, 3) - 0.5, 0],
+    "zeros": lambda: [torch.zeros(10, 10, 2), -1],
+    "ones": lambda: [torch.ones(10, 10, 2), -1],
+    "rand": lambda: [torch.rand(10, 10, 2) - 0.5, -1],
+    "randn_pos": lambda: [torch.randn(10, 2) + 10, -1],
+    "randn_neg": lambda: [torch.randn(10, 2) - 10, -1],
+    "ramp": lambda: [torch.linspace(-16, 15.8, 160).reshape(-1, 2), -1],
+    "zeros_custom_dim": lambda: [torch.zeros(7, 10, 5), 1],
+    "rand_custom_dim": lambda: [torch.rand(10, 3, 3) - 0.5, 0],
 }
 
 
@@ -47,7 +47,7 @@ def forward(self, a: torch.Tensor, dim: int) -> torch.Tensor:
 def test_glu_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         aten_op,
         exir_op,
     )
@@ -59,14 +59,15 @@ def test_glu_tosa_FP(test_data: Tuple):
     test_data_suite,
 )
 def test_glu_tosa_INT(test_data: Tuple):
+    input_data = test_data()
     pipeline = TosaPipelineINT[input_t1](
         Glu(),
-        (*test_data,),
+        (*input_data,),
         aten_op=[],
         exir_op=exir_op,
         # These tests don't make sense when output is ~= 0
-        frobenius_threshold=1.0 if (test_data[0].max() < 5) else 0.1,
-        cosine_threshold=0.0 if (test_data[0].max() < 5) else 0.9,
+        frobenius_threshold=1.0 if (input_data[0].max() < 5) else 0.1,
+        cosine_threshold=0.0 if (input_data[0].max() < 5) else 0.9,
     )
     pipeline.run()
 
@@ -79,7 +80,7 @@ def test_glu_tosa_INT(test_data: Tuple):
 def test_glu_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -94,7 +95,7 @@ def test_glu_u55_INT(test_data: Tuple):
 def test_glu_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -109,7 +110,7 @@ def test_glu_u85_INT(test_data: Tuple):
 def test_glu_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=False,
@@ -125,7 +126,7 @@ def test_glu_vgf_no_quant(test_data: input_t1):
 def test_glu_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_group_norm.py b/backends/arm/test/ops/test_group_norm.py
index 32e6babf101..f31698093e9 100644
--- a/backends/arm/test/ops/test_group_norm.py
+++ b/backends/arm/test/ops/test_group_norm.py
@@ -40,29 +40,39 @@ def forward(
 
 input_t = tuple[torch.Tensor]
 test_data_suite = {
-    "rand_4_6_groups_1": ((torch.rand(4, 6),), GroupNorm(1, 6)),
-    "rand_4_6_groups_2": ((torch.rand(4, 6),), GroupNorm(2, 6)),
-    "rand_4_6_groups_6": ((torch.rand(4, 6),), GroupNorm(6, 6)),
-    "rand_4_6_8_groups_2_eps_no_affine": (
+    "rand_4_6_groups_1": lambda: ((torch.rand(4, 6),), GroupNorm(1, 6)),
+    "rand_4_6_groups_2": lambda: ((torch.rand(4, 6),), GroupNorm(2, 6)),
+    "rand_4_6_groups_6": lambda: ((torch.rand(4, 6),), GroupNorm(6, 6)),
+    "rand_4_6_8_groups_2_eps_no_affine": lambda: (
         (torch.rand(4, 6, 8),),
         GroupNorm(2, 6, eps=1e-3, affine=False),
     ),
-    "randn_1_12_8_6_groups_6_eps": (
+    "randn_1_12_8_6_groups_6_eps": lambda: (
         (torch.randn(1, 12, 8, 6),),
         GroupNorm(6, 12, eps=1e-2),
     ),
-    "randn_1_12_8_6_groups_12": ((torch.randn(1, 12, 8, 6),), GroupNorm(12, 12)),
-    "rand_6_8_10_12_groups_1": ((torch.rand(6, 8, 10, 12),), GroupNorm(1, 8)),
-    "rand_6_8_10_12_groups_4_no_affine": (
+    "randn_1_12_8_6_groups_12": lambda: (
+        (torch.randn(1, 12, 8, 6),),
+        GroupNorm(12, 12),
+    ),
+    "rand_6_8_10_12_groups_1": lambda: (
+        (torch.rand(6, 8, 10, 12),),
+        GroupNorm(1, 8),
+    ),
+    "rand_6_8_10_12_groups_4_no_affine": lambda: (
         (torch.rand(6, 8, 10, 12),),
         GroupNorm(4, 8, affine=False),
     ),
-    "rand_6_8_10_12_groups_8": ((torch.rand(6, 8, 10, 12),), GroupNorm(8, 8)),
+    "rand_6_8_10_12_groups_8": lambda: (
+        (torch.rand(6, 8, 10, 12),),
+        GroupNorm(8, 8),
+    ),
 }
 
 
 @common.parametrize("test_data", test_data_suite)
 def test_native_group_norm_tosa_FP(test_data):
+    test_data = test_data()
     aten_op = "torch.ops.aten.group_norm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
     pipeline = TosaPipelineFP[input_t](
@@ -79,6 +89,7 @@ def test_native_group_norm_tosa_FP(test_data):
     test_data_suite,
 )
 def test_native_group_norm_tosa_INT(test_data):
+    test_data = test_data()
     aten_op = "torch.ops.aten.sub.Tensor"  # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
     pipeline = TosaPipelineINT[input_t](
@@ -97,6 +108,7 @@ def test_native_group_norm_tosa_INT(test_data):
 )
 @common.XfailIfNoCorstone300
 def test_native_group_norm_u55_INT(test_data):
+    test_data = test_data()
     pipeline = EthosU55PipelineINT[input_t](
         test_data[1],
         test_data[0],
@@ -113,6 +125,7 @@ def test_native_group_norm_u55_INT(test_data):
 )
 @common.XfailIfNoCorstone320
 def test_native_group_norm_u85_INT(test_data):
+    test_data = test_data()
     pipeline = EthosU85PipelineINT[input_t](
         test_data[1],
         test_data[0],
@@ -131,7 +144,7 @@ def test_native_group_norm_u85_INT(test_data):
 def test_native_group_norm_vgf_no_quant(test_data):
     aten_op = "torch.ops.aten.group_norm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
-    model, inp = test_data
+    model, inp = test_data()
     pipeline = VgfPipeline[input_t](
         inp,
         model,
@@ -150,7 +163,7 @@ def test_native_group_norm_vgf_no_quant(test_data):
 def test_native_group_norm_vgf_quant(test_data):
     aten_op = "torch.ops.aten.sub.Tensor"
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
-    model, inp = test_data
+    model, inp = test_data()
     pipeline = VgfPipeline[input_t](
         inp,
         model,
diff --git a/backends/arm/test/ops/test_linalg_vector_norm.py b/backends/arm/test/ops/test_linalg_vector_norm.py
index 1b2fc169fce..fa3290eb5d1 100644
--- a/backends/arm/test/ops/test_linalg_vector_norm.py
+++ b/backends/arm/test/ops/test_linalg_vector_norm.py
@@ -53,17 +53,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 test_modules = {
-    "default": (VectorNormModel(dim=1), (torch.rand(10, 4),)),
-    "ord1": (VectorNormModel(ord=1, dim=1), (torch.rand(10, 4),)),
-    "ord2": (VectorNormModel(ord=2, dim=1), (torch.rand(10, 20),)),
+    "default": lambda: (VectorNormModel(dim=1), (torch.rand(10, 4),)),
+    "ord1": lambda: (VectorNormModel(ord=1, dim=1), (torch.rand(10, 4),)),
+    "ord2": lambda: (VectorNormModel(ord=2, dim=1), (torch.rand(10, 20),)),
     # Norm computed along a specific dimension of a 3D tensor
-    "dim_3d": (VectorNormModel(dim=2), (torch.rand(4, 5, 6),)),
+    "dim_3d": lambda: (VectorNormModel(dim=2), (torch.rand(4, 5, 6),)),
 }
 
 
 @common.parametrize("test_module", test_modules)
 def test_vector_norm_tosa_FP(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     # We decompose LinalgVectorNorm before quantize stage to have annotations
     # with q/dq nodes. In case of FP, this operator will be decomposed
@@ -79,7 +79,7 @@ def test_vector_norm_tosa_FP(test_module):
 
 @common.parametrize("test_module", test_modules)
 def test_vector_norm_tosa_INT(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     # Should not found this op
     exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
@@ -97,7 +97,7 @@ def test_vector_norm_tosa_INT(test_module):
 @common.parametrize("test_module", test_modules)
 @common.XfailIfNoCorstone300
 def test_vector_norm_u55_INT_fvp(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     pipeline = EthosU55PipelineINT[input_t](
         model,
@@ -113,7 +113,7 @@ def test_vector_norm_u55_INT_fvp(test_module):
 @common.parametrize("test_module", test_modules)
 @common.XfailIfNoCorstone320
 def test_vector_norm_u85_INT_fvp(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     # The should be decomposed and annotated in DecomposeLinalgVectorNorm pass.
     pipeline = EthosU85PipelineINT[input_t](
@@ -130,7 +130,7 @@ def test_vector_norm_u85_INT_fvp(test_module):
 @common.parametrize("test_module", test_modules)
 @common.SkipIfNoModelConverter
 def test_vector_norm_vgf_no_quant(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
     # FP VGF
     aten_op = "torch.ops.aten.linalg_vector_norm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
@@ -147,7 +147,7 @@ def test_vector_norm_vgf_no_quant(test_module):
 @common.parametrize("test_module", test_modules)
 @common.SkipIfNoModelConverter
 def test_vector_norm_vgf_quant(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
     # Should not found this op
     exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
 
diff --git a/backends/arm/test/ops/test_logit.py b/backends/arm/test/ops/test_logit.py
index cf5b4f7f07e..b0ce9aad1f5 100644
--- a/backends/arm/test/ops/test_logit.py
+++ b/backends/arm/test/ops/test_logit.py
@@ -22,16 +22,16 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": [torch.zeros((10, 10, 10)), None],
-    "ones": [torch.ones((10, 10, 10)), None],
-    "uniform_valid": [torch.rand((10, 10, 10)), None],
-    "near_zero": [torch.full((10, 10), 1e-8), None],
-    "near_one": [torch.full((10, 10), 1 - 1e-8), None],
-    "mixed": [torch.tensor([0.0, 1e-5, 0.5, 1 - 1e-5, 1.0]), None],
-    "multi_dim": [torch.rand((2, 3, 4)), None],
-    "eps": [torch.zeros((10, 10, 10)), 1e-6],
-    "invalid_neg": [torch.full((5,), -0.1), 1e-6],
-    "invalid_gt1": [torch.full((5,), 1.1), 1e-6],
+    "zeros": lambda: (torch.zeros((10, 10, 10)), None),
+    "ones": lambda: (torch.ones((10, 10, 10)), None),
+    "uniform_valid": lambda: (torch.rand((10, 10, 10)), None),
+    "near_zero": lambda: (torch.full((10, 10), 1e-8), None),
+    "near_one": lambda: (torch.full((10, 10), 1 - 1e-8), None),
+    "mixed": lambda: (torch.tensor([0.0, 1e-5, 0.5, 1 - 1e-5, 1.0]), None),
+    "multi_dim": lambda: (torch.rand((2, 3, 4)), None),
+    "eps": lambda: (torch.zeros((10, 10, 10)), 1e-6),
+    "invalid_neg": lambda: (torch.full((5,), -0.1), 1e-6),
+    "invalid_gt1": lambda: (torch.full((5,), 1.1), 1e-6),
 }
 
 
@@ -45,7 +45,7 @@ def forward(self, x: torch.Tensor, eps: torch.float32):
 def test_logit_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -56,7 +56,7 @@ def test_logit_tosa_FP(test_data: Tuple):
 def test_logit_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=exir_op,
         # Quantization issues when logit(x) -> inf
@@ -71,7 +71,7 @@ def test_logit_tosa_INT(test_data: Tuple):
 def test_logit_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -83,7 +83,7 @@ def test_logit_u55_INT(test_data: Tuple):
 def test_logit_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -98,7 +98,7 @@ def test_logit_u85_INT(test_data: Tuple):
 def test_logit_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=False,
@@ -114,7 +114,7 @@ def test_logit_vgf_no_quant(test_data: input_t1):
 def test_logit_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_sign.py b/backends/arm/test/ops/test_sign.py
index 62c1a0db63b..5e5a88011e9 100644
--- a/backends/arm/test/ops/test_sign.py
+++ b/backends/arm/test/ops/test_sign.py
@@ -22,17 +22,17 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": torch.zeros(3, 5),
-    "ones": torch.ones(4, 4),
-    "neg_ones": -torch.ones(4, 4),
-    "mixed_signs": torch.tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]]),
-    "positive_ramp": torch.arange(0.1, 1.1, 0.2),
-    "negative_ramp": torch.arange(-1.0, -0.1, 0.2),
-    "small_values": torch.tensor(
+    "zeros": lambda: torch.zeros(3, 5),
+    "ones": lambda: torch.ones(4, 4),
+    "neg_ones": lambda: -torch.ones(4, 4),
+    "mixed_signs": lambda: torch.tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]]),
+    "positive_ramp": lambda: torch.arange(0.1, 1.1, 0.2),
+    "negative_ramp": lambda: torch.arange(-1.0, -0.1, 0.2),
+    "small_values": lambda: torch.tensor(
         [-1e-3, 0.0, 1e-3]
     ),  # Only values > observer's .eps are of interest.
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(10, 3, 5) - 0.5,
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(10, 3, 5) - 0.5,
 }
 
 
@@ -45,7 +45,7 @@ def forward(self, x: torch.Tensor):
 def test_sign_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -55,7 +55,7 @@ def test_sign_tosa_FP(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sign_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
-        Sign(), (test_data,), aten_op=[], exir_op=exir_op, frobenius_threshold=None
+        Sign(), (test_data(),), aten_op=[], exir_op=exir_op, frobenius_threshold=None
     )
     pipeline.run()
 
@@ -66,7 +66,7 @@ def test_sign_tosa_INT(test_data: Tuple):
 def test_sign_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -78,7 +78,7 @@ def test_sign_u55_INT(test_data: Tuple):
 def test_sign_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -90,7 +90,7 @@ def test_sign_u85_INT(test_data: Tuple):
 def test_sign_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=False,
@@ -103,7 +103,7 @@ def test_sign_vgf_no_quant(test_data: Tuple):
 def test_sign_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_op=[],
         exir_op=exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_sin.py b/backends/arm/test/ops/test_sin.py
index 3073c44be85..f9b3c2585e6 100644
--- a/backends/arm/test/ops/test_sin.py
+++ b/backends/arm/test/ops/test_sin.py
@@ -22,20 +22,20 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10, 10),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "zeros": lambda: torch.zeros(10, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 test_data_suite_fp16 = {
-    "rand_fp16": torch.rand(10, 10, dtype=torch.float16),
+    "rand_fp16": lambda: torch.rand(10, 10, dtype=torch.float16),
 }
 
 test_data_suite_bf16 = {
-    "rand_bf16": torch.rand(3, 3, dtype=torch.bfloat16),
+    "rand_bf16": lambda: torch.rand(3, 3, dtype=torch.bfloat16),
 }
 
 
@@ -51,7 +51,7 @@ def forward(self, x: torch.Tensor):
 def test_sin_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         tosa_extensions=["bf16"],
@@ -63,7 +63,7 @@ def test_sin_tosa_FP(test_data: Tuple):
 def test_sin_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
     )
@@ -75,7 +75,7 @@ def test_sin_tosa_INT(test_data: Tuple):
 def test_sin_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -87,7 +87,7 @@ def test_sin_u55_INT(test_data: Tuple):
 def test_sin_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -99,7 +99,7 @@ def test_sin_u85_INT(test_data: Tuple):
 def test_sin_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=False,
     )
@@ -111,7 +111,7 @@ def test_sin_vgf_no_quant(test_data: Tuple):
 def test_sin_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=True,
     )
diff --git a/backends/arm/test/ops/test_sinh.py b/backends/arm/test/ops/test_sinh.py
index 703d3e52011..911d9da077b 100644
--- a/backends/arm/test/ops/test_sinh.py
+++ b/backends/arm/test/ops/test_sinh.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -23,16 +23,16 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10),
-    "zeros_alt_shape": torch.zeros(10, 3, 5),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(10, 3, 5) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
-    "large": 100 * torch.ones(1, 1),
-    "small": 0.000001 * torch.ones(1, 1),
+    "zeros": lambda: torch.zeros(10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(10, 3, 5),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(10, 3, 5) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+    "large": lambda: 100 * torch.ones(1, 1),
+    "small": lambda: 0.000001 * torch.ones(1, 1),
 }
 
 
@@ -46,7 +46,7 @@ def forward(self, x: torch.Tensor):
 def test_sinh_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Sinh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -56,7 +56,7 @@ def test_sinh_tosa_FP(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sinh_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
-        Sinh(), (test_data,), aten_op=aten_op, exir_op=exir_op
+        Sinh(), (test_data(),), aten_op=aten_op, exir_op=exir_op
     )
     pipeline.run()
 
@@ -65,7 +65,7 @@ def test_sinh_tosa_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sinh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
-        Sinh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Sinh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -74,7 +74,7 @@ def test_sinh_u55_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sinh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
-        Sinh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Sinh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -84,7 +84,7 @@ def test_sinh_u85_INT(test_data: Tuple):
 def test_sinh_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sinh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=False,
     )
@@ -96,7 +96,7 @@ def test_sinh_vgf_no_quant(test_data: Tuple):
 def test_sinh_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sinh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=True,
     )
diff --git a/backends/arm/test/ops/test_tan.py b/backends/arm/test/ops/test_tan.py
index 62b8d1cd09a..31bd2ca85f7 100644
--- a/backends/arm/test/ops/test_tan.py
+++ b/backends/arm/test/ops/test_tan.py
@@ -25,16 +25,16 @@
 tiny32 = torch.finfo(torch.float32).tiny
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "zeros_alt_shape": torch.zeros(1, 10, 3, 5),
-    "ones": torch.ones(10, 15, 25),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(1, 10, 3, 5) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
-    "pi_multiples": (torch.arange(-5, 6, dtype=torch.float32) * math.pi),
-    "common_angles": torch.tensor(
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(1, 10, 3, 5),
+    "ones": lambda: torch.ones(10, 15, 25),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(1, 10, 3, 5) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+    "pi_multiples": lambda: (torch.arange(-5, 6, dtype=torch.float32) * math.pi),
+    "common_angles": lambda: torch.tensor(
         [
             -math.pi,
             -2 * math.pi / 3,
@@ -52,7 +52,7 @@
         ],
         dtype=torch.float32,
     ),
-    "near_asymptote_pos": torch.tensor(
+    "near_asymptote_pos": lambda: torch.tensor(
         [
             math.pi / 2 - 1e-7,
             math.pi / 2 - 1e-6,
@@ -63,12 +63,12 @@
         ],
         dtype=torch.float32,
     ),
-    "high_rank": torch.randn(1, 3, 7, 4, 5),
-    "very_small": torch.tensor(
+    "high_rank": lambda: torch.randn(1, 3, 7, 4, 5),
+    "very_small": lambda: torch.tensor(
         [-tiny32, -eps32, -1e-10, 0.0, 1e-10, eps32, tiny32], dtype=torch.float32
     ),
-    "large_values": torch.linspace(-1e6, 1e6, steps=257, dtype=torch.float32),
-    "undefined": torch.tensor([math.pi / 2, -math.pi / 2, 3 * math.pi / 2]),
+    "large_values": lambda: torch.linspace(-1e6, 1e6, steps=257, dtype=torch.float32),
+    "undefined": lambda: torch.tensor([math.pi / 2, -math.pi / 2, 3 * math.pi / 2]),
 }
 
 
@@ -82,7 +82,7 @@ def forward(self, x: torch.Tensor):
 def test_tan_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -93,7 +93,7 @@ def test_tan_tosa_FP(test_data: Tuple):
 def test_tan_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         frobenius_threshold=None,
@@ -107,7 +107,7 @@ def test_tan_tosa_INT(test_data: Tuple):
 def test_tan_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -119,7 +119,7 @@ def test_tan_u55_INT(test_data: Tuple):
 def test_tan_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -129,7 +129,7 @@ def test_tan_u85_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
 def test_tan_vgf_no_quant(test_data: Tuple):
-    pipeline = VgfPipeline[input_t](Tan(), (test_data,), [], [], quantize=False)
+    pipeline = VgfPipeline[input_t](Tan(), (test_data(),), [], [], quantize=False)
     pipeline.run()
 
 
@@ -138,7 +138,7 @@ def test_tan_vgf_no_quant(test_data: Tuple):
 def test_tan_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_upsample_bilinear2d.py b/backends/arm/test/ops/test_upsample_bilinear2d.py
index d06d1688ffe..f084e0ebe14 100644
--- a/backends/arm/test/ops/test_upsample_bilinear2d.py
+++ b/backends/arm/test/ops/test_upsample_bilinear2d.py
@@ -24,44 +24,89 @@
 
 test_data_suite_tosa = {
     # (test_name, test_data, size, scale_factor, compare_outputs)
-    "rand_double_scale": (torch.rand(2, 4, 8, 3), None, 2.0, True),
-    "rand_double_scale_one_dim": (torch.rand(2, 4, 8, 3), None, (1.0, 2.0), True),
-    "rand_double_size": (torch.rand(2, 4, 8, 3), (16, 6), None, True),
-    "rand_one_double_scale": (torch.rand(2, 4, 1, 1), None, 2.0, True),
-    "rand_one_double_size": (torch.rand(2, 4, 1, 1), (2, 2), None, True),
-    "rand_one_same_scale": (torch.rand(2, 4, 1, 1), None, 1.0, True),
-    "rand_one_same_size": (torch.rand(2, 4, 1, 1), (1, 1), None, True),
+    "rand_double_scale": lambda: (torch.rand(2, 4, 8, 3), None, 2.0, True),
+    "rand_double_scale_one_dim": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        (1.0, 2.0),
+        True,
+    ),
+    "rand_double_size": lambda: (torch.rand(2, 4, 8, 3), (16, 6), None, True),
+    "rand_one_double_scale": lambda: (torch.rand(2, 4, 1, 1), None, 2.0, True),
+    "rand_one_double_size": lambda: (torch.rand(2, 4, 1, 1), (2, 2), None, True),
+    "rand_one_same_scale": lambda: (torch.rand(2, 4, 1, 1), None, 1.0, True),
+    "rand_one_same_size": lambda: (torch.rand(2, 4, 1, 1), (1, 1), None, True),
     # Can't compare outputs as the rounding when selecting the nearest pixel is
     # different between PyTorch and TOSA. Just check the legalization went well.
     # TODO Improve the test infrastructure to support more in depth verification
     # of the TOSA legalization results.
-    "rand_half_scale": (torch.rand(2, 4, 8, 6), None, 0.5, False),
-    "rand_half_size": (torch.rand(2, 4, 8, 6), (4, 3), None, False),
-    "rand_one_and_half_scale": (torch.rand(2, 4, 8, 3), None, 1.5, False),
-    "rand_one_and_half_size": (torch.rand(2, 4, 8, 3), (12, 4), None, False),
+    "rand_half_scale": lambda: (torch.rand(2, 4, 8, 6), None, 0.5, False),
+    "rand_half_size": lambda: (torch.rand(2, 4, 8, 6), (4, 3), None, False),
+    "rand_one_and_half_scale": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        1.5,
+        False,
+    ),
+    "rand_one_and_half_size": lambda: (
+        torch.rand(2, 4, 8, 3),
+        (12, 4),
+        None,
+        False,
+    ),
     # Use randn for a bunch of tests to get random numbers from the
     # normal distribution where negative is also a possibilty
-    "randn_double_scale_negative": (torch.randn(2, 4, 8, 3), None, 2.0, True),
-    "randn_double_scale_one_dim_negative": (
+    "randn_double_scale_negative": lambda: (
+        torch.randn(2, 4, 8, 3),
+        None,
+        2.0,
+        True,
+    ),
+    "randn_double_scale_one_dim_negative": lambda: (
         torch.randn(2, 4, 8, 3),
         None,
         (1.0, 2.0),
         True,
     ),
-    "randn_double_size_negative": (torch.randn(2, 4, 8, 3), (16, 6), None, True),
-    "randn_one_double_scale_negative": (torch.randn(2, 4, 1, 1), None, 2.0, True),
-    "randn_one_double_size_negative": (torch.randn(2, 4, 1, 1), (2, 2), None, True),
-    "randn_one_same_scale_negative": (torch.randn(2, 4, 1, 1), None, 1.0, True),
-    "randn_one_same_size_negative": (torch.randn(2, 4, 1, 1), (1, 1), None, True),
+    "randn_double_size_negative": lambda: (
+        torch.randn(2, 4, 8, 3),
+        (16, 6),
+        None,
+        True,
+    ),
+    "randn_one_double_scale_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        None,
+        2.0,
+        True,
+    ),
+    "randn_one_double_size_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        (2, 2),
+        None,
+        True,
+    ),
+    "randn_one_same_scale_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        None,
+        1.0,
+        True,
+    ),
+    "randn_one_same_size_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        (1, 1),
+        None,
+        True,
+    ),
 }
 test_data_suite_tosa_bf16 = {
-    "randn_double_scale_bf16": (
+    "randn_double_scale_bf16": lambda: (
         torch.randn(1, 2, 2, 2, dtype=torch.bfloat16),
         None,
         2.0,
         True,
     ),
-    "randn_double_size_bf16": (
+    "randn_double_size_bf16": lambda: (
         torch.randn(1, 1, 3, 2, dtype=torch.bfloat16),
         (6, 4),
         None,
@@ -69,13 +114,13 @@
     ),
 }
 test_data_suite_tosa_fp16 = {
-    "randn_double_scale_fp16": (
+    "randn_double_scale_fp16": lambda: (
         torch.randn(1, 2, 2, 2, dtype=torch.float16),
         None,
         2.0,
         True,
     ),
-    "randn_double_size_fp16": (
+    "randn_double_size_fp16": lambda: (
         torch.randn(1, 1, 3, 2, dtype=torch.float16),
         (6, 4),
         None,
@@ -84,14 +129,24 @@
 }
 
 test_data_suite_Uxx = {
-    "rand_half_scale": (torch.rand(2, 4, 8, 6), None, 0.5, False),
-    "rand_half_size": (torch.rand(2, 4, 8, 6), (4, 3), None, False),
-    "rand_one_and_half_scale": (torch.rand(2, 4, 8, 3), None, 1.5, False),
-    "rand_one_and_half_size": (torch.rand(2, 4, 8, 3), (12, 4), None, False),
+    "rand_half_scale": lambda: (torch.rand(2, 4, 8, 6), None, 0.5, False),
+    "rand_half_size": lambda: (torch.rand(2, 4, 8, 6), (4, 3), None, False),
+    "rand_one_and_half_scale": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        1.5,
+        False,
+    ),
+    "rand_one_and_half_size": lambda: (
+        torch.rand(2, 4, 8, 3),
+        (12, 4),
+        None,
+        False,
+    ),
 }
 
 test_data_u55 = {
-    "rand_double_size": (torch.rand(2, 4, 8, 3), (16, 6), None, True),
+    "rand_double_size": lambda: (torch.rand(2, 4, 8, 3), (16, 6), None, True),
 }
 
 
@@ -166,7 +221,7 @@ def forward(self, x):
 def test_upsample_bilinear2d_vec_tosa_FP_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     match test_data.dtype:
         case torch.bfloat16:
             atol = 1e-2
@@ -196,7 +251,7 @@ def test_upsample_bilinear2d_vec_tosa_FP_UpsamplingBilinear2d(
 def test_upsample_bilinear2d_vec_tosa_FP_Upsample(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     match test_data.dtype:
         case torch.bfloat16:
             atol = 1e-2
@@ -227,7 +282,7 @@ def test_upsample_bilinear2d_vec_tosa_FP_Upsample(
 def test_upsample_bilinear2d_vec_tosa_FP_Interpolate(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     match test_data.dtype:
         case torch.bfloat16:
             atol = 1e-2
@@ -265,7 +320,7 @@ def test_upsample_bilinear2d_vec_tosa_does_not_delegate_exact_one_sixteenth_down
 def test_upsample_bilinear2d_vec_tosa_INT_intropolate(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = TosaPipelineINT[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
@@ -282,7 +337,7 @@ def test_upsample_bilinear2d_vec_tosa_INT_intropolate(
 def test_upsample_bilinear2d_vec_tosa_INT_Upsample(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = TosaPipelineINT[input_t1](
         Upsample(size, scale_factor),
@@ -302,7 +357,7 @@ def test_upsample_bilinear2d_vec_tosa_INT_a16w8(
     """Test upsample_bilinear2d vector op with int16 I/O quantization for TOSA
     INT.
     """
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = TosaPipelineINT[input_t1](
         Upsample(size, scale_factor),
         (test_data,),
@@ -320,7 +375,7 @@ def test_upsample_bilinear2d_vec_tosa_INT_a16w8(
 def test_upsample_bilinear2d_vec_u55_INT_Upsample_not_delegated(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
         Upsample(size, scale_factor),
         (test_data,),
@@ -338,7 +393,7 @@ def test_upsample_bilinear2d_vec_u55_INT_Upsample_not_delegated(
 def test_upsample_bilinear2d_vec_u55_INT_Interpolate_not_delegated(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
         Interpolate(size, scale_factor),
         (test_data,),
@@ -356,7 +411,7 @@ def test_upsample_bilinear2d_vec_u55_INT_Interpolate_not_delegated(
 def test_upsample_bilinear2d_vec_u55_INT_UpsamplingBilinear2d_not_delegated(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (test_data,),
@@ -372,7 +427,7 @@ def test_upsample_bilinear2d_vec_u55_INT_UpsamplingBilinear2d_not_delegated(
 @common.parametrize("test_data", test_data_suite_Uxx)
 @common.XfailIfNoCorstone320
 def test_upsample_bilinear2d_vec_u85_INT_Upsample(test_data: input_t1):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         Upsample(size, scale_factor),
@@ -391,7 +446,7 @@ def test_upsample_bilinear2d_vec_u85_INT_Upsample(test_data: input_t1):
 def test_upsample_bilinear2d_vec_u85_INT_Interpolate(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         Interpolate(size, scale_factor),
@@ -410,7 +465,7 @@ def test_upsample_bilinear2d_vec_u85_INT_Interpolate(
 def test_upsample_bilinear2d_vec_u85_INT_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
@@ -432,7 +487,7 @@ def test_upsample_bilinear2d_vec_u85_INT_a16w8(
     """Test upsample_bilinear2d vec op with 16A8W quantization on U85 (16-bit
     activations, 8-bit weights)
     """
-    data, size, scale_factor, compare_outputs = test_data
+    data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
@@ -452,7 +507,7 @@ def test_upsample_bilinear2d_vec_u85_INT_a16w8(
 def test_upsample_bilinear2d_vec_vgf_no_quant_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (data,),
@@ -470,8 +525,8 @@ def test_upsample_bilinear2d_vec_vgf_no_quant_UpsamplingBilinear2d(
 @common.parametrize("test_data", test_data_suite_tosa | test_data_suite_tosa_fp16)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_no_quant_Upsample(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
-    match test_data[0].dtype:
+    data, size, scale_factor, compare = test_data()
+    match data.dtype:
         case torch.float16:
             atol = 1e-2
             rtol = 1e-2
@@ -495,7 +550,7 @@ def test_upsample_bilinear2d_vec_vgf_no_quant_Upsample(test_data: torch.Tensor):
 @common.parametrize("test_data", test_data_suite_tosa | test_data_suite_tosa_fp16)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_no_quant_Interpolate(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Interpolate(size, scale_factor),
         (data,),
@@ -513,7 +568,7 @@ def test_upsample_bilinear2d_vec_vgf_no_quant_Interpolate(test_data: torch.Tenso
 def test_upsample_bilinear2d_vec_vgf_quant_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (data,),
@@ -529,7 +584,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_UpsamplingBilinear2d(
 @common.parametrize("test_data", test_data_suite_tosa)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_quant_Upsample(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Upsample(size, scale_factor),
         (data,),
@@ -545,7 +600,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_Upsample(test_data: torch.Tensor):
 @common.parametrize("test_data", test_data_suite_tosa)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_quant_Interpolate(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Interpolate(size, scale_factor),
         (data,),
@@ -563,7 +618,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_Interpolate(test_data: torch.Tensor):
 def test_upsample_bilinear2d_vec_vgf_quant_a16w8_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (data,),
@@ -583,7 +638,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_a16w8_UpsamplingBilinear2d(
 def test_upsample_bilinear2d_vec_vgf_quant_a16w8_Upsample(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Upsample(size, scale_factor),
         (data,),
@@ -603,7 +658,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_a16w8_Upsample(
 def test_upsample_bilinear2d_vec_vgf_quant_a16w8_Interpolate(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Interpolate(size, scale_factor),
         (data,),

From acffcb02928e8b6407a8f16b234f720d59526820 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 5 May 2026 10:28:45 -0700
Subject: [PATCH 02/58] CI: skip default-packages on every macos_job.yml
 callsite (#19297)

### Summary
pytorch/test-infra's setup-miniconda action pre-installs cmake=3.22
ninja=1.10 pkg-config=0.29 wheel=0.37 from the anaconda defaults channel
into the conda env it sets up for macOS jobs. Our own setup-conda.sh
then installs cmake=3.31.2 and friends from conda-forge into the same
env, and reconciling the two channels' transitive deps (e.g. zlib=1.2.13
vs libzlib>=1.3.1, rhash=1.4.3 vs rhash>=1.4.5) has been intermittently
failing the libmamba solver.

The companion test-infra PR exposes a default-packages input on
macos_job.yml. Pass an empty string from every macos_job.yml callsite in
this repo so the conda env created by setup-miniconda no longer
pre-pollutes the env with defaults-channel packages we don't use, and
our subsequent conda-forge install resolves cleanly.

This change has no effect until the [test-infra
PR](https://github.com/pytorch/test-infra/pull/8033) lands. Once it's
merged on test-infra@main, the workflows here pick it up automatically
because executorch tracks @main for all test-infra references.

Authored with Claude Code.

### Test plan
CI
---
 .github/workflows/_test_backend.yml |  1 +
 .github/workflows/_unittest.yml     |  1 +
 .github/workflows/apple.yml         |  3 +++
 .github/workflows/build-presets.yml |  1 +
 .github/workflows/metal.yml         |  5 +++++
 .github/workflows/mlx.yml           |  9 +++++++++
 .github/workflows/pull.yml          |  1 +
 .github/workflows/trunk.yml         | 11 +++++++++++
 8 files changed, 32 insertions(+)

diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
index bfa874a440d..6323c007f4a 100644
--- a/.github/workflows/_test_backend.yml
+++ b/.github/workflows/_test_backend.yml
@@ -129,6 +129,7 @@ jobs:
 
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       ref: ${{ inputs.ref }}
       runner: macos-m1-stable
       python-version: "3.12"
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 457480099c3..15c87bd79e4 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -44,6 +44,7 @@ jobs:
   macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 0d8995a8259..20c7352e40d 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -88,6 +88,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
@@ -175,6 +176,7 @@ jobs:
     needs: set-version
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
@@ -315,6 +317,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 7c5a37e0f6c..37854aed174 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -20,6 +20,7 @@ jobs:
       matrix:
         preset: [macos, ios, ios-simulator, pybind, profiling, llm]
     with:
+      default-packages: ""
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       runner: macos-14-xlarge
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
index 2ab1f1e0e22..de6507e035a 100644
--- a/.github/workflows/metal.yml
+++ b/.github/workflows/metal.yml
@@ -25,6 +25,7 @@ jobs:
     name: test-executorch-metal-build
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -41,6 +42,7 @@ jobs:
     name: test-metal-backend-modules
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -65,6 +67,7 @@ jobs:
     name: test-metal-qwen35-moe-tiny
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -187,6 +190,7 @@ jobs:
               name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "non-quantized"
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -252,6 +256,7 @@ jobs:
               name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "non-quantized"
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index a40198ea36f..65b8543bfd4 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -28,6 +28,7 @@ jobs:
   test-mlx:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -77,6 +78,7 @@ jobs:
   test-mlx-qwen35-moe:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-qwen35-moe
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -132,6 +134,7 @@ jobs:
         suite: [models, operators]
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-backend-${{ matrix.suite }}
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -173,6 +176,7 @@ jobs:
   test-mlx-parakeet:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-parakeet
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -231,6 +235,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-voxtral
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -291,6 +296,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-voxtral-realtime
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -366,6 +372,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-whisper
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -413,6 +420,7 @@ jobs:
   test-mlx-stories110m:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-stories110m
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -492,6 +500,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
       runner: macos-14-xlarge
       python-version: "3.12"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 6a4439b4254..97633965652 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1521,6 +1521,7 @@ jobs:
         runner: [macos-m1-stable, macos-m2-stable]
       fail-fast: false
     with:
+      default-packages: ""
       runner: ${{ matrix.runner }}
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 68c2e68436e..670517f836b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -40,6 +40,7 @@ jobs:
             backend: portable
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -275,6 +276,7 @@ jobs:
           - build-tool: cmake
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -297,6 +299,7 @@ jobs:
           - build-tool: cmake
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -457,6 +460,7 @@ jobs:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
@@ -475,6 +479,7 @@ jobs:
     name: test-static-llama-ane
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -497,6 +502,7 @@ jobs:
     name: test-llama-torchao-lowbit
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -581,6 +587,7 @@ jobs:
         mode: [mps, coreml, xnnpack+custom+quantize_kv]
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -667,6 +674,7 @@ jobs:
       matrix:
         model: ["gemma3-4b"] # llava gives segfault so not covering.
     with:
+      default-packages: ""
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: macos-15-xlarge
       python-version: '3.11'
@@ -754,6 +762,7 @@ jobs:
         model: [dl3, edsr, efficient_sam, emformer_join, emformer_transcribe, ic3, ic4, mobilebert, mv2, mv3, resnet50, vit, w2l]
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -793,6 +802,7 @@ jobs:
     strategy:
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -932,6 +942,7 @@ jobs:
         ]
       fail-fast: false
     with:
+      default-packages: ""
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: macos-15-xlarge
       python-version: '3.11'

From 83ac75c8d9da324bf6185403ae02286b6ea4fa74 Mon Sep 17 00:00:00 2001
From: Ishan Godawatta <32465586+IshanG97@users.noreply.github.com>
Date: Tue, 5 May 2026 19:26:58 +0100
Subject: [PATCH 03/58] feat(mlx): add handler for aten.roll (#19038)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Adds an MLX delegate handler for `aten.roll`, mapping `torch.roll` onto
`mlx::core::roll` via a new `RollNode` in the schema. Replaces the
default decomposition (`index_select + arange + cat`) with a single
native kernel — needed by Swin Transformer's shift-window attention.

Flat roll (`dims=[]`) raises `NotImplementedError` for now; no known
consumer needs it yet.

Generated files (`MLXLoader.*`, `schema_generated.h`,
`mlx_graph_schema.py`, `_generated_serializers.py`,
`_generated_inspector.py`, `_generated/`) are regenerated from
`schema.fbs` by `backends/mlx/CMakeLists.txt` at build time and are
deliberately not committed.

Fixes #18919.

## Test plan

- `python backends/mlx/serialization/generate.py` — regenerates cleanly
with `RollNode` in all expected outputs.
- `lintrunner --skip MYPY --paths-cmd 'git diff --name-only
upstream/main'` — no issues.
- End-to-end `run_all_tests -k roll` not run locally (no executorch
build on this machine); relying on CI. Happy to push fixes if it finds
anything.

cc @metascroy

Co-authored-by: Ishan Godawatta <ishan.godawatta@arm.com>
---
 backends/mlx/ops.py                   | 40 ++++++++++++++++++++
 backends/mlx/runtime/MLXInterpreter.h | 10 +++++
 backends/mlx/serialization/schema.fbs | 13 ++++++-
 backends/mlx/test/test_ops.py         | 53 +++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/backends/mlx/ops.py b/backends/mlx/ops.py
index 27d214e0ae9..9651d4a7b58 100644
--- a/backends/mlx/ops.py
+++ b/backends/mlx/ops.py
@@ -117,6 +117,7 @@
     RepeatNode,
     ReshapeNode,
     RMSNormNode,
+    RollNode,
     RopeNode,
     RoundNode,
     RsqrtNode,
@@ -1678,6 +1679,45 @@ def _repeat_handler(P: MLXProgramBuilder, n: Node) -> Slot:
     return out
 
 
+@REGISTRY.register(target=[torch.ops.aten.roll.default])
+def _roll_handler(P: MLXProgramBuilder, n: Node) -> Slot:
+    args = P.args(n)
+    require_args(args, 2, 3, "aten.roll")
+    require_kwargs(P.kwargs(n), set(), "aten.roll")
+    x = args[0]
+    shifts_arg = args[1]
+    dims_arg = args[2] if len(args) > 2 else []
+
+    shifts = [shifts_arg] if isinstance(shifts_arg, int) else list(shifts_arg)
+    dims: List[int] = [dims_arg] if isinstance(dims_arg, int) else list(dims_arg)
+
+    # Flat roll (torch.roll with dims=[]) would require reshape + roll +
+    # reshape at the graph level. Not yet supported; Swin-style usage always
+    # passes explicit dims.
+    if not dims:
+        raise NotImplementedError(
+            "aten.roll without dims (flat roll) is not supported by the MLX "
+            "delegate yet."
+        )
+    if len(shifts) != len(dims):
+        raise ValueError(
+            f"aten.roll: shifts and dims must have the same length, got "
+            f"shifts={shifts} (len={len(shifts)}) dims={dims} (len={len(dims)})"
+        )
+    require_static_ints(dims, "dims", "aten.roll")
+
+    out = P.make_or_get_slot(n)
+    P.emit(
+        RollNode(
+            x=P.slot_to_tid(x),
+            out=P.slot_to_tid(out),
+            shift=[P.to_int_or_vid(s) for s in shifts],
+            axes=dims,
+        )
+    )
+    return out
+
+
 @REGISTRY.register(target=[torch.ops.aten.index.Tensor])
 def _index_handler(P: MLXProgramBuilder, n: Node) -> Slot:
     args = P.args(n)
diff --git a/backends/mlx/runtime/MLXInterpreter.h b/backends/mlx/runtime/MLXInterpreter.h
index 304fdfe9805..57f24993499 100644
--- a/backends/mlx/runtime/MLXInterpreter.h
+++ b/backends/mlx/runtime/MLXInterpreter.h
@@ -1733,6 +1733,13 @@ inline void exec_all(const AllNode& n, ExecutionState& st, StreamOrDevice s) {
   }
 }
 
+inline void exec_roll(const RollNode& n, ExecutionState& st, StreamOrDevice s) {
+  const auto& x = st.const_tensor_ref(n.x);
+  auto shifts = to_shape(n.shift, st);
+  std::vector<int> axes(n.axes.begin(), n.axes.end());
+  st.set_tensor(n.out, roll(x, shifts, axes, s));
+}
+
 inline void
 exec_repeat(const RepeatNode& n, ExecutionState& st, StreamOrDevice s) {
   const auto& x = st.const_tensor_ref(n.x);
@@ -2210,6 +2217,9 @@ class Interpreter {
       case OpCode::REPEAT:
         ops::exec_repeat(std::get<RepeatNode>(instr.node), st, s);
         break;
+      case OpCode::ROLL:
+        ops::exec_roll(std::get<RollNode>(instr.node), st, s);
+        break;
       case OpCode::SORT:
         ops::exec_sort(std::get<SortNode>(instr.node), st, s);
         break;
diff --git a/backends/mlx/serialization/schema.fbs b/backends/mlx/serialization/schema.fbs
index 67b4636f0be..42c5754f4f7 100644
--- a/backends/mlx/serialization/schema.fbs
+++ b/backends/mlx/serialization/schema.fbs
@@ -673,6 +673,16 @@ table ArgPartitionNode {
     axis: int32;
 }
 
+// Shift tensor elements along specified axes with wrap-around.
+// Maps to mlx::core::roll(a, shifts, axes).
+// Flat roll (torch.roll with dims=None) is not yet supported.
+table RollNode {
+    x: Tid (required);
+    out: Tid (required);
+    shift: [IntOrVid] (required);  // Shift amount per axis (can be dynamic)
+    axes: [int32] (required);      // Axes to roll along; len(shift) == len(axes)
+}
+
 
 // =============================================================================
 // Math ops - Unary element-wise
@@ -1119,7 +1129,8 @@ union OpNode {
     GatherQmmNode,
     ScanNode,
     MetalKernelNode,
-    BitwiseInvertNode
+    BitwiseInvertNode,
+    RollNode
     // BC: Add new op nodes here (append only)
 }
 
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
index 459d5aa1e73..a44ed83da4c 100644
--- a/backends/mlx/test/test_ops.py
+++ b/backends/mlx/test/test_ops.py
@@ -855,6 +855,59 @@ def create_inputs(self) -> Tuple[torch.Tensor, ...]:
         return (x,)
 
 
+class RollModel(nn.Module):
+    """Model that rolls a tensor along specified dimensions."""
+
+    def __init__(self, shifts: Tuple[int, ...], dims: Tuple[int, ...]):
+        super().__init__()
+        self.shifts = shifts
+        self.dims = dims
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.roll(x, shifts=self.shifts, dims=self.dims)
+
+
+@register_test
+class RollTest(OpTestCase):
+    """Test case for torch.roll()."""
+
+    name = "roll"
+    rtol = 1e-5
+    atol = 1e-5
+
+    def __init__(
+        self,
+        input_shape: Tuple[int, ...] = (4, 5),
+        shifts: Tuple[int, ...] = (1,),
+        dims: Tuple[int, ...] = (0,),
+    ):
+        self.input_shape = input_shape
+        self.shifts = shifts
+        self.dims = dims
+        shift_str = ",".join(str(s) for s in shifts)
+        dim_str = ",".join(str(d) for d in dims)
+        self.name = f"roll_shift({shift_str})_dim({dim_str})"
+
+    @classmethod
+    def get_test_configs(cls) -> List["RollTest"]:
+        return [
+            cls(input_shape=(8,), shifts=(2,), dims=(0,)),
+            cls(input_shape=(4, 5), shifts=(1,), dims=(0,)),
+            cls(input_shape=(4, 5), shifts=(-2,), dims=(1,)),
+            cls(input_shape=(3, 4, 5), shifts=(3,), dims=(2,)),
+            cls(input_shape=(3, 4, 5), shifts=(1, 2), dims=(0, 2)),
+            cls(input_shape=(3, 4, 5), shifts=(-1, -2, -3), dims=(0, 1, 2)),
+            cls(input_shape=(3, 4, 5), shifts=(2,), dims=(-1,)),
+        ]
+
+    def create_model(self) -> nn.Module:
+        return RollModel(self.shifts, self.dims)
+
+    def create_inputs(self) -> Tuple[torch.Tensor, ...]:
+        x = torch.randn(self.input_shape)
+        return (x,)
+
+
 class CatNModel(nn.Module):
     """Model that concatenates N tensors along a dimension."""
 

From 9915faff34dd32fe5244244396152581c217f23e Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 11:38:25 -0700
Subject: [PATCH 04/58] Lora fix (#19304)

number of think tokens is a little flakey and I dont think its super
material for now so relaxing
---
 .ci/scripts/test_lora.sh             | 20 +++++++++++++++++++-
 .ci/scripts/test_lora_multimethod.sh | 20 +++++++++++++++++++-
 .ci/scripts/test_model_e2e.sh        |  2 +-
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
index e323aed114d..79561f54622 100644
--- a/.ci/scripts/test_lora.sh
+++ b/.ci/scripts/test_lora.sh
@@ -33,6 +33,24 @@ cleanup_files() {
   rm result*.txt
 }
 
+matches_base_response_prefix() {
+  local output_file="$1"
+  python - "$output_file" <<'PY'
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text()
+pattern = re.compile(
+    r"^<\|im_start\|>user Calculate 15% of 80\?<\|im_end\|><\|im_start\|>assistant:\n"
+    r"(?:<think>\n)+"
+    r"Okay, so I need to calculate 15% of 80\.",
+    re.MULTILINE,
+)
+sys.exit(0 if pattern.match(text) else 1)
+PY
+}
+
 # Hosting lora adapter in personal repo for now.
 python -m pip install -q huggingface_hub
 HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
@@ -186,7 +204,7 @@ cmake-out/examples/models/llama/llama_main --model_path=qwen_q.pte --data_paths=
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 RESULT=$(cat result.txt)
-if [[ "${RESULT}" == "${EXPECTED_QUANT_PREFIX}"* ]]; then
+if matches_base_response_prefix result.txt; then
   echo "Expected result prefix: ${EXPECTED_QUANT_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 3: Success"
diff --git a/.ci/scripts/test_lora_multimethod.sh b/.ci/scripts/test_lora_multimethod.sh
index 8f4ae1a4f68..7c468eb226b 100755
--- a/.ci/scripts/test_lora_multimethod.sh
+++ b/.ci/scripts/test_lora_multimethod.sh
@@ -33,6 +33,24 @@ cleanup_files() {
   rm -f result*.txt
 }
 
+matches_base_response_prefix() {
+  local output_file="$1"
+  python - "$output_file" <<'PY'
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text()
+pattern = re.compile(
+    r"^<\|im_start\|>user Calculate 15% of 80\?<\|im_end\|><\|im_start\|>assistant:\n"
+    r"(?:<think>\n)+"
+    r"Okay, so I need to calculate 15% of 80\.",
+    re.MULTILINE,
+)
+sys.exit(0 if pattern.match(text) else 1)
+PY
+}
+
 # Download LoRA adapter.
 python -m pip install -q huggingface_hub
 HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
@@ -107,7 +125,7 @@ NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result_base.txt)
-if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then
+if matches_base_response_prefix result_base.txt; then
   echo "Test 2 (base_forward): Success"
 else
   echo "Test 2 (base_forward): Failure"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 7205bb6d49c..188cb514b2f 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -258,7 +258,7 @@ fi
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
-  conda install -y -c conda-forge "ffmpeg<8"
+  conda install -y -c conda-forge ffmpeg
   pip install datasets soundfile
   # We pushd'd into EXECUTORCH_ROOT above, so torch_pin is importable here.
   TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")

From ff25a2f6867511da20fafdad42ca38d3fdd76ff5 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 5 May 2026 11:45:23 -0700
Subject: [PATCH 05/58] QNN SDK download: validate archive and retry on all
 errors (#19233)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
The QNN backend test workflows have been flaking because the download
from softwarecenter.qualcomm.com aborts mid-stream with `curl: (92)
HTTP/2 stream 0 was not closed cleanly: INTERNAL_ERROR`, or returns a
short error body that curl treats as a successful 200 — letting unzip
choke on the not-a-zip with exit 9. The previous `curl --retry 3` only
covered a narrow set of transient errors and never validated the
archive, so neither failure was retried. Wrap the download in a
five-attempt loop using `curl --fail --retry-all-errors` and validate
each attempt with `unzip -t` before proceeding, with the on-disk file
size logged on failure so a tiny error body is unambiguous in the log.

Authored with Claude Code.

### Test plan
CI
---
 backends/qualcomm/scripts/install_qnn_sdk.sh | 22 +++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/backends/qualcomm/scripts/install_qnn_sdk.sh b/backends/qualcomm/scripts/install_qnn_sdk.sh
index 5bc0f7eeb1d..7921b48da2f 100644
--- a/backends/qualcomm/scripts/install_qnn_sdk.sh
+++ b/backends/qualcomm/scripts/install_qnn_sdk.sh
@@ -64,7 +64,27 @@ install_qnn() {
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
   QNN_ZIP_FILE="v${QNN_VERSION}.zip"
-  curl --retry 3 -Lo "/tmp/${QNN_ZIP_FILE}" "${QNN_ZIP_URL}"
+  # softwarecenter.qualcomm.com intermittently aborts the download with
+  # HTTP/2 INTERNAL_ERROR mid-stream, and occasionally returns a tiny
+  # error body that curl treats as success — both cases get caught here:
+  # --fail rejects HTTP errors, --retry-all-errors retries transport
+  # errors, and `unzip -t` validates the archive before we proceed.
+  QNN_DOWNLOAD_MAX_ATTEMPTS=5
+  for attempt in $(seq 1 ${QNN_DOWNLOAD_MAX_ATTEMPTS}); do
+    rm -f "/tmp/${QNN_ZIP_FILE}"
+    if curl --fail --retry 3 --retry-delay 5 --retry-connrefused --retry-all-errors \
+         -Lo "/tmp/${QNN_ZIP_FILE}" "${QNN_ZIP_URL}" \
+       && unzip -tq "/tmp/${QNN_ZIP_FILE}"; then
+      break
+    fi
+    ls -l "/tmp/${QNN_ZIP_FILE}" 2>&1 || true
+    if [ "${attempt}" = "${QNN_DOWNLOAD_MAX_ATTEMPTS}" ]; then
+      echo "ERROR: QNN SDK download failed after ${attempt} attempts" >&2
+      exit 1
+    fi
+    echo "QNN SDK download attempt ${attempt} failed; retrying in $((attempt * 10))s..."
+    sleep $((attempt * 10))
+  done
   echo "Finishing downloading qnn sdk."
   unzip -qo "/tmp/${QNN_ZIP_FILE}" -d /tmp
   echo "Finishing unzip qnn sdk."

From a0d6e9bc4bb8bb2a33b6373e041eaf82f14829c2 Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@icloud.com>
Date: Tue, 5 May 2026 12:33:46 -0700
Subject: [PATCH 06/58] switch correctness checks to SNR-based assertion for
 cuda quant int4_matmul (#19300)

Replace torch.allclose(atol/rtol) with an SNR (signal-to-noise ratio)
assertion across all int4_matmul / int4_matvec / dequant-vs-fused tests.

Why:
- test_prefill_short was flaking on CI (A10G) with max_abs_err=1.0000.
Root cause: bf16 GEMM with K=2048 reduction produces output magnitudes
up to ~200; at that scale, the bf16 ULP gap is 0.5-1.0. Triton fused
kernel and cuBLAS reduce in different orders (and Triton autotune picks
different tile configs on different hardware), so 1-ULP element-wise
differences are unavoidable. atol/rtol false-fails on these outliers;
SNR averages them out.
- atol/rtol thresholds also depend on size: a value tuned for K=2048 is
too loose for K=64 and too tight for K=4096. SNR is size-invariant
(||signal|| and ||noise|| both scale with sqrt(N) and sqrt(K), canceling
in the ratio).

What:
- Add _assert_snr(test_case, actual, expected, label) helper that
asserts 20*log10(||expected|| / ||actual-expected||) >= 50 dB.
- Replace 4 call sites: TestInt4Matmul, TestInt4Matvec (x2),
TestDequantThenMatmul.
- 50 dB ~ 0.3% RMS error: well below observed clean noise (80-90 dB) and
well above any real functional bug (<20 dB SNR for wrong stride /
flipped nibble / off-by-one group_idx / missing mask).

Test plan:
  python -m pytest backends/cuda/tests/test_int4_matmul.py -v
  -> 35/35 passed
---
 backends/cuda/tests/test_int4_matmul.py | 64 +++++++++++++++----------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/backends/cuda/tests/test_int4_matmul.py b/backends/cuda/tests/test_int4_matmul.py
index 2f33f888ac1..ed0ca47f3f6 100644
--- a/backends/cuda/tests/test_int4_matmul.py
+++ b/backends/cuda/tests/test_int4_matmul.py
@@ -19,7 +19,6 @@
 import unittest
 
 import torch
-
 from executorch.backends.cuda.triton.kernels.int4_matmul import (
     dequant_w4_to_bf16,
     int4_matmul,
@@ -28,6 +27,41 @@
 
 ATOL = 0.01
 DEVICE = "cuda"
+SNR_THRESHOLD_DB = 50.0
+
+
+def _assert_snr(test_case, actual, expected, label, threshold_db=SNR_THRESHOLD_DB):
+    """Assert signal-to-noise ratio (in dB) of `actual` vs `expected` >= threshold.
+
+    SNR = 20*log10(||expected||_2 / ||actual - expected||_2)
+
+    Why SNR rather than torch.allclose(atol/rtol):
+      * Size-invariant: ||signal|| and ||noise|| both scale with sqrt(N) and
+        with sqrt(K) (CLT + random-walk rounding), so the ratio is independent
+        of tensor size and reduction depth. The same threshold works for
+        K=64 and K=4096, M=1 and M=1024.
+      * Robust to bf16 ULP outliers: with K=2048 and output magnitudes ~200,
+        a single element can differ by ~1.0 just from differing reduction
+        orders (Triton fused vs cuBLAS). atol/rtol false-fails on these;
+        SNR averages them out.
+      * Sensitive to real bugs: wrong stride, flipped nibble, off-by-one
+        group_idx, or a missing mask all collapse SNR to <20 dB. The 50 dB
+        threshold (≈0.3% RMS error) sits comfortably between observed clean
+        noise floor (~80-90 dB) and any genuine functional break.
+    """
+    a = actual.float()
+    b = expected.float()
+    diff = a - b
+    signal = b.norm()
+    noise = diff.norm()
+    snr_db = (20.0 * torch.log10(signal / noise.clamp(min=1e-9))).item()
+    test_case.assertGreater(
+        snr_db,
+        threshold_db,
+        f"{label}: SNR={snr_db:.1f} dB (threshold {threshold_db:.1f} dB), "
+        f"max_abs_err={diff.abs().max().item():.4f}, "
+        f"signal_norm={signal.item():.2f}, noise_norm={noise.item():.4f}",
+    )
 
 
 def _quantize_simple(w_bf16, group_size):
@@ -118,12 +152,7 @@ def _run_matmul(self, M, N, K, group_size):
 
         self.assertEqual(out.shape, (M, N))
         self.assertEqual(out.dtype, torch.bfloat16)
-        self.assertTrue(
-            torch.allclose(out.float(), ref.float(), atol=ATOL, rtol=0.01),
-            f"int4_matmul M={M} [{N}x{K}] gs={group_size}: "
-            f"max_abs_err={(out.float() - ref.float()).abs().max().item():.4f}, "
-            f"max_rel_err={((out.float() - ref.float()).abs() / ref.float().abs().clamp(min=1e-6)).max().item():.4f}",
-        )
+        _assert_snr(self, out, ref, f"int4_matmul M={M} [{N}x{K}] gs={group_size}")
 
     # --- Decode (M=1) ---
     def test_decode_square(self):
@@ -189,13 +218,7 @@ def _run_matvec(self, N, K, group_size):
 
         self.assertEqual(out.shape, (1, N))
         self.assertEqual(out.dtype, torch.bfloat16)
-        # atol=1.0 for large accumulation across K, rtol=0.01 for relative
-        self.assertTrue(
-            torch.allclose(out.float(), ref.float(), atol=1.0, rtol=0.01),
-            f"int4_matvec [{N}x{K}] gs={group_size}: "
-            f"max_err={(out.float() - ref.float()).abs().max().item():.4f}, "
-            f"max_rel={((out.float()-ref.float()).abs()/(ref.float().abs().clamp(min=0.1))).max().item():.4f}",
-        )
+        _assert_snr(self, out, ref, f"int4_matvec [{N}x{K}] gs={group_size}")
 
     def test_qkv_proj(self):
         self._run_matvec(2048, 2048, 128)
@@ -226,10 +249,7 @@ def test_matches_int4_matmul(self):
         out_mv = int4_matvec(x, packed, scale, gs)
         out_mm = int4_matmul(x, packed, scale, gs)
 
-        self.assertTrue(
-            torch.allclose(out_mv.float(), out_mm.float(), atol=1.0, rtol=0.01),
-            f"matvec vs matmul: max_err={(out_mv.float() - out_mm.float()).abs().max().item():.4f}",
-        )
+        _assert_snr(self, out_mv, out_mm, "matvec vs matmul")
 
 
 class TestDequantThenMatmul(unittest.TestCase):
@@ -248,13 +268,7 @@ def _run(self, M, N, K, group_size):
         w_bf16 = dequant_w4_to_bf16(packed, scale, group_size)
         out_dequant = torch.nn.functional.linear(x, w_bf16)
 
-        self.assertTrue(
-            torch.allclose(
-                out_fused.float(), out_dequant.float(), atol=ATOL, rtol=0.01
-            ),
-            f"fused vs dequant M={M} [{N}x{K}]: "
-            f"max_abs_err={(out_fused.float() - out_dequant.float()).abs().max().item():.4f}",
-        )
+        _assert_snr(self, out_fused, out_dequant, f"fused vs dequant M={M} [{N}x{K}]")
 
     def test_decode(self):
         self._run(1, 2048, 2048, 128)

From ca7d5cfe6bd57eeea007fcce38ffccba89b66aa2 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 5 May 2026 13:58:37 -0700
Subject: [PATCH 07/58] Add missing program_builder dep to arm test targets
 (#19266)

Summary:
D102325968 added an import of `executorch.backends.test.program_builder`
in `test_fuse_constant_ops_pass.py` but only updated the xplat
`targets.bzl` with the corresponding Buck dependency. The fbcode
`targets.bzl` was missing this dependency, causing test listing failures
for `fbcode//executorch/backends/arm/test:fuse_constant_ops_pass`.

Add `//executorch/backends/test:program_builder` to the deps list.

Differential Revision: D103456950


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 backends/arm/test/targets.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 52d1b651b75..6e2539cf2dc 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -98,6 +98,7 @@ def define_arm_tests():
                 "//executorch/backends/arm/tosa:partitioner",
                 "//executorch/backends/arm:vgf",
                 "//executorch/backends/test:graph_builder",
+                "//executorch/backends/test:program_builder",
                 "//executorch/exir:lib",
                 "fbsource//third-party/pypi/pytest:pytest",
                 "fbsource//third-party/pypi/parameterized:parameterized",

From 15da1d1a0065664c51ecd343100fd11843c8cb2e Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 14:28:45 -0700
Subject: [PATCH 08/58] change ffmpeg install path away from conda on linux
 (#19306)

---
 .ci/scripts/test_model_e2e.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 188cb514b2f..97711de2713 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -258,7 +258,19 @@ fi
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
-  conda install -y -c conda-forge ffmpeg
+  if ! command -v ffmpeg >/dev/null; then
+    if [ "$(uname -s)" = "Linux" ] && command -v apt-get >/dev/null; then
+      if [ "$(id -u)" -eq 0 ]; then
+        apt-get update
+        apt-get install -y --no-install-recommends ffmpeg
+      else
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends ffmpeg
+      fi
+    else
+      conda install -y -c conda-forge ffmpeg
+    fi
+  fi
   pip install datasets soundfile
   # We pushd'd into EXECUTORCH_ROOT above, so torch_pin is importable here.
   TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")

From 10a0c91261d2cca177539881fcfda72ada342bc1 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 15:07:04 -0700
Subject: [PATCH 09/58] Improve Android Emulator Robustness (#19310)

Disabling animations sometimes cant happen right after a slow boot.
Moving disabling animations to the script where we can retry. Did some
general cleaning of other possible sources of flakiness as well.

Authored with codex
---
 .github/workflows/_android.yml  |  7 +++++--
 scripts/run_android_emulator.sh | 32 +++++++++++++++++++++++++++-----
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index 54622a33cd8..6f152607d7c 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -109,6 +109,9 @@ jobs:
           ram-size: 16384M
           heap-size: 12288M
           force-avd-creation: false
-          disable-animations: true
-          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
+          # The action's built-in animation disabling runs immediately after
+          # boot and is not retried. Software-emulated boots can briefly drop
+          # adb there, so scripts/run_android_emulator.sh handles it instead.
+          disable-animations: false
+          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none -no-metrics
           emulator-boot-timeout: 900
diff --git a/scripts/run_android_emulator.sh b/scripts/run_android_emulator.sh
index 29c2425cd0e..041c2f17b94 100755
--- a/scripts/run_android_emulator.sh
+++ b/scripts/run_android_emulator.sh
@@ -10,21 +10,43 @@ set -ex
 # This script is originally adopted from https://github.com/pytorch/pytorch/blob/main/android/run_tests.sh
 ADB_PATH=$ANDROID_HOME/platform-tools/adb
 
+adb_shell_with_retries() {
+  local attempts="$1"
+  shift
+
+  for ((i = 1; i <= attempts; i++)); do
+    if "$ADB_PATH" shell "$@"; then
+      return 0
+    fi
+    sleep 5
+    "$ADB_PATH" wait-for-device
+  done
+
+  return 1
+}
+
 echo "Waiting for emulator boot to complete"
 # shellcheck disable=SC2016
 $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; do sleep 5; done;'
+$ADB_PATH wait-for-device
+
+echo "Unlock emulator and disable animations"
+adb_shell_with_retries 5 input keyevent 82 || true
+adb_shell_with_retries 5 settings put global window_animation_scale 0.0 || true
+adb_shell_with_retries 5 settings put global transition_animation_scale 0.0 || true
+adb_shell_with_retries 5 settings put global animator_duration_scale 0.0 || true
 
 # The device will be created by ReactiveCircus/android-emulator-runner GHA
 echo "List all running emulators"
 $ADB_PATH devices
 
-adb uninstall org.pytorch.executorch.test || true
-adb install -t android-test-debug-androidTest.apk
+"$ADB_PATH" uninstall org.pytorch.executorch.test || true
+"$ADB_PATH" install -t android-test-debug-androidTest.apk
 
-adb logcat -c
-adb shell am instrument -w -r \
+"$ADB_PATH" logcat -c
+"$ADB_PATH" shell am instrument -w -r \
   org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner >result.txt 2>&1
-adb logcat -d > logcat.txt
+"$ADB_PATH" logcat -d > logcat.txt
 cat logcat.txt
 grep -q FAILURES result.txt && cat result.txt
 grep -q FAILURES result.txt && exit -1

From fe2ce0655ca034bb2c82ba9a74d462efe15ef9f6 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 15:30:03 -0700
Subject: [PATCH 10/58] Improve huggingface robustness (#19311)

Really should retry the web requests in optimum instead of just retrying
the whole export here.
---
 .ci/scripts/test_huggingface_optimum_model.py | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 59e5fa3d03a..04ad1f5e792 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -2,8 +2,10 @@
 import gc
 import logging
 import math
+import shutil
 import subprocess
 import tempfile
+import time
 from pathlib import Path
 from typing import List
 
@@ -25,6 +27,17 @@
 )
 
 
+EXPORT_RETRIES = 3
+
+
+def _clear_export_dir(model_dir):
+    for path in Path(model_dir).iterdir():
+        if path.is_dir() and not path.is_symlink():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+
+
 def cli_export(command, model_dir):
     p = Path(model_dir)
     if p.exists():
@@ -34,11 +47,19 @@ def cli_export(command, model_dir):
             raise Exception(
                 f"Existing directory {model_dir} is non-empty. Please remove it first."
             )
-    try:
-        subprocess.run(command, check=True)
-        print("Export completed successfully.")
-    except subprocess.CalledProcessError as e:
-        print(f"Export failed with error: {e}")
+
+    for attempt in range(1, EXPORT_RETRIES + 1):
+        try:
+            subprocess.run(command, check=True)
+            print("Export completed successfully.")
+            return
+        except subprocess.CalledProcessError as e:
+            print(f"Export attempt {attempt}/{EXPORT_RETRIES} failed with error: {e}")
+            if attempt == EXPORT_RETRIES:
+                raise
+            if p.exists():
+                _clear_export_dir(model_dir)
+            time.sleep(attempt * 10)
 
 
 def check_causal_lm_output_quality(

From e0cc4688145ea24d17d4540889237502a23bdb98 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 5 May 2026 15:31:00 -0700
Subject: [PATCH 11/58] arm: validate archive and retry on all errors for FVP
 and toolchain downloads (#19309)

### Summary

The Test ARM Backend workflow has been intermittently failing with
`curl: (92) HTTP/2 stream 0 was not closed cleanly: INTERNAL_ERROR`
during the FVP corstone download from developer.arm.com's CDN. The
toolchain download in the same setup uses the same bare-curl pattern and
fails the same way when the CDN flakes. In both cases the previous flow
was a single `curl --output ...` followed by a fatal `verify_md5`, so
neither a transient HTTP/2 reset nor a short error body that curl treats
as a successful 200 was retried.

Factor out a `download_with_retry` helper in utils.sh that wraps the
download in a five-attempt outer loop using
`curl --fail --retry-all-errors` and validates each attempt against the
published MD5 before proceeding, with the on-disk file size logged on
failure for diagnosis. Switch verify_md5's mismatch path from `exit 2`
to `return 2` so the helper can treat a bad checksum as a retryable
failure; existing callers (`verify_md5 ... || exit 1`) keep the same
fatal-on-mismatch behavior since the function still returns non-zero on
a bad checksum.

Use the helper from both fvp_utils.sh and toolchain_utils.sh in place of
the bare `curl` + `verify_md5` pair.

Authored with Claude Code.

### Test plan
CI

cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 backends/arm/scripts/fvp_utils.sh       |  3 +-
 backends/arm/scripts/toolchain_utils.sh |  3 +-
 backends/arm/scripts/utils.sh           | 48 +++++++++++++++++++++++--
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/backends/arm/scripts/fvp_utils.sh b/backends/arm/scripts/fvp_utils.sh
index 978ec363848..73f67112efd 100644
--- a/backends/arm/scripts/fvp_utils.sh
+++ b/backends/arm/scripts/fvp_utils.sh
@@ -67,10 +67,9 @@ function install_fvp() {
             log_step "fvp" "Downloading FVP ${fvp}"
             url_variable=${fvp}_url
             fvp_url=${!url_variable}
-            curl --output "FVP_${fvp}.tgz" "${fvp_url}"
             md5_variable=${fvp}_md5_checksum
             fvp_md5_checksum=${!md5_variable}
-            verify_md5 ${fvp_md5_checksum} FVP_${fvp}.tgz || exit 1
+            download_with_retry "fvp" "${fvp_url}" "FVP_${fvp}.tgz" "${fvp_md5_checksum}" || exit 1
         fi
 
         log_step "fvp" "Installing FVP ${fvp}"
diff --git a/backends/arm/scripts/toolchain_utils.sh b/backends/arm/scripts/toolchain_utils.sh
index 3d5f12e556b..5b37bcee7b4 100644
--- a/backends/arm/scripts/toolchain_utils.sh
+++ b/backends/arm/scripts/toolchain_utils.sh
@@ -107,8 +107,7 @@ function setup_toolchain() {
 
     if [[ ! -e "${toolchain_archive}" ]]; then
         log_step "toolchain" "Downloading ${toolchain_dir} toolchain"
-        curl --output "${toolchain_archive}" -L "${toolchain_url}"
-        verify_md5 ${toolchain_md5_checksum} "${toolchain_archive}" || exit 1
+        download_with_retry "toolchain" "${toolchain_url}" "${toolchain_archive}" "${toolchain_md5_checksum}" || exit 1
     fi
 
     log_step "toolchain" "Installing ${toolchain_dir} toolchain"
diff --git a/backends/arm/scripts/utils.sh b/backends/arm/scripts/utils.sh
index 2253311a19f..a7f151140f2 100644
--- a/backends/arm/scripts/utils.sh
+++ b/backends/arm/scripts/utils.sh
@@ -47,7 +47,10 @@ function verify_md5() {
     # Arg 1: Expected checksum for file
     # Arg 2: Path to file
     # Exits with return code 1 if the number of arguments is incorrect.
-    # Exits with return code 2 if the calculated mf5 does not match the given. 
+    # Returns 2 if the calculated md5 does not match the given. Returning
+    # rather than exiting lets callers like download_with_retry treat a bad
+    # checksum as a retryable failure (e.g. truncated download) instead of
+    # tearing down the whole script.
 
     [[ $# -ne 2 ]]  \
         && { echo "[${FUNCNAME[0]}] Invalid number of args, expecting 2, but got $#"; exit 1; }
@@ -60,11 +63,50 @@ function verify_md5() {
         local file_checksum="$(md5sum $file | awk '{print $1}')"
     fi
     if [[ ${ref_checksum} != ${file_checksum} ]]; then
-        echo "Mismatched MD5 checksum for file: ${file}. Expecting ${ref_checksum} but got ${file_checksum}. Exiting."
-        exit 2
+        echo "Mismatched MD5 checksum for file: ${file}. Expecting ${ref_checksum} but got ${file_checksum}."
+        return 2
     fi
 }
 
+function download_with_retry() {
+    # Download a URL to a path and validate its MD5, retrying on transport
+    # or checksum errors. developer.arm.com's CDN intermittently aborts the
+    # download mid-stream with HTTP/2 INTERNAL_ERROR (curl exit 92), and
+    # rare cases return a short error body that curl treats as success;
+    # both are caught here. --fail rejects HTTP errors,
+    # --retry-all-errors handles transport errors, and verify_md5 catches
+    # truncation / wrong-content via the published archive checksum.
+
+    # Arg 1: log context (passed to log_step)
+    # Arg 2: URL to download
+    # Arg 3: Output path
+    # Arg 4: Expected MD5 checksum
+
+    [[ $# -ne 4 ]] \
+        && { echo "[${FUNCNAME[0]}] Invalid number of args, expecting 4, but got $#"; exit 1; }
+    local context="${1}"
+    local url="${2}"
+    local output="${3}"
+    local expected_md5="${4}"
+
+    local max_attempts=5
+    for attempt in $(seq 1 ${max_attempts}); do
+        rm -f "${output}"
+        if curl --fail --retry 3 --retry-delay 5 --retry-connrefused --retry-all-errors \
+             -L --output "${output}" "${url}" \
+           && verify_md5 "${expected_md5}" "${output}"; then
+            return 0
+        fi
+        ls -l "${output}" 2>&1 || true
+        if [[ "${attempt}" = "${max_attempts}" ]]; then
+            log_step "${context}" "ERROR: download of ${url} failed after ${attempt} attempts"
+            return 1
+        fi
+        log_step "${context}" "download attempt ${attempt} failed; retrying in $((attempt * 10))s..."
+        sleep $((attempt * 10))
+    done
+}
+
 function patch_repo() {
     # Patch git repo found in $repo_dir, starting from patch $base_rev and applying patches found in $patch_dir/$name.
 

From 9b95dd21bba52ea8ab5cb72da115f3e9497027f3 Mon Sep 17 00:00:00 2001
From: billmguo <minguo@meta.com>
Date: Tue, 5 May 2026 15:53:09 -0700
Subject: [PATCH 12/58] runner fix to mitigate the numerical issue (#19286)

Differential Revision: D103690468

Pull Request resolved: https://github.com/pytorch/executorch/pull/19286
---
 .../models/llama/runner/static_attention_io_manager.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index c4e851f0b0c..6f631df3ff0 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -14,6 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
@@ -53,8 +54,8 @@ class StaticKVCache {
         style_(style),
         input_ptrs_(n_caches_),
         output_ptrs_(n_caches_) {
-    size_t total_cache_len =
-        std::accumulate(cache_lengths_.begin(), cache_lengths_.end(), 0);
+    size_t total_cache_len = std::accumulate(
+        cache_lengths_.begin(), cache_lengths_.end(), size_t(0));
     cache_data_size_ = total_cache_len * n_heads_per_cache_ * head_dim_;
     update_data_size_ =
         n_caches_ * n_heads_per_cache_ * max_input_len_ * head_dim_;
@@ -867,6 +868,12 @@ class StaticAttentionIOManager {
   void set_input(executorch::runtime::Method& method, size_t idx, T* data) {
     auto methodMeta = method.method_meta();
     auto inputMeta = methodMeta.input_tensor_meta(idx);
+    ET_CHECK_MSG(
+        sizeof(T) == executorch::runtime::elementSize(inputMeta->scalar_type()),
+        "set_input: sizeof(T)=%zu but model expects element size %zu for input %zu",
+        sizeof(T),
+        executorch::runtime::elementSize(inputMeta->scalar_type()),
+        idx);
     auto impl = ::executorch::runtime::etensor::TensorImpl(
         inputMeta->scalar_type(),
         inputMeta->sizes().size(),

From 165ac2e1537765f81dbf8d8d9b74622e98758e9d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 15:58:09 -0700
Subject: [PATCH 13/58] Relax lora string test (#19312)

Seeing small numerical flakeyness causing diverging output. The output
is roughly equivalent though. Current thought is different cpu
architectures causing different xnnpack kernels to trigger causing minor
difference in output.
---
 .ci/scripts/test_lora.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
index 79561f54622..b20d456a673 100644
--- a/.ci/scripts/test_lora.sh
+++ b/.ci/scripts/test_lora.sh
@@ -160,6 +160,13 @@ To calculate 15% of 80, we can multiply 80 by 15/100.
 So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
 #### 12
 The answer is: 12<|im_end|>"
+EXPECTED_QUANT_LORA_ALTERNATE_PREFIX="
+<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
+To calculate 15% of 80, we can multiply 80 by 15/100.
+80 * 15/100 = 12.
+So, 15% of 80 is 12.
+#### 12
+The answer is: 12<|im_end|>"
 
 # Export Quantized PTE, PTD file, no LoRA.
 # override base.lora_config=null to avoid creating a lora model
@@ -225,12 +232,13 @@ NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result.txt)
-if [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_PREFIX}"* ]]; then
+if [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_PREFIX}"* ]] || [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_ALTERNATE_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_QUANT_LORA_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 4: Success"
 else
   echo "Expected result prefix: ${EXPECTED_QUANT_LORA_PREFIX}"
+  echo "Alternate expected result prefix: ${EXPECTED_QUANT_LORA_ALTERNATE_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 4: Failure; results not the same"
   cleanup_files

From 5d07ce0ee5c77dc6dd0304013fd56789d4081a2b Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 16:12:51 -0700
Subject: [PATCH 14/58] Cadence tests should retry (#19313)

With these giant 500+ op tests we often will get a flakey 1/500 failure.
Just adding retries to make this a little less noisy.

Failure is something like FAILED
backends/cadence/aot/tests/test_replace_ops_passes.py::TestReplaceOpsPasses::test_replace_conv2d_with_linear
- AssertionError: Pass validation failed for pass
ReplaceTrivialConvWithLinear. Output tensor 0 differs by max
1.525879e-05. Expected rtol=2e-05, atol=1e-06. Original output:
tensor([[[[ 6.5604]],
---
 .github/workflows/_test_cadence.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_test_cadence.yml b/.github/workflows/_test_cadence.yml
index 5b81ddc3f82..2e98d21db1c 100644
--- a/.github/workflows/_test_cadence.yml
+++ b/.github/workflows/_test_cadence.yml
@@ -45,9 +45,9 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         pip install -e . --no-build-isolation > /dev/null
-        pip install beartype later pyre_extensions pytest-xdist
+        pip install beartype later pyre_extensions pytest-rerunfailures==15.1 pytest-xdist
 
-        python -m pytest backends/cadence/aot/tests/ -v -n auto
+        python -m pytest backends/cadence/aot/tests/ -v -n auto --reruns 2 --reruns-delay 1
 
   test-ops:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -66,7 +66,7 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         pip install -e . --no-build-isolation > /dev/null
-        pip install beartype later pyre_extensions pytest-xdist
+        pip install beartype later pyre_extensions pytest-rerunfailures==15.1 pytest-xdist
 
         # Use the pre-built runner from the build job
         mkdir -p cmake-out/backends/cadence
@@ -74,4 +74,4 @@ jobs:
         chmod +x cmake-out/backends/cadence/cadence_runner
 
         export PYTHONPATH="${PYTHONPATH:-}:$(pwd)/backends/cadence/utils/FACTO"
-        python -m pytest examples/cadence/operators/ -v -n auto
+        python -m pytest examples/cadence/operators/ -v -n auto --reruns 2 --reruns-delay 1

From 5dcf0ed18f20d840407339eb1be6e76b62e38546 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 5 May 2026 17:01:54 -0700
Subject: [PATCH 15/58] llama/rope: gate fp64 hf_precompute_freqs_cis on
 cos/sin scaling (#19308)

### Summary
a79521b904 ("Add LongRoPE support and fp64 RoPE precompute for Phi-3 /
Phi-4 family") unconditionally moved hf_precompute_freqs_cis to fp64
cos/sin precompute with a final cast to fp32. That works for the Phi-4
device validation that motivated the commit, but it broke
test_static_attention.py::test_within_transformer on the Linux unittest
runners (pull, pull-editable, trunk-release have been 100% red since the
commit landed).

The test compares mha_transformer (built with use_hf_rope=False, taking
the pure-fp32 precompute_freqs_cis path) against static_transformer
(built with use_hf_rope=True, taking hf_precompute_freqs_cis) at
rtol=1e-3, with shared weights. Before a79521b904, both paths produced
bit-identical fp32 cos/sin tables (verified empirically: 0/192 entries
differed). After the commit, HF cos/sin diverge from non-HF by ~1 ULP in
38/192 entries; that drift compounds across 4 transformer layers and
tips past rtol=1e-3 on the CI runners (Python 3.10, source-built torch).
Local Python 3.12 stayed just barely within tolerance, which is why
review missed it.

Gate the fp64 precompute on the property the original commit was
actually protecting: a non-trivial cos/sin scale being applied. That is
either LongRoPE active (Phi-3 / Phi-4 set short_factor and long_factor
via config) or an explicit attention_factor != 1.0 passed through. Both
cases preserve fp64; vanilla HF RoPE (Llama family, the test config)
goes back to fp32 throughout and re-establishes bit-identical agreement
with the non-HF path.

Authored with Claude Code.

### Test plan
CI
---
 examples/models/llama/rope.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 7fc111e2c34..5b3a2d03874 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -154,21 +154,32 @@ def hf_precompute_freqs_cis(
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
 
-    # Compute the RoPE table in fp64 to minimize ULP-level drift; cast to fp32
-    # once at the end. Phi-4 Mini's narrow decode-time logit margins make the
-    # exported model sensitive to 1-ULP differences in freqs_cos / freqs_sin
-    # under sampling, especially on the Vulkan delegate.
+    # fp64 precompute is required whenever cos/sin will be scaled by a
+    # non-trivial attention_factor (LongRoPE on Phi-3 / Phi-4 family). There,
+    # fp32 ULP-level rounding in the table is load-bearing on Vulkan under
+    # sampling -- a fp32-only regression manifests as decode-time n-gram
+    # looping, not a unit-test red. For vanilla HF RoPE, fp32 throughout
+    # produces cos/sin tables bit-identical to the non-HF precompute_freqs_cis
+    # path, which the static-attention vs MHA parity tests rely on.
+    #
+    # If you add a new model that needs cos/sin scaling but does not set
+    # short_factor / long_factor / attention_factor, extend the gate below.
+    longrope_active = (short_factor is not None) or (long_factor is not None)
+    needs_fp64 = longrope_active or (
+        attention_factor is not None and attention_factor != 1.0
+    )
+    compute_dtype = torch.float64 if needs_fp64 else torch.float32
+
     inv_freq = 1.0 / (
         theta
         ** (
-            torch.arange(0, dim, 2, device=device, dtype=torch.int64).to(torch.float64)
+            torch.arange(0, dim, 2, device=device, dtype=torch.int64).to(compute_dtype)
             / dim
         )
     )
 
     # LongRoPE: divide inv_freq element-wise by short_factor or long_factor.
     # Selection mirrors HF: long_factor when seq_len > original_max_position_embeddings.
-    longrope_active = (short_factor is not None) or (long_factor is not None)
     if longrope_active:
         chosen = (
             long_factor
@@ -178,7 +189,7 @@ def hf_precompute_freqs_cis(
         if chosen is None:
             # Fall back to whichever factor was provided.
             chosen = short_factor if long_factor is None else long_factor
-        ext_factors = torch.tensor(chosen, dtype=torch.float64, device=device)
+        ext_factors = torch.tensor(chosen, dtype=compute_dtype, device=device)
         assert ext_factors.numel() == inv_freq.numel(), (
             f"LongRoPE factor length {ext_factors.numel()} must equal dim/2 "
             f"({inv_freq.numel()})"
@@ -200,8 +211,8 @@ def hf_precompute_freqs_cis(
                 )
 
     # pyre-ignore Undefined attribute [16]: `float` has no attribute `device`.
-    t = torch.arange(end, device=inv_freq.device, dtype=torch.int64).to(torch.float64)
-    freqs = torch.outer(t, inv_freq).to(torch.float64)  # pyre-ignore
+    t = torch.arange(end, device=inv_freq.device, dtype=torch.int64).to(compute_dtype)
+    freqs = torch.outer(t, inv_freq).to(compute_dtype)  # pyre-ignore
     emb = torch.cat((freqs, freqs), dim=-1)
     cos_tab = torch.cos(emb)
     sin_tab = torch.sin(emb)

From 0f9de6a93d2ee1c6ebe42357dd9ab348e41bfea7 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Tue, 5 May 2026 17:06:32 -0700
Subject: [PATCH 16/58] Move CUDAGuard/CUDAStreamGuard static_assert tests out
 of CUDA fixtures (#19314)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
The 6 type-trait checks below were defined as TEST_F(CUDAGuardTest, ...)
and TEST_F(CUDAStreamGuardTest, ...). Both fixtures' SetUp() calls
GTEST_SKIP() when no CUDA device is available, so on every test host
without an attached GPU these tests skip instead of running:

  CUDAGuardTest.CopyConstructorDeleted
  CUDAGuardTest.CopyAssignmentDeleted
  CUDAGuardTest.MoveAssignmentDeleted
  CUDAStreamGuardTest.CopyConstructorDeleted
  CUDAStreamGuardTest.CopyAssignmentDeleted
  CUDAStreamGuardTest.MoveAssignmentDeleted

Because they never produced a successful run (Passes: 0 across 173 / 23
runs, all skips), TestX auto-disabled them and they show up as DISABLED
on the executorch dashboard.

These are pure compile-time static_assert checks. They do not need a
CUDA device or any runtime state — if the file compiles, they pass.
Move them into a separate non-fixture test suite
(CUDAGuardCompileTimeTest /
CUDAStreamGuardCompileTimeTest) so they run unconditionally.

The remaining 15 fixture-based tests still need a real CUDA device and
will be addressed separately (fixing the gpu-remote-execution platform
deps so cudaGetDeviceCount returns a non-zero value).

Reviewed By: Gasoonjia

Differential Revision: D103937761
---
 backends/aoti/slim/cuda/test/test_cuda_guard.cpp        | 9 ++++++---
 backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/backends/aoti/slim/cuda/test/test_cuda_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
index c9938bf5cd8..70da3108aba 100644
--- a/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
+++ b/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
@@ -94,19 +94,22 @@ TEST_F(CUDAGuardTest, NegativeDeviceIndex) {
   EXPECT_FALSE(guard_result.ok());
 }
 
-TEST_F(CUDAGuardTest, CopyConstructorDeleted) {
+// Compile-time type-trait checks. These do not need a CUDA device, so they
+// live outside the CUDAGuardTest fixture (whose SetUp() calls GTEST_SKIP
+// when no CUDA device is available).
+TEST(CUDAGuardCompileTimeTest, CopyConstructorDeleted) {
   static_assert(
       !std::is_copy_constructible_v<CUDAGuard>,
       "CUDAGuard should not be copy constructible");
 }
 
-TEST_F(CUDAGuardTest, CopyAssignmentDeleted) {
+TEST(CUDAGuardCompileTimeTest, CopyAssignmentDeleted) {
   static_assert(
       !std::is_copy_assignable_v<CUDAGuard>,
       "CUDAGuard should not be copy assignable");
 }
 
-TEST_F(CUDAGuardTest, MoveAssignmentDeleted) {
+TEST(CUDAGuardCompileTimeTest, MoveAssignmentDeleted) {
   static_assert(
       !std::is_move_assignable_v<CUDAGuard>,
       "CUDAGuard should not be move assignable");
diff --git a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
index 613bc6ffe19..1f1acdac5db 100644
--- a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
+++ b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
@@ -234,19 +234,22 @@ TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) {
   EXPECT_FALSE(guard_result.ok());
 }
 
-TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) {
+// Compile-time type-trait checks. These do not need a CUDA device, so they
+// live outside the CUDAStreamGuardTest fixture (whose SetUp() calls
+// GTEST_SKIP when no CUDA device is available).
+TEST(CUDAStreamGuardCompileTimeTest, CopyConstructorDeleted) {
   static_assert(
       !std::is_copy_constructible_v<CUDAStreamGuard>,
       "CUDAStreamGuard should not be copy constructible");
 }
 
-TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) {
+TEST(CUDAStreamGuardCompileTimeTest, CopyAssignmentDeleted) {
   static_assert(
       !std::is_copy_assignable_v<CUDAStreamGuard>,
       "CUDAStreamGuard should not be copy assignable");
 }
 
-TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) {
+TEST(CUDAStreamGuardCompileTimeTest, MoveAssignmentDeleted) {
   static_assert(
       !std::is_move_assignable_v<CUDAStreamGuard>,
       "CUDAStreamGuard should not be move assignable");

From 5faf36eb26ae35529ed78f3b746539d8bfdc1a9e Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Tue, 5 May 2026 18:36:33 -0700
Subject: [PATCH 17/58] Restrict XOR python export targets to fbcode (#19316)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
`xplat/executorch/extension/training/examples/XOR/BUCK` invokes
`define_common_targets()` for both fbcode (`fbcode_target`) and xplat
(`non_fbcode_target`). The python targets in this example
(`model`, `export_model_lib`, `export_model`) depend on
`//caffe2:torch` and `//executorch/exir:lib`, neither of which is
defined as an xplat target — `xplat/executorch/exir/BUCK` only
declares the `:lib` target via `fbcode_target(...)`. As a result the
xplat configuration of
`fbsource//xplat/executorch/extension/training/examples/XOR:export_model`
fails analysis with:

  Unknown target `lib` from package `fbsource//xplat/executorch/exir`.
Did you mean one of the 0 targets in
fbsource//xplat/executorch/exir:BUCK?

This produced 218/218 BUILD_RULE failures on the

`fbsource//xplat/executorch/extension/training/examples/XOR:export_model`
target with no successful run on record (linked to T168807700).

Wrap the three python rules with `if not is_xplat():` so they only
register when called from fbcode, matching the established precedent
in `xplat/executorch/kernels/portable/test/targets.bzl`. The
`train_xor` C++ binary continues to be defined for both cells since
its dependencies are xplat-compatible.

Differential Revision: D103951555
---
 extension/training/examples/XOR/targets.bzl | 56 +++++++++++----------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/extension/training/examples/XOR/targets.bzl b/extension/training/examples/XOR/targets.bzl
index 4a85c34c1bb..f332dc49b3f 100644
--- a/extension/training/examples/XOR/targets.bzl
+++ b/extension/training/examples/XOR/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "is_xplat", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -23,30 +23,34 @@ def define_common_targets():
         define_static_target = True,
     )
 
-    runtime.python_library(
-        name = "model",
-        srcs = ["model.py"],
-        visibility = [],  # Private
-        deps = [
-            "//caffe2:torch",
-        ],
-    )
+    # The Python export targets depend on `//caffe2:torch` and
+    # `//executorch/exir:lib`, neither of which exist as xplat (fbsource)
+    # targets. Restrict these to fbcode only.
+    if not is_xplat():
+        runtime.python_library(
+            name = "model",
+            srcs = ["model.py"],
+            visibility = [],  # Private
+            deps = [
+                "//caffe2:torch",
+            ],
+        )
 
-    runtime.python_library(
-        name = "export_model_lib",
-        srcs = ["export_model.py"],
-        visibility = ["//executorch/extension/training/examples/XOR/..."],
-        deps = [
-            ":model",
-            "//caffe2:torch",
-            "//executorch/exir:lib",
-        ],
-    )
+        runtime.python_library(
+            name = "export_model_lib",
+            srcs = ["export_model.py"],
+            visibility = ["//executorch/extension/training/examples/XOR/..."],
+            deps = [
+                ":model",
+                "//caffe2:torch",
+                "//executorch/exir:lib",
+            ],
+        )
 
-    runtime.python_binary(
-        name = "export_model",
-        main_module = "executorch.extension.training.examples.XOR.export_model",
-        deps = [
-            ":export_model_lib",
-        ],
-    )
+        runtime.python_binary(
+            name = "export_model",
+            main_module = "executorch.extension.training.examples.XOR.export_model",
+            deps = [
+                ":export_model_lib",
+            ],
+        )

From 6a8d3419717a168a003cfbadee12d8f84d3658dd Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 19:21:28 -0700
Subject: [PATCH 18/58] Retry op numeric tests Arm (#19321)

Similar to the cadence retries. The numerics tests are a little too
strict likely and a little flakey.

cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 .ci/scripts/test_backend.sh             |  4 +++-
 backends/arm/test/test_arm_baremetal.sh | 22 ++++++++++++----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index e035b44cb62..431607e899d 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -35,6 +35,7 @@ export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
 EXTRA_BUILD_ARGS=""
+PYTEST_RETRY_ARGS=()
 
 if [[ "$FLOW" == *qnn* ]]; then
     # Setup QNN sdk and deps - note that this is a bit hacky due to the nature of the
@@ -57,6 +58,7 @@ if [[ "$FLOW" == *vulkan* ]]; then
 fi
 
 if [[ "$FLOW" == *arm* ]]; then
+    PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
 
     # Setup ARM deps.
     if [[ "$FLOW" == *vgf* ]]; then
@@ -95,6 +97,6 @@ GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
 
 EXIT_CODE=0
-${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
+${CONDA_RUN_CMD} pytest -c /dev/null -n auto "${PYTEST_RETRY_ARGS[@]}" backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
 # Generate markdown summary.
 ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 382ad9f633d..ad8cd8b7d3a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -48,6 +48,7 @@ fi
 TEST_SUITE_NAME="$(basename "$0") ${TEST_SUITE}"
 
 EXCLUDE_TARGET_EXPR="(not u55) and (not u85) and (not tosa) and (not _vgf_)"
+PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
 
 all() { # Run all tests
     # This will list all lines in this file that is starting with test_ remove () { and add this script name in
@@ -80,7 +81,7 @@ test_pytest_ops_no_target() {
     echo "${TEST_SUITE_NAME}: Run pytest ops for target-less tests"
 
     # Run arm baremetal pytest tests without target
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -91,7 +92,7 @@ test_pytest_models_no_target() {
     source backends/arm/scripts/install_models_for_test.sh
 
     # Run arm baremetal pytest tests without FVP
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -101,7 +102,7 @@ test_pytest_models_no_target() {
 test_pytest_ops_tosa() {
     echo "${TEST_SUITE_NAME}: Run pytest ops for TOSA"
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k tosa
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k tosa
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -111,7 +112,7 @@ test_pytest_models_tosa() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k tosa
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k tosa
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -134,7 +135,7 @@ test_pytest_ops_ethos_u55() {
     backends/arm/scripts/build_executorch.sh
     backends/arm/test/setup_testing.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u55
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u55
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -147,7 +148,7 @@ test_pytest_models_ethos_u55() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u55
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u55
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -188,7 +189,7 @@ test_pytest_ops_ethos_u85() {
     backends/arm/test/setup_testing.sh
 
     # Run arm baremetal pytest tests with FVP
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u85
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u85
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -201,7 +202,7 @@ test_pytest_models_ethos_u85() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u85
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u85
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -235,7 +236,7 @@ test_pytest_ops_vkml() {
 
     source backends/arm/test/setup_testing_vkml.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ \
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ \
             --ignore=backends/arm/test/models -k _vgf_
     echo "${TEST_SUITE_NAME}: PASS"
 }
@@ -248,7 +249,7 @@ test_pytest_models_vkml() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k _vgf_
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k _vgf_
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -295,6 +296,7 @@ test_smaller_stories_llama() {
     # Get path to source directory
     pytest \
     -c /dev/null \
+    "${PYTEST_RETRY_ARGS[@]}" \
     --verbose \
     --color=yes \
     --numprocesses=auto \

From 5b337e96262ee25b97f5303f595c23356f801329 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Tue, 5 May 2026 19:29:42 -0700
Subject: [PATCH 19/58] Declare pip as explicit dep (#19322)

Job is a little flakey as sometimes the runner doesn't contain pip. This
adds it as an explicit dep.
---
 .ci/scripts/test_coreml_bc.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.ci/scripts/test_coreml_bc.sh b/.ci/scripts/test_coreml_bc.sh
index ac379481ea5..b077726832e 100644
--- a/.ci/scripts/test_coreml_bc.sh
+++ b/.ci/scripts/test_coreml_bc.sh
@@ -23,7 +23,7 @@ source "${REPO_ROOT}/.ci/scripts/utils.sh"
 # Create a conda environment with Python 3.10 for compatibility with old ET versions
 # ET 1.0.0 only supports Python >=3.10,<3.13
 CONDA_ENV_NAME="coreml_bc_test_env"
-conda create -y -n "${CONDA_ENV_NAME}" python=3.10
+conda create -y -n "${CONDA_ENV_NAME}" python=3.10 pip packaging
 
 # Use conda run to execute commands in the new environment
 CONDA_RUN="conda run --no-capture-output -n ${CONDA_ENV_NAME}"
@@ -69,7 +69,7 @@ git submodule sync --recursive
 git submodule update --init --recursive
 
 # Install executorch
-${CONDA_RUN} pip install --upgrade pip
+${CONDA_RUN} python -m pip install --upgrade pip
 ${CONDA_RUN} python install_executorch.py
 
 # Step 3: Export model
@@ -129,7 +129,7 @@ git submodule update --init --recursive
 
 # Step 5: Install current version
 echo "=== Step 5: Installing current ET version ==="
-${CONDA_RUN} pip install --upgrade pip
+${CONDA_RUN} python -m pip install --upgrade pip
 ${CONDA_RUN} python install_executorch.py
 
 # Step 6: Run the old pte file

From 8ae05c2bc72fe2645859abcd245d6e8ac7ea1d2b Mon Sep 17 00:00:00 2001
From: Abdurrahman Akkas <akkasa@meta.com>
Date: Tue, 5 May 2026 20:02:50 -0700
Subject: [PATCH 20/58] Fix FuseMMWithAdd returning False after graph mutation

Differential Revision: D103935830

Pull Request resolved: https://github.com/pytorch/executorch/pull/19318
---
 backends/cadence/aot/fuse_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
index aaf13562388..42be54d48b9 100644
--- a/backends/cadence/aot/fuse_ops.py
+++ b/backends/cadence/aot/fuse_ops.py
@@ -190,7 +190,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
         #    is True)
         # 2. The single successor of addmm is not a view op.
         if len(addmm_node.users) == 0:
-            return False
+            return True
 
         addmm_user = list(addmm_node.users.keys())[0]
         if intermediate_view and not self._is_view_node(addmm_user):

From 1debeb6d38f5ce71a03a0cdc11a187d929047b5d Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Tue, 5 May 2026 21:46:19 -0700
Subject: [PATCH 21/58] Re-apply U55 reject split for bool permute test
 (#19320)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Forward fix on top of D103817917 (`Arm backend: Cleanup dim-order
and permute handling` — DiffTrain import of upstream PR #19278).

D103817917 reverted an internally-applied test split: the bool
permute case (`rank2_bool`) is U55-rejected and was already moved
out of the U55-delegating test on master into a separate
`test_data_suite_u55_reject` set with a dedicated
`test_permute_u55_INT_not_delegated` test using
`OpNotSupportedPipeline`. Upstream PR #19278 doesn't include that
split, so the DiffTrain import wipes it out and brings back the
combined `test_data_suite` + special-cased bool branch in
`test_permute_u55_INT`. That regresses CI:
`test_permute_u55_INT[rank2_bool]` is reported as a critical
LAND_BLOCKING failure on D103817917.

Re-apply the split so trunk returns to the clean form after
D103817917 lands:
- Add `OpNotSupportedPipeline` import.
- Move `rank2_bool` out of `test_data_suite` into a new
  `test_data_suite_u55_reject` dict.
- Drop the dead `if test_data[0].dtype == torch.bool: ...`
  workaround block from `test_permute_u55_INT` (no bool flows
  through this test anymore).
- Add `test_permute_u55_INT_not_delegated` parametrized over
  `test_data_suite_u55_reject`, exercising
  `OpNotSupportedPipeline` with `u55_subset=True`.

The `test_data_suite_u55` dict introduced by D103817917 (large
permutes that only U55 needs to exercise) is preserved unchanged.

Differential Revision: D103963260


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 backends/arm/test/ops/test_permute.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 98fb034e311..8864324dbd5 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -17,6 +17,7 @@
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
     EthosU85PipelineINT,
+    OpNotSupportedPipeline,
     TosaPipelineFP,
     TosaPipelineINT,
     VgfPipeline,
@@ -39,6 +40,9 @@
     "rank_3_large": lambda: (torch.rand(16, 64, 65), [1, 2, 0]),
     "reshape_large_1": lambda: (torch.rand(1, 1, 65537), [0, 2, 1]),
     "reshape_large_2": lambda: (torch.rand(65537, 1, 1), [1, 2, 0]),
+}
+
+test_data_suite_u55_reject = {
     "rank2_bool": lambda: (torch.randint(0, 2, (5, 5), dtype=torch.bool), [1, 0]),
 }
 
@@ -111,10 +115,19 @@ def test_permute_u55_INT(test_data):
         aten_op,
         exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
     )
-    if test_data[0].dtype == torch.bool:
-        pipeline.tester.use_portable_ops = True
-        pipeline.pop_stage("check_count.exir")
-        pipeline.pop_stage("check_not.exir")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite_u55_reject)
+def test_permute_u55_INT_not_delegated(test_data: torch.Tensor):
+    test_data, dims = test_data()
+    pipeline = OpNotSupportedPipeline[input_t1](
+        SimplePermute(dims=dims),
+        (test_data,),
+        non_delegated_ops={exir_op: 1},
+        quantize=True,
+        u55_subset=True,
+    )
     pipeline.run()
 
 
From c7e8628c05b661089c1772e048c6a15d66427798 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Wed, 6 May 2026 09:40:37 -0700
Subject: [PATCH 22/58] Fix retry variable on mac (#19333)

---
 .ci/scripts/test_backend.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index 431607e899d..8bf2236e4e6 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -97,6 +97,11 @@ GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
 
 EXIT_CODE=0
-${CONDA_RUN_CMD} pytest -c /dev/null -n auto "${PYTEST_RETRY_ARGS[@]}" backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
+PYTEST_ARGS=(-c /dev/null -n auto)
+if [[ ${#PYTEST_RETRY_ARGS[@]} -gt 0 ]]; then
+    PYTEST_ARGS+=("${PYTEST_RETRY_ARGS[@]}")
+fi
+PYTEST_ARGS+=("backends/test/suite/$SUITE/" -m "flow_$FLOW" --json-report --json-report-file="$REPORT_FILE")
+${CONDA_RUN_CMD} pytest "${PYTEST_ARGS[@]}" || EXIT_CODE=$?
 # Generate markdown summary.
 ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE

From 3ffaf27f2d6eeab42b410d519e5f47fea6d21098 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Wed, 6 May 2026 10:36:40 -0700
Subject: [PATCH 23/58] Revert torch-family pin centralization (#19334)

Several macos jobs have been timing out really bad since this change
blocking viablestrict. We could potentially just increase the runtime
but until I actually get a viable strict bump I dont want to be sat
waiting 1-2hrs for these jobs to run so reverting to get back to the
30-40m runtimes.
---
 .ci/docker/build.sh                           | 17 ----
 .ci/docker/ci_commit_pins/pytorch.txt         |  2 +-
 .ci/docker/common/install_pytorch.sh          | 29 +-----
 .ci/docker/ubuntu/Dockerfile                  |  5 -
 .ci/scripts/test_model_e2e.sh                 |  5 +-
 .ci/scripts/test_wheel_package_qnn.sh         | 27 +++--
 .ci/scripts/tests/test_torch_pin.py           | 54 ----------
 .ci/scripts/utils.sh                          | 54 +++-------
 .github/scripts/update_pytorch_pin.py         | 98 +++++++++++--------
 .github/workflows/weekly-pytorch-pin-bump.yml | 35 ++-----
 .../models/moshi/mimi/install_requirements.sh | 10 +-
 install_executorch.py                         | 19 ++--
 install_requirements.py                       | 64 +++++-------
 torch_pin.py                                  | 59 +----------
 14 files changed, 130 insertions(+), 348 deletions(-)
 delete mode 100644 .ci/scripts/tests/test_torch_pin.py

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index e6122c498ce..7c4a80044e4 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -92,18 +92,6 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
-# Pull channel + spec/url helpers out of torch_pin.py so install_pytorch.sh
-# (which runs inside the docker build, where torch_pin.py isn't available)
-# can decide between wheel install (test/release) and source build (nightly).
-# Self-hosted runners often have python3 but not the unversioned python alias.
-PYTHON_BIN=$(command -v python3 || command -v python)
-TORCH_PIN_HELPERS=$(cd ../.. && "$PYTHON_BIN" -c "from torch_pin import CHANNEL, torch_spec, torchaudio_spec, torchvision_spec, torch_index_url_base; print(CHANNEL); print(torch_spec()); print(torchaudio_spec()); print(torchvision_spec()); print(torch_index_url_base())")
-TORCH_CHANNEL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '1p')
-TORCH_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '2p')
-TORCHAUDIO_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '3p')
-TORCHVISION_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '4p')
-TORCH_INDEX_URL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '5p')
-
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -116,11 +104,6 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
-  --build-arg "TORCH_CHANNEL=${TORCH_CHANNEL}" \
-  --build-arg "TORCH_SPEC=${TORCH_SPEC}" \
-  --build-arg "TORCHAUDIO_SPEC=${TORCHAUDIO_SPEC}" \
-  --build-arg "TORCHVISION_SPEC=${TORCHVISION_SPEC}" \
-  --build-arg "TORCH_INDEX_URL=${TORCH_INDEX_URL}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 0932a9ef6b8..f6e39a63b92 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-release/2.11
+release/2.11
\ No newline at end of file
diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
index ddf2f21baa9..548a24f885d 100755
--- a/.ci/docker/common/install_pytorch.sh
+++ b/.ci/docker/common/install_pytorch.sh
@@ -17,24 +17,6 @@ install_domains() {
 }
 
 install_pytorch_and_domains() {
-  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
-    # Test/release: install the published wheels directly. The specs and URL
-    # are passed in as docker build args (computed from torch_pin.py by
-    # .ci/docker/build.sh). RC wheels at /whl/test/ get re-uploaded under the
-    # same version, so use --no-cache-dir there to avoid stale cache hits.
-    local cache_flag=""
-    if [ "${TORCH_CHANNEL}" = "test" ]; then
-      cache_flag="--no-cache-dir"
-    fi
-    pip_install --force-reinstall ${cache_flag} \
-      "${TORCH_SPEC}" "${TORCHVISION_SPEC}" "${TORCHAUDIO_SPEC}" \
-      --index-url "${TORCH_INDEX_URL}/cpu"
-    return
-  fi
-
-  # Nightly: build pytorch from source against the pinned SHA in pytorch.txt
-  # so we catch upstream regressions, then install audio/vision from the
-  # commits that pytorch itself pins.
   git clone https://github.com/pytorch/pytorch.git
 
   # Fetch the target commit
@@ -45,19 +27,14 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
-  # PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
-  # doesn't pass — gcc-11 in this image is too old to accept the combined
-  # NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
-  # runtime targets (phones, embedded) don't use SVE, so bypass the check.
-  export BUILD_IGNORE_SVE_UNAVAILABLE=1
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
-  # Defer to PyTorch's own pinned audio/vision commits.
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  # Grab the pinned audio and vision commits from PyTorch
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 98268d49675..0e2d7e48eb9 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -64,11 +64,6 @@ ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
 
 ARG TORCH_VERSION
-ARG TORCH_CHANNEL
-ARG TORCH_SPEC
-ARG TORCHAUDIO_SPEC
-ARG TORCHVISION_SPEC
-ARG TORCH_INDEX_URL
 ARG SKIP_PYTORCH
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 97711de2713..1678b0a4fbb 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -272,10 +272,7 @@ elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ];
     fi
   fi
   pip install datasets soundfile
-  # We pushd'd into EXECUTORCH_ROOT above, so torch_pin is importable here.
-  TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")
-  TORCHCODEC_INDEX=$(python -c "from torch_pin import torch_index_url_base; print(torch_index_url_base())")
-  pip install "$TORCHCODEC_PKG" --extra-index-url "${TORCHCODEC_INDEX}/cpu"
+  pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh
index f44fafadb58..763bd8733c1 100644
--- a/.ci/scripts/test_wheel_package_qnn.sh
+++ b/.ci/scripts/test_wheel_package_qnn.sh
@@ -150,26 +150,25 @@ run_core_tests () {
   echo "=== [$LABEL] Installing wheel & deps ==="
   "$PIPBIN" install --upgrade pip
   "$PIPBIN" install "$WHEEL_FILE"
-  # runpy.run_path uses a relative path, so the caller must run this script
-  # from the executorch repo root (where torch_pin.py lives).
-  TORCH_SPEC=$(
+  TORCH_VERSION=$(
   "$PYBIN" - <<'PY'
 import runpy
 module_vars = runpy.run_path("torch_pin.py")
-print(module_vars["torch_spec"]())
+print(module_vars["TORCH_VERSION"])
 PY
 )
-  TORCH_INDEX=$(
-  "$PYBIN" - <<'PY'
-import runpy
-module_vars = runpy.run_path("torch_pin.py")
-print(module_vars["torch_index_url_base"]())
-PY
-)
-  echo "=== [$LABEL] Install $TORCH_SPEC from ${TORCH_INDEX}/cpu ==="
 
-  # Install torch based on the pinned PyTorch version from the channel index.
-  "$PIPBIN" install "$TORCH_SPEC" --index-url "${TORCH_INDEX}/cpu"
+#   NIGHTLY_VERSION=$(
+#   "$PYBIN" - <<'PY'
+# import runpy
+# module_vars = runpy.run_path("torch_pin.py")
+# print(module_vars["NIGHTLY_VERSION"])
+# PY
+# )
+  echo "=== [$LABEL] Install torch==${TORCH_VERSION} ==="
+
+  # Install torch based on the pinned PyTorch version, preferring the PyTorch test index
+  "$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
   "$PIPBIN" install wheel
 
   # Install torchao based on the pinned commit from third-party/ao submodule
diff --git a/.ci/scripts/tests/test_torch_pin.py b/.ci/scripts/tests/test_torch_pin.py
deleted file mode 100644
index 6c475aeaa05..00000000000
--- a/.ci/scripts/tests/test_torch_pin.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import importlib
-
-import pytest
-
-
-@pytest.fixture
-def pin():
-    """Yield a fresh import of torch_pin so tests can mutate CHANNEL safely."""
-    import torch_pin
-
-    yield torch_pin
-    importlib.reload(torch_pin)
-
-
-@pytest.mark.parametrize(
-    "channel, expected_torch, expected_url",
-    [
-        (
-            "nightly",
-            "torch=={TORCH_VERSION}.{NIGHTLY_VERSION}",
-            "https://download.pytorch.org/whl/nightly",
-        ),
-        ("test", "torch=={TORCH_VERSION}", "https://download.pytorch.org/whl/test"),
-        ("release", "torch=={TORCH_VERSION}", "https://download.pytorch.org/whl"),
-    ],
-)
-def test_channel_resolution(pin, channel, expected_torch, expected_url):
-    pin.CHANNEL = channel
-    expected = expected_torch.format(
-        TORCH_VERSION=pin.TORCH_VERSION, NIGHTLY_VERSION=pin.NIGHTLY_VERSION
-    )
-    assert pin.torch_spec() == expected
-    assert pin.torch_index_url_base() == expected_url
-
-
-def test_all_specs_share_nightly_suffix(pin):
-    pin.CHANNEL = "nightly"
-    suffix = f".{pin.NIGHTLY_VERSION}"
-    assert pin.torch_spec().endswith(suffix)
-    assert pin.torchaudio_spec().endswith(suffix)
-    assert pin.torchcodec_spec().endswith(suffix)
-    assert pin.torchvision_spec().endswith(suffix)
-
-
-def test_specs_drop_suffix_off_nightly(pin):
-    pin.CHANNEL = "test"
-    assert pin.torch_spec() == f"torch=={pin.TORCH_VERSION}"
-    assert pin.torchaudio_spec() == f"torchaudio=={pin.TORCHAUDIO_VERSION}"
-    assert pin.torchcodec_spec() == f"torchcodec=={pin.TORCHCODEC_VERSION}"
-    assert pin.torchvision_spec() == f"torchvision=={pin.TORCHVISION_VERSION}"
-
-
-def test_torch_branch_derived_from_version(pin):
-    assert pin.torch_branch() == f"release/{pin.TORCH_VERSION.rsplit('.', 1)[0]}"
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 12e7f3d2067..86e54b478ef 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -53,7 +53,7 @@ dedupe_macos_loader_path_rpaths() {
   pushd ..
   torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
   popd
-  
+
   if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
     return
   fi
@@ -89,30 +89,6 @@ install_domains() {
 }
 
 install_pytorch_and_domains() {
-  # CWD is the executorch repo root, where torch_pin.py lives.
-  TORCH_CHANNEL=$(python -c "from torch_pin import CHANNEL; print(CHANNEL)")
-
-  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
-    # Test/release: install the published wheels directly from torch_pin.py's
-    # channel index, skipping the source-build path entirely. RC wheels at
-    # /whl/test/ get re-uploaded under the same version, so use --no-cache-dir
-    # there to avoid stale cache hits.
-    local torch_spec=$(python -c "from torch_pin import torch_spec; print(torch_spec())")
-    local torchvision_spec=$(python -c "from torch_pin import torchvision_spec; print(torchvision_spec())")
-    local torchaudio_spec=$(python -c "from torch_pin import torchaudio_spec; print(torchaudio_spec())")
-    local torch_index_url=$(python -c "from torch_pin import torch_index_url_base; print(torch_index_url_base())")
-    local cache_flag=""
-    if [ "${TORCH_CHANNEL}" = "test" ]; then
-      cache_flag="--no-cache-dir"
-    fi
-    pip install --force-reinstall ${cache_flag} \
-      "${torch_spec}" "${torchvision_spec}" "${torchaudio_spec}" \
-      --index-url "${torch_index_url}/cpu"
-    return
-  fi
-
-  # Nightly: source-build pytorch from the pinned SHA so CI catches upstream
-  # regressions; pytorch's own audio/vision pins drive those installs.
   pushd .ci/docker || return
   TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
   popd || return
@@ -164,10 +140,10 @@ install_pytorch_and_domains() {
   fi
 
   dedupe_macos_loader_path_rpaths
-  # We're on the nightly path here; defer to PyTorch's own pinned commits.
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  # Grab the pinned audio and vision commits from PyTorch
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
@@ -242,21 +218,17 @@ download_stories_model_artifacts() {
 }
 
 do_not_use_nightly_on_ci() {
-  # Sanity check that prevents accidentally landing a PR that pins to PyTorch
-  # nightly without exercising the source-build path (see #6564).
-  #
-  # For CHANNEL=nightly, CI source-builds pytorch from the SHA in pytorch.txt,
-  # so the installed torch shows up as e.g. 2.13.0a0+gitc8a648d — assert that.
-  # For CHANNEL=test/release, we install published wheels by design (e.g.
-  # 2.11.0), so the +git assertion doesn't apply.
-  TORCH_CHANNEL=$(python -c "from torch_pin import CHANNEL; print(CHANNEL)")
-  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
-    return 0
-  fi
-
+  # An assert to make sure that we are not using PyTorch nightly on CI to prevent
+  # regression as documented in https://github.com/pytorch/executorch/pull/6564
   TORCH_VERSION=$(pip list | grep -w 'torch ' | awk -F ' ' {'print $2'} | tr -d '\n')
+
+  # The version of PyTorch building from source looks like 2.6.0a0+gitc8a648d that
+  # includes the commit while nightly (2.6.0.dev20241019+cpu) or release (2.6.0)
+  # won't have that. Note that we couldn't check for the exact commit from the pin
+  # ci_commit_pins/pytorch.txt here because the value will be different when running
+  # this on PyTorch CI
   if [[ "${TORCH_VERSION}" != *"+git"* ]]; then
-    echo "Unexpected torch version. Expected binary built from source for CHANNEL=nightly, got ${TORCH_VERSION}"
+    echo "Unexpected torch version. Expected binary built from source, got ${TORCH_VERSION}"
     exit 1
   fi
 }
diff --git a/.github/scripts/update_pytorch_pin.py b/.github/scripts/update_pytorch_pin.py
index 9f2698917b2..dbc48552d9b 100644
--- a/.github/scripts/update_pytorch_pin.py
+++ b/.github/scripts/update_pytorch_pin.py
@@ -8,12 +8,6 @@
 import urllib.request
 from pathlib import Path
 
-# torch_pin.py lives at the repo root. Locate it relative to this script so
-# the import works regardless of where the script is invoked from.
-_REPO_ROOT = Path(__file__).resolve().parents[2]
-sys.path.insert(0, str(_REPO_ROOT))
-from torch_pin import CHANNEL, NIGHTLY_VERSION, torch_branch
-
 
 def parse_nightly_version(nightly_version):
     """
@@ -33,6 +27,23 @@ def parse_nightly_version(nightly_version):
     return f"{year}-{month}-{day}"
 
 
+def get_torch_nightly_version():
+    """
+    Read NIGHTLY_VERSION from torch_pin.py.
+
+    Returns:
+        NIGHTLY_VERSION string
+    """
+    with open("torch_pin.py", "r") as f:
+        content = f.read()
+
+    match = re.search(r'NIGHTLY_VERSION\s*=\s*["\']([^"\']+)["\']', content)
+    if not match:
+        raise ValueError("Could not find NIGHTLY_VERSION in torch_pin.py")
+
+    return match.group(1)
+
+
 def get_commit_hash_for_nightly(date_str):
     """
     Fetch commit hash from PyTorch nightly branch for a given date.
@@ -80,16 +91,17 @@ def extract_hash_from_title(title):
     return match.group(1)
 
 
-def update_pytorch_pin(ref):
+def update_pytorch_pin(commit_hash):
     """
-    Update .ci/docker/ci_commit_pins/pytorch.txt with the new ref.
+    Update .ci/docker/ci_commit_pins/pytorch.txt with the new commit hash.
 
     Args:
-        ref: Either a commit SHA (nightly) or a branch name (test/release).
+        commit_hash: Commit hash to write
     """
-    pin_file = _REPO_ROOT / ".ci/docker/ci_commit_pins/pytorch.txt"
-    pin_file.write_text(f"{ref}\n")
-    print(f"Updated {pin_file} with ref: {ref}")
+    pin_file = ".ci/docker/ci_commit_pins/pytorch.txt"
+    with open(pin_file, "w") as f:
+        f.write(f"{commit_hash}\n")
+    print(f"Updated {pin_file} with commit hash: {commit_hash}")
 
 
 def should_skip_file(filename):
@@ -106,20 +118,18 @@ def should_skip_file(filename):
     return filename in skip_files
 
 
-def fetch_file_content(ref, file_path):
+def fetch_file_content(commit_hash, file_path):
     """
     Fetch file content from GitHub API.
 
     Args:
-        ref: Commit SHA or branch name to fetch from
+        commit_hash: Commit hash to fetch from
         file_path: File path in the repository
 
     Returns:
         File content as bytes
     """
-    api_url = (
-        f"https://api.github.com/repos/pytorch/pytorch/contents/{file_path}?ref={ref}"
-    )
+    api_url = f"https://api.github.com/repos/pytorch/pytorch/contents/{file_path}?ref={commit_hash}"
 
     req = urllib.request.Request(api_url)
     req.add_header("Accept", "application/vnd.github.v3+json")
@@ -136,7 +146,7 @@ def fetch_file_content(ref, file_path):
         raise
 
 
-def sync_directory(et_dir, pt_path, ref):
+def sync_directory(et_dir, pt_path, commit_hash):
     """
     Sync files from PyTorch to ExecuTorch using GitHub API.
     Only syncs files that already exist in ExecuTorch - does not add new files.
@@ -144,7 +154,7 @@ def sync_directory(et_dir, pt_path, ref):
     Args:
         et_dir: ExecuTorch directory path
         pt_path: PyTorch directory path in the repository (e.g., "c10")
-        ref: Commit SHA or branch name to fetch from
+        commit_hash: Commit hash to fetch from
 
     Returns:
         Number of files grafted
@@ -171,12 +181,12 @@ def sync_directory(et_dir, pt_path, ref):
 
         # Fetch content from PyTorch and compare
         try:
-            pt_content = fetch_file_content(ref, pt_file_path)
+            pt_content = fetch_file_content(commit_hash, pt_file_path)
             et_content = et_file.read_bytes()
 
             if pt_content != et_content:
                 print(f"⚠️  Difference detected in {rel_path}")
-                print(f"📋 Grafting from PyTorch ref {ref}...")
+                print(f"📋 Grafting from PyTorch commit {commit_hash}...")
 
                 et_file.write_bytes(pt_content)
                 print(f"✅ Grafted {et_file}")
@@ -191,34 +201,37 @@ def sync_directory(et_dir, pt_path, ref):
     return files_grafted
 
 
-def sync_c10_directories(ref):
+def sync_c10_directories(commit_hash):
     """
     Sync c10 and torch/headeronly directories from PyTorch to ExecuTorch using GitHub API.
 
     Args:
-        ref: PyTorch commit SHA or branch name to sync from
+        commit_hash: PyTorch commit hash to sync from
 
     Returns:
         Total number of files grafted
     """
     print("\n🔄 Syncing c10 directories from PyTorch via GitHub API...")
 
+    # Get repository root
+    repo_root = Path.cwd()
+
     # Define directory pairs to sync (from check_c10_sync.sh)
     # Format: (executorch_dir, pytorch_path_in_repo)
     dir_pairs = [
         (
-            _REPO_ROOT / "runtime/core/portable_type/c10/c10",
+            repo_root / "runtime/core/portable_type/c10/c10",
             "c10",
         ),
         (
-            _REPO_ROOT / "runtime/core/portable_type/c10/torch/headeronly",
+            repo_root / "runtime/core/portable_type/c10/torch/headeronly",
             "torch/headeronly",
         ),
     ]
 
     total_grafted = 0
     for et_dir, pt_path in dir_pairs:
-        files_grafted = sync_directory(et_dir, pt_path, ref)
+        files_grafted = sync_directory(et_dir, pt_path, commit_hash)
         total_grafted += files_grafted
 
     if total_grafted > 0:
@@ -231,26 +244,27 @@ def sync_c10_directories(ref):
 
 def main():
     try:
-        print(f"CHANNEL: {CHANNEL}")
-        if CHANNEL == "nightly":
-            # Nightly pins to an immutable SHA looked up by date.
-            print(f"Found NIGHTLY_VERSION: {NIGHTLY_VERSION}")
-            date_str = parse_nightly_version(NIGHTLY_VERSION)
-            print(f"Parsed date: {date_str}")
-            pin_ref = get_commit_hash_for_nightly(date_str)
-        else:
-            # For test/release, pin to the branch name so CI picks up
-            # cherry-picks / security patches as they land on the branch.
-            pin_ref = torch_branch()
-        print(f"Pin ref: {pin_ref}")
+        # Read NIGHTLY_VERSION from torch_pin.py
+        nightly_version = get_torch_nightly_version()
+        print(f"Found NIGHTLY_VERSION: {nightly_version}")
+
+        # Parse to date string
+        date_str = parse_nightly_version(nightly_version)
+        print(f"Parsed date: {date_str}")
+
+        # Fetch commit hash from PyTorch nightly branch
+        commit_hash = get_commit_hash_for_nightly(date_str)
+        print(f"Found commit hash: {commit_hash}")
 
         # Update the pin file
-        update_pytorch_pin(pin_ref)
+        update_pytorch_pin(commit_hash)
 
-        # Sync c10 directories from PyTorch (ref param accepts branches too)
-        sync_c10_directories(pin_ref)
+        # Sync c10 directories from PyTorch
+        sync_c10_directories(commit_hash)
 
-        print("\n✅ Successfully updated PyTorch pin and synced c10 directories!")
+        print(
+            "\n✅ Successfully updated PyTorch commit pin and synced c10 directories!"
+        )
 
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
diff --git a/.github/workflows/weekly-pytorch-pin-bump.yml b/.github/workflows/weekly-pytorch-pin-bump.yml
index ba8f48505d5..30579c77701 100644
--- a/.github/workflows/weekly-pytorch-pin-bump.yml
+++ b/.github/workflows/weekly-pytorch-pin-bump.yml
@@ -22,46 +22,29 @@ jobs:
         with:
           python-version: '3.11'
 
-      - name: Check torch_pin channel
-        id: channel
-        run: |
-          CHANNEL=$(python -c "from torch_pin import CHANNEL; print(CHANNEL)")
-          echo "channel=${CHANNEL}" >> "$GITHUB_OUTPUT"
-          if [ "${CHANNEL}" != "nightly" ]; then
-            echo "torch_pin.py CHANNEL is '${CHANNEL}'; weekly nightly bump only runs when CHANNEL == 'nightly'."
-          fi
-
       - name: Determine nightly version
-        if: steps.channel.outputs.channel == 'nightly'
         id: nightly
         run: |
           NIGHTLY_DATE=$(date -u -d 'yesterday' '+%Y%m%d')
           NIGHTLY_VERSION="dev${NIGHTLY_DATE}"
           echo "version=${NIGHTLY_VERSION}" >> "$GITHUB_OUTPUT"
 
+      - name: Read current TORCH_VERSION
+        id: torch
+        run: |
+          TORCH_VERSION=$(python -c "exec(open('torch_pin.py').read()); print(TORCH_VERSION)")
+          echo "version=${TORCH_VERSION}" >> "$GITHUB_OUTPUT"
+
       - name: Update torch_pin.py with new NIGHTLY_VERSION
-        if: steps.channel.outputs.channel == 'nightly'
-        env:
-          NIGHTLY_VERSION: ${{ steps.nightly.outputs.version }}
         run: |
-          python - <<'PY'
-          import os, pathlib, re
-          p = pathlib.Path('torch_pin.py')
-          p.write_text(re.sub(
-              r'^NIGHTLY_VERSION\s*=\s*".*"$',
-              f'NIGHTLY_VERSION = "{os.environ["NIGHTLY_VERSION"]}"',
-              p.read_text(),
-              count=1,
-              flags=re.MULTILINE,
-          ))
-          PY
+          printf 'TORCH_VERSION = "%s"\nNIGHTLY_VERSION = "%s"\n' \
+            "${{ steps.torch.outputs.version }}" \
+            "${{ steps.nightly.outputs.version }}" > torch_pin.py
 
       - name: Run pin bump script
-        if: steps.channel.outputs.channel == 'nightly'
         run: python .github/scripts/update_pytorch_pin.py
 
       - name: Create branch and PR
-        if: steps.channel.outputs.channel == 'nightly'
         env:
           GH_TOKEN: ${{ secrets.UPDATEBOT_TOKEN }}
         run: |
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index eb8fe96ed05..9fc12f64bc9 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -7,16 +7,10 @@
 
 set -x
 
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-# torch_pin lives at the executorch repo root.
-cd "$SCRIPT_DIR/../../../.."
-
-TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")
-TORCHCODEC_INDEX=$(python -c "from torch_pin import torch_index_url_base; print(torch_index_url_base())")
-
 sudo apt install ffmpeg -y
-pip install "$TORCHCODEC_PKG" --extra-index-url "${TORCHCODEC_INDEX}/cpu"
+pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 bash "$SCRIPT_DIR"/../../llama/install_requirements.sh
diff --git a/install_executorch.py b/install_executorch.py
index d305a06bd28..140a1163020 100644
--- a/install_executorch.py
+++ b/install_executorch.py
@@ -174,11 +174,7 @@ def _parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--use-pt-pinned-commit",
         action="store_true",
-        help="install plain `torch` (whatever pip resolves by default; CI "
-        "uses this when torch is already built from source against the "
-        "pinned ref in pytorch.txt). Without this flag, install the specific "
-        "pinned version from the channel selected in torch_pin.py "
-        "(nightly / test / release).",
+        help="build from the pinned PyTorch commit instead of nightly",
     )
     parser.add_argument(
         "--editable",
@@ -221,14 +217,13 @@ def main(args):
         return
 
     check_and_update_submodules()
-    # By default install the specific pinned version from the channel selected
-    # in torch_pin.py. With --use-pt-pinned-commit, install plain `torch` (pip's
-    # default resolution); CI uses this when torch is already built from source
-    # against the pinned ref in pytorch.txt.
-    install_pinned_version = not args.use_pt_pinned_commit
+    # This option is used in CI to make sure that PyTorch build from the pinned commit
+    # is used instead of nightly. CI jobs wouldn't be able to catch regression from the
+    # latest PT commit otherwise
+    use_pytorch_nightly = not args.use_pt_pinned_commit
 
     # Step 1: Install core dependencies first
-    install_requirements(install_pinned_version)
+    install_requirements(use_pytorch_nightly)
 
     # Step 2: Install core package
     package_spec = "."
@@ -253,7 +248,7 @@ def main(args):
 
     # Step 3: Extra (optional) packages that is only useful for running examples.
     if not args.minimal:
-        install_optional_example_requirements(install_pinned_version)
+        install_optional_example_requirements(use_pytorch_nightly)
 
 
 if __name__ == "__main__":
diff --git a/install_requirements.py b/install_requirements.py
index 1e8ab5c2d6f..b30068cbdb8 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,18 +12,9 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-from torch_pin import (
-    CHANNEL,
-    torch_index_url_base,
-    torch_spec,
-    torchaudio_spec,
-    torchvision_spec,
-)
-
-# Only RC wheels at /whl/test/ get re-uploaded under the same version, so
-# pip's local cache can serve stale content. Nightly and release wheels are
-# immutable per their identifier.
-_NO_CACHE_DIR_FLAG = ["--no-cache-dir"] if CHANNEL == "test" else []
+# The pip repository that hosts nightly torch packages.
+# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
+TORCH_URL_BASE = "https://download.pytorch.org/whl/test"
 
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
@@ -32,18 +23,17 @@
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
 #
-# NOTE: If you change torch_pin.py, the pre-commit hook runs
-# .github/scripts/update_pytorch_pin.py to refresh
-# .ci/docker/ci_commit_pins/pytorch.txt and the c10 grafted headers.
-# If you bypass the hook, run that script manually.
+# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
+# by picking the hash from the same date in
+# https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ @lint-ignore
 #
 # NOTE: If you're changing, make the corresponding supported CUDA versions in
 # SUPPORTED_CUDA_VERSIONS in install_utils.py if needed.
 
 
-def install_requirements(install_pinned_version):
-    # No prebuilt wheels are available for Intel macOS, regardless of channel.
-    if install_pinned_version and is_intel_mac_os():
+def install_requirements(use_pytorch_nightly):
+    # Skip pip install on Intel macOS if using nightly.
+    if use_pytorch_nightly and is_intel_mac_os():
         print(
             "ERROR: Prebuilt PyTorch wheels are no longer available for Intel-based macOS.\n"
             "Please build from source by following https://docs.pytorch.org/executorch/main/using-executorch-building-from-source.html",
@@ -52,26 +42,25 @@ def install_requirements(install_pinned_version):
         sys.exit(1)
 
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(torch_index_url_base())
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     # pip packages needed by exir.
     TORCH_PACKAGE = [
-        # Default: install the specific pinned version from the channel selected
-        # in torch_pin.py. With --use-pt-pinned-commit, pass plain "torch" and
-        # let pip resolve its default (CI's source-build is already installed).
-        (torch_spec() if install_pinned_version else "torch"),
+        # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
+        # that we don't need to set any version number there because they have already
+        # been installed on CI before this step, so pip won't reinstall them
+        ("torch==2.11.0" if use_pytorch_nightly else "torch"),
     ]
 
     # Install the requirements for core ExecuTorch package.
-    # `--extra-index-url` tells pip to look for package versions on the
-    # provided URL if they aren't available on the default URL.
+    # `--extra-index-url` tells pip to look for package
+    # versions on the provided URL if they aren't available on the default URL.
     subprocess.run(
         [
             sys.executable,
             "-m",
             "pip",
             "install",
-            *_NO_CACHE_DIR_FLAG,
             "-r",
             "requirements-dev.txt",
             *TORCH_PACKAGE,
@@ -117,14 +106,14 @@ def install_requirements(install_pinned_version):
     )
 
 
-def install_optional_example_requirements(install_pinned_version):
+def install_optional_example_requirements(use_pytorch_nightly):
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(torch_index_url_base())
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
-        (torchvision_spec() if install_pinned_version else "torchvision"),
-        (torchaudio_spec() if install_pinned_version else "torchaudio"),
+        ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
+        ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
     ]
     # Then install domain libraries
     subprocess.run(
@@ -133,7 +122,6 @@ def install_optional_example_requirements(install_pinned_version):
             "-m",
             "pip",
             "install",
-            *_NO_CACHE_DIR_FLAG,
             *DOMAIN_LIBRARIES,
             "--extra-index-url",
             torch_url,
@@ -164,11 +152,7 @@ def main(args):
     parser.add_argument(
         "--use-pt-pinned-commit",
         action="store_true",
-        help="install plain `torch` (whatever pip resolves by default; CI "
-        "uses this when torch is already built from source against the "
-        "pinned ref in pytorch.txt). Without this flag, install the specific "
-        "pinned version from the channel selected in torch_pin.py "
-        "(nightly / test / release).",
+        help="build from the pinned PyTorch commit instead of nightly",
     )
     parser.add_argument(
         "--example",
@@ -176,10 +160,10 @@ def main(args):
         help="Also installs required packages for running example scripts.",
     )
     args = parser.parse_args(args)
-    install_pinned_version = not bool(args.use_pt_pinned_commit)
-    install_requirements(install_pinned_version)
+    use_pytorch_nightly = not bool(args.use_pt_pinned_commit)
+    install_requirements(use_pytorch_nightly)
     if args.example:
-        install_optional_example_requirements(install_pinned_version)
+        install_optional_example_requirements(use_pytorch_nightly)
 
 
 if __name__ == "__main__":
diff --git a/torch_pin.py b/torch_pin.py
index 856a67c1990..3575d9a376d 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,59 +1,2 @@
-# CHANNEL selects the wheel source for torch and its domain libraries.
-#   "nightly" — dev builds from /whl/nightly. NIGHTLY_VERSION is appended to
-#               every package spec, and CI source-builds pytorch from the
-#               pinned SHA in pytorch.txt to catch upstream regressions.
-#   "test"    — release candidates from /whl/test.
-#   "release" — stable releases from /whl.
-# For "test" and "release", NIGHTLY_VERSION is ignored and CI installs the
-# published wheels directly (no source build).
-#
-# Example — pinning to a 2.12 release candidate when nightly is broken:
-#   1. Set CHANNEL = "test".
-#   2. Set the four version constants to the RC's major.minor.patch
-#      (look up matching versions on https://download.pytorch.org/whl/test/).
-#   3. Re-run install_requirements.sh; commit. The pre-commit hook calls
-#      .github/scripts/update_pytorch_pin.py, which writes torch_branch()
-#      (e.g. "release/2.12") into .ci/docker/ci_commit_pins/pytorch.txt and
-#      re-syncs grafted c10 headers.
-CHANNEL = "test"
-
 TORCH_VERSION = "2.11.0"
-TORCHAUDIO_VERSION = "2.11.0"
-TORCHCODEC_VERSION = "0.11.0"
-TORCHVISION_VERSION = "0.26.0"
-
-NIGHTLY_VERSION = "dev20260318"
-
-
-def _spec(name: str, version: str) -> str:
-    if CHANNEL == "nightly":
-        return f"{name}=={version}.{NIGHTLY_VERSION}"
-    return f"{name}=={version}"
-
-
-def torch_spec() -> str:
-    return _spec("torch", TORCH_VERSION)
-
-
-def torchaudio_spec() -> str:
-    return _spec("torchaudio", TORCHAUDIO_VERSION)
-
-
-def torchcodec_spec() -> str:
-    return _spec("torchcodec", TORCHCODEC_VERSION)
-
-
-def torchvision_spec() -> str:
-    return _spec("torchvision", TORCHVISION_VERSION)
-
-
-def torch_index_url_base() -> str:
-    if CHANNEL == "release":
-        return "https://download.pytorch.org/whl"
-    return f"https://download.pytorch.org/whl/{CHANNEL}"
-
-
-def torch_branch() -> str:
-    # PyTorch uses "release/M.N" branches; derive from the pinned version.
-    # Used by update_pytorch_pin.py to write into pytorch.txt for test/release.
-    return f"release/{TORCH_VERSION.rsplit('.', 1)[0]}"
+# NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287

From cdcc9156f974d1d89f54d89ce5918af54a98af31 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Wed, 6 May 2026 10:46:00 -0700
Subject: [PATCH 24/58] Fix C++ -Werror regressions in llama runner (#19326)
 (#19326)

Summary:

Fixes 3 `-Werror` diagnostics that broke the qualcomm llama runner build
on
`cfg:android-arm64-clang19-no-san` and disabled the following test infra
targets:

- `xplat/executorch/examples/qualcomm/oss_scripts/llama:runner_lib`
-
`xplat/executorch/examples/qualcomm/oss_scripts/llama:runner_lib_static`
-
`xplat/executorch/examples/qualcomm/oss_scripts/llama:qnn_llama_runner`
-
`xplat/executorch/examples/qualcomm/oss_scripts/llama:qnn_llama_runner_static`

Three diagnostics fixed:

1. `-Wreorder-ctor` in `runner.cpp`: `attention_sink_rope_module_` is
declared as the 2nd field of `Runner<T>` (right after `module_`) but the
constructor initializer list appended it last, after `tokenizer_`. Moved
it to the correct position in the init list to match declaration order.
   Recent regression introduced in the attention-sink diff (#16574).

2. `-Woverloaded-virtual` in `lhd_token_generator.h` and
   `multimodal_lhd_token_generator.h`: the derived classes define a
`prepare_io(std::vector<uint64_t>, std::vector<int32_t>)` overload that
   hides the base class virtual `prepare_io(uint64_t, int64_t)`. Added a
   `using TokenGenerator<T>::prepare_io;` (and equivalent for the
multimodal hierarchy) declaration so the base virtual stays in scope and
the warning is silenced without changing behavior. Latent bug surfaced
   by the clang19 toolchain bump.

3. `-Wdelete-non-abstract-non-virtual-dtor` in `prompt_processor.h`:
   `PromptProcessor<T>` has virtual member functions but no virtual
   destructor, so deleting via `std::unique_ptr<PromptProcessor<T>>` in
   `Runner` was undefined behavior under strict warnings. Added
   `virtual ~PromptProcessor() = default;` mirroring the pattern already
   used in `TokenGenerator` (`token_generator.h`). Also transitively
   fixes `MultimodalPromptProcessor<T>`.

Reviewed By: rascani

Differential Revision: D103991803
---
 .../qualcomm/oss_scripts/llama/runner/lhd_token_generator.h   | 3 +++
 .../runner/multimodal_runner/multimodal_lhd_token_generator.h | 3 +++
 examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h | 2 ++
 examples/qualcomm/oss_scripts/llama/runner/runner.cpp         | 4 ++--
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
index e97f64b7c1d..796dde88014 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
@@ -102,6 +102,9 @@ class LhdTokenGenerator : public TokenGenerator<T> {
       AttentionSinkRopeRunner* attention_sink_rope_runner) override;
 
  private:
+  // Bring base class's virtual prepare_io into scope so the overload below
+  // does not hide it (-Woverloaded-virtual).
+  using TokenGenerator<T>::prepare_io;
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
index 83da9e7a6ba..7494afec6da 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
@@ -108,6 +108,9 @@ class MultimodalLhdTokenGenerator
       AttentionSinkRopeRunner* attention_sink_rope_runner) override;
 
  private:
+  // Bring base class's virtual prepare_io into scope so the overload below
+  // does not hide it (-Woverloaded-virtual).
+  using TokenGenerator<T>::prepare_io;
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
index 0790985d231..599f7050d83 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
@@ -40,6 +40,8 @@ class PromptProcessor {
       const std::string& method_name,
       Metadata metadata);
 
+  virtual ~PromptProcessor() = default;
+
   /**
    * @brief Initialize I/O tensor and allocate I/O data buffer.
    * @param buffer_manager Pointer to IMemAlloc instance; by default, it uses a
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 0e9b7860dbd..0a4a8b9abb5 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -102,6 +102,7 @@ Runner<T>::Runner(
     std::unique_ptr<tokenizers::Tokenizer> tokenizer,
     std::unique_ptr<executorch::extension::Module> attention_sink_rope_module)
     : module_(std::move(module)),
+      attention_sink_rope_module_(std::move(attention_sink_rope_module)),
       ngram_(ngram),
       window_(window),
       gcap_(gcap),
@@ -111,8 +112,7 @@ Runner<T>::Runner(
       temperature_(temperature),
       eval_mode_(static_cast<EvalMode>(eval_mode)),
       shared_buffer_(shared_buffer),
-      tokenizer_(std::move(tokenizer)),
-      attention_sink_rope_module_(std::move(attention_sink_rope_module)) {
+      tokenizer_(std::move(tokenizer)) {
   stats_.reset();
 
   if (decoder_model_version == "llama2") {

From 3a4c3a15abcba85cfc4a82bdcf5b5b89949d4d5c Mon Sep 17 00:00:00 2001
From: Johnson Wong <the.johnson.wong@gmail.com>
Date: Wed, 6 May 2026 13:21:26 -0700
Subject: [PATCH 25/58] =?UTF-8?q?Fix=20ExecutorTorch=20=E2=86=92=20ExecuTo?=
 =?UTF-8?q?rch=20in=20comments=20only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Differential Revision: D104074211

Pull Request resolved: https://github.com/pytorch/executorch/pull/19337
---
 backends/mlx/llm/cache.py        | 2 +-
 backends/mlx/llm/et_attention.py | 2 +-
 backends/mlx/test/test_ops.py    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/mlx/llm/cache.py b/backends/mlx/llm/cache.py
index 9709980689b..6b281818610 100644
--- a/backends/mlx/llm/cache.py
+++ b/backends/mlx/llm/cache.py
@@ -23,7 +23,7 @@
 
 class KVCache(nn.Module):
     """
-    MLX-optimized KV cache with ExecutorTorch llama KVCache interface.
+    MLX-optimized KV cache with ExecuTorch llama KVCache interface.
 
     This class follows the same interface as examples/models/llama/attention.py KVCache,
     making it a drop-in replacement, but uses the mlx::kv_cache_update op internally
diff --git a/backends/mlx/llm/et_attention.py b/backends/mlx/llm/et_attention.py
index 10c758f94fe..ea39133db3a 100644
--- a/backends/mlx/llm/et_attention.py
+++ b/backends/mlx/llm/et_attention.py
@@ -7,7 +7,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-MLX-optimized attention for ExecutorTorch's Llama attention registry.
+MLX-optimized attention for ExecuTorch's Llama attention registry.
 
 Registers an "mlx" attention type that uses mlx::kv_cache_update and
 mlx::custom_sdpa for efficient execution on Apple Silicon.
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
index a44ed83da4c..9624d49b05e 100644
--- a/backends/mlx/test/test_ops.py
+++ b/backends/mlx/test/test_ops.py
@@ -1810,7 +1810,7 @@ class KVCacheModel(nn.Module):
     """
     Test model wrapping KVCache from cache.py.
 
-    This tests the ExecutorTorch llama KVCache-compatible interface that uses
+    This tests the ExecuTorch llama KVCache-compatible interface that uses
     the mlx::kv_cache_update op internally.
     """
 
@@ -1845,7 +1845,7 @@ def forward(
 @register_test
 class KVCacheTest(OpTestCase):
     """
-    Test case for MLX KVCache with ExecutorTorch llama KVCache interface.
+    Test case for MLX KVCache with ExecuTorch llama KVCache interface.
 
     This verifies that KVCache:
     1. Accepts the ET llama KVCache update interface

From 3c4ec8ff013f2bb89bc21bebf858723e74c53102 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Wed, 6 May 2026 13:27:50 -0700
Subject: [PATCH 26/58] Limit ARM retries to operator tests (#19343)

Jobs been timing out since the first attempt
---
 .ci/scripts/test_backend.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index 8bf2236e4e6..a7f89f820b2 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -58,7 +58,9 @@ if [[ "$FLOW" == *vulkan* ]]; then
 fi
 
 if [[ "$FLOW" == *arm* ]]; then
-    PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
+    if [[ "$SUITE" == "operators" ]]; then
+        PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
+    fi
 
     # Setup ARM deps.
     if [[ "$FLOW" == *vgf* ]]; then

From 851cffb4994d07fa226a9d9868ee1c692025c658 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Wed, 6 May 2026 13:28:14 -0700
Subject: [PATCH 27/58] Fix missing check (#19340)

Missing dimension check which was breaking test.
---
 kernels/optimized/cpu/op_grid_sampler_2d.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
index aebfd292bab..a3f59df109d 100644
--- a/kernels/optimized/cpu/op_grid_sampler_2d.cpp
+++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
@@ -338,10 +338,10 @@ Tensor& opt_grid_sampler_2d_out(
   // The NEON paths index input/grid/out directly assuming a contiguous NCHW
   // default-dim-order layout — no use of .strides() or .dim_order(). Fall
   // back to portable for anything else.
-  const bool fast_eligible = tensor_is_default_dim_order(input) &&
-      tensor_is_default_dim_order(grid) && tensor_is_default_dim_order(out) &&
-      tensor_is_contiguous(input) && tensor_is_contiguous(grid) &&
-      tensor_is_contiguous(out);
+  const bool fast_eligible = input.dim() == 4 && grid.dim() == 4 &&
+      tensor_is_default_dim_order(input) && tensor_is_default_dim_order(grid) &&
+      tensor_is_default_dim_order(out) && tensor_is_contiguous(input) &&
+      tensor_is_contiguous(grid) && tensor_is_contiguous(out);
 
   // The fast paths read input/grid and write out as a single dtype: float for
   // the fp32 NEON path, fp16 for both the fp16 HW path (which raw-casts the

From af901308ff744858bca36de7a91dc5d207e461b0 Mon Sep 17 00:00:00 2001
From: Eli Amesefe <eliamesefe@meta.com>
Date: Wed, 6 May 2026 16:41:12 -0700
Subject: [PATCH 28/58] route EthosU input/output memcpy through overridable
 hook (#19264)

Differential Revision: D103455766

Pull Request resolved: https://github.com/pytorch/executorch/pull/19264
---
 backends/arm/CMakeLists.txt                   |  6 ++++--
 backends/arm/runtime/EthosUBackend.cpp        | 14 +++++++++++---
 .../arm/runtime/EthosUBackend_Cortex_M.cpp    |  9 ++++++++-
 .../arm/runtime/EthosUBackend_IoMemcpy.cpp    | 19 +++++++++++++++++++
 backends/arm/runtime/targets.bzl              |  1 +
 5 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100644 backends/arm/runtime/EthosUBackend_IoMemcpy.cpp

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 12c46107104..0c8b241522c 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -54,8 +54,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
 
   set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 
-  set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp
-                           backends/arm/runtime/VelaBinStream.cpp
+  set(_arm_backend_sources
+      backends/arm/runtime/EthosUBackend.cpp
+      backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
+      backends/arm/runtime/VelaBinStream.cpp
   )
   list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
 
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index 2b17cf2c43d..4b78f9a7e28 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -26,6 +26,12 @@
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
+// Overridable memcpy used by the EthosU backend for input/output scratch
+// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
+// std::memcpy. Firmware targets can supply a strong override (e.g. routing
+// through a DMA engine) to reduce CPU memcpy load on the host MCU.
+extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
+
 using namespace std;
 
 using executorch::aten::ScalarType;
@@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       if (both_char || both_int || both_short || both_bool) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
-        // Sizes match and elt size matches so memcpy
-        memcpy(
+        // Sizes match and elt size matches so memcpy.
+        // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+        arm_ethos_io_memcpy(
             scratch_addr,
             tensor_in.mutable_data_ptr<char>(),
             tensor_in.nbytes());
@@ -389,7 +396,8 @@ Error copy_with_layout_adjustment(
   }
   const char* src_bytes = src;
   for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
-    memcpy(dest, src_bytes, chunk_size);
+    // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+    arm_ethos_io_memcpy(dest, src_bytes, chunk_size);
     src_bytes += vela_chunk_size;
     dest += chunk_size;
   }
diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
index 7962ef846df..96398762302 100644
--- a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
+++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
@@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex(
   return ethosu_reserve_driver();
 }
 
+// Overridable memcpy used by the EthosU backend for output scratch
+// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
+// std::memcpy. Firmware targets can supply a strong override (e.g. routing
+// through a DMA engine) to reduce CPU memcpy load on the host MCU.
+extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
+
 namespace executorch {
 namespace backends {
 namespace arm {
@@ -136,7 +142,8 @@ Error platform_execute(
       }
       io_bytes_total += tensor_bytes;
     } else {
-      memcpy(
+      // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+      arm_ethos_io_memcpy(
           tensor_out.mutable_data_ptr<char>(),
           static_cast<const char*>(output_addr),
           tensor_bytes);
diff --git a/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
new file mode 100644
index 00000000000..1ef5b747b81
--- /dev/null
+++ b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstddef>
+#include <cstring>
+
+// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a
+// strong-symbol override (e.g. routing through DMA on Cortex-M55) without
+// touching the upstream EthosUBackend code. Lives in its own translation
+// unit so the compiler in the call-site TUs cannot inline this body and
+// bypass the link-time override (same trick as bolt_arm_memcpy_external).
+extern "C" __attribute__((weak)) void
+arm_ethos_io_memcpy(void* dst, const void* src, size_t size) {
+  std::memcpy(dst, src, size);
+}
diff --git a/backends/arm/runtime/targets.bzl b/backends/arm/runtime/targets.bzl
index 42df03fb58b..51c0bf93f55 100644
--- a/backends/arm/runtime/targets.bzl
+++ b/backends/arm/runtime/targets.bzl
@@ -15,6 +15,7 @@ def define_common_targets():
         srcs = [
             "EthosUBackend.cpp",
             "EthosUBackend_Cortex_M.cpp",
+            "EthosUBackend_IoMemcpy.cpp",
         ],
         headers = ["EthosUBackend_Internal.h"],
         compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],

From 1414bc10fe3065ca7614b7ff7b2214d9a0c247eb Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 7 May 2026 10:27:34 -0500
Subject: [PATCH 29/58] Disable HF Xet storage to fix CI export timeouts
 (#19358)

HuggingFace's Xet storage backend stalls mid-download on CI runners,
causing the 90-minute job timeout to fire before model weights finish
downloading. Force standard HTTP downloads instead.

(from debug logs in #19352)
---
 .ci/scripts/export_model_artifact.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 78053c33e7a..b7edc1765ea 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -67,6 +67,8 @@ if [ -z "${1:-}" ]; then
   exit 1
 fi
 
+export HF_HUB_DISABLE_XET=1
+
 set -eux
 
 DEVICE="$1"

From 0ee31fc1e92834cb8afc9c335cb680ea65ceb97f Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Thu, 7 May 2026 09:14:24 -0700
Subject: [PATCH 30/58] Bump iOS XCTest timeout for ExecuTorchLLMTests (#19354)

Summary:
The 13 XCTestCase methods in
`xplat/executorch/extension/llm/apple:ExecuTorchLLMTests`
(testLLaMA, testPhi4, testGemma, testLLaVA, testVoxtral and their
reset variants) regularly hit the 1800-second per-test ceiling
enforced by `fbobjc/Tools/xctest_runner` for the `long_running`
label. LLM inference on iOS-sim CPU (1B-class models,
128-768 token sequences, each test calls `generate()` twice)
routinely exceeds 30 minutes per test method, producing spurious
"Test timed out after 1800 seconds" flakes on the test-issues
dashboard for owner `ai_infra_mobile_platform`.

Per the runner formula
`TEST_CASE_TIMEOUT(60s) * label_multiplier * 3`:

| label          | multiplier | per-XCTestCase budget |
|----------------|-----------:|----------------------:|
| long_running   |        x10 |                 1800s |
| glacial (here) |        x30 |                 5400s |

Switching to `glacial` (the highest tier supported by the runner)
gives each test 90 minutes. Adding
`test_test_rule_timeout_ms = 14400000` sets the bundle-level
wall-clock budget to 4h, which is comfortable headroom for ~5
testcases at 90 min each plus xctest setup/teardown.

Note: this diff is unrelated to T269848646. T269848646 tracks a
separate cluster of 446 iOS-sim test-run *cancellations*
(`duration: 0.00`, "test execution was cancelled because the test
run was cancelled") that is owned by testinfra and is not
addressed here.

Reviewed By: shoumikhin

Differential Revision: D104147313
---
 extension/llm/apple/BUCK | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/extension/llm/apple/BUCK b/extension/llm/apple/BUCK
index 36da3c77935..667082e27d1 100644
--- a/extension/llm/apple/BUCK
+++ b/extension/llm/apple/BUCK
@@ -16,7 +16,17 @@ non_fbcode_target(_kind = fb_apple_library,
     ],
     sdks = IOS,
     visibility = EXECUTORCH_CLIENTS,
-    test_labels = ["long_running"],
+    # `glacial` raises the per-XCTestCase timeout from 1800s -> 5400s (90 min)
+    # via fbobjc/Tools/xctest_runner: TEST_CASE_TIMEOUT(60s) * 30 * 3.
+    # Required because LLM inference (LLaMA, Phi4, Gemma, LLaVA, Voxtral)
+    # on iOS-sim CPU regularly exceeds 30 minutes for a full forward pass.
+    test_labels = ["glacial"],
+    # Rule-level wall-clock for the whole auto-generated test bundle:
+    # ExecuTorchLLMTests currently contains 13 XCTestCase methods, and
+    # individual methods can exceed 30 minutes on iOS-sim CPU. This 4h
+    # budget is intended as the total bundle/shard wall-clock, including
+    # xctest setup/teardown overhead; it is not based on "5 testcases".
+    test_test_rule_timeout_ms = 14400000,
     test_deps = [
         ":ExecuTorchLLMTestResource",
         "//xplat/executorch/backends/xnnpack:xnnpack_backendApple",

From dd4397f8d2f8c328582a52261586cb4c8d938fc7 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Thu, 7 May 2026 09:55:09 -0700
Subject: [PATCH 31/58] Fix optimized grid sampler validation (#19373)

First attempt was not sufficient. This one passes the full optimized
kernel suite now.
---
 kernels/optimized/cpu/op_grid_sampler_2d.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
index a3f59df109d..7ec45860985 100644
--- a/kernels/optimized/cpu/op_grid_sampler_2d.cpp
+++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
@@ -339,6 +339,7 @@ Tensor& opt_grid_sampler_2d_out(
   // default-dim-order layout — no use of .strides() or .dim_order(). Fall
   // back to portable for anything else.
   const bool fast_eligible = input.dim() == 4 && grid.dim() == 4 &&
+      grid.size(3) == 2 && input.size(0) == grid.size(0) &&
       tensor_is_default_dim_order(input) && tensor_is_default_dim_order(grid) &&
       tensor_is_default_dim_order(out) && tensor_is_contiguous(input) &&
       tensor_is_contiguous(grid) && tensor_is_contiguous(out);

From a8ce9cebc331ac6d027bf16f388e8315802d1653 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Thu, 7 May 2026 10:01:13 -0700
Subject: [PATCH 32/58] Convert unconditional GTEST_SKIP tests to DISABLED_
 prefix (#19355)

Summary:
A number of kernel unit tests are skipped unconditionally, with the very
first statement is `GTEST_SKIP() << "Dynamic shape not supported";`.
These tests therefore emit a `SKIPPED` result on every invocation while
the feature is
unimplemented.

The googletest recommended idiom for this case is the `DISABLED_` name
prefix:

- The test is still **compiled**, so the documenting body cannot rot.
- The test is **not executed** by gtest at all (no result is emitted).

This commit applies that conversion mechanically to every
unconditionally-skipped GTest in the ExecuTorch tree.

Disabled tests can still be opted into on demand via gtest's
`--gtest_also_run_disabled_tests` flag.

Differential Revision: D104126199
---
 kernels/test/op_add_test.cpp          |  4 ++--
 kernels/test/op_addmm_test.cpp        |  4 ++--
 kernels/test/op_bitwise_not_test.cpp  |  4 ++--
 kernels/test/op_bmm_test.cpp          |  4 ++--
 kernels/test/op_clamp_test.cpp        |  8 ++++----
 kernels/test/op_clone_test.cpp        |  4 ++--
 kernels/test/op_cumsum_test.cpp       |  4 ++--
 kernels/test/op_detach_copy_test.cpp  |  4 ++--
 kernels/test/op_div_test.cpp          |  4 ++--
 kernels/test/op_floor_divide_test.cpp | 28 +++++++++++++--------------
 kernels/test/op_full_like_test.cpp    |  4 ++--
 kernels/test/op_gelu_test.cpp         |  8 ++++----
 kernels/test/op_glu_test.cpp          |  8 ++++----
 kernels/test/op_linear_test.cpp       |  4 ++--
 kernels/test/op_log_softmax_test.cpp  |  4 ++--
 kernels/test/op_logit_test.cpp        |  4 ++--
 kernels/test/op_masked_fill_test.cpp  |  4 ++--
 kernels/test/op_mean_test.cpp         |  4 ++--
 kernels/test/op_mm_test.cpp           |  4 ++--
 kernels/test/op_mul_test.cpp          |  4 ++--
 kernels/test/op_relu_test.cpp         |  4 ++--
 kernels/test/op_round_test.cpp        |  4 ++--
 kernels/test/op_softmax_test.cpp      |  4 ++--
 kernels/test/op_split_copy_test.cpp   | 10 ++++++----
 kernels/test/op_sub_test.cpp          |  4 ++--
 kernels/test/op_unbind_copy_test.cpp  | 10 ++++++----
 kernels/test/op_var_mean_test.cpp     |  4 ++--
 kernels/test/op_var_test.cpp          |  4 ++--
 28 files changed, 82 insertions(+), 78 deletions(-)

diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
index 5561ad67b66..60faa4efb47 100644
--- a/kernels/test/op_add_test.cpp
+++ b/kernels/test/op_add_test.cpp
@@ -816,8 +816,8 @@ TEST_F(OpAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpAddOutKernelTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpAddOutKernelTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_addmm_test.cpp b/kernels/test/op_addmm_test.cpp
index a2251784c17..ff02d9c0a79 100644
--- a/kernels/test/op_addmm_test.cpp
+++ b/kernels/test/op_addmm_test.cpp
@@ -529,8 +529,8 @@ TEST_F(OpAddmmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpAddmmOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpAddmmOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_bitwise_not_test.cpp b/kernels/test/op_bitwise_not_test.cpp
index 1b73574f9ff..702486f0d2a 100644
--- a/kernels/test/op_bitwise_not_test.cpp
+++ b/kernels/test/op_bitwise_not_test.cpp
@@ -155,8 +155,8 @@ TEST_F(OpBitwiseNotOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBitwiseNotOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpBitwiseNotOutTest, DISABLED_DynamicShapeUnbound) {
   /* %python
   out_args = "{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND"
   %rewrite(unary_op) */
diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp
index edf2703e393..c870c412035 100644
--- a/kernels/test/op_bmm_test.cpp
+++ b/kernels/test/op_bmm_test.cpp
@@ -407,8 +407,8 @@ TEST_F(OpBmmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpBmmOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpBmmOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 81138fc8a55..aeb44f1d7ab 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -457,8 +457,8 @@ TEST_F(OpClampOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpClampOutTest, DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
@@ -480,8 +480,8 @@ TEST_F(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpClampOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpClampOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
diff --git a/kernels/test/op_clone_test.cpp b/kernels/test/op_clone_test.cpp
index 43e4576548a..57a8aed2d6c 100644
--- a/kernels/test/op_clone_test.cpp
+++ b/kernels/test/op_clone_test.cpp
@@ -209,8 +209,8 @@ TEST_F(OpCloneTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpCloneTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpCloneTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_cumsum_test.cpp b/kernels/test/op_cumsum_test.cpp
index 3e0ec164d04..720f7bd98e9 100644
--- a/kernels/test/op_cumsum_test.cpp
+++ b/kernels/test/op_cumsum_test.cpp
@@ -260,8 +260,8 @@ TEST_F(OpCumSumOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpCumSumOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpCumSumOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_detach_copy_test.cpp b/kernels/test/op_detach_copy_test.cpp
index d5c558afd9d..fba497c75ab 100644
--- a/kernels/test/op_detach_copy_test.cpp
+++ b/kernels/test/op_detach_copy_test.cpp
@@ -190,8 +190,8 @@ TEST_F(OpDetachCopyOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpDetachCopyOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpDetachCopyOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp
index 94f26d1b301..84d33fa2757 100644
--- a/kernels/test/op_div_test.cpp
+++ b/kernels/test/op_div_test.cpp
@@ -526,8 +526,8 @@ TEST_F(OpDivOutTest, BroadcastNDTest) {
   test_broadcast_3D<ScalarType::BFloat16>();
 }
 
-TEST_F(OpDivOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpDivOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_floor_divide_test.cpp b/kernels/test/op_floor_divide_test.cpp
index 8be1168eee1..166f7fdd4f9 100644
--- a/kernels/test/op_floor_divide_test.cpp
+++ b/kernels/test/op_floor_divide_test.cpp
@@ -175,8 +175,8 @@ TEST_F(OpFloorDivideTest, MismatchedOutputShapesDies) {
   ET_EXPECT_KERNEL_FAILURE(context_, op_floor_divide_out(a, b, out));
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneAB) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -195,8 +195,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingAB) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -215,8 +215,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneBA) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.522396445274353, 0.6753279566764832});
@@ -235,8 +235,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingBA) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.522396445274353, 0.6753279566764832});
@@ -255,8 +255,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundSameAsExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -283,8 +283,8 @@ TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -311,8 +311,8 @@ TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_full_like_test.cpp b/kernels/test/op_full_like_test.cpp
index 6e7692f5347..23ac4e685f9 100644
--- a/kernels/test/op_full_like_test.cpp
+++ b/kernels/test/op_full_like_test.cpp
@@ -181,8 +181,8 @@ TEST_F(OpFullLikeTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFullLikeTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpFullLikeTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_gelu_test.cpp b/kernels/test/op_gelu_test.cpp
index 8fae399fb18..9303b034ca2 100644
--- a/kernels/test/op_gelu_test.cpp
+++ b/kernels/test/op_gelu_test.cpp
@@ -213,8 +213,8 @@ TEST_F(OpGeluTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpGeluTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpGeluTest, DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -240,8 +240,8 @@ TEST_F(OpGeluTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpGeluTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpGeluTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_glu_test.cpp b/kernels/test/op_glu_test.cpp
index ac931302f98..9bee3a6a5a2 100644
--- a/kernels/test/op_glu_test.cpp
+++ b/kernels/test/op_glu_test.cpp
@@ -200,8 +200,8 @@ TEST_F(OpGluOutTest, AllNonFloatOutputDTypeDies) {
 #undef TEST_ENTRY
 }
 
-TEST_F(OpGluOutTest, DynamicShapeUpperBoundSameAsExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpGluOutTest, DISABLED_DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -253,8 +253,8 @@ TEST_F(OpGluOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpGluOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpGluOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
index 0ad5790a550..9b0ba782271 100644
--- a/kernels/test/op_linear_test.cpp
+++ b/kernels/test/op_linear_test.cpp
@@ -338,8 +338,8 @@ TEST_F(OpLinearOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpLinearOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpLinearOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index 3bcbee96a1c..84255b8a29c 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -421,8 +421,8 @@ TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpLogSoftmaxOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpLogSoftmaxOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_logit_test.cpp b/kernels/test/op_logit_test.cpp
index 1bb0a43a37d..0056e984bb7 100644
--- a/kernels/test/op_logit_test.cpp
+++ b/kernels/test/op_logit_test.cpp
@@ -259,8 +259,8 @@ TEST_F(OpLogitOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpLogitOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpLogitOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_masked_fill_test.cpp b/kernels/test/op_masked_fill_test.cpp
index 41962ba5ed8..b36b54c2b81 100644
--- a/kernels/test/op_masked_fill_test.cpp
+++ b/kernels/test/op_masked_fill_test.cpp
@@ -377,8 +377,8 @@ TEST_F(OpMaskedFillTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMaskedFillTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpMaskedFillTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
 
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
index 65d21b45518..23f4b675d68 100644
--- a/kernels/test/op_mean_test.cpp
+++ b/kernels/test/op_mean_test.cpp
@@ -465,8 +465,8 @@ TEST_F(OpMeanOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMeanOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpMeanOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_mm_test.cpp b/kernels/test/op_mm_test.cpp
index 63d06143b5d..62d5ed29e26 100644
--- a/kernels/test/op_mm_test.cpp
+++ b/kernels/test/op_mm_test.cpp
@@ -255,8 +255,8 @@ TEST_F(OpMmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMmOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpMmOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index 28baa0cbd16..4553f8a53b6 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -711,8 +711,8 @@ TEST_F(OpMulOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMulOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpMulOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_relu_test.cpp b/kernels/test/op_relu_test.cpp
index 7d3cfc696b2..128c4388615 100644
--- a/kernels/test/op_relu_test.cpp
+++ b/kernels/test/op_relu_test.cpp
@@ -288,8 +288,8 @@ TEST_F(OpReluTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpReluTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Unbound dynamic shape not supported";
+// DISABLED: Unbound dynamic shape not supported
+TEST_F(OpReluTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_round_test.cpp b/kernels/test/op_round_test.cpp
index e05f3a68d40..cbf9b6515d9 100644
--- a/kernels/test/op_round_test.cpp
+++ b/kernels/test/op_round_test.cpp
@@ -230,8 +230,8 @@ TEST_F(OpRoundTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpRoundTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpRoundTest, DISABLED_DynamicShapeUnbound) {
   /* %python
   out_args = "{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND"
   %rewrite(unary_op) */
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
index a5d26d0a4f9..3c61acb7d29 100644
--- a/kernels/test/op_softmax_test.cpp
+++ b/kernels/test/op_softmax_test.cpp
@@ -302,8 +302,8 @@ TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpSoftmaxOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpSoftmaxOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_split_copy_test.cpp b/kernels/test/op_split_copy_test.cpp
index 76b29fa30bb..2dd112b1ace 100644
--- a/kernels/test/op_split_copy_test.cpp
+++ b/kernels/test/op_split_copy_test.cpp
@@ -563,14 +563,16 @@ TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(
+    OpSplitCopyTensorOutTest,
+    DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpSplitCopyTensorOutTest, DISABLED_DynamicShapeUnbound) {
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
 }
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index c8e7c69c443..41ebc2f2733 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -637,8 +637,8 @@ TEST_F(OpSubOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpSubOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpSubOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_unbind_copy_test.cpp b/kernels/test/op_unbind_copy_test.cpp
index 70825537490..c98edc5e1f7 100644
--- a/kernels/test/op_unbind_copy_test.cpp
+++ b/kernels/test/op_unbind_copy_test.cpp
@@ -363,14 +363,16 @@ TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
       {2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(
+    OpUnbindCopyIntOutTest,
+    DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpUnbindCopyIntOutTest, DISABLED_DynamicShapeUnbound) {
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
 }
diff --git a/kernels/test/op_var_mean_test.cpp b/kernels/test/op_var_mean_test.cpp
index 7049c21d65b..05a0281a090 100644
--- a/kernels/test/op_var_mean_test.cpp
+++ b/kernels/test/op_var_mean_test.cpp
@@ -635,8 +635,8 @@ TEST_F(OpVarMeanCorrectionOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(mean_out, expected_mean);
 }
 
-TEST_F(OpVarMeanCorrectionOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpVarMeanCorrectionOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({3, 2}, {0.49, 0.40, 0.56, 0.38, 0.49, 0.56});
diff --git a/kernels/test/op_var_test.cpp b/kernels/test/op_var_test.cpp
index bfa73bfe15c..63e7e94f982 100644
--- a/kernels/test/op_var_test.cpp
+++ b/kernels/test/op_var_test.cpp
@@ -449,8 +449,8 @@ TEST_F(OpVarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpVarOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpVarOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({3, 2}, {0.49, 0.40, 0.56, 0.38, 0.49, 0.56});

From 563e237ccbb0697482528f71c527fc7acc248292 Mon Sep 17 00:00:00 2001
From: Nikhil Viswanath Sivakumar
 <68182521+nil-is-all@users.noreply.github.com>
Date: Thu, 7 May 2026 12:19:41 -0500
Subject: [PATCH 33/58] Docathon: Add workflow to assign user on comment
 (#19294)

This workflow assigns a user to an issue when a comment containing
'/assigntome' is made, provided the issue has the 'docathon-2026' label.

cc @mergennachin @AlannaBurke
---
 .github/workflows/assigntome-docathon.yml | 60 +++++++++++++++++++++++
 .github/workflows/docathon-sync-label.yml | 31 ++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 .github/workflows/assigntome-docathon.yml
 create mode 100644 .github/workflows/docathon-sync-label.yml

diff --git a/.github/workflows/assigntome-docathon.yml b/.github/workflows/assigntome-docathon.yml
new file mode 100644
index 00000000000..92dec519296
--- /dev/null
+++ b/.github/workflows/assigntome-docathon.yml
@@ -0,0 +1,60 @@
+name: Assign User on Comment
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+jobs:
+  assign:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+    steps:
+      - name: Check for "/assigntome" in comment
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          script: |
+            const issueComment = context.payload.comment.body;
+            const assignRegex = /\/assigntome/i;
+            if (assignRegex.test(issueComment)) {
+              const assignee = context.payload.comment.user.login;
+              const issueNumber = context.payload.issue.number;
+              try {
+                const { data: issue } = await github.rest.issues.get({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber
+                });
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-2026');
+              if (hasLabel) {
+                if (issue.assignee !== null) {
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNumber,
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-2026 label](https://github.com/pytorch/executorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-2026)"
+                  });
+                } else {
+                  await github.rest.issues.addAssignees({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNumber,
+                    assignees: [assignee]
+                  });
+                }
+              } else {
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-2026 label](https://github.com/pytorch/executorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-2026)";
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber,
+                  body: commmentMessage
+                });
+               }
+              } catch (error) {
+                console.error(error);
+              }
+            }
diff --git a/.github/workflows/docathon-sync-label.yml b/.github/workflows/docathon-sync-label.yml
new file mode 100644
index 00000000000..bf8197f8d64
--- /dev/null
+++ b/.github/workflows/docathon-sync-label.yml
@@ -0,0 +1,31 @@
+name: Docathon Labels Sync
+
+on:
+  pull_request_target:
+    types: [opened, synchronize, edited]
+    branches: [main]
+
+jobs:
+  check-labels:
+    if: github.repository_owner == 'pytorch'
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 1
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: 3.x
+      - name: Install dependencies
+        run: |
+          pip install requests==2.32.3
+          pip install PyGithub==2.3.0
+      - name: Run Python script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python ./.github/scripts/docathon-label-sync.py ${{ github.event.pull_request.number }}

From 76d941e80464fe8b0218bb56143f23472c45186c Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 7 May 2026 10:32:03 -0700
Subject: [PATCH 34/58] More generic slice propagation before unary ops which
 works for non-contiguous slices (#19345)

Differential Revision: D103752840

Pull Request resolved: https://github.com/pytorch/executorch/pull/19345
---
 backends/cadence/aot/reorder_ops.py           | 105 ++++++++++-
 .../aot/tests/test_reorder_ops_passes.py      | 166 ++++++++++++++++++
 2 files changed, 270 insertions(+), 1 deletion(-)

diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
index a8eda5cc457..5a9b76b473a 100644
--- a/backends/cadence/aot/reorder_ops.py
+++ b/backends/cadence/aot/reorder_ops.py
@@ -11,7 +11,7 @@
 
 from collections import defaultdict
 from math import prod
-from typing import cast, DefaultDict, List, Tuple
+from typing import Callable, cast, DefaultDict, List, Tuple
 
 import torch
 import torch.fx
@@ -719,6 +719,109 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
         return True
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class PropagateSlice(RemoveOrReplacePassInterface):
+    """Propagate slice_copy before unary element-wise ops when the cost
+    model indicates it reduces total data movement.
+
+    Supported ops (extensible via dispatch table):
+        - quantize_per_tensor: element-wise, slice passes through unchanged
+        - dequantize_per_tensor: element-wise, slice passes through unchanged
+
+    Handles any slice dim and any step size. Runs in the iterative pass
+    loop — chains are handled by repeated application.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        elementwise_targets = [
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+        ]
+        self._dispatch: dict[
+            EdgeOpOverload,
+            tuple[
+                Callable[[torch.fx.Node, torch.fx.Node], bool],
+                Callable[[torch.fx.Node, torch.fx.Node], bool],
+            ],
+        ] = {
+            t: (self._should_swap_elementwise, self._swap_elementwise_slice)
+            for t in elementwise_targets
+        }
+
+    @property
+    def targets(self) -> list[EdgeOpOverload]:
+        return [exir_ops.edge.aten.slice_copy.Tensor]
+
+    def _should_swap_elementwise(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        full_size = prod(op_node.meta["val"].shape)
+        sliced_size = prod(slice_node.meta["val"].shape)
+        return sliced_size < full_size
+
+    def _swap_elementwise_slice(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        op_input = op_node.args[0]
+        assert isinstance(op_input, torch.fx.Node)
+        graph = slice_node.graph
+
+        slice_args = slice_node.args[1:]
+
+        with graph.inserting_before(op_node):
+            new_slice = graph.call_function(
+                exir_ops.edge.aten.slice_copy.Tensor,
+                args=(op_input, *slice_args),
+            )
+            new_slice.meta["val"] = exir_ops.edge.aten.slice_copy.Tensor(
+                op_input.meta["val"], *slice_args
+            )
+
+            new_args = list(op_node.args)
+            new_args[0] = new_slice
+            target = cast(EdgeOpOverload, op_node.target)
+            new_op = graph.call_function(
+                target,
+                args=tuple(new_args),
+                kwargs=op_node.kwargs,
+            )
+            new_op.meta["val"] = target(
+                new_slice.meta["val"],
+                *[
+                    a.meta["val"] if isinstance(a, torch.fx.Node) else a
+                    for a in new_args[1:]
+                ],
+                **{
+                    k: v.meta["val"] if isinstance(v, torch.fx.Node) else v
+                    for k, v in op_node.kwargs.items()
+                },
+            )
+
+        slice_node.replace_all_uses_with(new_op)
+        graph.erase_node(slice_node)
+        graph.erase_node(op_node)
+        return True
+
+    def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
+        parent = node.args[0]
+        if not isinstance(parent, torch.fx.Node):
+            return False
+        if len(parent.users) != 1:
+            return False
+        if not isinstance(parent.target, EdgeOpOverload):
+            return False
+
+        entry = self._dispatch.get(parent.target)
+        if entry is None:
+            return False
+
+        should_swap, do_swap = entry
+        return should_swap(parent, node) and do_swap(parent, node)
+
+
 # The following class consolidates functions to reoder ops (i.e., either hoist
 # or sink some ops in the graph).
 class CadenceReorderOpsInGraph:
diff --git a/backends/cadence/aot/tests/test_reorder_ops_passes.py b/backends/cadence/aot/tests/test_reorder_ops_passes.py
index ba9089a652e..cf3a6840179 100644
--- a/backends/cadence/aot/tests/test_reorder_ops_passes.py
+++ b/backends/cadence/aot/tests/test_reorder_ops_passes.py
@@ -26,6 +26,7 @@
     MoveSliceBeforePermutePass,
     PostponeDequantizeOpBelowUseChainPass,
     PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView,
+    PropagateSlice,
     SinkOpsCloserToUsePass,
 )
 from executorch.backends.test.graph_builder import GraphBuilder
@@ -761,3 +762,168 @@ def test_non_dim0_slice_always_moved(self) -> None:
             MoveSliceBeforePermutePass(),
         )
         self.assertTrue(result.modified)
+
+
+class TestPropagateSlice(unittest.TestCase):
+    def test_swap_quantize_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        slice_node = slice_nodes[0]
+        self.assertEqual(slice_node.args[0].name, "x")
+        self.assertEqual(list(slice_node.meta["val"].shape), [2, 60, 1, 1])
+
+        quant_nodes = gm.graph.find_nodes(
+            op="call_function",
+            target=exir_ops.edge.cadence.quantize_per_tensor.default,
+        )
+        self.assertEqual(len(quant_nodes), 1)
+        self.assertEqual(quant_nodes[0].args[0], slice_node)
+        self.assertEqual(list(quant_nodes[0].meta["val"].shape), [2, 60, 1, 1])
+
+    def test_swap_dequantize_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder(
+            "x", torch.randint(0, 255, (4, 60, 4, 4), dtype=torch.uint8)
+        )
+        dequant = builder.call_operator(
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(dequant, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "x")
+
+    def test_step_2_through_quantize(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[4], 2)
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [2, 60, 1, 1])
+
+    def test_non_batch_dim_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 4, 4))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 1, 0, 30, 1),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [4, 30, 4, 4])
+
+    def test_no_swap_when_multi_user(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 2),
+        )
+        builder.output([sliced, quant])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)
+
+    def test_no_swap_noop_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 1),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)
+
+    def test_unsupported_parent_not_swapped(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        relu = builder.call_operator(
+            exir_ops.edge.aten.relu.default,
+            args=(x,),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(relu, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)

From 74c7c9123426d6d0827ae454d017d9a271179ade Mon Sep 17 00:00:00 2001
From: Nikhil Viswanath Sivakumar
 <68182521+nil-is-all@users.noreply.github.com>
Date: Thu, 7 May 2026 12:56:13 -0500
Subject: [PATCH 35/58] Docathon automation: Add script to sync labels from
 issue to PR (#19374)

Adds python script for workflow:
.github/workflows/docathon-sync-label.yml


cc @mergennachin @AlannaBurke
---
 .github/scripts/docathon-label-sync.py | 54 ++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 .github/scripts/docathon-label-sync.py

diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
new file mode 100644
index 00000000000..4f00067905d
--- /dev/null
+++ b/.github/scripts/docathon-label-sync.py
@@ -0,0 +1,54 @@
+import os
+import re
+import sys
+
+from github import Github
+
+
+def main() -> None:
+    token = os.environ.get("GITHUB_TOKEN")
+
+    repo_owner = "pytorch"
+    repo_name = "pytorch"
+    pull_request_number = int(sys.argv[1])
+
+    g = Github(token)
+    repo = g.get_repo(f"{repo_owner}/{repo_name}")
+    pull_request = repo.get_pull(pull_request_number)
+    pull_request_body = pull_request.body
+    # PR without description
+    if pull_request_body is None:
+        return
+
+    # get issue number from the PR body
+    if not re.search(r"#\d{1,6}", pull_request_body):
+        print("The pull request does not mention an issue.")
+        return
+    issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
+    issue = repo.get_issue(issue_number)
+    issue_labels = issue.labels
+    docathon_label_present = any(
+        label.name == "docathon-2026" for label in issue_labels
+    )
+
+    # if the issue has a docathon label, add all labels from the issue to the PR.
+    if not docathon_label_present:
+        print("The 'docathon-2026' label is not present in the issue.")
+        return
+    pull_request_labels = pull_request.get_labels()
+    pull_request_label_names = [label.name for label in pull_request_labels]
+    issue_label_names = [label.name for label in issue_labels]
+    labels_to_add = [
+        label
+        for label in issue_label_names
+        if label not in pull_request_label_names and label != "actionable"
+    ]
+    if not labels_to_add:
+        print("The pull request already has the same labels.")
+        return
+    pull_request.add_to_labels(*labels_to_add)
+    print("Labels added to the pull request!")
+
+
+if __name__ == "__main__":
+    main()

From 1643611197ab5279d5a5fd7fe39793d169a092c2 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 7 May 2026 13:01:43 -0500
Subject: [PATCH 36/58] Disable HF Xet storage across all CI scripts (#19371)

HuggingFace's Xet storage backend stalls mid-download on CI runners,
causing 90-minute job timeouts. Set HF_HUB_DISABLE_XET=1 in every CI
script and workflow that downloads from HuggingFace to force standard
HTTP downloads instead.
---
 .ci/scripts/download_hf_hub.sh                | 3 +++
 .ci/scripts/export_model_artifact.sh          | 1 +
 .ci/scripts/test_huggingface_optimum_model.py | 4 ++++
 .ci/scripts/test_lora.sh                      | 2 ++
 .ci/scripts/test_lora_multimethod.sh          | 2 ++
 .ci/scripts/test_phi_3_mini.sh                | 2 ++
 .github/workflows/mlx.yml                     | 6 ++++++
 7 files changed, 20 insertions(+)

diff --git a/.ci/scripts/download_hf_hub.sh b/.ci/scripts/download_hf_hub.sh
index b47fc5dd215..c0487e687c7 100644
--- a/.ci/scripts/download_hf_hub.sh
+++ b/.ci/scripts/download_hf_hub.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
+
 # Function to download files from the Hugging Face Hub
 # Arguments:
 # 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index b7edc1765ea..1f75d850e84 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -67,6 +67,7 @@ if [ -z "${1:-}" ]; then
   exit 1
 fi
 
+# Disable HF Xet storage to avoid stalled downloads on CI runners
 export HF_HUB_DISABLE_XET=1
 
 set -eux
diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 04ad1f5e792..7b0e69ff9b4 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -2,6 +2,7 @@
 import gc
 import logging
 import math
+import os
 import shutil
 import subprocess
 import tempfile
@@ -9,6 +10,9 @@
 from pathlib import Path
 from typing import List
 
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
+
 import torch
 from datasets import load_dataset
 
diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
index b20d456a673..45e4c68745a 100644
--- a/.ci/scripts/test_lora.sh
+++ b/.ci/scripts/test_lora.sh
@@ -6,6 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 set -exu
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
diff --git a/.ci/scripts/test_lora_multimethod.sh b/.ci/scripts/test_lora_multimethod.sh
index 7c468eb226b..f0b30bd4be1 100755
--- a/.ci/scripts/test_lora_multimethod.sh
+++ b/.ci/scripts/test_lora_multimethod.sh
@@ -6,6 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 set -exu
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
index 086822bbad4..76ebb2ff3d1 100644
--- a/.ci/scripts/test_phi_3_mini.sh
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -6,6 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 set -exu
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
 
 BUILD_TYPE=${1:-Debug}
 BUILD_DIR=${3:-cmake-out}
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index 65b8543bfd4..cdc9cd8a3d0 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -306,6 +306,8 @@ jobs:
       timeout: 90
       script: |
         set -eux
+        # Disable HF Xet storage to avoid stalled downloads on CI runners
+        export HF_HUB_DISABLE_XET=1
 
         echo "::group::Install ExecuTorch"
         ${CONDA_RUN} python install_executorch.py > /dev/null
@@ -382,6 +384,8 @@ jobs:
       timeout: 90
       script: |
         set -eux
+        # Disable HF Xet storage to avoid stalled downloads on CI runners
+        export HF_HUB_DISABLE_XET=1
 
         echo "::group::Install ExecuTorch and configure MLX build"
         ${CONDA_RUN} python install_executorch.py > /dev/null
@@ -510,6 +514,8 @@ jobs:
       timeout: 90
       script: |
         set -eux
+        # Disable HF Xet storage to avoid stalled downloads on CI runners
+        export HF_HUB_DISABLE_XET=1
 
         MODEL_ID="${{ matrix.model.id }}"
         MODEL_NAME="${{ matrix.model.name }}"

From 226c1c54ab661278b0c4a8033e6b1dcc5f93ab3f Mon Sep 17 00:00:00 2001
From: Rohit Yelukati Mahendra <34777717+ymrohit@users.noreply.github.com>
Date: Thu, 7 May 2026 19:53:55 +0100
Subject: [PATCH 37/58] [DOC] Add redirects for moved ExecuTorch pages (#19338)

Fixes #14797

### Summary

- Add redirects for moved top-level backend, LLM, and tutorial pages.
- Point existing Core ML, MPS, and Arm Ethos-U redirects at their
current canonical pages.
- Update stale internal doc links, including the Tools Model
Visualization entry.
- Keep Samsung Exynos in Android navigation through the Android wrapper
while preserving the Backends section order so Cadence points next to
Samsung.

### Test plan

- `git diff --check`
- `python -m py_compile docs/source/conf.py`
- Redirect-target script: checked 23 redirects, no missing targets
- Parsed redirect table: confirmed `llm/llama-demo-android` resolves to
`docs/source/using-executorch-android.md`
- `PYTHONPATH=/tmp/executorch-doc-stub:.. ../.venv/bin/sphinx-build -D
plot_gallery=0 -b html docs/source
/tmp/executorch-docs-14797-final-escalated` succeeded with existing
repository warnings before the follow-up redirect entry
- Verified rendered `backends-cadence.html` next page is Samsung Exynos
- Verified generated redirects for `backends-arm-ethos-u.html`,
`backends-coreml.html`, `backends-mps.html`, and `visualization.html`


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 docs/source/android-backends.md                  |  5 +++--
 .../backends/arm-ethos-u/arm-ethos-u-overview.md |  4 ++--
 docs/source/conf.py                              | 16 +++++++++++++---
 docs/source/platforms-desktop.md                 |  6 +++---
 docs/source/platforms-embedded.md                |  6 +++---
 docs/source/tools-section.md                     |  4 ++--
 docs/source/using-executorch-export.md           |  2 +-
 7 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md
index d4da0966ed9..bbbbb1418e0 100644
--- a/docs/source/android-backends.md
+++ b/docs/source/android-backends.md
@@ -16,7 +16,7 @@ Available hardware acceleration backends for Android deployment.
 - {doc}`android-qualcomm` — Qualcomm AI Engine (NPU)
 - {doc}`android-mediatek` — MediaTek NPU acceleration
 - {doc}`android-arm-vgf` — ARM VGF Backend
-- {doc}`backends/samsung/samsung-overview` — Samsung Exynos NPU
+- {doc}`android-samsung-exynos` — Samsung Exynos NPU
 
 ```{toctree}
 :hidden:
@@ -25,4 +25,5 @@ android-vulkan
 android-qualcomm
 android-mediatek
 android-arm-vgf
-backends/samsung/samsung-overview
+android-samsung-exynos
+```
diff --git a/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md b/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md
index 1a2cd1b44be..faffedece35 100644
--- a/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md
+++ b/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md
@@ -4,7 +4,7 @@ The Arm&reg; Ethos&trade;-U backend targets Edge/IoT-type AI use-cases by enabli
 [Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
 [Arm&reg; Ethos&trade;-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85), leveraging [TOSA](https://www.mlplatform.org/tosa/) and the
 [ethos-u-vela](https://pypi.org/project/ethos-u-vela/) graph compiler. This document is a technical reference for using the Ethos-U backend, for a top level view with code examples
-please refer to the [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
+please refer to the [Arm Ethos-U Backend Tutorial](tutorials/ethos-u-getting-started.md).
 
 ## Features
 
@@ -111,7 +111,7 @@ For more information on quantization, see [Quantization](arm-ethos-u-quantizatio
 
 ## Runtime Integration
 
-An example runtime application is available in [examples/arm/executor_runner](https://github.com/pytorch/executorch/blob/main/examples/arm/executor_runner/), and the steps requried for building and deploying it on a FVP it is explained in the previously mentioned [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
+An example runtime application is available in [examples/arm/executor_runner](https://github.com/pytorch/executorch/blob/main/examples/arm/executor_runner/), and the steps requried for building and deploying it on a FVP it is explained in the previously mentioned [Arm Ethos-U Backend Tutorial](tutorials/ethos-u-getting-started.md).
 The example application is recommended to use for testing basic functionality of your lowered models, as well as a starting point for developing runtime integrations for your own targets.
 For an in-depth explanation of the architecture of the executor_runner and the steps required for doing such an integration, please refer to [Ethos-U porting guide](https://github.com/pytorch/executorch/blob/main/examples/arm/ethos-u-porting-guide.md).
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1a2ef3e5e5f..75757b7da27 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -283,13 +283,23 @@
     "tutorials/export-to-executorch-tutorial": "../using-executorch-export.html",
     "build-run-vulkan": "backends/vulkan/vulkan-overview.html",
     "backends-vulkan": "backends/vulkan/vulkan-overview.html",
-    "executorch-arm-delegate-tutorial": "backends-arm-ethos-u.html",
-    "build-run-coreml": "backends-coreml.html",
+    "executorch-arm-delegate-tutorial": "backends/arm-ethos-u/tutorials/ethos-u-getting-started.html",
+    "build-run-coreml": "backends/coreml/coreml-overview.html",
     "build-run-mediatek-backend": "backends-mediatek.html",
-    "build-run-mps": "backends-mps.html",
+    "build-run-mps": "backends/mps/mps-overview.html",
     "build-run-qualcomm-ai-engine-direct-backend": "backends-qualcomm.html",
     "build-run-xtensa": "backends-cadence.html",
     "apple-runtime": "using-executorch-ios.html",
+    "backends-arm-ethos-u": "backends/arm-ethos-u/arm-ethos-u-overview.html",
+    "backends-arm-vgf": "backends/arm-vgf/arm-vgf-overview.html",
+    "backends-coreml": "backends/coreml/coreml-overview.html",
+    "backends-mps": "backends/mps/mps-overview.html",
+    "backends-xnnpack": "backends/xnnpack/xnnpack-overview.html",
+    "backend-delegates-xnnpack-reference": "backends/xnnpack/xnnpack-arch-internals.html",
+    "llm/llama-demo-android": "../using-executorch-android.html",
+    "tutorial-arm-ethos-u": "backends/arm-ethos-u/tutorials/ethos-u-getting-started.html",
+    "tutorial-arm-vgf": "backends/arm-vgf/tutorials/vgf-getting-started.html",
+    "visualization": "visualize.html",
 }
 
 # Custom directives defintions to create cards on main landing page
diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md
index ba22786576f..b004d47c2ee 100644
--- a/docs/source/platforms-desktop.md
+++ b/docs/source/platforms-desktop.md
@@ -11,11 +11,11 @@ ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windo
 ### Linux
 - [XNNPACK (CPU)](backends/xnnpack/xnnpack-overview.md)
 - [OpenVINO (Intel)](build-run-openvino)
-- [ARM Ethos-U (ARM64)](backends-arm-ethos-u)
+- [ARM Ethos-U (ARM64)](backends/arm-ethos-u/arm-ethos-u-overview.md)
 
 ### macOS
-- [CoreML (recommended)](backends-coreml)
-- [MPS (Apple Silicon)](backends-mps)
+- [Core ML (recommended)](backends/coreml/coreml-overview.md)
+- [MPS (Apple Silicon)](backends/mps/mps-overview.md)
 - [XNNPACK (CPU)](backends/xnnpack/xnnpack-overview.md)
 
 ### Windows
diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md
index 5ea248fc0d9..f766eddda82 100644
--- a/docs/source/platforms-embedded.md
+++ b/docs/source/platforms-embedded.md
@@ -10,10 +10,10 @@ ExecuTorch supports embedded devices from microcontrollers to edge devices.
 
 ### Microcontrollers
 - [Cadence Xtensa Backend](backends-cadence)
-- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [ARM Ethos-U NPU Backend](backends/arm-ethos-u/arm-ethos-u-overview.md)
 - [Custom Backend Development](backend-delegates-integration)
 
 ### Edge Devices
-- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
-- [NXP eIQ Neutron Backend](backend-nxp)
+- [ARM Ethos-U NPU Backend](backends/arm-ethos-u/arm-ethos-u-overview.md)
+- [NXP eIQ Neutron Backend](backends/nxp/nxp-overview.md)
 - [Custom Hardware Integration](backend-delegates-integration)
diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md
index c54b4933c44..6d8061dd33a 100644
--- a/docs/source/tools-section.md
+++ b/docs/source/tools-section.md
@@ -13,7 +13,7 @@ In this section, explore ExecuTorch's comprehensive developer tools for profilin
 - {doc}`model-inspector` — Model Inspector
 - {doc}`memory-planning-inspection` — Memory Planning Inspection
 - {doc}`devtools-tutorial` — Development Utilities
-- {doc}`visualization` — Model Visualization
+- {doc}`visualize` — Model Visualization
 
 ```{toctree}
 :hidden:
@@ -29,4 +29,4 @@ model-debugging
 model-inspector
 memory-planning-inspection
 devtools-tutorial
-visualization
+visualize
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index c8814b74eda..d37dfae2ef7 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -38,7 +38,7 @@ Commonly used hardware backends are listed below. For mobile, consider using XNN
 - [Vulkan (Android GPU)](backends/vulkan/vulkan-overview.md)
 - [Qualcomm NPU](backends-qualcomm.md)
 - [MediaTek NPU](backends-mediatek.md)
-- [Arm Ethos-U NPU](backends-arm-ethos-u.md)
+- [Arm Ethos-U NPU](backends/arm-ethos-u/arm-ethos-u-overview.md)
 - [Cadence DSP](backends-cadence.md)
 
 ## Model Preparation

From bf8abb668737788ccf73ae3be46472252ad0c526 Mon Sep 17 00:00:00 2001
From: Aheli Poddar <ahpoddar@redhat.com>
Date: Fri, 8 May 2026 00:45:19 +0530
Subject: [PATCH 38/58] [DOC] Fix outdated version-pinned doc URLs (#19325)

### Summary
Replace version-pinned GitHub blob/tree URLs (specific commit hashes and
release/0.4, release/0.6, release/1.0, release/1.2 branches) with
`/main/` references across 10 doc files. Update
`docs.pytorch.org/executorch/0.4/` URL to use `/stable/`. Update all
line number anchors to match the current `main` branch.

Fixes #19257

### Test plan
- All 29 updated URLs verified to return HTTP 200 via `curl`
- All file paths confirmed to exist on `main` locally
- All 21 line number references verified to point to the correct code
(class/function definitions)
- `lintrunner -a` passes with no lint issues

cc @mergennachin @AlannaBurke @digantdesai @freddan80 @per @zingo
@oscarandersson8218 @mansnils @Sebastian-Larsson @robell

---------

Co-authored-by: Nikhil Viswanath Sivakumar <68182521+nil-is-all@users.noreply.github.com>
---
 docs/source/api-life-cycle.md                 |  6 ++---
 docs/source/backends-qualcomm.md              |  4 ++--
 .../arm-ethos-u-troubleshooting.md            |  2 +-
 docs/source/backends/nxp/nxp-overview.md      |  2 +-
 docs/source/backends/nxp/nxp-partitioner.rst  |  4 ++--
 .../backends/xnnpack/xnnpack-partitioner.rst  |  6 ++---
 docs/source/bundled-io.md                     | 10 ++++-----
 .../source/compiler-custom-compiler-passes.md | 22 +++++++++----------
 docs/source/compiler-memory-planning.md       |  2 +-
 docs/source/using-executorch-android.md       |  2 +-
 10 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/docs/source/api-life-cycle.md b/docs/source/api-life-cycle.md
index 0327f23a985..3ccaa4eddb1 100644
--- a/docs/source/api-life-cycle.md
+++ b/docs/source/api-life-cycle.md
@@ -104,7 +104,7 @@ decorator.
 
 Use <code>.. warning::</code> in the docstrings of deprecated and experimental
 APIs. See
-<a href="https://github.com/pytorch/pytorch/blob/cd8bbdc71a0258292381a7d54c8b353988d02ff4/torch/nn/utils/stateless.py#L170">example
+<a href="https://github.com/pytorch/pytorch/blob/main/torch/nn/utils/stateless.py#L176">example
 usage</a>.
 
 </ul>
@@ -115,7 +115,7 @@ usage</a>.
    </td>
    <td>
 
-Use the <code>ET_DEPRECATED</code> annotation macro. See <a href="https://github.com/pytorch/executorch/blob/8e0f856ee269b319ac4195509cf31e3f548aa0e8/runtime/executor/program.h#L81">example usage</a>.
+Use the <code>ET_DEPRECATED</code> annotation macro. See <a href="https://github.com/pytorch/executorch/blob/main/runtime/executor/program.h#L92">example usage</a>.
 
 <p>
 <p>
@@ -125,7 +125,7 @@ Use the <code>ET_EXPERIMENTAL</code> annotation macro.
    <td>
 
 Start Doxygen comments with <code>DEPRECATED:</code> See
-<a href="https://github.com/pytorch/executorch/blob/9d859653ae916d0a72f6b2b5c5925bed38832140/runtime/executor/program.h#L139">example
+<a href="https://github.com/pytorch/executorch/blob/main/runtime/executor/program.h#L164">example
 usage</a>.
 
 <p>
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 6feddcc803c..c4465c8290d 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -608,7 +608,7 @@ Supports:
 For details, see: backends/qualcomm/quantizer/quantizer.py
 
 ### Operator Support
-[The full operator support matrix](https://github.com/pytorch/executorch/tree/f32cdc3de6f7176d70a80228f1a60bcd45d93437/backends/qualcomm/builders#operator-support-status) is tracked and frequently updated in the ExecuTorch repository.
+[The full operator support matrix](https://github.com/pytorch/executorch/tree/main/backends/qualcomm/builders#operator-support-status) is tracked and frequently updated in the ExecuTorch repository.
 
 It lists:
 - Supported PyTorch ops (aten.*, custom ops)
@@ -633,4 +633,4 @@ If you encounter any issues while reproducing the tutorial, please file a github
 [issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
 
  ### Debugging tips
- - Before trying any complicated models, try out [a simple model example](https://github.com/pytorch/executorch/tree/f32cdc3de6f7176d70a80228f1a60bcd45d93437/examples/qualcomm#simple-examples-to-verify-the-backend-is-working) and see it if works one device.
+ - Before trying any complicated models, try out [a simple model example](https://github.com/pytorch/executorch/tree/main/examples/qualcomm#simple-examples-to-verify-the-backend-is-working) and see if it works on your device.
diff --git a/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md b/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md
index e6d35c0646e..9fe485e9f04 100644
--- a/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md
+++ b/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md
@@ -24,7 +24,7 @@ You can see how  this coupling between the memory mode and runtime application i
 
 ## Using Bundled.io and ETdump
 
-The arm_executor_runner supports [bundled-io](https://docs.pytorch.org/executorch/0.4/bundled-io.html) and [ETdump](https://docs.pytorch.org/executorch/stable/etdump.html) debugging tools.
+The arm_executor_runner supports [bundled-io](https://docs.pytorch.org/executorch/stable/bundled-io.html) and [ETdump](https://docs.pytorch.org/executorch/stable/etdump.html) debugging tools.
 
 To enable bundled-io, set `-DEXECUTORCH_BUILD_DEVTOOLS=ON` when building Executorch and `-DET_BUNDLE_IO=ON` when building the executor_runner. To enable ETdump, set `-DEXECUTORCH_BUILD_ARM_ETDUMP=ON` when building Executorch and `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` when building the executor_runner.
 
diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md
index 2bf66e28e5c..6070f86e458 100644
--- a/docs/source/backends/nxp/nxp-overview.md
+++ b/docs/source/backends/nxp/nxp-overview.md
@@ -39,7 +39,7 @@ $ ./examples/nxp/setup.sh
 
 To test the eIQ Neutron Backend, both AoT flow for model preparation and Runtime for execution, refer to the [Getting started with eIQ Neutron NPU ExecuTorch backend](tutorials/nxp-basic-tutorial.md)
 
-For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/main/examples/nxp/aot_neutron_compile.py).
 
 
 ## Runtime Integration
diff --git a/docs/source/backends/nxp/nxp-partitioner.rst b/docs/source/backends/nxp/nxp-partitioner.rst
index 9aa65b1d0d9..c568959883f 100644
--- a/docs/source/backends/nxp/nxp-partitioner.rst
+++ b/docs/source/backends/nxp/nxp-partitioner.rst
@@ -28,7 +28,7 @@ Following fields can be set:
 Custom Delegation Options
 -------------------------
 By default the Neutron backend is defensive, what means it does not delegate operators which cannot be decided statically during partitioning. But as the model author you typically have insight into the model and so you can allow opportunistic delegation for some cases. For list of options, see
-`CustomDelegationOptions <https://github.com/pytorch/executorch/blob/release/1.2/backends/nxp/backend/custom_delegation_options.py#L11>`_
+`CustomDelegationOptions <https://github.com/pytorch/executorch/blob/main/backends/nxp/backend/custom_delegation_options.py#L11>`_
 
 ================
 Operator Support
@@ -37,7 +37,7 @@ Operator Support
 Operators are the building blocks of the ML model. See `IRs <https://docs.pytorch.org/docs/stable/torch.compiler_ir.html>`_ for more information on the PyTorch operator set.
 
 This section lists the Edge operators supported by the Neutron backend.
-For detailed constraints of the operators see the conditions in the ``is_supported_*`` functions in the `Node converters <https://github.com/pytorch/executorch/blob/release/1.2/backends/nxp/neutron_partitioner.py#L202>`_
+For detailed constraints of the operators see the ``is_supported`` / ``_is_supported_in_IR`` / ``_is_supported_on_target`` checks in the `Node converters <https://github.com/pytorch/executorch/blob/main/backends/nxp/backend/ir/converter/node_converter.py#L118>`_
 
 
 .. csv-table:: Operator Support
diff --git a/docs/source/backends/xnnpack/xnnpack-partitioner.rst b/docs/source/backends/xnnpack/xnnpack-partitioner.rst
index a0881aa3a6a..85dc3bf9c61 100644
--- a/docs/source/backends/xnnpack/xnnpack-partitioner.rst
+++ b/docs/source/backends/xnnpack/xnnpack-partitioner.rst
@@ -2,10 +2,10 @@
 Partitioner API
 ===============
 
-The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an ``XnnpackPartitioner`` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the `constructor <https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/xnnpack_partitioner.py#L31>`_:
+The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an ``XnnpackPartitioner`` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the `constructor <https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/xnnpack_partitioner.py#L31>`_:
 
-- ``configs``: Control which operators are delegated to XNNPACK. By default, all available operators all delegated. See `../config/__init__.py <https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/config/__init__.py#L66>`_ for an exhaustive list of available operator configs.
-- ``config_precisions``: Filter operators by data type. By default, delegate all precisions. One or more of ``ConfigPrecisionType.FP32``, ``ConfigPrecisionType.STATIC_QUANT``, or ``ConfigPrecisionType.DYNAMIC_QUANT``. See `ConfigPrecisionType <https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/config/xnnpack_config.py#L24>`_.
+- ``configs``: Control which operators are delegated to XNNPACK. By default, all available operators are delegated. See `../config/__init__.py <https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/config/__init__.py#L76>`_ for an exhaustive list of available operator configs.
+- ``config_precisions``: Filter operators by data type. By default, delegate all precisions. One or more of ``ConfigPrecisionType.FP32``, ``ConfigPrecisionType.STATIC_QUANT``, or ``ConfigPrecisionType.DYNAMIC_QUANT``. See `ConfigPrecisionType <https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/config/xnnpack_config.py#L30>`_.
 - ``per_op_mode``: If true, emit individual delegate calls for every operator. This is an advanced option intended to reduce memory overhead in some contexts at the cost of a small amount of runtime overhead. Defaults to false.
 - ``verbose``: If true, print additional information during lowering.
 
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index d901710bfb7..2597b991920 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -199,17 +199,17 @@ This stage mainly focuses on executing the model with the bundled inputs and com
 
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
 We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API for this
-`executorch::bundled_program::get_program_data`. Check out an [example usage](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp#L128-L137) of this API.
+`executorch::bundled_program::get_program_data`. Check out an [example usage](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp#L128-L137) of this API.
 
 ### Load Bundled Input to Method
-To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`.  Check out an [example usage](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp#L253-L259) of this API.
+To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`.  Check out an [example usage](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp#L253-L259) of this API.
 
 ### Verify the Method's Output.
-We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Check out an [example usage](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp#L301-L307) of this API.
+We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Check out an [example usage](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp#L301-L307) of this API.
 
 ### Runtime Example
 
-Please checkout our [example runner](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/README.md#bundledprogram) for a bundled program. You could run these commands to test with the BundledProgram binary (`.bpte`) file you generated in the previous step:
+Please check out our [example runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/README.md#bundledprogram) for a bundled program. You could run these commands to test with the BundledProgram binary (`.bpte`) file you generated in the previous step:
 
 ```bash
 cd executorch
@@ -218,7 +218,7 @@ cd executorch
 ```
 
 It is expected to see no output from running the above mentioned snippet.
-For a detailed example of how the runner should be like, please refer to our [example runner](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp).
+For a detailed example of how the runner should be like, please refer to our [example runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp).
 
 
 ### Try the Complete Workflow
diff --git a/docs/source/compiler-custom-compiler-passes.md b/docs/source/compiler-custom-compiler-passes.md
index ff0013a1929..aaba70b02af 100644
--- a/docs/source/compiler-custom-compiler-passes.md
+++ b/docs/source/compiler-custom-compiler-passes.md
@@ -25,7 +25,7 @@ Our projection on the frequency of these use cases are:
 
 For level 1 uses cases (creating one-to-X mappings, performing forwards iterations,
 and looking at local node information), we can utilize a helper class called
-[`ExportPass`](https://github.com/pytorch/executorch/blob/d9eef24bb720804aa7b400b05241487510ae0dc2/exir/pass_base.py#L44).
+[`ExportPass`](https://github.com/pytorch/executorch/blob/main/exir/pass_base.py#L655).
 This is an
 [interpreter-based](https://pytorch.org/docs/stable/fx.html#the-interpreter-pattern)
 way where we execute each node and recreate the graph except with
@@ -35,7 +35,7 @@ metadata such as stack trace, FakeTensor values, and torch.nn.Module hierarchy
 are preserved and updated depending on the transformations made.
 
 To implement this pass, we can create a subclass of
-[`ExportPass`](https://github.com/pytorch/executorch/blob/d9eef24bb720804aa7b400b05241487510ae0dc2/exir/pass_base.py#L44)
+[`ExportPass`](https://github.com/pytorch/executorch/blob/main/exir/pass_base.py#L655)
 and implement the exposed functions.  When called with a graph module, it will
 run the graph module and create a new graph containing the changes specified by
 the pass. This means that the graph module passed in must be runnable on CPU,
@@ -171,7 +171,7 @@ class ScalarToTensorPass(ExportPass):
 ### Level 2
 
 For creating many-to-one mappings, we can utilize FX's [subgraph
-rewriter](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/subgraph_rewriter.py#L77).
+rewriter](https://github.com/pytorch/pytorch/blob/main/torch/fx/subgraph_rewriter.py#L96).
 Given a `pattern`, it creates a subgraph of operators matching to the pattern,
 and then replaces each matched subgraph with the `replacement`.
 
@@ -229,7 +229,7 @@ class ReplacedPatterns:
 ### Level 3
 
 For the third way of creating a pass, we can utilize the most basic
-[`PassBase`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/infra/pass_base.py#L22).
+[`PassBase`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/infra/pass_base.py#L28).
 To create a pass, we can subclass this and implement the function `call` with
 the pass contents. Additionally, we can implement the functions `requires` and
 `ensures` which will be called before and after the function `call`. Note that
@@ -315,7 +315,7 @@ with IR Spec, so be careful when using them.
 
 For finding subgraphs within a graph that match a specific pattern, we can
 utilize FX's
-[`SubgraphMatcher`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/utils/matcher_utils.py#L51).
+[`SubgraphMatcher`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/utils/matcher_utils.py#L63).
 
 Class Attributes:
 
@@ -382,7 +382,7 @@ class InternalMatch():
 
 To find the largest subgraphs of nodes that support a specific invariant, we can
 utilize FX's
-[`CapabilityBasedPartitioner`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/infra/partitioner.py#L34C1-L34C1).
+[`CapabilityBasedPartitioner`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/infra/partitioner.py#L65).
 
 Class Attributes
 
@@ -399,14 +399,14 @@ Class Attributes
    that are allowed to be in a single node partition.
 
 The
-[`OperatorSupportBase`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/operator_support.py#L28)
+[`OperatorSupportBase`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L37)
 class is used by
 the partitioner to determine if a specific node in the graph belongs in the
 partition. This is done by overriding the `is_node_supported` function. You can
-chain multiple `OperatorSuppportBase` by using
-[`chain`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/operator_support.py#L150)(which
+chain multiple `OperatorSupportBase` by using
+[`chain`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L159)(which
 returns False if any of the OperatorSupportBase return False) and
-[`any_chain`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/operator_support.py#L164)
+[`any_chain`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L172)
 (which returns True if any of the OperatorSupportBase returns True).
 
 Consider the following example:
@@ -440,7 +440,7 @@ not allow `call_module` nodes.
 ### Combined
 
 We also provide a combined helper function:
-[`generate_pattern_op_partitions`](https://github.com/pytorch/executorch/blob/d9eef24bb720804aa7b400b05241487510ae0dc2/exir/backend/canonical_partitioners/pattern_op_partitioner.py#L59)
+[`generate_pattern_op_partitions`](https://github.com/pytorch/executorch/blob/main/exir/backend/canonical_partitioners/pattern_op_partitioner.py#L107)
 
 Args:
 * `graph_module (fx.GraphModule)`: Module that we want to partition
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 5c30defada7..5a34634beec 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -82,7 +82,7 @@ program = edge_program.to_executorch(
         )
 ```
 
-Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).
+Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/main/exir/memory_planning.py#L801).
 
 ## Debugging Tool
 
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index 443015b47be..ef55ade68aa 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -82,7 +82,7 @@ Starting from 2025-04-12, you can download nightly `main` branch snapshots:
 * `executorch.aar`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-{YYYYMMDD}/executorch.aar`
 * `executorch.aar.sha256sums`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-{YYYYMMDD}/executorch.aar.sha256sums`
 * Replace `YYYYMMDD` with the actual date you want to use.
-* AAR file is generated by [this workflow](https://github.com/pytorch/executorch/blob/c66b37d010c88a113560693b14dc6bd112593c11/.github/workflows/android-release-artifacts.yml#L14-L15).
+* AAR file is generated by [this workflow](https://github.com/pytorch/executorch/blob/main/.github/workflows/android-release-artifacts.yml).
 
 For example:
 

From d5ba603a391218f1ee1704811601d5177e525a8f Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Thu, 7 May 2026 12:54:35 -0700
Subject: [PATCH 39/58] Restore VGF skip guards and preload_deps shape (#19375)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Restores two pieces of test plumbing in backends/arm/test/ that were
inadvertently removed from pytorch/executorch on GitHub and are still
required for stable CI:

1. _VGF_ENABLED skip-guards in test_rewrite_conv_pass.py — without
   them, three VGF tests crash (rather than skip) on environments where
   LAVAPIPE_LIB_PATH is unset.
2. preload_deps shape in targets.bzl — the prior refactor silently
   dropped //executorch/kernels/quantized:custom_ops_generated_lib from
   every non-VGF arm test whenever runtime.is_oss or not _ENABLE_VGF.

The guards are no-ops on environments that have lavapipe configured, so
this is a strict safety improvement for OSS CI and a divergence fix for
fbsource. This supersedes the long-stuck fix-up diff D100742931 (which
will be abandoned).

Differential Revision: D104267179


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 backends/arm/test/passes/test_rewrite_conv_pass.py | 7 +++++++
 backends/arm/test/targets.bzl                      | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/backends/arm/test/passes/test_rewrite_conv_pass.py b/backends/arm/test/passes/test_rewrite_conv_pass.py
index 09176f26f28..fc8478afee5 100644
--- a/backends/arm/test/passes/test_rewrite_conv_pass.py
+++ b/backends/arm/test/passes/test_rewrite_conv_pass.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
+
 import pytest
 import torch
 import torch.nn as nn
@@ -34,6 +36,8 @@
 from torch.export import Dim, export
 from torch.export.exported_program import _get_shape_env
 
+_VGF_ENABLED = "LAVAPIPE_LIB_PATH" in os.environ
+
 
 class TinyConvReluCat(nn.Module):
     def __init__(self, conv1_bias: bool = True) -> None:
@@ -214,6 +218,7 @@ def test_rewrite_conv_tosa_FP():
     pipeline.run()
 
 
+@pytest.mark.skipif(not _VGF_ENABLED, reason="VGF not enabled")
 def test_fold_and_annotate_q_params_vgf_quant_preserves_output_qparams_on_non_fuseable_clamp() -> (
     None
 ):
@@ -228,6 +233,7 @@ def test_fold_and_annotate_q_params_vgf_quant_preserves_output_qparams_on_non_fu
     assert clamp.meta["output_qparams"]
 
 
+@pytest.mark.skipif(not _VGF_ENABLED, reason="VGF not enabled")
 def test_rewrite_conv_vgf_quant_handles_non_fuseable_conv_clamp_cat_branch() -> None:
     exported_program = _export_quantized(TinyConvReluCat())
     compile_spec = _compile_spec()
@@ -239,6 +245,7 @@ def test_rewrite_conv_vgf_quant_handles_non_fuseable_conv_clamp_cat_branch() ->
     )
 
 
+@pytest.mark.skipif(not _VGF_ENABLED, reason="VGF not enabled")
 def test_rewrite_conv_vgf_quant_infers_quantized_bias_dtype_from_inputs() -> None:
     exported_program = _export_quantized(TinyConvReluCat(conv1_bias=False))
     edge_program = to_edge(
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 6e2539cf2dc..15547b7d115 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -84,11 +84,12 @@ def define_arm_tests():
                 "EMULATION_LAYER_TENSOR_JSON": "$(location fbsource//third-party/arm-ml-emulation-layer/v0.9.0/src:VkLayer_Tensor_json)",
                 "EMULATION_LAYER_GRAPH_JSON": "$(location fbsource//third-party/arm-ml-emulation-layer/v0.9.0/src:VkLayer_Graph_json)",
             } if _ENABLE_VGF else {}),
-            preload_deps = [] if runtime.is_oss or not _ENABLE_VGF else [
+            preload_deps = [
                 "//executorch/kernels/quantized:custom_ops_generated_lib",
+            ] + ([] if runtime.is_oss or not _ENABLE_VGF else [
                 "fbsource//third-party/khronos:vulkan",
                 "//executorch/backends/arm/runtime:vgf_backend",
-            ],
+            ]),
             deps = [
                 "//executorch/backends/arm/test:arm_tester" if runtime.is_oss else "//executorch/backends/arm/test/tester/fb:arm_tester_fb",
                 "//executorch/backends/arm/test:conftest",

From d858cd99ffe04de55822b111b02f9bc2cfb11bfc Mon Sep 17 00:00:00 2001
From: khazaei <khazaei@alumni.stanford.edu>
Date: Thu, 7 May 2026 13:00:50 -0700
Subject: [PATCH 40/58] Add optional offset arg to quantized_conv1d_nlc and
 precompute it AOT (#19344)

Differential Revision: D103893688

Pull Request resolved: https://github.com/pytorch/executorch/pull/19344
---
 backends/cadence/aot/functions.yaml                          | 2 +-
 backends/cadence/aot/functions_hifi.yaml                     | 2 +-
 backends/cadence/aot/ops_registrations.py                    | 5 +++--
 .../cadence/generic/operators/op_quantized_conv1d_nlc.cpp    | 1 +
 backends/cadence/generic/operators/op_quantized_conv1d_nlc.h | 1 +
 .../generic/operators/op_quantized_depthwise_conv1d_nlc.cpp  | 1 +
 backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp  | 4 ++++
 .../hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp     | 1 +
 8 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index 60fda2853a3..754b781cb7b 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -399,7 +399,7 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv1d_ncl_per_tensor_out
 
-- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv1d_nlc_per_tensor_out
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 3b1932d01ec..bf9ef2976a9 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -574,7 +574,7 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_conv1d_ncl_per_tensor_out
 
-- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_conv1d_nlc_per_tensor_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a1d3ab871e1..f3e73028169 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -263,10 +263,10 @@ def register_fake(
     "quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
     "quantized_depthwise_conv1d_ncl.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
@@ -1305,6 +1305,7 @@ def quantized_conv1d_nlc_per_tensor_meta(
     output_zero_point: int,
     out_multiplier: int,
     out_shift: int,
+    offset: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     torch._check(bias.dtype == torch.int32, lambda: "expected int32")
     # NLC format: input is [N, L, C], weight is [OC, K, IC/groups]
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
index b4e253ef366..6f42543cfc1 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
@@ -256,6 +256,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
+    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   (void)ctx;
   quantized_conv1d_nlc(
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
index 7713121cf97..4f4d2877b27 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
@@ -54,6 +54,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
+    const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
index 2ae06a651d2..a8f98a76ffc 100644
--- a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -57,6 +57,7 @@ ::executorch::aten::Tensor& quantized_depthwise_conv1d_nlc_per_tensor_out(
       output_zero_point,
       out_multiplier,
       out_shift,
+      ::executorch::aten::optional<Tensor>(),
       out);
 }
 
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
index d4631752495..5171c2908bc 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -238,6 +238,7 @@ void quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
+    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   // HiFi nnlib kernels only support dilation=1.
   // Fall back to generic implementation for dilation > 1.
@@ -258,6 +259,7 @@ void quantized_conv1d_nlc_per_tensor_out(
         output_zero_point,
         out_multiplier,
         out_shift,
+        offset,
         out);
     return;
   }
@@ -284,6 +286,7 @@ void quantized_conv1d_nlc_per_tensor_out(
           output_zero_point,
           out_multiplier,
           out_shift,
+          offset,
           out);
     } else {
       xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
@@ -320,6 +323,7 @@ void quantized_conv1d_nlc_per_tensor_out(
           output_zero_point,
           out_multiplier,
           out_shift,
+          offset,
           out);
     } else {
       xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
diff --git a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
index 9e7e13477ca..4299990b52a 100644
--- a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -206,6 +206,7 @@ void quantized_depthwise_conv1d_nlc_per_tensor_out(
         output_zero_point,
         out_multiplier,
         out_shift,
+        ::executorch::aten::optional<Tensor>(),
         out);
     return;
   }

From 180edd3da89852f6a8b0ccd90f8eb6f75e8cdec8 Mon Sep 17 00:00:00 2001
From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com>
Date: Thu, 7 May 2026 13:57:10 -0700
Subject: [PATCH 41/58] Reorder slice before binary broadcast ops (#19346)

Differential Revision: D103563327

Pull Request resolved: https://github.com/pytorch/executorch/pull/19346
---
 backends/cadence/aot/reorder_ops.py           | 109 +++++++++++++++---
 .../aot/tests/test_reorder_ops_passes.py      |  97 ++++++++++++++++
 2 files changed, 188 insertions(+), 18 deletions(-)

diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
index 5a9b76b473a..2ca766316f3 100644
--- a/backends/cadence/aot/reorder_ops.py
+++ b/backends/cadence/aot/reorder_ops.py
@@ -721,15 +721,16 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class PropagateSlice(RemoveOrReplacePassInterface):
-    """Propagate slice_copy before unary element-wise ops when the cost
-    model indicates it reduces total data movement.
+    """Propagate slice_copy before element-wise ops when the cost model
+    indicates it reduces total data movement.
 
     Supported ops (extensible via dispatch table):
-        - quantize_per_tensor: element-wise, slice passes through unchanged
-        - dequantize_per_tensor: element-wise, slice passes through unchanged
+        - quantize_per_tensor: unary element-wise
+        - dequantize_per_tensor: unary element-wise
+        - add.Tensor: binary with broadcast — slices non-broadcasting inputs
+        - mul.Tensor: binary with broadcast — slices non-broadcasting inputs
 
-    Handles any slice dim and any step size. Runs in the iterative pass
-    loop — chains are handled by repeated application.
+    Handles any slice dim and any step size.
     """
 
     def __init__(self) -> None:
@@ -740,16 +741,28 @@ def __init__(self) -> None:
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
             exir_ops.edge.cadence.dequantize_per_tensor.default,
         ]
+        binary_targets = [
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+        ]
         self._dispatch: dict[
             EdgeOpOverload,
             tuple[
                 Callable[[torch.fx.Node, torch.fx.Node], bool],
                 Callable[[torch.fx.Node, torch.fx.Node], bool],
             ],
-        ] = {
-            t: (self._should_swap_elementwise, self._swap_elementwise_slice)
-            for t in elementwise_targets
-        }
+        ] = {}
+        for t in elementwise_targets:
+            self._dispatch[t] = (
+                self._should_swap_elementwise,
+                self._swap_elementwise_slice,
+            )
+
+        for t in binary_targets:
+            self._dispatch[t] = (
+                self._should_swap_binary_elementwise,
+                self._swap_binary_elementwise_slice,
+            )
 
     @property
     def targets(self) -> list[EdgeOpOverload]:
@@ -765,19 +778,21 @@ def _should_swap_elementwise(
     def _swap_elementwise_slice(
         self, op_node: torch.fx.Node, slice_node: torch.fx.Node
     ) -> bool:
-        op_input = op_node.args[0]
-        assert isinstance(op_input, torch.fx.Node)
+        op_input = get_arg(op_node, "input", torch.fx.Node)
         graph = slice_node.graph
 
-        slice_args = slice_node.args[1:]
+        slice_dim = get_arg(slice_node, "dim", int)
+        slice_start = get_arg(slice_node, "start")
+        slice_end = get_arg(slice_node, "end")
+        slice_step = get_arg(slice_node, "step", int)
 
         with graph.inserting_before(op_node):
             new_slice = graph.call_function(
                 exir_ops.edge.aten.slice_copy.Tensor,
-                args=(op_input, *slice_args),
+                args=(op_input, slice_dim, slice_start, slice_end, slice_step),
             )
             new_slice.meta["val"] = exir_ops.edge.aten.slice_copy.Tensor(
-                op_input.meta["val"], *slice_args
+                op_input.meta["val"], slice_dim, slice_start, slice_end, slice_step
             )
 
             new_args = list(op_node.args)
@@ -805,10 +820,68 @@ def _swap_elementwise_slice(
         graph.erase_node(op_node)
         return True
 
-    def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
-        parent = node.args[0]
-        if not isinstance(parent, torch.fx.Node):
+    def _should_swap_binary_elementwise(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        lhs, rhs = op_node.args[0], op_node.args[1]
+        assert isinstance(lhs, torch.fx.Node) and isinstance(rhs, torch.fx.Node)
+        if lhs.meta["val"].shape == rhs.meta["val"].shape:
             return False
+        full_size = prod(op_node.meta["val"].shape)
+        sliced_size = prod(slice_node.meta["val"].shape)
+        return sliced_size < full_size
+
+    def _swap_binary_elementwise_slice(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        lhs, rhs = op_node.args[0], op_node.args[1]
+        assert isinstance(lhs, torch.fx.Node) and isinstance(rhs, torch.fx.Node)
+        graph = slice_node.graph
+
+        slice_dim = get_arg(slice_node, "dim", int)
+        slice_start = get_arg(slice_node, "start")
+        slice_end = get_arg(slice_node, "end")
+        slice_step = get_arg(slice_node, "step", int)
+
+        output_shape = op_node.meta["val"].shape
+
+        new_args = list(op_node.args)
+        with graph.inserting_before(op_node):
+            for i, inp in enumerate([lhs, rhs]):
+                if inp.meta["val"].shape[slice_dim] == output_shape[slice_dim]:
+                    new_slice = graph.call_function(
+                        exir_ops.edge.aten.slice_copy.Tensor,
+                        args=(inp, slice_dim, slice_start, slice_end, slice_step),
+                    )
+                    new_slice.meta["val"] = exir_ops.edge.aten.slice_copy.Tensor(
+                        inp.meta["val"], slice_dim, slice_start, slice_end, slice_step
+                    )
+                    new_args[i] = new_slice
+
+            target = cast(EdgeOpOverload, op_node.target)
+            new_op = graph.call_function(
+                target,
+                args=tuple(new_args),
+                kwargs=op_node.kwargs,
+            )
+            new_op.meta["val"] = target(
+                *[
+                    a.meta["val"] if isinstance(a, torch.fx.Node) else a
+                    for a in new_args
+                ],
+                **{
+                    k: v.meta["val"] if isinstance(v, torch.fx.Node) else v
+                    for k, v in op_node.kwargs.items()
+                },
+            )
+
+        slice_node.replace_all_uses_with(new_op)
+        graph.erase_node(slice_node)
+        graph.erase_node(op_node)
+        return True
+
+    def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
+        parent = get_arg(node, "input", torch.fx.Node)
         if len(parent.users) != 1:
             return False
         if not isinstance(parent.target, EdgeOpOverload):
diff --git a/backends/cadence/aot/tests/test_reorder_ops_passes.py b/backends/cadence/aot/tests/test_reorder_ops_passes.py
index cf3a6840179..ea8943df8e8 100644
--- a/backends/cadence/aot/tests/test_reorder_ops_passes.py
+++ b/backends/cadence/aot/tests/test_reorder_ops_passes.py
@@ -927,3 +927,100 @@ def test_unsupported_parent_not_swapped(self) -> None:
         result = PropagateSlice().call(gm)
 
         self.assertFalse(result.modified)
+
+    def test_swap_broadcast_mul_slice_on_broadcast_dim(self) -> None:
+        """[1,60,1,1] * [4,1,1,1] → [4,60,1,1] → slice(dim=0, step=2)
+        Only the [4,1,1,1] input should be sliced."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(1, 60, 1, 1))
+        b = builder.placeholder("b", torch.randn(4, 1, 1, 1))
+        mul = builder.call_operator(exir_ops.edge.aten.mul.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(mul, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "b")
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [2, 1, 1, 1])
+
+        mul_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.mul.Tensor
+        )
+        self.assertEqual(len(mul_nodes), 1)
+        self.assertEqual(list(mul_nodes[0].meta["val"].shape), [2, 60, 1, 1])
+
+    def test_swap_broadcast_add_lhs_broadcasts(self) -> None:
+        """[1,60,4,4] + [4,60,4,4] → [4,60,4,4] → slice(dim=0, step=2)
+        Only the [4,60,4,4] (rhs) should be sliced."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(1, 60, 4, 4))
+        b = builder.placeholder("b", torch.randn(4, 60, 4, 4))
+        add = builder.call_operator(exir_ops.edge.aten.add.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(add, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "b")
+
+    def test_swap_broadcast_mul_slice_on_non_broadcast_dim(self) -> None:
+        """[4,60,1,1] * [4,1,1,1] → [4,60,1,1] → slice(dim=1, start=0, end=30)
+        Only the [4,60,1,1] (lhs) should be sliced since rhs has dim1=1."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(4, 60, 1, 1))
+        b = builder.placeholder("b", torch.randn(4, 1, 1, 1))
+        mul = builder.call_operator(exir_ops.edge.aten.mul.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(mul, 1, 0, 30, 1),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "a")
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [4, 30, 1, 1])
+
+    def test_no_swap_binary_same_shape(self) -> None:
+        """Same-shape binary ops are not swapped (no broadcast)."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(4, 60, 4, 4))
+        b = builder.placeholder("b", torch.randn(4, 60, 4, 4))
+        add = builder.call_operator(exir_ops.edge.aten.add.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(add, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)

From fa857bd30eaaf6bdfe7744ffb29125ad04dbc4dc Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Thu, 7 May 2026 14:56:20 -0700
Subject: [PATCH 42/58] ci: fix macOS PyTorch wheel cache key for branch-ref
 pins (#19350)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
`install_pytorch_and_domains` constructs the cached-wheel URL using
`${TORCH_VERSION:0:7}`, which gives "release" when the pin is a branch
ref like `release/2.11`. The upload code uses the basename of
`dist/*.whl`, which is whatever PyTorch's setup.py wrote — always the
resolved commit hash (e.g. `+git70d99e9`). The two never match, so every
macOS run misses the cache and does a ~30-minute source build even
though the wheel for the current pin's HEAD is already in S3.

Resolve the hash via `git rev-parse --short=7 HEAD` after `git
checkout`, so download and upload agree. Commit-hash pins are unchanged
(the first 7 chars already equaled the resolved hash).

Authored with Claude Code.

### Test plan
CI
---
 .ci/scripts/utils.sh | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 86e54b478ef..0184ba285dd 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -105,7 +105,11 @@ install_pytorch_and_domains() {
   fi
   local python_version=$(python -c 'import platform; v=platform.python_version_tuple(); print(f"{v[0]}{v[1]}")')
   local torch_release=$(cat version.txt)
-  local torch_short_hash=${TORCH_VERSION:0:7}
+  # Download key must match the upload key below (basename of dist/*.whl,
+  # which always carries setup.py's resolved +gitHASH). Branch-ref pins
+  # like `release/2.11` would otherwise produce `+gitrelease` here and
+  # never hit the cache.
+  local torch_short_hash=$(git rev-parse --short=7 HEAD)
   local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
   local torch_wheel_name="torch-${torch_release}%2Bgit${torch_short_hash}-cp${python_version}-cp${python_version}-${platform:-}.whl"
 
@@ -127,6 +131,30 @@ install_pytorch_and_domains() {
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
+    # Invariant: the basename setup.py just produced must match the cache
+    # URL we'd reconstruct on the next run. If they diverge (someone edits
+    # torch_wheel_name above, or PyTorch renames its wheels), the cache
+    # will silently miss and every macOS run will fall back to a ~30-min
+    # source build. Fail loudly so the regression is caught immediately.
+    shopt -s nullglob
+    local built_wheels=(dist/*.whl)
+    shopt -u nullglob
+    if [[ ${#built_wheels[@]} -ne 1 ]]; then
+      echo "ERROR: expected exactly 1 wheel in dist/, found ${#built_wheels[@]}" >&2
+      exit 1
+    fi
+    local built_wheel_name
+    built_wheel_name=$(basename "${built_wheels[0]}")
+    local expected_wheel_name="${torch_wheel_name//\%2B/+}"
+    if [[ "${built_wheel_name}" != "${expected_wheel_name}" ]]; then
+      echo "ERROR: built torch wheel name does not match cache URL key:" >&2
+      echo "  built:    ${built_wheel_name}" >&2
+      echo "  expected: ${expected_wheel_name}" >&2
+      echo "Fix torch_wheel_name construction in install_pytorch_and_domains" >&2
+      echo "in .ci/scripts/utils.sh" >&2
+      exit 1
+    fi
+
     # Only AWS runners have access to S3
     if command -v aws && [[ -z "${GITHUB_RUNNER:-}" ]]; then
       for wheel_path in dist/*.whl; do

From 91aef5732edd2eca416a00f75b38e8e25e6fce52 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Thu, 7 May 2026 16:30:35 -0700
Subject: [PATCH 43/58] Update target.bzl to remove a comment (#19380)

This is an interim fix, as the follow up diffs will enable this flag to
sync internal states


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 backends/arm/test/targets.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 15547b7d115..771de9575a5 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-_ENABLE_VGF = False  # Disabled: memfd_create blocked by seccomp on Sandcastle causes segfaults before Python pre-flight check can run
+_ENABLE_VGF = False
 
 def define_arm_tests():
     # TODO [fbonly] Add more tests

From ada8e3580b0a2d8efb6865e07977bfa35dd901cd Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Fri, 8 May 2026 03:46:09 -0700
Subject: [PATCH 44/58] Enable VGF tests and add Vulkan format compatibility
 shim (#19383)

Differential Revision: D102431804

Pull Request resolved: https://github.com/pytorch/executorch/pull/19383
---
 backends/arm/test/targets.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 771de9575a5..7baf37e6047 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-_ENABLE_VGF = False
+_ENABLE_VGF = True
 
 def define_arm_tests():
     # TODO [fbonly] Add more tests

From 3185f029b7b4668ee3aef8781688158653d33ff9 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Fri, 8 May 2026 10:17:36 -0700
Subject: [PATCH 45/58] ci: install pinned torch before requirements-ci.txt on
 macOS (#19342)

### Summary

`setup-macos.sh` runs `install_pip_dependencies` before
`install_pytorch_and_domains`. That order lets torchsr's transitive
torch dep get pulled from PyPI before the pinned source-built or
S3-cached torch lands; `install_pytorch_and_domains` then overwrites the
wrong-source torch.

The overwrite is small in the current state, but the same race forced
--no-cache-dir`. That cascaded reinstalls of every torch transitive dep,
pushed the macOS unittest past its 60-minute timeout, until #19334
reverted it.

Reorder so `install_pip_dependencies` runs after
`install_pytorch_and_domains`. With torch already at the pinned version,
torchsr's dep is satisfied and pip skips re-downloading. Removes the
structural footgun so any future re-land of centralized torch install
does not need `--force-reinstall`.

Also rewrite two adjacent comments:
- Add a comment above `install_pytorch_and_domains` recording the
ordering rationale.
- Drop the now-misleading "We build PyTorch from source here" comment
that drifted above `install_executorch`; replace with one explaining why
`install_executorch --use-pt-pinned-commit` is correct (torch is already
installed).

Authored with Claude Code.

### Test plan
CI
---
 .ci/scripts/setup-macos.sh | 12 +++++++++---
 .ci/scripts/utils.sh       |  4 ++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 4b43a730710..6bd26e0b171 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -116,7 +116,6 @@ setup_macos_env_variables
 # buck2 atm
 install_buck
 brew install libomp
-install_pip_dependencies
 
 # TODO(huydhn): Unlike our self-hosted runner, GitHub runner doesn't have access
 # to our infra, so compiler caching needs to be setup differently using GitHub
@@ -125,10 +124,17 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then
   install_sccache
 fi
 
+# Install pinned torch before requirements-ci.txt so torchsr's transitive
+# torch dep is satisfied by the existing install and pip does not pull a
+# separate copy from PyPI. sccache is initialized above so source-build
+# cache misses still hit the cache.
 print_cmake_info
 install_pytorch_and_domains
-# We build PyTorch from source here instead of using nightly. This allows CI to test against
-# the pinned commit from PyTorch
+
+install_pip_dependencies
+
+# install_executorch's --use-pt-pinned-commit skips re-installing torch since
+# install_pytorch_and_domains already installed the pinned build above.
 if [[ "$EDITABLE" == "true" ]]; then
   install_executorch --use-pt-pinned-commit --editable
 else
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 0184ba285dd..b291374d667 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -127,6 +127,10 @@ install_pytorch_and_domains() {
   if [[ "${torch_wheel_not_found}" == "1" ]]; then
     echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
 
+    # Install PyTorch's own build-time deps so the source build does not
+    # silently inherit them from whatever else happens to be in the env
+    # (e.g. executorch's requirements-ci.txt).
+    pip install -r requirements-build.txt
     git submodule update --init --recursive
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"

From 6df43e18ce9be711ddc554ff8b8c67afd100e3bf Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Fri, 8 May 2026 10:47:23 -0700
Subject: [PATCH 46/58] Skip test_mimi in internal CI

Differential Revision: D104304516

Pull Request resolved: https://github.com/pytorch/executorch/pull/19379
---
 examples/models/moshi/mimi/BUCK | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/models/moshi/mimi/BUCK b/examples/models/moshi/mimi/BUCK
index 9774655f951..1d52649166f 100644
--- a/examples/models/moshi/mimi/BUCK
+++ b/examples/models/moshi/mimi/BUCK
@@ -1,5 +1,4 @@
 load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
-load("@fbsource//tools/target_determinator/macros:fbcode_ci_helpers.bzl", "fbcode_ci")
 load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
@@ -8,7 +7,9 @@ oncall("executorch")
 fbcode_target(_kind = runtime.python_test,
     name = "test_mimi",
     srcs = ["test_mimi.py"],
-    labels = ci.labels(fbcode_ci.use_opt_instead_of_dev()),
+    # Skipped in fbcode CI: setUpClass downloads the model from HuggingFace,
+    # which requires network access unavailable in this CI environment. Still runs in OSS CI.
+    labels = [ci.skip_target()],
     deps = [
         "fbsource//third-party/pypi/huggingface-hub:huggingface-hub",
         "fbsource//third-party/pypi/moshi:moshi",

From 1284d542cd6c439ad4d35bdd05ac7271f985a81f Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Fri, 8 May 2026 10:51:59 -0700
Subject: [PATCH 47/58] test_passes: wire QNN SDK into runtime env

Differential Revision: D104309789

Pull Request resolved: https://github.com/pytorch/executorch/pull/19381
---
 backends/qualcomm/tests/BUCK | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backends/qualcomm/tests/BUCK b/backends/qualcomm/tests/BUCK
index 25834c7e616..c73a8f89536 100644
--- a/backends/qualcomm/tests/BUCK
+++ b/backends/qualcomm/tests/BUCK
@@ -64,6 +64,10 @@ fbcode_target(_kind = runtime.python_test,
     srcs = [
         "test_passes.py",
     ],
+    env = {} if runtime.is_oss else {
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
+        "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_version()),
+    },
     deps = [
         ":models",
         "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
@@ -77,6 +81,7 @@ fbcode_target(_kind = runtime.python_test,
         "//executorch/backends/qualcomm/builders:builders",
     ] + ([] if runtime.is_oss else [
         # These deps fail in OSS: keep_gpu_sections kwarg breaks TARGETS evaluation
+        "//executorch/devtools:lib",
         "//executorch/examples/models/llama:transformer_modules",
         "//executorch/examples/qualcomm/oss_scripts/llama:masking_utils",
         "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",

From c56493676fedad34dd25965c72a41a6d023e87f5 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Fri, 8 May 2026 10:59:48 -0700
Subject: [PATCH 48/58] Use gpu_cpp_unittest for slim CUDA guard tests

Differential Revision: D104297130

Pull Request resolved: https://github.com/pytorch/executorch/pull/19378
---
 backends/aoti/slim/cuda/test/targets.bzl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/aoti/slim/cuda/test/targets.bzl b/backends/aoti/slim/cuda/test/targets.bzl
index bf38b599637..079f769a509 100644
--- a/backends/aoti/slim/cuda/test/targets.bzl
+++ b/backends/aoti/slim/cuda/test/targets.bzl
@@ -1,8 +1,8 @@
-load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs:gpu_cpp_unittest.bzl", "gpu_cpp_unittest")
 load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 def cuda_slim_cpp_unittest(name):
-    cpp_unittest(
+    gpu_cpp_unittest(
         name = "test_" + name,
         srcs = [
             "test_" + name + ".cpp",
@@ -16,6 +16,7 @@ def cuda_slim_cpp_unittest(name):
         external_deps = [
             ("cuda", None, "cuda-lazy"),
         ],
+        hip_compatible = False,
         keep_gpu_sections = True,
         remote_execution = re_test_utils.remote_execution(
             platform = "gpu-remote-execution",

From b57ac0364e01bc26ce5857b71ac73a9c095c6fe6 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Fri, 8 May 2026 12:01:00 -0700
Subject: [PATCH 49/58] Re-land XNNPACK update (#19237)

### Summary
It should work now that https://github.com/google/pthreadpool/pull/92 is
merged.
---
 .ci/scripts/test_lora.sh                      |  3 ++-
 backends/xnnpack/CMakeLists.txt               |  4 ++++
 backends/xnnpack/third-party/XNNPACK          |  2 +-
 backends/xnnpack/third-party/cpuinfo          |  2 +-
 backends/xnnpack/third-party/pthreadpool      |  2 +-
 backends/xnnpack/third-party/xnnpack.buck.bzl | 14 +++++++++++++-
 6 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
index 45e4c68745a..102347a08fd 100644
--- a/.ci/scripts/test_lora.sh
+++ b/.ci/scripts/test_lora.sh
@@ -159,7 +159,8 @@ Okay, so I need to calculate 15% of 80."
 EXPECTED_QUANT_LORA_PREFIX="
 <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
 To calculate 15% of 80, we can multiply 80 by 15/100.
-So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
+80 * 15/100 = 12.
+So, 15% of 80 is 12.
 #### 12
 The answer is: 12<|im_end|>"
 EXPECTED_QUANT_LORA_ALTERNATE_PREFIX="
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 625e3d2523f..1b46c993b17 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -154,12 +154,16 @@ install(
           xnnpack-normalization
           xnnpack-operators
           xnnpack-operator-run
+          xnnpack-operator-delete
           xnnpack-operator-utils
           xnnpack-pack-lh
           xnnpack-packing
           xnnpack-sanitizers
           xnnpack-subgraph
           xnnpack-datatype
+          xnnpack-fingerprint-id
+          xnnpack-fingerprint-cache
+          xnnpack-fingerprint-check
           xnnpack-reference-ukernels
           xnnpack-logging
   EXPORT ExecuTorchTargets
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 3131afead79..1adaa7c709d 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 3131afead790c5c69a9aa12273dfc40399789ad7
+Subproject commit 1adaa7c709d4839d29e1f219cb962b01c9e6a905
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
index 8a9210069b5..f9a03241f8c 160000
--- a/backends/xnnpack/third-party/cpuinfo
+++ b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit 8a9210069b5a37dd89ed118a783945502a30a4ae
+Subproject commit f9a03241f8c3d4ed0c9728f5d70bff873d43d4e0
diff --git a/backends/xnnpack/third-party/pthreadpool b/backends/xnnpack/third-party/pthreadpool
index c2ba5c50bb5..a56dcd79c69 160000
--- a/backends/xnnpack/third-party/pthreadpool
+++ b/backends/xnnpack/third-party/pthreadpool
@@ -1 +1 @@
-Subproject commit c2ba5c50bb58d1397b693740cf75fad836a0d1bf
+Subproject commit a56dcd79c699366e7ac6466792c3025883ff7704
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index 14520b07664..ac861435af8 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -41,7 +41,10 @@ def define_xnnpack():
             "XNNPACK/src/memory.c",
             "XNNPACK/src/mutex.c",
             "XNNPACK/src/normalization.c",
+            "XNNPACK/src/operator-delete.c",
             "XNNPACK/src/operator-utils.c",
+            "XNNPACK/src/operators/fingerprint_cache.c",
+            "XNNPACK/src/operators/fingerprint_id.c",
             "XNNPACK/src/reference/packing.cc",
         ],
         headers = get_xnnpack_headers(),
@@ -1039,7 +1042,7 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512vnnigfni",
         srcs = select({
-            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vnnifgni"),
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vnnigfni"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
@@ -1068,6 +1071,7 @@ def define_xnnpack():
         "-mavxvnni",
         "-mf16c",
         "-mfma",
+        "-mgfni",
     ]
 
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
@@ -1172,6 +1176,14 @@ def define_xnnpack():
             # "-DXNN_ENABLE_DWCONV_MULTIPLASS=0",
             "-DXNN_ENABLE_ARM_I8MM=1",
             "-DXNN_ENABLE_ARM_FP16_VECTOR=1",
+            "-DXNN_ENABLE_SSE=1",
+            "-DXNN_ENABLE_SSE2=1",
+            "-DXNN_ENABLE_SSSE3=1",
+            "-DXNN_ENABLE_SSE41=1",
+            "-DXNN_ENABLE_AVX=1",
+            "-DXNN_ENABLE_F16C=1",
+            "-DXNN_ENABLE_FMA3=1",
+            "-DXNN_ENABLE_AVX2=1",
             "-DXNN_ENABLE_AVX512F=1",
             "-DXNN_ENABLE_AVX512SKX=1",
             "-DXNN_ENABLE_AVX512VNNI=1",

From e969a980c7cac9033fcbc7d90c8fb21cb0059765 Mon Sep 17 00:00:00 2001
From: Lidang Jiang <119769478+Lidang-Jiang@users.noreply.github.com>
Date: Sat, 9 May 2026 03:28:43 +0800
Subject: [PATCH 50/58] Fix torch.split fails in to_edge with alias annotations
 (#18700)

Fixes #11723

## Summary

`torch.split` fails with `RuntimeError: Found a custom (non-ATen)
operator whose output has alias annotations` when used with
`to_edge_transform_and_lower` and a partitioner that requests op
preservation.

**Root cause**: `_remove_invalid_ops_for_not_decompose` relies on
`torchgen`'s `aliased_return_names()` to detect ops with aliased
returns. However, for ops returning lists of aliased tensors (e.g.,
`split.Tensor` returns `Tensor(a)[]`), `aliased_return_names()` returns
`[None]`, failing to detect the alias annotation. This lets
`split.Tensor` pass through into the `EDGE_DO_NOT_DECOMP` namespace,
where functionalization fails.

**Fix**: Add a fallback check using `op._schema.returns` directly, which
correctly reports `alias_info` on list return types. This also fixes the
same latent issue for `chunk.default` and `tensor_split.sections`.

## Test plan

- Added `test_remove_invalid_ops_filters_aliased_list_returns`
regression test
- Run: `pytest
exir/tests/test_passes.py::TestPasses::test_remove_invalid_ops_filters_aliased_list_returns
-xvs`
- Verified existing split-related test still passes:
`test_to_out_variant_singleon_tensor_list`
- Verified existing broken ops test still passes:
`test_compile_fix_broken_ops`

<details>
<summary>Before fix</summary>

```
==================== BEFORE FIX ====================
RESULT: FAILED
RuntimeError: Found a custom (non-ATen) operator whose output has alias annotations: EDGE_DO_NOT_DECOMP::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]. We only support functionalizing operators whose outputs do not have alias annotations (e.g. 'Tensor(a)' is a Tensor with an alias annotation whereas 'Tensor' is a Tensor without. The '(a)' is the alias annotation). The alias annotation specifies that the output Tensor shares storage with an input that has the same annotation. Please check if (1) the output needs to be an output (if not, don't return it), (2) if the output doesn't share storage with any inputs, then delete the alias annotation. (3) if the output indeed shares storage with an input, then add a .clone() before returning it to prevent storage sharing and then delete the alias annotation. Otherwise, please file an issue on GitHub.

While executing %split : [num_users=3] = call_function[target=torch.ops.EDGE_DO_NOT_DECOMP.split.Tensor](args = (%x, 2), kwargs = {})
Original traceback:
None
Use tlparse to see full graph. (https://github.com/pytorch/tlparse?tab=readme-ov-file#tlparse-parse-structured-pt2-logs)
```

</details>

<details>
<summary>After fix</summary>

```
==================== AFTER FIX ====================
WARNING:root:Op aten.split.Tensor was requested for preservation by partitioner.  This request is ignored because it aliases output.

Test 1: to_edge (no partitioner)
RESULT: SUCCESS - outputs match

Test 2: to_edge_transform_and_lower with split.Tensor preservation
RESULT: SUCCESS - split.Tensor correctly filtered from EDGE_DO_NOT_DECOMP
         (AttributeError from dummy partitioner partition(), not from split bug)

Test 3: _remove_invalid_ops_for_not_decompose filter check
  aten::split.Tensor                            -> FILTERED (correct)
  aten::chunk                                   -> FILTERED (correct)
  aten::tensor_split.sections                   -> FILTERED (correct)
```

</details>

<details>
<summary>Unit test output</summary>

```
$ pytest exir/tests/test_passes.py::TestPasses::test_remove_invalid_ops_filters_aliased_list_returns -xvs

============================= test session starts ==============================
platform linux -- Python 3.12.12, pytest-8.4.2
collected 1 item

exir/tests/test_passes.py::TestPasses::test_remove_invalid_ops_filters_aliased_list_returns PASSED

============================== 1 passed in 6.83s ===============================

$ pytest exir/tests/test_passes.py::TestPasses::test_to_out_variant_singleon_tensor_list -xvs
PASSED

$ pytest exir/tests/test_passes.py::TestPasses::test_compile_fix_broken_ops -xvs
PASSED
```

</details>

This PR was authored with the assistance of Claude.

---------

Signed-off-by: Lidang-Jiang <lidangjiang@gmail.com>
---
 exir/program/_program.py  | 12 +++++++++++-
 exir/tests/test_passes.py | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/exir/program/_program.py b/exir/program/_program.py
index c68d0eed945..950e203c86c 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1081,7 +1081,7 @@ def _sanity_check_graph_for_non_decomp_ops(
                     logging.warning(warning_str)
 
 
-def _remove_invalid_ops_for_not_decompose(
+def _remove_invalid_ops_for_not_decompose(  # noqa: C901
     preserve_ops: List[torch._ops.OpOverload],
 ) -> List[torch._ops.OpOverload]:
     _logged_warnings = set()
@@ -1124,6 +1124,16 @@ def keep(op):
                 )
                 return False
 
+        # Fallback: torchgen does not detect alias annotations on ops
+        # returning lists of aliased tensors (e.g. split.Tensor returns
+        # Tensor(a)[]). Check op._schema.returns directly.
+        for ret in schema.returns:
+            if ret.alias_info is not None:
+                log_warning(
+                    f"Op {op} was requested for preservation by partitioner.  This request is ignored because it aliases output."
+                )
+                return False
+
         # Explicit block list of ops that don't work if asked for
         # preservation
         if op in [
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index f683384f8f9..8a084ba491a 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -940,6 +940,43 @@ def body(i, h, h_accum):
             torch.allclose(prog.exported_program().module()(inp), model(inp))
         )
 
+    def test_remove_invalid_ops_filters_aliased_list_returns(self) -> None:
+        """Verify _remove_invalid_ops_for_not_decompose filters ops that return
+        aliased tensor lists (e.g. split, chunk) even when torchgen's
+        aliased_return_names() fails to detect them. Regression test for
+        https://github.com/pytorch/executorch/issues/11723
+        """
+        from executorch.exir.program._program import (
+            _remove_invalid_ops_for_not_decompose,
+        )
+
+        # These ops return Tensor(a)[] — a list of aliased views.
+        # torchgen's aliased_return_names() misses the alias annotation on
+        # list returns, so the fallback check on op._schema.returns is needed.
+        aliased_list_ops = [
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.chunk.default,
+            torch.ops.aten.tensor_split.sections,
+            torch.ops.aten.split_with_sizes.default,
+        ]
+        for op in aliased_list_ops:
+            result = _remove_invalid_ops_for_not_decompose([op])
+            self.assertNotIn(
+                op,
+                result,
+                f"{op} should be filtered out because it returns aliased tensors",
+            )
+
+        # Non-aliased ops should be preserved.
+        preserved_ops = [torch.ops.aten.linear.default]
+        for op in preserved_ops:
+            result = _remove_invalid_ops_for_not_decompose([op])
+            self.assertIn(
+                op,
+                result,
+                f"{op} should be preserved because it has no aliased returns",
+            )
+
     def test_convert_symb_ops(self) -> None:
         class Foo(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:

From 7e1643325f21a634ba51854eb6c3214703ecfcdb Mon Sep 17 00:00:00 2001
From: Nitin Jain <jainnitin@meta.com>
Date: Fri, 8 May 2026 13:24:34 -0700
Subject: [PATCH 51/58] Add a16w8 reduce_sum FVP coverage for Ethos-U85
 (#19319)

Differential Revision: D103667823

Pull Request resolved: https://github.com/pytorch/executorch/pull/19319
---
 backends/arm/test/ops/test_sum.py | 72 ++++++++++++++++++++++++++++++-
 backends/arm/test/targets.bzl     |  1 +
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 1075055c4f0..d727eb0408a 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -5,6 +5,8 @@
 
 from typing import Callable, Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 
@@ -96,7 +98,16 @@ def test_sum_dim_intlist_tosa_INT(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sum.test_parameters)
+# dim=None cases skipped: executorch.devtools.bundled_program.config rejects
+# None as a model input (cannot be serialized into the bundled program).
+_DIM_NONE_SKIP_REASON = "bundled_program cannot serialize None as a model input"
+_dim_none_skips = {
+    "dim_None": _DIM_NONE_SKIP_REASON,
+    "dim_None_4d_tensor": _DIM_NONE_SKIP_REASON,
+}
+
+
+@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
 @common.XfailIfNoCorstone300
 def test_sum_u55_INT_1_0(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -108,7 +119,7 @@ def test_sum_u55_INT_1_0(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sum.test_parameters)
+@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
 @common.XfailIfNoCorstone320
 def test_sum_u85_INT_1_0(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
@@ -220,3 +231,60 @@ def test_sum_tosa_FP(test_data: Callable[[], input_t2]):
 def test_sum_tosa_INT(test_data: Callable[[], input_t2]):
     pipeline = TosaPipelineINT[input_t1](SumDefault(), test_data(), SumDefault.aten_op)
     pipeline.run()
+
+
+# a16w8 (int16 IO + int8 weights) coverage for sum.dim_IntList. Surfaces the
+# Ethos-U85 int16 ReduceSum silent-zero issue tracked upstream at
+# https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/issues/23.
+
+
+class SumLastDim(torch.nn.Module):
+    """Reduce the last dim with keepdim=True."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.sum(dim=-1, keepdim=True)
+
+
+a16w8_sum_test_parameters = {
+    "rank1_16": lambda: (torch.rand(16),),
+    "rank3_8x1x16": lambda: (torch.rand(8, 1, 16),),
+    "rank3_4x4x16": lambda: (torch.rand(4, 4, 16),),
+}
+
+
+@common.parametrize("test_data", a16w8_sum_test_parameters)
+@common.XfailIfNoCorstone300
+def test_sum_dim_intlist_a16w8_u55_INT(test_data: Callable[[], input_t1]):
+    pipeline = EthosU55PipelineINT[input_t1](
+        SumLastDim(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        a16w8_quantization=True,
+        symmetric_io_quantization=True,
+        qtol=128,
+        epsilon=2**-16,
+    )
+    pipeline.run()
+
+
+# All cases hit upstream Vela issue #23 (linked above). strict=False so the
+# test target stays green both on stock Vela 5.0 (cases XFAIL) and once the
+# Vela fix is in tree (cases XPASS).
+@common.parametrize("test_data", a16w8_sum_test_parameters)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(
+    reason="Ethos-U85 int16 ReduceSum returns zero (vela#23)", strict=False
+)
+def test_sum_dim_intlist_a16w8_u85_INT(test_data: Callable[[], input_t1]):
+    pipeline = EthosU85PipelineINT[input_t1](
+        SumLastDim(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        a16w8_quantization=True,
+        symmetric_io_quantization=True,
+        qtol=128,
+        epsilon=2**-16,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 7baf37e6047..6a39d1fe5c1 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -30,6 +30,7 @@ def define_arm_tests():
         "ops/test_slice.py",
         "ops/test_sigmoid.py",
         "ops/test_sub.py",
+        "ops/test_sum.py",
         "ops/test_tanh.py",
         "ops/test_view.py",
         "ops/test_cos.py",

From b3baac5e30d8c5dad4e684ecc33f9d27aa58beb0 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Fri, 8 May 2026 15:06:19 -0700
Subject: [PATCH 52/58] Replace external_deps with deps for prettytable
 (#19401)

<< DO NOT EDIT BELOW THIS LINE >>
@diff-train-skip-merge
---
 profiler/BUCK | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/profiler/BUCK b/profiler/BUCK
index 1f6bc6cd9e6..7f7a7f42ab4 100644
--- a/profiler/BUCK
+++ b/profiler/BUCK
@@ -20,7 +20,9 @@ fbcode_target(_kind = runtime.python_library,
     ],
     base_module = "executorch.profiler",
     visibility = ["PUBLIC"],
-    external_deps = ["prettytable"],
+    deps = [
+        "fbsource//third-party/pypi/prettytable:prettytable",
+    ],
 )
 
 fbcode_target(_kind = runtime.python_library,

From 9889c7cd44406e70b87ae183a9181c91d2b026ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jiawei=20L=C3=BC?=
 <1398719+jiawei-lyu@users.noreply.github.com>
Date: Fri, 8 May 2026 15:24:02 -0700
Subject: [PATCH 53/58] Remove Vulkan shader DotSlash label

Differential Revision: D104429023

Pull Request resolved: https://github.com/pytorch/executorch/pull/19400
---
 backends/vulkan/targets.bzl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 10775a428bb..7689d522aa6 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -130,7 +130,6 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal
         },
         cmd = genrule_cmd,
         default_outs = ["."],
-        labels = ["uses_dotslash"],
     )
 
     suffix = "_no_volk" if no_volk else ""

From 9e4e49781ae9ac50e62551214456ac0bc5f24bca Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Fri, 8 May 2026 16:28:56 -0700
Subject: [PATCH 54/58] Make op_upsample_bilinear2d_aa_test deterministic
 (#19357)

Summary:
Three test methods in

`fbcode/executorch/kernels/portable/test/op_upsample_bilinear2d_aa_test.py`
have been auto-disabled as flaky on the test-issues dashboard
(owner ai_infra_mobile_platform):

- test_upsample_bilinear2d_aa_aten_parity_u8
- test_upsample_bilinear2d_aa_aggressive_downsampling
- test_upsample_bilinear2d_aa_align_corners_downsampling

Root cause: each test builds its input via `torch.randint(...)` or
`torch.randn(...)` with no seed pinned, so each run sees a different
sample. The configured `atol` was tight enough that on some draws the
ATen-vs-ExecuTorch divergence (driven by separable-vs-direct
anti-aliased interpolation differences) crossed the threshold and the
test flipped to FAIL. The kernel implementations themselves are not
changing across runs.

Fix:

1. Add `setUp(self): torch.manual_seed(0)` so every run sees the same
   input tensor and the same divergence, eliminating the run-to-run
   FAIL/PASS oscillation.
2. Bump two atol thresholds to cover the worst-case observed
   divergence with the now-pinned input:
   - u8 parity: 3.5 -> 5 (observed max abs error 4 / 255)
   - aggressive 4x downsampling: 0.4 -> 1.0 (observed max abs error
     ~0.59 for N(0,1) input)
3. The pre-existing `atol=0.25` on align_corners_downsampling is left
   unchanged - with seed 0 it now passes consistently.

The relaxed tolerances are still well below any change that would
indicate an actual kernel regression; the comprehensive C++ test
suite in `op_upsample_bilinear2d_aa_test.cpp` still validates the
kernel under tighter constraints.

Reviewed By: rascani

Differential Revision: D104150928
---
 .../test/op_upsample_bilinear2d_aa_test.py    | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/test/op_upsample_bilinear2d_aa_test.py b/kernels/portable/test/op_upsample_bilinear2d_aa_test.py
index f86aa35465c..c6e09af3b5c 100644
--- a/kernels/portable/test/op_upsample_bilinear2d_aa_test.py
+++ b/kernels/portable/test/op_upsample_bilinear2d_aa_test.py
@@ -19,6 +19,20 @@
 
 
 class UpsampleBilinear2dAATest(unittest.TestCase):
+    def setUp(self) -> None:
+        # Save RNG state so we can restore it in tearDown; without this,
+        # `torch.manual_seed` would leak determinism into other test
+        # modules that share the same process.
+        self._torch_rng_state = torch.get_rng_state()
+        # Pin RNG so torch.randn / torch.randint inputs are deterministic.
+        # Without this, the parity tests below occasionally see input values
+        # that produce ATen-vs-ExecuTorch differences just above the
+        # configured atol, surfacing as flakes on the test-issues dashboard.
+        torch.manual_seed(0)
+
+    def tearDown(self) -> None:
+        torch.set_rng_state(self._torch_rng_state)
+
     def run_upsample_aa_test(
         self,
         inp: torch.Tensor,
@@ -126,7 +140,10 @@ def test_upsample_bilinear2d_aa_aten_parity_u8(self):
             input_tensor,
             output_size=(4, 4),
             align_corners=False,
-            atol=3.5,  # Relaxed tolerance for uint8 due to implementation differences in anti-aliasing
+            # uint8 quantization: a +/-1 step at the kernel level rounds to a
+            # full unit in the output, so observed deltas vs. ATen can reach
+            # ~4 units even though the underlying float disagreement is small.
+            atol=5,
         )
 
     def test_upsample_bilinear2d_aa_downsampling(self):
@@ -144,7 +161,10 @@ def test_upsample_bilinear2d_aa_aggressive_downsampling(self):
             input_tensor,
             output_size=(2, 2),
             align_corners=False,
-            atol=0.4,  # Relaxed tolerance due to implementation differences in separable vs direct interpolation
+            # Aggressive 4x downsampling magnifies the separable-vs-direct
+            # interpolation differences between ExecuTorch and ATen; observed
+            # max abs error reaches ~0.6 for typical N(0,1) inputs.
+            atol=1.0,
         )
 
     def test_upsample_bilinear2d_aa_asymmetric_downsampling(self):

From 4413a5c696bfbce55f69916cf3b6aa25e5cd40a2 Mon Sep 17 00:00:00 2001
From: Rohit Yelukati Mahendra <34777717+ymrohit@users.noreply.github.com>
Date: Sat, 9 May 2026 01:30:07 +0100
Subject: [PATCH 55/58] [DOC] Add extension APIs to runtime API reference
 (#19385)

Fixes #19348

### Summary

- Add `extension/module` and `extension/tensor` headers to the Doxygen
inputs used by the runtime API reference.
- Expand the module namespace macros so Breathe can resolve the
documented extension classes with stable namespace names.
- Add runtime API reference sections for `Module`, `BundledModule`, and
the tensor extension namespace.

### Test plan

- `git diff --check origin/main..HEAD`
- `python -m py_compile docs/source/conf.py`
- `cd docs && doxygen source/Doxyfile`
- Isolated Breathe/Sphinx build of
`executorch-runtime-api-reference.rst` against the generated Doxygen XML
- Verified the rendered runtime API page contains the new Module
Extension and Tensor Extension entries

cc @mergennachin @AlannaBurke
---
 docs/source/Doxyfile                          | 12 +++++++---
 .../executorch-runtime-api-reference.rst      | 23 +++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile
index 2b895b215a3..c6b8fb0275b 100644
--- a/docs/source/Doxyfile
+++ b/docs/source/Doxyfile
@@ -963,6 +963,11 @@ INPUT                  = ../devtools/bundled_program/bundled_program.h \
                          ../runtime/core/span.h \
                          ../runtime/core/tag.h \
                          ../runtime/core/tensor_shape_dynamism.h \
+                         ../extension/module/bundled_module.h \
+                         ../extension/module/module.h \
+                         ../extension/tensor/tensor_accessor.h \
+                         ../extension/tensor/tensor_ptr.h \
+                         ../extension/tensor/tensor_ptr_maker.h \
                          ../runtime/platform/compiler.h \
                          ../runtime/executor/ \
                          ../runtime/platform/
@@ -2374,7 +2379,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2382,7 +2387,7 @@ MACRO_EXPANSION        = NO
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_ONLY_PREDEF     = NO
+EXPAND_ONLY_PREDEF     = YES
 
 # If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
@@ -2415,7 +2420,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED             = ET_MODULE_NAMESPACE=module \
+                         ET_BUNDLED_MODULE_NAMESPACE=bundled_module
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/source/executorch-runtime-api-reference.rst b/docs/source/executorch-runtime-api-reference.rst
index 8853e5444eb..42f75e500e3 100644
--- a/docs/source/executorch-runtime-api-reference.rst
+++ b/docs/source/executorch-runtime-api-reference.rst
@@ -40,3 +40,26 @@ Values
 
 .. doxygenclass:: executorch::runtime::etensor::Tensor
   :members:
+
+Module Extension
+----------------
+
+The Module extension provides a higher-level C++ facade for loading programs,
+setting inputs and outputs, and executing methods with common runtime defaults.
+
+.. doxygenclass:: executorch::extension::module::Module
+  :members:
+
+.. doxygenclass:: executorch::extension::bundled_module::BundledModule
+  :members:
+
+Tensor Extension
+----------------
+
+The Tensor extension provides managed tensor helpers for C++ applications that
+need to create, alias, resize, or index tensors before passing them to runtime
+APIs.
+
+.. doxygennamespace:: executorch::extension
+  :members:
+  :content-only:

From 93b764e765d7f46a2095d2aa5c7489c46fa79a46 Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@icloud.com>
Date: Fri, 8 May 2026 18:48:08 -0700
Subject: [PATCH 56/58] Hoist W4A8 activation quantization out of GEMM K-loop
 (#19209)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Context
The original K-loop did `tl.max(tl.abs(a))` + INT8 cast on every tile
(16 tiles × 16 rows = 256 reductions per program). Hoisting eliminates
this redundant work and halves activation HBM bandwidth in the GEMM
(bf16 → int8).

## Improvement
Pre-quantize activations to INT8 once into a dedicated buffer (with
per-row-per-tile FP32 scales) **before** the W4A8 batched MoE GEMMs,
instead of re-quantizing inside the K-loop on every tile.

## Perf (1600-token prefill)

| Metric | Baseline (`gh/digantdesai/53/head`) | Optimized | Speedup |
|---|---|---|---|
| Prefill | 5727 tok/s (5296–5963) | **6171** tok/s (5941–6313) |
**1.08×** |

## Correctness
7/7 microbenchmark configs (incl. qwen3.5-like M=128, K=2048, gs=128)
pass with relative diff <1.5% vs BF16 reference — within INT8
quantization noise.


cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils
@Sebastian-Larsson @robell
---
 backends/cuda/tests/test_fused_moe.py       |   5 +
 backends/cuda/triton/kernels/__init__.py    |   2 +
 backends/cuda/triton/kernels/fused_moe.py   | 372 ++++++++++++++------
 backends/cuda/triton/kernels/int4_matmul.py |   5 +
 4 files changed, 280 insertions(+), 104 deletions(-)

diff --git a/backends/cuda/tests/test_fused_moe.py b/backends/cuda/tests/test_fused_moe.py
index bbc351bc47b..324fd88907d 100644
--- a/backends/cuda/tests/test_fused_moe.py
+++ b/backends/cuda/tests/test_fused_moe.py
@@ -503,6 +503,11 @@ class TestFusedMoEBatchedInt8(unittest.TestCase):
         (55, 64, 64, 32, 4, 2, 32, "64tok"),
         (99, 128, 128, 64, 8, 2, 32, "128tok"),
         (0, 256, 128, 64, 8, 2, 32, "256tok"),
+        # Realistic-scale configs to catch precision/alignment issues with
+        # K > PREQUANT_BLOCK_K (matches Qwen3.5-MoE shapes: hidden=2048,
+        # intermediate=1024, num_experts=8, top_k=2, group_size=128).
+        (77, 512, 2048, 1024, 8, 2, 128, "512tok_real_dims"),
+        (21, 1, 2048, 1024, 8, 2, 128, "1tok_decode"),
     ]
 
     def test_int8_correctness(self):
diff --git a/backends/cuda/triton/kernels/__init__.py b/backends/cuda/triton/kernels/__init__.py
index d9f76f9909e..4db10fbf82d 100644
--- a/backends/cuda/triton/kernels/__init__.py
+++ b/backends/cuda/triton/kernels/__init__.py
@@ -8,6 +8,7 @@
     fused_moe,
     fused_moe_batched,
     fused_moe_batched_gemm,
+    fused_moe_batched_gemm_int8,
     moe_align_block_size,
 )
 
@@ -23,6 +24,7 @@
     "fused_moe",
     "fused_moe_batched",
     "fused_moe_batched_gemm",
+    "fused_moe_batched_gemm_int8",
     "int4_matvec",
     "moe_align_block_size",
     "sdpa",
diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
index e35c3008a24..9aeb8ef7dbe 100644
--- a/backends/cuda/triton/kernels/fused_moe.py
+++ b/backends/cuda/triton/kernels/fused_moe.py
@@ -42,6 +42,131 @@
 from torch.library import triton_op, wrap_triton
 
 
+# ---------------------------------------------------------------------------
+# W4A8 batched MoE kernels (INT8 activations + INT4 weights).
+#
+# Activation INT8 quantization is HOISTED out of the GEMM K-loop into a
+# dedicated pre-quantization kernel:
+#   - _quantize_activations_int8_kernel writes [max_padded, K] INT8 +
+#     [max_padded, num_k_tiles] float32 per-row-per-tile scales.
+#   - _fused_moe_batched_int8_kernel (GEMM1) loads pre-quantized INT8 + scale.
+#   - _silu_quantize_int8_kernel fuses SiLU(gate)*up with INT8 quantization
+#     between GEMM1 and GEMM2.
+#   - _fused_moe_silu_batched_int8_kernel (GEMM2) loads pre-quantized INT8.
+#
+# Hoisting eliminates ~256 redundant tl.max reductions per program
+# (cdiv(K, BLOCK_SIZE_K) tiles * BLOCK_SIZE_M rows) and halves activation HBM
+# bandwidth in the GEMM K-loop (bf16 -> int8).
+#
+# BLOCK_SIZE_K is fixed at PREQUANT_BLOCK_K (= 32, matches the llama.cpp
+# group_size) so the per-tile activation scales line up with the GEMM K-loop.
+# ---------------------------------------------------------------------------
+PREQUANT_BLOCK_K = 32
+
+
+@triton.jit
+def _quantize_activations_int8_kernel(
+    A,  # [M+1, K] bf16 input activations (with sentinel zero row)
+    A_int8,  # [max_padded, K] int8 output (sorted order)
+    A_scale,  # [max_padded, num_k_tiles] float32 per-row-per-tile scales
+    sorted_token_ids,  # [max_padded] int64 pair indices
+    K: tl.constexpr,
+    NUM_K_TILES: tl.constexpr,
+    top_k: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am,
+    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
+):
+    """Quantize one sorted M-row to INT8 with per-tile scales.
+
+    Grid: (max_padded,) — one program per sorted row. Each program loops
+    over K-tiles. Sentinel pair_ids map to the appended zero row in A.
+    """
+    row_id = tl.program_id(0)
+    pair_id = tl.load(sorted_token_ids + row_id)
+    token_id = pair_id // top_k
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    for k_tile in range(NUM_K_TILES):
+        k_offset = k_tile * BLOCK_SIZE_K
+        k_full_offs = k_offset + offs_k
+        k_mask = k_full_offs < K
+
+        # Load bf16 activation slice [BLOCK_SIZE_K]
+        a_ptrs = A + token_id * stride_am + k_full_offs * stride_ak
+        a_bf16 = tl.load(a_ptrs, mask=k_mask, other=0.0)
+
+        # Compute per-tile scale (scalar)
+        a_f32 = a_bf16.to(tl.float32)
+        a_absmax = tl.max(tl.abs(a_f32))
+        a_scale_val = a_absmax / 127.0 + 1e-12
+
+        # Quantize to INT8
+        a_scaled = a_f32 / a_scale_val
+        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+
+        # Store quantized activations
+        q_ptrs = A_int8 + row_id * stride_qm + k_full_offs * stride_qk
+        tl.store(q_ptrs, a_int8, mask=k_mask)
+
+        # Store scale
+        s_ptr = A_scale + row_id * stride_sm + k_tile * stride_sk
+        tl.store(s_ptr, a_scale_val)
+
+
+@triton.jit
+def _silu_quantize_int8_kernel(
+    A,  # [num_tokens_post_padded, 2*inter] bf16 GEMM1 output (sorted)
+    A_int8,  # [num_tokens_post_padded, inter] int8 SiLU-quantized output
+    A_scale,  # [num_tokens_post_padded, num_k_tiles] float32 per-tile scales
+    K: tl.constexpr,  # intermediate_size
+    NUM_K_TILES: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am,
+    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
+):
+    """SiLU(gate)*up + INT8 quantization for the batched GEMM2 input.
+
+    Grid: (max_padded,). Reads gate at columns [0, K), up at [K, 2K),
+    computes SiLU(gate)*up, quantizes to INT8 with per-tile scales.
+    """
+    row_id = tl.program_id(0)
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    for k_tile in range(NUM_K_TILES):
+        k_offset = k_tile * BLOCK_SIZE_K
+        k_full_offs = k_offset + offs_k
+        k_mask = k_full_offs < K
+
+        gate_ptrs = A + row_id * stride_am + k_full_offs * stride_ak
+        up_ptrs = gate_ptrs + K * stride_ak
+
+        gate = tl.load(gate_ptrs, mask=k_mask, other=0.0).to(tl.float32)
+        up = tl.load(up_ptrs, mask=k_mask, other=0.0).to(tl.float32)
+        silu_out = gate * tl.sigmoid(gate) * up
+
+        a_absmax = tl.max(tl.abs(silu_out))
+        a_scale_val = a_absmax / 127.0 + 1e-12
+        a_scaled = silu_out / a_scale_val
+        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+
+        q_ptrs = A_int8 + row_id * stride_qm + k_full_offs * stride_qk
+        tl.store(q_ptrs, a_int8, mask=k_mask)
+
+        s_ptr = A_scale + row_id * stride_sm + k_tile * stride_sk
+        tl.store(s_ptr, a_scale_val)
+
+
 # Autotune configs for GEMM1 (_fused_moe_kernel).
 # Top performers from CI benchmark on A100-SXM4-80GB, Qwen3.5 MoE dimensions
 # (M=1, N=1024, K=2048, 8 experts, group_size=128).
@@ -68,6 +193,7 @@
     triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=4, num_stages=3),
     triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128}, num_warps=2, num_stages=3),
     triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=2),
 ]
 
 
@@ -451,9 +577,12 @@ def _fused_moe_fake(
 # ---------------------------------------------------------------------------
 
 # Fixed BLOCK_M for the batched kernel. Not autotuned because the token
-# sorting layout depends on it. 16 is the minimum for tl.dot and wastes
-# the least padding with typical Qwen3.5 expert load (~30 tokens/expert).
-_BATCHED_BLOCK_M = 16
+# sorting layout depends on it. Microbenchmarked on Qwen3.5 MoE prefill
+# (M=1696, top_k=8, 256 experts) — BLOCK_M=64 is ~1.32x faster than 16
+# despite the extra padding, because the per-expert M block (~30 tokens
+# × 8 top_k = ~53 active rows/expert) saturates 64-row tensor-core MMAs
+# and reduces total program count.
+_BATCHED_BLOCK_M = 64
 
 
 def moe_align_block_size(
@@ -712,35 +841,39 @@ def _fused_moe_batched_kernel(
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
 
 
-# Autotune configs for batched INT8 GEMM1 (gate+up projection, W4A8).
+# Autotune configs for the prequant GEMM1 INT8 kernel.
+# BLOCK_SIZE_K is FIXED at PREQUANT_BLOCK_K — only N/warps/stages tunable.
 _BATCHED_GEMM1_INT8_CONFIGS = [
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=3),
-    triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2
-    ),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=2),
 ]
 
 
 @triton.autotune(configs=_BATCHED_GEMM1_INT8_CONFIGS, key=["N", "K"])
 @triton.jit
 def _fused_moe_batched_int8_kernel(
-    # Pointers
-    A,  # [M+1, K] bf16 activations (row M is zero-padding sentinel)
+    # Pointers — A is INT8 pre-quantized in sorted order, A_scale per-tile
+    A_int8,  # [max_padded, K] int8 pre-quantized activations
+    A_scale,  # [max_padded, num_k_tiles] float32 per-tile scales
     B,  # [E, N, K//2] int8 packed INT4 weights
     C,  # [num_tokens_post_padded, N] bf16 output (sorted order)
     B_scale,  # [E, N, K//group_size] bf16 scales
-    sorted_token_ids,  # [num_tokens_post_padded] int64 pair indices
     expert_ids,  # [num_expert_blocks] int64
     # Dimensions
     N: tl.constexpr,
     K: tl.constexpr,
     # Strides
-    stride_am,
-    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
     stride_be,
     stride_bk,
     stride_bn,
@@ -750,18 +883,14 @@ def _fused_moe_batched_int8_kernel(
     stride_bsk,
     stride_bsn,
     # Config
-    top_k: tl.constexpr,
     group_size: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
     compute_type: tl.constexpr,
 ):
-    """Batched GEMM1 (gate+up) with INT8 tensor cores (W4A8).
-
-    Dynamically quantizes bf16 activations to INT8 per-row per-tile,
-    dequantizes INT4 weights to INT8 (skipping bf16), and uses
-    tl.dot(int8, int8) → int32 accumulation with per-tile float32 rescale.
+    """Batched GEMM1 (gate+up) with INT8 tensor cores, consuming pre-quantized
+    activations + per-row-per-tile scales. No quantization in the K-loop.
     """
     pid = tl.program_id(0)
     num_n_blocks = tl.cdiv(N, BLOCK_SIZE_N)
@@ -771,14 +900,13 @@ def _fused_moe_batched_int8_kernel(
     expert_id = tl.load(expert_ids + expert_block_idx).to(tl.int64)
 
     offs_m = expert_block_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    pair_ids = tl.load(sorted_token_ids + offs_m)
-    token_ids = pair_ids // top_k
 
     offs_n = n_block * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
     n_mask = offs_n < N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
 
-    a_ptrs = A + token_ids[:, None] * stride_am + offs_k[None, :] * stride_ak
+    # A_int8 is in sorted order, indexed directly by offs_m
+    a_ptrs = A_int8 + offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk
 
     b_ptrs = (
         B
@@ -788,27 +916,22 @@ def _fused_moe_batched_int8_kernel(
     )
     b_shifter = (offs_k[:, None] % 2) * 4
 
-    # Float32 accumulator for cross-tile summation (rescaled per tile)
     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
     for k_step in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         k_remaining = K - k_step * BLOCK_SIZE_K
         k_mask = offs_k < k_remaining
 
-        # Load bf16 activation tile [BLOCK_M, BLOCK_K]
-        a_bf16 = tl.load(a_ptrs, mask=k_mask[None, :], other=0.0)
+        # Load pre-quantized INT8 activation tile [BLOCK_M, BLOCK_K]
+        a_int8 = tl.load(a_ptrs, mask=k_mask[None, :], other=0)
 
-        # Per-row dynamic INT8 quantization
-        a_f32 = a_bf16.to(tl.float32)
-        a_absmax = tl.max(tl.abs(a_f32), axis=1)  # [BLOCK_M]
-        a_scale = a_absmax / 127.0 + 1e-12  # avoid division by zero
-        a_scaled = a_f32 / a_scale[:, None]
-        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+        # Load pre-computed per-row-per-tile scale [BLOCK_M]
+        a_scale = tl.load(A_scale + offs_m * stride_sm + k_step * stride_sk)
 
         # Load and unpack INT4 weights to INT8 [BLOCK_K, BLOCK_N]
         b = tl.load(b_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0)
         b = (b >> b_shifter) & 0xF
-        b_int8 = (b - 8).to(tl.int8)  # symmetric dequant to [-8, 7]
+        b_int8 = (b - 8).to(tl.int8)
 
         # Per-group weight scale
         if BLOCK_SIZE_K <= group_size:
@@ -822,6 +945,8 @@ def _fused_moe_batched_int8_kernel(
             b_scale = tl.load(scale_ptrs, mask=n_mask[None, :], other=0.0).to(
                 tl.float32
             )
+            dot_i32 = tl.dot(a_int8, b_int8)
+            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
         else:
             scale_ptrs = (
                 B_scale
@@ -832,24 +957,15 @@ def _fused_moe_batched_int8_kernel(
             b_scale = tl.load(
                 scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
             ).to(tl.float32)
-
-        if BLOCK_SIZE_K <= group_size:
-            # INT8 tensor core GEMM: [BLOCK_M, BLOCK_K] @ [BLOCK_K, BLOCK_N] → int32
-            dot_i32 = tl.dot(a_int8, b_int8)
-            # b_scale is [1, BLOCK_N], broadcast
-            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
-        else:
-            # Multi-group tile: dequantize weights per group, use float matmul
             b_dequant = (b_int8.to(tl.float32) * b_scale).to(compute_type)
             acc += (
                 tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32)
                 * a_scale[:, None]
             )
 
-        a_ptrs += BLOCK_SIZE_K * stride_ak
+        a_ptrs += BLOCK_SIZE_K * stride_qk
         b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
 
-    # Write output in sorted order [BLOCK_M, BLOCK_N]
     c_ptrs = C + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
 
@@ -985,37 +1101,38 @@ def _fused_moe_silu_batched_kernel(
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
 
 
-# Autotune configs for batched INT8 GEMM2 (down projection + SiLU, W4A8).
 _BATCHED_GEMM2_INT8_CONFIGS = [
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2),
-    triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2
-    ),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=2, num_stages=2),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=2, num_stages=3),  # num_warps=2
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=2),
 ]
 
 
 @triton.autotune(configs=_BATCHED_GEMM2_INT8_CONFIGS, key=["N", "K"])
 @triton.jit
 def _fused_moe_silu_batched_int8_kernel(
-    # Pointers
-    A,  # [num_tokens_post_padded, 2*inter] bf16 GEMM1 output (sorted order)
+    A_int8,  # [max_padded, K] int8 pre-quantized SiLU output
+    A_scale,  # [max_padded, num_k_tiles] float32 per-tile scales
     B,  # [E, N, K//2] int8 packed INT4 weights
-    C,  # [M*top_k + 1, N] bf16 output (scatter to original pair order)
+    C,  # [M*top_k + 1, N] bf16 output (scatter to pair order)
     B_scale,  # [E, N, K//group_size] bf16 scales
     sorted_token_ids,  # [num_tokens_post_padded] int64 pair indices
     expert_ids,  # [num_expert_blocks] int64
-    topk_weights,  # [M*top_k] float32 router weights (flat)
+    topk_weights,  # [M*top_k] float32 router weights
     # Dimensions
     N: tl.constexpr,
-    K: tl.constexpr,  # intermediate_size
-    num_pairs,  # M * top_k (for clamping sentinel weight lookups)
+    K: tl.constexpr,
+    num_pairs,
     # Strides
-    stride_am,
-    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
     stride_be,
     stride_bk,
     stride_bn,
@@ -1025,18 +1142,14 @@ def _fused_moe_silu_batched_int8_kernel(
     stride_bsk,
     stride_bsn,
     # Config
-    top_k: tl.constexpr,
     group_size: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
     compute_type: tl.constexpr,
 ):
-    """Batched GEMM2 with fused SiLU, INT8 tensor cores, and scatter-back (W4A8).
-
-    SiLU(gate)*up is computed in float32, then dynamically quantized to INT8
-    per-row per-tile. INT4 weights are dequantized directly to INT8.
-    tl.dot(int8, int8) → int32, with per-tile float32 rescale.
+    """GEMM2 with INT8 tensor cores, consuming pre-quantized SiLU(gate)*up
+    activations + per-row-per-tile scales. Scatter-back to pair order.
     """
     pid = tl.program_id(0)
     num_n_blocks = tl.cdiv(N, BLOCK_SIZE_N)
@@ -1052,9 +1165,7 @@ def _fused_moe_silu_batched_int8_kernel(
     n_mask = offs_n < N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
 
-    # A pointers: gate at [0, K), up at [K, 2K)
-    a_gate_ptrs = A + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
-    a_up_ptrs = a_gate_ptrs + K * stride_ak
+    a_ptrs = A_int8 + offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk
 
     b_ptrs = (
         B
@@ -1070,23 +1181,13 @@ def _fused_moe_silu_batched_int8_kernel(
         k_remaining = K - k_step * BLOCK_SIZE_K
         k_mask = offs_k < k_remaining
 
-        # Load gate and up tiles, apply SiLU in float32
-        gate = tl.load(a_gate_ptrs, mask=k_mask[None, :], other=0.0).to(tl.float32)
-        up = tl.load(a_up_ptrs, mask=k_mask[None, :], other=0.0)
-        silu_out = gate * tl.sigmoid(gate) * up.to(tl.float32)  # [BLOCK_M, BLOCK_K]
-
-        # Per-row dynamic INT8 quantization of SiLU output
-        a_absmax = tl.max(tl.abs(silu_out), axis=1)  # [BLOCK_M]
-        a_scale = a_absmax / 127.0 + 1e-12
-        a_scaled = silu_out / a_scale[:, None]
-        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+        a_int8 = tl.load(a_ptrs, mask=k_mask[None, :], other=0)
+        a_scale = tl.load(A_scale + offs_m * stride_sm + k_step * stride_sk)
 
-        # Load and unpack INT4 weights to INT8 [BLOCK_K, BLOCK_N]
         b = tl.load(b_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0)
         b = (b >> b_shifter) & 0xF
         b_int8 = (b - 8).to(tl.int8)
 
-        # Per-group weight scale
         if BLOCK_SIZE_K <= group_size:
             group_idx = (BLOCK_SIZE_K * k_step) // group_size
             scale_ptrs = (
@@ -1098,6 +1199,8 @@ def _fused_moe_silu_batched_int8_kernel(
             b_scale = tl.load(scale_ptrs, mask=n_mask[None, :], other=0.0).to(
                 tl.float32
             )
+            dot_i32 = tl.dot(a_int8, b_int8)
+            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
         else:
             scale_ptrs = (
                 B_scale
@@ -1108,21 +1211,13 @@ def _fused_moe_silu_batched_int8_kernel(
             b_scale = tl.load(
                 scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
             ).to(tl.float32)
-
-        if BLOCK_SIZE_K <= group_size:
-            # INT8 tensor core GEMM: [BLOCK_M, BLOCK_K] @ [BLOCK_K, BLOCK_N] → int32
-            dot_i32 = tl.dot(a_int8, b_int8)
-            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
-        else:
-            # Multi-group tile: dequantize weights per group, use float matmul
             b_dequant = (b_int8.to(tl.float32) * b_scale).to(compute_type)
             acc += (
                 tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32)
                 * a_scale[:, None]
             )
 
-        a_gate_ptrs += BLOCK_SIZE_K * stride_ak
-        a_up_ptrs += BLOCK_SIZE_K * stride_ak
+        a_ptrs += BLOCK_SIZE_K * stride_qk
         b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
 
     # Apply router weights per row
@@ -1132,7 +1227,6 @@ def _fused_moe_silu_batched_int8_kernel(
     weights = tl.where(is_valid, weights, 0.0)
     acc = acc * weights[:, None]
 
-    # Scatter to original pair order
     scatter_ids = tl.where(is_valid, pair_ids, num_pairs)
     c_ptrs = C + scatter_ids[:, None] * stride_cm + offs_n[None, :] * stride_cn
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
@@ -1284,7 +1378,18 @@ def fused_moe_batched_gemm_int8(
     num_experts: int,
     group_size: int,
 ) -> torch.Tensor:
-    """Batched W4A8 GEMM1 + GEMM2+SiLU with INT8 tensor cores."""
+    """Batched W4A8 GEMM1 + GEMM2+SiLU with INT8 tensor cores.
+
+    Pipeline:
+      1. moe_align_block_size: sort pairs by expert.
+      2. _quantize_activations_int8_kernel: quantize hidden_states to INT8
+         in sorted order with per-row-per-tile scales.
+      3. _fused_moe_batched_int8_kernel (GEMM1): consumes INT8 + scales.
+      4. _silu_quantize_int8_kernel: fuse SiLU(gate)*up + INT8 quantization
+         on the GEMM1 output.
+      5. _fused_moe_silu_batched_int8_kernel (GEMM2): consumes INT8 + scales,
+         scatter-back to original pair order.
+    """
     M, K = hidden_states.shape
     N1 = w1.shape[1]
     intermediate = N1 // 2
@@ -1308,6 +1413,35 @@ def fused_moe_batched_gemm_int8(
 
     topk_weights_flat = topk_weights.reshape(-1)
 
+    # ---- Pre-quantize activations for GEMM1 ----
+    BLOCK_K_QUANT = PREQUANT_BLOCK_K
+    num_k_tiles_g1 = (K + BLOCK_K_QUANT - 1) // BLOCK_K_QUANT
+
+    a_int8_g1 = torch.empty(
+        max_padded, K, dtype=torch.int8, device=hidden_states.device
+    )
+    a_scale_g1 = torch.empty(
+        max_padded, num_k_tiles_g1, dtype=torch.float32, device=hidden_states.device
+    )
+
+    grid_quant_g1 = (max_padded,)
+    wrap_triton(_quantize_activations_int8_kernel)[grid_quant_g1](
+        hidden_padded,
+        a_int8_g1,
+        a_scale_g1,
+        sorted_token_ids,
+        K=K,
+        NUM_K_TILES=num_k_tiles_g1,
+        top_k=top_k,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
+        stride_am=hidden_padded.stride(0),
+        stride_ak=hidden_padded.stride(1),
+        stride_qm=a_int8_g1.stride(0),
+        stride_qk=a_int8_g1.stride(1),
+        stride_sm=a_scale_g1.stride(0),
+        stride_sk=a_scale_g1.stride(1),
+    )
+
     cache1 = torch.empty(
         max_padded,
         N1,
@@ -1319,16 +1453,18 @@ def grid1(meta):
         return (num_expert_blocks * triton.cdiv(N1, meta["BLOCK_SIZE_N"]),)
 
     wrap_triton(_fused_moe_batched_int8_kernel)[grid1](
-        hidden_padded,
+        a_int8_g1,
+        a_scale_g1,
         w1,
         cache1,
         w1_scale,
-        sorted_token_ids,
         expert_ids,
         N=N1,
         K=K,
-        stride_am=hidden_padded.stride(0),
-        stride_ak=hidden_padded.stride(1),
+        stride_qm=a_int8_g1.stride(0),
+        stride_qk=a_int8_g1.stride(1),
+        stride_sm=a_scale_g1.stride(0),
+        stride_sk=a_scale_g1.stride(1),
         stride_be=w1.stride(0),
         stride_bk=w1.stride(2),
         stride_bn=w1.stride(1),
@@ -1337,12 +1473,37 @@ def grid1(meta):
         stride_bse=w1_scale.stride(0),
         stride_bsk=w1_scale.stride(2),
         stride_bsn=w1_scale.stride(1),
-        top_k=top_k,
         group_size=group_size,
         BLOCK_SIZE_M=BLOCK_M,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
         compute_type=tl.bfloat16,
     )
 
+    # ---- SiLU + pre-quantize for GEMM2 ----
+    num_k_tiles_g2 = (intermediate + BLOCK_K_QUANT - 1) // BLOCK_K_QUANT
+    a_int8_g2 = torch.empty(
+        max_padded, intermediate, dtype=torch.int8, device=hidden_states.device
+    )
+    a_scale_g2 = torch.empty(
+        max_padded, num_k_tiles_g2, dtype=torch.float32, device=hidden_states.device
+    )
+
+    grid_silu = (max_padded,)
+    wrap_triton(_silu_quantize_int8_kernel)[grid_silu](
+        cache1,
+        a_int8_g2,
+        a_scale_g2,
+        K=intermediate,
+        NUM_K_TILES=num_k_tiles_g2,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
+        stride_am=cache1.stride(0),
+        stride_ak=cache1.stride(1),
+        stride_qm=a_int8_g2.stride(0),
+        stride_qk=a_int8_g2.stride(1),
+        stride_sm=a_scale_g2.stride(0),
+        stride_sk=a_scale_g2.stride(1),
+    )
+
     out_buf = torch.zeros(
         num_pairs + 1,
         N2,
@@ -1354,7 +1515,8 @@ def grid2(meta):
         return (num_expert_blocks * triton.cdiv(N2, meta["BLOCK_SIZE_N"]),)
 
     wrap_triton(_fused_moe_silu_batched_int8_kernel)[grid2](
-        cache1,
+        a_int8_g2,
+        a_scale_g2,
         w2,
         out_buf,
         w2_scale,
@@ -1364,8 +1526,10 @@ def grid2(meta):
         N=N2,
         K=intermediate,
         num_pairs=num_pairs,
-        stride_am=cache1.stride(0),
-        stride_ak=cache1.stride(1),
+        stride_qm=a_int8_g2.stride(0),
+        stride_qk=a_int8_g2.stride(1),
+        stride_sm=a_scale_g2.stride(0),
+        stride_sk=a_scale_g2.stride(1),
         stride_be=w2.stride(0),
         stride_bk=w2.stride(2),
         stride_bn=w2.stride(1),
@@ -1374,9 +1538,9 @@ def grid2(meta):
         stride_bse=w2_scale.stride(0),
         stride_bsk=w2_scale.stride(2),
         stride_bsn=w2_scale.stride(1),
-        top_k=top_k,
         group_size=group_size,
         BLOCK_SIZE_M=BLOCK_M,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
         compute_type=tl.bfloat16,
     )
 
diff --git a/backends/cuda/triton/kernels/int4_matmul.py b/backends/cuda/triton/kernels/int4_matmul.py
index 78cce9ea7cf..848e0057f9f 100644
--- a/backends/cuda/triton/kernels/int4_matmul.py
+++ b/backends/cuda/triton/kernels/int4_matmul.py
@@ -285,6 +285,11 @@ def _int4_matmul_fake(
     triton.Config({"BLOCK_N": 8, "BLOCK_K": 128}, num_warps=2, num_stages=3),
     triton.Config({"BLOCK_N": 8, "BLOCK_K": 256}, num_warps=2, num_stages=3),
     triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_N": 1, "BLOCK_K": 512}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=1, num_stages=2),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 64}, num_warps=1, num_stages=3),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=8, num_stages=3),
 ]
 
 
From dbbe9cbe1337d08cca5d38894acdc8d2271e6a5f Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Fri, 8 May 2026 13:38:58 -0700
Subject: [PATCH 57/58] [ET-VK] Make libtorch optional in custom op test
 binaries

Add `include_torch` parameter (default False) to
`define_custom_op_test_binary()`. None of the custom op test binaries
directly include torch/ATen/c10 headers, so libtorch was unnecessary
baggage. Dropping it reduces the q4gsw_linear_adreno binary from ~1 GB
to 74 MB.

Differential Revision: [D104456804](https://our.internmc.facebook.com/intern/diff/D104456804/)

ghstack-source-id: 379498992
Pull Request resolved: https://github.com/pytorch/executorch/pull/19402
---
 backends/vulkan/test/custom_ops/targets.bzl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 5fb0f7f4cbf..3c9a67d8ca4 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -6,14 +6,13 @@ load(
     "vulkan_spv_shader_lib",
 )
 
-def define_custom_op_test_binary(custom_op_name, extra_deps = [], src_file = None):
+def define_custom_op_test_binary(custom_op_name, extra_deps = [], src_file = None, include_torch = False):
     deps_list = [
         ":prototyping_utils",
         ":operator_implementations",
         ":custom_ops_shaderlib",
         "//executorch/backends/vulkan:vulkan_graph_runtime",
-        runtime.external_dep_location("libtorch"),
-    ] + extra_deps
+    ] + ([runtime.external_dep_location("libtorch")] if include_torch else []) + extra_deps
 
     src_file_str = src_file if src_file else "{}.cpp".format(custom_op_name)
 

From 0cafcb20e32fc79e42d4349b8a7249428066c414 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Fri, 8 May 2026 13:39:03 -0700
Subject: [PATCH 58/58] [ET-VK] Plumb subgroup property queries +
 VK_EXT_subgroup_size_control

Adds infrastructure for querying GPU subgroup capabilities and pinning required subgroup size at pipeline creation time, sourced from the existing `SUBGROUP_SIZE` yaml template parameter. This is the foundation for writing subgroup-using shaders (e.g. cooperative GEMV variants) that remain portable across GPUs with different subgroup widths (Adreno=64, Mali=16, NVIDIA=32, etc.).

`PhysicalDevice` now chains `VkPhysicalDeviceSubgroupProperties` and `VkPhysicalDeviceSubgroupSizeControlProperties` into `vkGetPhysicalDeviceProperties2`, plus `VkPhysicalDeviceSubgroupSizeControlFeatures` into `vkGetPhysicalDeviceFeatures2`. The `Adapter` exposes accessors for subgroup_size, supported subgroup ops/stages, [min,max] subgroup size range, and whether the driver supports per-pipeline required subgroup size for the COMPUTE stage. `VK_EXT_subgroup_size_control` is added to the requested extension list and the size-control features are chained into device-create pNext when supported.

`ComputePipeline::Descriptor` gains a `required_subgroup_size` field that, when nonzero, chains `VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT` into pipeline creation (both the on-demand `retrieve` path and the batch `create_pipelines` path). The pipeline cache key includes the field so pipelines compiled for different subgroup widths cache independently. `ShaderInfo` carries the same field so it can be plumbed from shader yaml through to the pipeline descriptor.

The existing `SUBGROUP_SIZE` yaml template parameter is now the single source of truth: `gen_vulkan_spv.py` substitutes it into GLSL as before AND emits it as `ShaderInfo::required_subgroup_size`. At dispatch, `vkapi::resolve_required_subgroup_size` validates the value is within the adapter's `[min, max]` range and throws `ShaderNotSupportedError` if the extension is unsupported or the value is out of range, surfacing a clear failure rather than silently miscompiling a shader whose algorithm depends on the pinned subgroup width.

No shader yamls are modified by this change; subsequent commits opt their shaders into the pinning by declaring `SUBGROUP_SIZE` in their yamls.

Differential Revision: [D104456803](https://our.internmc.facebook.com/intern/diff/D104456803/)

ghstack-source-id: 379498994
Pull Request resolved: https://github.com/pytorch/executorch/pull/19403
---
 backends/vulkan/runtime/api/Context.cpp       |  14 ++-
 backends/vulkan/runtime/gen_vulkan_spv.py     |  61 +++++++++-
 .../vulkan/runtime/graph/ComputeGraph.cpp     |  12 +-
 backends/vulkan/runtime/vk_api/Adapter.cpp    |  64 +++++++++-
 backends/vulkan/runtime/vk_api/Adapter.h      | 110 ++++++++++++++++++
 backends/vulkan/runtime/vk_api/Device.cpp     |  69 ++++++++++-
 backends/vulkan/runtime/vk_api/Device.h       |  22 ++++
 backends/vulkan/runtime/vk_api/Exception.cpp  |   3 +
 backends/vulkan/runtime/vk_api/Exception.h    |   1 +
 backends/vulkan/runtime/vk_api/Pipeline.cpp   |  43 ++++++-
 backends/vulkan/runtime/vk_api/Pipeline.h     |  11 ++
 backends/vulkan/runtime/vk_api/Shader.cpp     |   6 +-
 backends/vulkan/runtime/vk_api/Shader.h       |  10 +-
 13 files changed, 405 insertions(+), 21 deletions(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 80aef97fc04..d090a62f370 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -149,10 +149,14 @@ vkapi::DescriptorSet Context::get_descriptor_set(
 
   spec_constants.append(additional_constants);
 
+  const uint32_t resolved_required_subgroup_size =
+      vkapi::resolve_required_subgroup_size(shader_descriptor, adapter_p_);
+
   VkPipeline pipeline = pipeline_cache().retrieve(
       {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
        shader_cache().retrieve(shader_descriptor),
-       spec_constants});
+       spec_constants,
+       resolved_required_subgroup_size});
 
   cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
 
@@ -315,8 +319,14 @@ VkPipeline Context::get_shader_pipeline(
 
   spec_constants.append(additional_constants);
 
+  const uint32_t resolved_required_subgroup_size =
+      vkapi::resolve_required_subgroup_size(shader, adapter_p_);
+
   VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout, shader_cache().retrieve(shader), spec_constants});
+      {pipeline_layout,
+       shader_cache().retrieve(shader),
+       spec_constants,
+       resolved_required_subgroup_size});
 
   return pipeline;
 }
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index dab33fb3097..d12fdce2c5c 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -281,8 +281,9 @@ def layout_declare_buffer(
     dtype: str,
     precision: str = "PRECISION",
     is_scalar_array: bool = True,
+    vec_size: int = 4,
 ) -> str:
-    array_type = buffer_gvec_type(dtype, 4)
+    array_type = buffer_gvec_type(dtype, vec_size)
     if is_scalar_array:
         array_type = buffer_scalar_type(dtype)
 
@@ -341,6 +342,7 @@ def layout_declare_tensor(
     storage_type: str,
     is_scalar_array: bool = True,
     precision: str = "PRECISION",
+    vec_size: int = 4,
 ) -> str:
     assert storage_type.lower() in ["buffer", "texture3d", "texture2d"]
 
@@ -357,6 +359,7 @@ def layout_declare_tensor(
             dtype,
             precision,
             is_scalar_array=is_scalar_array,
+            vec_size=vec_size,
         )
 
     # Create image/sampler binding
@@ -785,6 +788,10 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                     "generate_variant_forall", None
                 )
 
+                reserved_yaml_keys = {
+                    "generate_variant_forall",
+                }
+
                 for variant in params_dict["shader_variants"]:
                     default_iterated_params_names = set(
                         default_iterated_params.keys()
@@ -797,7 +804,7 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                         variant_params_names
                         - default_iterated_params_names
                         - params_names
-                        - {"generate_variant_forall"}
+                        - reserved_yaml_keys
                     )
                     assert len(invalid_keys) == 0
 
@@ -813,7 +820,7 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                         for combination in variant_combinations:
                             default_params_copy = copy.deepcopy(default_params)
                             for key in variant:
-                                if key != "generate_variant_forall":
+                                if key not in reserved_yaml_keys:
                                     default_params_copy[key] = variant[key]
 
                             variant_name = variant["NAME"]
@@ -842,7 +849,8 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                     else:
                         default_params_copy = copy.deepcopy(default_params)
                         for key in variant:
-                            default_params_copy[key] = variant[key]
+                            if key not in reserved_yaml_keys:
+                                default_params_copy[key] = variant[key]
 
                         self.shader_template_params[template_name].append(
                             default_params_copy
@@ -1026,6 +1034,27 @@ def generate_src_file(shader_paths_pair) -> Tuple[bool, List[str]]:
                 print(f"template_file_path: {template_file_path}")
                 output_text = preprocess(input_text, codegen_params)
 
+            # If the shader yaml declared a SUBGROUP_SIZE template parameter,
+            # embed it into the generated GLSL as a comment. getShaderInfo()
+            # parses it back out alongside TILE_SIZE, WEIGHT_STORAGE, etc.,
+            # avoiding a side-channel name -> value map.
+            subgroup_size = codegen_params.get("SUBGROUP_SIZE")
+            if subgroup_size is not None:
+                try:
+                    subgroup_size_int = int(subgroup_size)
+                except (TypeError, ValueError) as e:
+                    raise RuntimeError(
+                        f"Shader variant {src_file_name!r} declared "
+                        f"SUBGROUP_SIZE={subgroup_size!r}, which is not "
+                        f"parseable as an integer. Fix the SUBGROUP_SIZE "
+                        f"value in the shader's yaml."
+                    ) from e
+                if subgroup_size_int > 0:
+                    output_text = (
+                        f"// REQUIRED_SUBGROUP_SIZE = {subgroup_size_int}\n"
+                        + output_text
+                    )
+
             included_files = get_glsl_includes(output_text)
 
             with codecs.open(gen_out_path, "w", encoding="utf-8") as output_file:
@@ -1184,6 +1213,12 @@ class ShaderInfo:
     requires_integer_dot_product_ext: bool = False
     requires_shader_int64_ext: bool = False
     requires_shader_float64_ext: bool = False
+    # Subgroup size requirement (matches the C++ ShaderInfo encoding):
+    #   0  = no requirement
+    #   >0 = literal fixed size; sourced from the shader yaml's
+    #        `SUBGROUP_SIZE` template parameter (single source of truth for
+    #        both GLSL substitution and the Vulkan pipeline pin).
+    required_subgroup_size: int = 0
 
 
 def getName(filePath: str) -> str:
@@ -1208,6 +1243,17 @@ def findTileSizes(lineStr: str) -> List[int]:
     return [int(matches.group(1)), int(matches.group(2)), int(matches.group(3))]
 
 
+def isRequiredSubgroupSizeLine(lineStr: str) -> bool:
+    return re.search(r"^// REQUIRED_SUBGROUP_SIZE = ", lineStr) is not None
+
+
+def findRequiredSubgroupSize(lineStr: str) -> int:
+    matches = re.search(r"^// REQUIRED_SUBGROUP_SIZE = ([0-9]+)", lineStr)
+    if matches is None:
+        raise AssertionError("matches is None in findRequiredSubgroupSize")
+    return int(matches.group(1))
+
+
 def isWeightStorageTypeLine(lineStr: str) -> bool:
     weight_storage_id = r"^ \* WEIGHT_STORAGE = "
     return re.search(weight_storage_id, lineStr) is not None
@@ -1281,6 +1327,8 @@ def getShaderInfo(srcFilePath: str) -> ShaderInfo:  # noqa: C901
                 shader_info.layouts.append(determineDescriptorType(line))
             if isTileSizeLine(line):
                 shader_info.tile_size = findTileSizes(line)
+            if isRequiredSubgroupSizeLine(line):
+                shader_info.required_subgroup_size = findRequiredSubgroupSize(line)
             if isWeightStorageTypeLine(line):
                 shader_info.weight_storage_type = getWeightStorageType(line)
             if isBiasStorageTypeLine(line):
@@ -1378,6 +1426,7 @@ def to_cpp_str(val: bool):
         to_cpp_str(shader_info.requires_integer_dot_product_ext),
         to_cpp_str(shader_info.requires_shader_int64_ext),
         to_cpp_str(shader_info.requires_shader_float64_ext),
+        str(shader_info.required_subgroup_size),
     ]
 
     shader_info_str = textwrap.indent(
@@ -1406,7 +1455,9 @@ def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
 
 
 def genCppFiles(
-    spv_files: Dict[str, str], cpp_header_path: str, cpp_src_file_path: str
+    spv_files: Dict[str, str],
+    cpp_header_path: str,
+    cpp_src_file_path: str,
 ) -> None:
     spv_bin_strs = []
     register_shader_info_strs = []
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index f0b61e128bb..3accdf375cb 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -825,10 +825,20 @@ void ComputeGraph::register_pipeline_to_create(
 
   spec_constants.append(spec_vars);
 
+  // Resolve any shader-declared required subgroup size into a concrete value
+  // so the pre-built pipeline matches the one created at dispatch time. The
+  // shared helper throws ShaderNotSupportedError when the adapter cannot honor
+  // the requirement; let it propagate so a stale unused pipeline doesn't sit
+  // in the cache while dispatch later throws on the same shader.
+  const uint32_t resolved_required_subgroup_size =
+      vkapi::resolve_required_subgroup_size(
+          shader_info, context()->adapter_ptr());
+
   const vkapi::ComputePipelineCache::Key desc = {
       context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset),
       context()->shader_cache().retrieve(shader_info),
-      spec_constants};
+      spec_constants,
+      resolved_required_subgroup_size};
 
   if (context_->pipeline_cache().contains(desc)) {
     return;
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index e965687bc00..b762c95205b 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -129,6 +129,9 @@ VkDevice create_logical_device(
 #ifdef VK_NV_cooperative_matrix2
       VK_NV_COOPERATIVE_MATRIX_2_EXTENSION_NAME,
 #endif /* VK_NV_cooperative_matrix2 */
+#ifdef VK_EXT_subgroup_size_control
+      VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+#endif /* VK_EXT_subgroup_size_control */
   };
 
   std::vector<const char*> enabled_device_extensions;
@@ -199,6 +202,19 @@ VkDevice create_logical_device(
   extension_list_top = &cooperative_matrix2_features;
 #endif /* VK_NV_cooperative_matrix2 */
 
+#ifdef VK_EXT_subgroup_size_control
+  // Only enable the feature struct if the extension was actually requested
+  // and the feature flag is set on the physical device. The extension itself
+  // is filtered into enabled_device_extensions by
+  // find_requested_device_extensions.
+  VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features{
+      physical_device.subgroup_size_control_features};
+  if (physical_device.supports_subgroup_size_control) {
+    subgroup_size_control_features.pNext = extension_list_top;
+    extension_list_top = &subgroup_size_control_features;
+  }
+#endif /* VK_EXT_subgroup_size_control */
+
   device_create_info.pNext = extension_list_top;
 
   VkDevice handle = nullptr;
@@ -405,7 +421,7 @@ std::string Adapter::stringize() const {
   ss << "    deviceType:    " << device_type << std::endl;
   ss << "    deviceName:    " << properties.deviceName << std::endl;
 
-#define PRINT_BOOL(value, name) \
+#define PRINT_VALUE(value, name) \
   ss << "      " << std::left << std::setw(36) << #name << value << std::endl;
 
 #define PRINT_PROP(struct, name)                                       \
@@ -452,7 +468,7 @@ std::string Adapter::stringize() const {
 #endif /* VK_KHR_8bit_storage */
 
   ss << "    Shader 16bit and 8bit Features {" << std::endl;
-  PRINT_BOOL(physical_device_.supports_int16_shader_types, shaderInt16)
+  PRINT_VALUE(physical_device_.supports_int16_shader_types, shaderInt16)
 #ifdef VK_KHR_shader_float16_int8
   PRINT_PROP(physical_device_.shader_float16_int8_types, shaderFloat16);
   PRINT_PROP(physical_device_.shader_float16_int8_types, shaderInt8);
@@ -460,8 +476,29 @@ std::string Adapter::stringize() const {
   ss << "    }" << std::endl;
 
   ss << "    Shader 64bit Features {" << std::endl;
-  PRINT_BOOL(physical_device_.supports_int64_shader_types, shaderInt64)
-  PRINT_BOOL(physical_device_.supports_float64_shader_types, shaderFloat64)
+  PRINT_VALUE(physical_device_.supports_int64_shader_types, shaderInt64)
+  PRINT_VALUE(physical_device_.supports_float64_shader_types, shaderFloat64)
+  ss << "    }" << std::endl;
+
+  ss << "    Subgroup Properties {" << std::endl;
+  PRINT_VALUE(subgroup_size(), subgroupSize)
+  PRINT_VALUE(supports_subgroup_compute_basic(), computeSubgroupBasic)
+  PRINT_VALUE(supports_subgroup_compute_shuffle(), computeSubgroupShuffle)
+  PRINT_VALUE(supports_subgroup_compute_ballot(), computeSubgroupBallot)
+  PRINT_VALUE(supports_subgroup_compute_vote(), computeSubgroupVote)
+  PRINT_VALUE(supports_subgroup_compute_arithmetic(), computeSubgroupArithmetic)
+  PRINT_VALUE(
+      supports_subgroup_compute_shuffle_relative(),
+      computeSubgroupShuffleRelative)
+  PRINT_VALUE(supports_subgroup_compute_clustered(), computeSubgroupClustered)
+  PRINT_VALUE(supports_subgroup_compute_quad(), computeSubgroupQuad)
+  PRINT_VALUE(min_subgroup_size(), minSubgroupSize)
+  PRINT_VALUE(max_subgroup_size(), maxSubgroupSize)
+  PRINT_VALUE(supports_subgroup_size_control(), subgroupSizeControl)
+  PRINT_VALUE(supports_compute_full_subgroups(), computeFullSubgroups)
+  PRINT_VALUE(
+      supports_required_subgroup_size_for_compute(),
+      requiredSubgroupSizeStages_compute)
   ss << "    }" << std::endl;
 
 #ifdef VK_KHR_shader_integer_dot_product
@@ -614,5 +651,24 @@ std::ostream& operator<<(std::ostream& os, const Adapter& adapter) {
   return os;
 }
 
+uint32_t resolve_required_subgroup_size(
+    const ShaderInfo& shader,
+    Adapter* adapter) {
+  if (shader.required_subgroup_size == 0u) {
+    return 0u;
+  }
+  if (!adapter->supports_required_subgroup_size_for_compute()) {
+    throw ShaderNotSupportedError(
+        shader.kernel_name, VulkanExtension::SUBGROUP_SIZE_CONTROL);
+  }
+  const uint32_t resolved = shader.required_subgroup_size;
+  if (resolved < adapter->min_subgroup_size() ||
+      resolved > adapter->max_subgroup_size()) {
+    throw ShaderNotSupportedError(
+        shader.kernel_name, VulkanExtension::SUBGROUP_SIZE_CONTROL);
+  }
+  return resolved;
+}
+
 } // namespace vkapi
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index 3c503deab70..68ae0b6528d 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -285,6 +285,106 @@ class Adapter final {
     return physical_device_.min_ubo_alignment;
   }
 
+  // Subgroup properties
+
+  inline uint32_t subgroup_size() const {
+    return physical_device_.subgroup_size;
+  }
+
+  inline bool supports_subgroup_compute_basic() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_BASIC_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_shuffle() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_SHUFFLE_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_ballot() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_BALLOT_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_vote() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_VOTE_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_arithmetic() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_shuffle_relative() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_clustered() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_CLUSTERED_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_quad() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_QUAD_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  // Subgroup size control (VK_EXT_subgroup_size_control / Vulkan 1.3 core).
+
+  inline uint32_t min_subgroup_size() const {
+    return physical_device_.min_subgroup_size;
+  }
+
+  inline uint32_t max_subgroup_size() const {
+    return physical_device_.max_subgroup_size;
+  }
+
+  inline bool supports_subgroup_size_control() const {
+#ifdef ETVK_FORCE_NO_EXTENSIONS
+    return false;
+#endif
+#ifdef VK_EXT_subgroup_size_control
+    return physical_device_.supports_subgroup_size_control;
+#else
+    return false;
+#endif /* VK_EXT_subgroup_size_control */
+  }
+
+  inline bool supports_compute_full_subgroups() const {
+#ifdef ETVK_FORCE_NO_EXTENSIONS
+    return false;
+#endif
+#ifdef VK_EXT_subgroup_size_control
+    return physical_device_.supports_compute_full_subgroups;
+#else
+    return false;
+#endif /* VK_EXT_subgroup_size_control */
+  }
+
+  inline bool supports_required_subgroup_size_for_compute() const {
+    return supports_subgroup_size_control() &&
+        (physical_device_.required_subgroup_size_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
   inline uint32_t max_texture2d_dim() const {
     return physical_device_.properties.limits.maxImageDimension2D;
   }
@@ -312,5 +412,15 @@ class Adapter final {
   friend std::ostream& operator<<(std::ostream&, const Adapter&);
 };
 
+// Resolve the shader-declared required subgroup size into a concrete value
+// (or 0 = no requirement) given the adapter capabilities. Throws
+// ShaderNotSupportedError when the shader declares a required subgroup size
+// but the adapter cannot honor it (extension unsupported, or value out of
+// the adapter's [min, max] range). Silent fallback would create a
+// correctness/perf landmine, so callers must be prepared for this throw.
+uint32_t resolve_required_subgroup_size(
+    const ShaderInfo& shader,
+    Adapter* adapter);
+
 } // namespace vkapi
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
index cb6a54dc489..4deaecbe12c 100644
--- a/backends/vulkan/runtime/vk_api/Device.cpp
+++ b/backends/vulkan/runtime/vk_api/Device.cpp
@@ -78,6 +78,14 @@ PhysicalDevice::PhysicalDevice(
       cooperative_matrix2_features{
           VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV},
 #endif /* VK_NV_cooperative_matrix2 */
+#ifdef VK_EXT_subgroup_size_control
+      subgroup_size_control_features{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT,
+          nullptr},
+      subgroup_size_control_properties{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT,
+          nullptr},
+#endif /* VK_EXT_subgroup_size_control */
       queue_families{},
       num_compute_queues(0),
       api_version_major(0),
@@ -89,6 +97,14 @@ PhysicalDevice::PhysicalDevice(
       has_timestamps(false),
       timestamp_period(0),
       min_ubo_alignment(0),
+      subgroup_size(0),
+      supported_subgroup_ops(0),
+      supported_subgroup_stages(0),
+      min_subgroup_size(0),
+      max_subgroup_size(0),
+      required_subgroup_size_stages(0),
+      supports_subgroup_size_control(false),
+      supports_compute_full_subgroups(false),
       device_name{},
       device_type{DeviceType::UNKNOWN} {
   // Extract physical device properties
@@ -275,6 +291,11 @@ void PhysicalDevice::query_extensions_vk_1_1() {
   extension_list_top = &cooperative_matrix2_features;
 #endif /* VK_NV_cooperative_matrix2 */
 
+#ifdef VK_EXT_subgroup_size_control
+  subgroup_size_control_features.pNext = extension_list_top;
+  extension_list_top = &subgroup_size_control_features;
+#endif /* VK_EXT_subgroup_size_control */
+
   features2.pNext = extension_list_top;
 
   vkGetPhysicalDeviceFeatures2(handle, &features2);
@@ -289,16 +310,60 @@ void PhysicalDevice::query_extensions_vk_1_1() {
     supports_float64_shader_types = true;
   }
 
+#ifdef VK_EXT_subgroup_size_control
+  supports_subgroup_size_control =
+      subgroup_size_control_features.subgroupSizeControl == VK_TRUE;
+  supports_compute_full_subgroups =
+      subgroup_size_control_features.computeFullSubgroups == VK_TRUE;
+#endif /* VK_EXT_subgroup_size_control */
+
   // Query properties separately from features
   VkPhysicalDeviceProperties2 properties2{
       VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2};
 
+  void* properties_list_top = nullptr;
+
+  VkPhysicalDeviceSubgroupProperties subgroup_properties{
+      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES};
+  subgroup_properties.pNext = properties_list_top;
+  properties_list_top = &subgroup_properties;
+
 #ifdef VK_KHR_shader_integer_dot_product
-  shader_int_dot_product_properties.pNext = nullptr;
-  properties2.pNext = &shader_int_dot_product_properties;
+  shader_int_dot_product_properties.pNext = properties_list_top;
+  properties_list_top = &shader_int_dot_product_properties;
 #endif /* VK_KHR_shader_integer_dot_product */
 
+#ifdef VK_EXT_subgroup_size_control
+  subgroup_size_control_properties.pNext = properties_list_top;
+  properties_list_top = &subgroup_size_control_properties;
+#endif /* VK_EXT_subgroup_size_control */
+
+  properties2.pNext = properties_list_top;
+
   vkGetPhysicalDeviceProperties2(handle, &properties2);
+
+  subgroup_size = subgroup_properties.subgroupSize;
+  supported_subgroup_ops = subgroup_properties.supportedOperations;
+  supported_subgroup_stages = subgroup_properties.supportedStages;
+
+#ifdef VK_EXT_subgroup_size_control
+  if (supports_subgroup_size_control) {
+    min_subgroup_size = subgroup_size_control_properties.minSubgroupSize;
+    max_subgroup_size = subgroup_size_control_properties.maxSubgroupSize;
+    required_subgroup_size_stages =
+        subgroup_size_control_properties.requiredSubgroupSizeStages;
+  } else {
+    // Default to the single subgroup_size when control is unavailable so
+    // callers can use min/max range queries unconditionally.
+    min_subgroup_size = subgroup_size;
+    max_subgroup_size = subgroup_size;
+    required_subgroup_size_stages = 0;
+  }
+#else
+  min_subgroup_size = subgroup_size;
+  max_subgroup_size = subgroup_size;
+  required_subgroup_size_stages = 0;
+#endif /* VK_EXT_subgroup_size_control */
 }
 
 void PhysicalDevice::override_device_name(const std::string& new_name) {
diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h
index 9fa413b2457..05660e779b8 100644
--- a/backends/vulkan/runtime/vk_api/Device.h
+++ b/backends/vulkan/runtime/vk_api/Device.h
@@ -60,6 +60,12 @@ struct PhysicalDevice final {
   VkPhysicalDeviceCooperativeMatrix2FeaturesNV cooperative_matrix2_features;
 #endif /* VK_NV_cooperative_matrix2 */
 
+#ifdef VK_EXT_subgroup_size_control
+  VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
+  VkPhysicalDeviceSubgroupSizeControlPropertiesEXT
+      subgroup_size_control_properties;
+#endif /* VK_EXT_subgroup_size_control */
+
   // Available GPU queues
   std::vector<VkQueueFamilyProperties> queue_families;
 
@@ -75,6 +81,22 @@ struct PhysicalDevice final {
   float timestamp_period;
   size_t min_ubo_alignment;
 
+  // Subgroup properties (queried via VkPhysicalDeviceSubgroupProperties).
+  // Populated from VK_VERSION_1_1+ devices; otherwise left at safe defaults.
+  uint32_t subgroup_size;
+  VkSubgroupFeatureFlags supported_subgroup_ops;
+  VkShaderStageFlags supported_subgroup_stages;
+
+  // Subgroup size control (VK_EXT_subgroup_size_control / Vulkan 1.3 core).
+  // Populated only if the extension/feature is supported; default to safe
+  // values otherwise. min/max set to subgroup_size when the extension is
+  // missing so callers can use the same range queries unconditionally.
+  uint32_t min_subgroup_size;
+  uint32_t max_subgroup_size;
+  VkShaderStageFlags required_subgroup_size_stages;
+  bool supports_subgroup_size_control;
+  bool supports_compute_full_subgroups;
+
   // Device identity
   std::string device_name;
   DeviceType device_type;
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
index 5bcf047aaf1..3181476543e 100644
--- a/backends/vulkan/runtime/vk_api/Exception.cpp
+++ b/backends/vulkan/runtime/vk_api/Exception.cpp
@@ -118,6 +118,9 @@ std::ostream& operator<<(std::ostream& out, const VulkanExtension result) {
     case VulkanExtension::SHADER_FLOAT64:
       out << "shaderFloat64";
       break;
+    case VulkanExtension::SUBGROUP_SIZE_CONTROL:
+      out << "VK_EXT_subgroup_size_control (compute stage required size)";
+      break;
   }
   return out;
 }
diff --git a/backends/vulkan/runtime/vk_api/Exception.h b/backends/vulkan/runtime/vk_api/Exception.h
index aa1ef1f2526..ab45ed83bcd 100644
--- a/backends/vulkan/runtime/vk_api/Exception.h
+++ b/backends/vulkan/runtime/vk_api/Exception.h
@@ -85,6 +85,7 @@ enum class VulkanExtension : uint8_t {
   INTEGER_DOT_PRODUCT,
   SHADER_INT64,
   SHADER_FLOAT64,
+  SUBGROUP_SIZE_CONTROL,
 };
 
 class ShaderNotSupportedError : public std::exception {
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 522c4b8589b..2b42e25c92e 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -287,9 +287,21 @@ ComputePipeline::ComputePipeline(
       descriptor.specialization_constants.data(), // pData
   };
 
+  const void* shader_stage_pnext = nullptr;
+#ifdef VK_EXT_subgroup_size_control
+  VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT
+      required_subgroup_size_info{
+          VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+          nullptr,
+          descriptor.required_subgroup_size};
+  if (descriptor.required_subgroup_size > 0u) {
+    shader_stage_pnext = &required_subgroup_size_info;
+  }
+#endif /* VK_EXT_subgroup_size_control */
+
   const VkPipelineShaderStageCreateInfo shader_stage_create_info{
       VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
-      nullptr, // pNext
+      shader_stage_pnext, // pNext
       0u, // flags
       VK_SHADER_STAGE_COMPUTE_BIT, // stage
       descriptor.shader_module, // module
@@ -355,7 +367,8 @@ bool operator==(
   return (
       _1.pipeline_layout == _2.pipeline_layout &&
       _1.shader_module == _2.shader_module &&
-      _1.specialization_constants == _2.specialization_constants);
+      _1.specialization_constants == _2.specialization_constants &&
+      _1.required_subgroup_size == _2.required_subgroup_size);
 }
 
 //
@@ -489,7 +502,18 @@ void ComputePipelineCache::create_pipelines(
   std::vector<VkComputePipelineCreateInfo> create_infos;
   create_infos.reserve(num_pipelines);
 
-  for (const auto& key : keys_to_create) {
+#ifdef VK_EXT_subgroup_size_control
+  // Stable storage for any required-subgroup-size structs that need to live
+  // until vkCreateComputePipelines returns. Indexed by pipeline index; only
+  // the entries for pipelines that actually request a fixed subgroup size
+  // are populated.
+  std::vector<VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT>
+      required_subgroup_size_infos(num_pipelines);
+#endif /* VK_EXT_subgroup_size_control */
+
+  for (size_t pipeline_idx = 0; pipeline_idx < keys_to_create.size();
+       ++pipeline_idx) {
+    const auto& key = keys_to_create[pipeline_idx];
     map_entries.push_back(key.specialization_constants.generate_map_entries());
 
     specialization_infos.push_back(VkSpecializationInfo{
@@ -499,9 +523,20 @@ void ComputePipelineCache::create_pipelines(
         key.specialization_constants.data(), // pData
     });
 
+    const void* shader_stage_pnext = nullptr;
+#ifdef VK_EXT_subgroup_size_control
+    if (key.required_subgroup_size > 0u) {
+      required_subgroup_size_infos[pipeline_idx] = {
+          VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+          nullptr,
+          key.required_subgroup_size};
+      shader_stage_pnext = &required_subgroup_size_infos[pipeline_idx];
+    }
+#endif /* VK_EXT_subgroup_size_control */
+
     shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{
         VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
-        nullptr, // pNext
+        shader_stage_pnext, // pNext
         0u, // flags
         VK_SHADER_STAGE_COMPUTE_BIT, // stage
         key.shader_module, // module
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
index 67dfaebe75b..5e286ad77d7 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -157,6 +157,14 @@ class ComputePipeline final {
     VkPipelineLayout pipeline_layout;
     VkShaderModule shader_module;
     SpecVarList specialization_constants;
+    // Optional: when nonzero, the pipeline is created with
+    // VkPipelineShaderStageRequiredSubgroupSizeCreateInfo chained into the
+    // shader stage pNext, locking the subgroup size to this value. Must be a
+    // power of two within [adapter.min_subgroup_size(),
+    // adapter.max_subgroup_size()] and the adapter must support
+    // VK_EXT_subgroup_size_control with VK_SHADER_STAGE_COMPUTE_BIT in the
+    // required_subgroup_size_stages mask. 0 = no requirement.
+    uint32_t required_subgroup_size = 0u;
   };
 
   explicit ComputePipeline(VkDevice device, VkPipeline handle);
@@ -281,6 +289,9 @@ class ComputePipelineCache final {
         seed = utils::hash_combine(seed, new_seed);
       }
 
+      seed = utils::hash_combine(
+          seed, std::hash<uint32_t>()(descriptor.required_subgroup_size));
+
       return seed;
     }
   };
diff --git a/backends/vulkan/runtime/vk_api/Shader.cpp b/backends/vulkan/runtime/vk_api/Shader.cpp
index c932d0a264b..f55cc88e5c3 100644
--- a/backends/vulkan/runtime/vk_api/Shader.cpp
+++ b/backends/vulkan/runtime/vk_api/Shader.cpp
@@ -34,7 +34,8 @@ ShaderInfo::ShaderInfo(
     const bool requires_8bit_storage_ext,
     const bool requires_integer_dot_product_ext,
     const bool requires_shader_int64_ext,
-    const bool requires_shader_float64_ext)
+    const bool requires_shader_float64_ext,
+    const uint32_t required_subgroup_size_arg)
     : src_code{
           spirv_bin,
           size,
@@ -47,7 +48,8 @@ ShaderInfo::ShaderInfo(
       requires_8bit_storage(requires_8bit_storage_ext),
       requires_integer_dot_product(requires_integer_dot_product_ext),
       requires_shader_int64(requires_shader_int64_ext),
-      requires_shader_float64(requires_shader_float64_ext) {
+      requires_shader_float64(requires_shader_float64_ext),
+      required_subgroup_size(required_subgroup_size_arg) {
 }
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2) {
diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
index 6311710f02b..6cef4d923e9 100644
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ b/backends/vulkan/runtime/vk_api/Shader.h
@@ -69,6 +69,13 @@ struct ShaderInfo final {
   bool requires_shader_int64 = false;
   bool requires_shader_float64 = false;
 
+  // Subgroup size requirement declared in the shader's yaml.
+  //   0  = no requirement (default)
+  //  >0  = literal fixed size; pipeline is pinned to this subgroup size.
+  // Sourced from the yaml's `SUBGROUP_SIZE` template parameter — single
+  // source of truth shared with GLSL ${SUBGROUP_SIZE} substitution.
+  uint32_t required_subgroup_size = 0u;
+
   explicit ShaderInfo();
 
   explicit ShaderInfo(
@@ -82,7 +89,8 @@ struct ShaderInfo final {
       const bool requires_8bit_storage_ext,
       const bool requires_integer_dot_product_ext,
       const bool requires_shader_int64_ext,
-      const bool requires_shader_float64_ext);
+      const bool requires_shader_float64_ext,
+      const uint32_t required_subgroup_size_arg = 0u);
 
   operator bool() const {
     return src_code.bin != nullptr;