pytorch
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 9 additions & 6 deletions b/‎Makefile‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/remove_permutes_around_elementwise_tosa_ops.py‎
Lines changed: 12 additions & 5 deletions b/‎backends/arm/_passes/remove_permutes_around_elementwise_tosa_ops.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎backends/arm/ao_ext/ops/mxfp_conv2d_op.py‎
Lines changed: 16 additions & 1 deletion b/‎backends/arm/ao_ext/ops/mxfp_conv2d_op.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎backends/arm/test/misc/test_mxfp_conv2d_ao.py‎
Lines changed: 82 additions & 0 deletions b/‎backends/arm/test/misc/test_mxfp_conv2d_ao.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎backends/arm/test/misc/test_transpose_counts.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/test/misc/test_transpose_counts.py‎
Lines changed: 1 addition & 1 deletion
@@ -852,7 +852,8 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
+        # TODO(T12345): re-enable qnn_16a16w once OOM on linux.2xlarge is resolved
+        pt2e_quantize: [qnn_8a8w]
         mode: [qnn]
       fail-fast: false
     with:
 
@@ -951,7 +951,8 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
+        # TODO(T12345): re-enable qnn_16a16w once OOM on linux.2xlarge is resolved
+        pt2e_quantize: [qnn_8a8w]
         mode: [qnn]
       fail-fast: false
     with:
 
@@ -127,8 +127,8 @@ help:
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
-	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
-	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
+	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner and worker with CUDA backend"
+	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner and worker with MLX backend"
 	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  qwen3_5_moe-mlx     - Build Qwen3.5 MoE runner with MLX backend"
@@ -444,20 +444,23 @@ qwen3_5_moe-cuda:
 gemma4_31b-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Gemma 4 31B runner with CUDA..."
+	@echo "==> Building Gemma 4 31B runner, worker, and no-bleed test with CUDA..."
 	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-cuda
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
+	@echo "  Test:   cmake-out/examples/models/gemma4_31b/test_gemma4_31b_nobleed"
 
 gemma4_31b-mlx:
 	@echo "==> Building and installing ExecuTorch with MLX..."
 	cmake --workflow --preset mlx-release
-	@echo "==> Building Gemma 4 31B runner with MLX..."
+	@echo "==> Building Gemma 4 31B runner and worker with MLX..."
 	cd examples/models/gemma4_31b && cmake --workflow --preset gemma4-31b-mlx
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Runner: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
+	@echo "  Worker: cmake-out/examples/models/gemma4_31b/gemma4_31b_worker"
 
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 
@@ -618,7 +618,7 @@ def _tosa_pipeline(
                 RewriteMatmulPass(),
                 RewritePadPass(),
                 FuseViewCopyTransformPass(),
-                RemovePermutesAroundElementwiseTosaOps(),
+                RemovePermutesAroundElementwiseTosaOps(exported_program),
                 CanonicalizeViewCopyPermutePass(),
                 FuseCascadedTransposeOrPermuteOps(),
                 RewriteHighRankSingletonPermutePass(),
 
@@ -3,15 +3,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
+
+from executorch.backends.arm._passes.arm_pass_utils import is_param_node
 from executorch.backends.arm._passes.insert_table_ops import TableOps
 from executorch.backends.transforms.remove_permutes_around_elementwise_ops import (
     RemovePermutesAroundElementwiseOps,
 )
+from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 
 
 class RemovePermutesAroundElementwiseTosaOps(RemovePermutesAroundElementwiseOps):
-    def __init__(self) -> None:
+    def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__(
             extra_permutable_ops={
                 *TableOps.unary_table_ops.keys(),
@@ -20,16 +24,19 @@ def __init__(self) -> None:
                 exir_ops.backend.tosa.TABLE.default,
             }
         )
+        self.exported_program = exported_program
+
+    def _is_constant(self, node: torch.fx.Node) -> bool:
+        # Override fragile string match check with exported program check
+        return super()._is_constant(node) or is_param_node(self.exported_program, node)
 
     def permute_subgraph(self, subgraph) -> bool:
-        # Original function will always permute constant nodes which is wrong for table ops
-        # Remove constant tosa.TABLE edges before running full function
+        # TABLE lookup inputs are already tied to the table layout.
         new_constant_edges_in = set()
         for const_node, user_node in subgraph.constant_edges_in:
             if user_node.target == exir_ops.backend.tosa.TABLE.default:
                 continue
-            else:
-                new_constant_edges_in.add((const_node, user_node))
+            new_constant_edges_in.add((const_node, user_node))
 
         subgraph.constant_edges_in = new_constant_edges_in
         return super().permute_subgraph(subgraph)
@@ -32,6 +32,12 @@
 )
 
 
+_SUPPORTED_OUTPUT_DTYPES: set[torch.dtype] = {
+    torch.float32,
+    torch.bfloat16,
+}
+
+
 def _get_mx_elem_dtype(
     weight_qdata: torch.Tensor,
     weight_payload_dtype: str = "",
@@ -208,10 +214,12 @@ def __init__(
         groups: int,
         weight_dtype: MXFPDType,
         block_size: int,
+        output_dtype: torch.dtype = torch.float32,
     ) -> None:
         super().__init__()
         self.weight_dtype = mxfp_dtype_to_str(weight_dtype)
         self.block_size = block_size
+        self.output_dtype = output_dtype
 
         self.register_buffer("weight_qdata", weight_qdata, persistent=True)
         self.register_buffer("weight_scale", weight_scale, persistent=True)
@@ -233,7 +241,7 @@ def __init__(
         self.groups = groups
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return torch.ops.tosa_mxfp.conv2d.default(
+        output = torch.ops.tosa_mxfp.conv2d.default(
             x,
             self.weight_qdata,
             self.weight_scale,
@@ -245,6 +253,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.block_size,
             self.weight_dtype,
         )
+        if self.output_dtype != torch.float32:
+            output = output.to(self.output_dtype)
+        return output
 
 
 def transform_conv2d_to_mxfp(
@@ -276,6 +287,9 @@ def transform_conv2d_to_mxfp(
     )
 
     bias = module.bias.detach().to(torch.float32) if module.bias is not None else None
+    output_dtype = weight_ohwi.dtype
+    if output_dtype not in _SUPPORTED_OUTPUT_DTYPES:
+        raise ValueError(f"Unsupported output_dtype: {output_dtype}")
     return MXFPConv2dOp(
         weight_qdata,
         weight_scale,
@@ -286,4 +300,5 @@ def transform_conv2d_to_mxfp(
         module.groups,
         config.weight_dtype,
         config.block_size,
+        output_dtype,
     )
@@ -159,6 +159,61 @@ def test_mxfp_conv2d_quantize_supports_fp4_weights() -> None:
     )
 
 
+def test_mxfp_conv2d_preserves_bfloat16_output_dtype() -> None:
+    model = Conv2dModule().eval().to(torch.bfloat16)
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+    output = model(torch.randn(1, IN_CHANNELS, 8, 8, dtype=torch.bfloat16))
+
+    assert isinstance(model.conv, MXFPConv2dOp)
+    assert model.conv.output_dtype == torch.bfloat16
+    assert output.dtype == torch.bfloat16
+
+
+def test_mxfp_conv2d_op_output_dtype_constructor_arg() -> None:
+    model = Conv2dModule().eval()
+    config = MXFPOpConfig(weight_dtype=torch.float8_e4m3fn)
+    to_mxfp(
+        model,
+        config,
+    )
+    assert isinstance(model.conv, MXFPConv2dOp)
+
+    fp32_conv = MXFPConv2dOp(
+        model.conv.weight_qdata,
+        model.conv.weight_scale,
+        model.conv.bias,
+        model.conv.stride,
+        model.conv.padding,
+        model.conv.dilation,
+        model.conv.groups,
+        config.weight_dtype,
+        config.block_size,
+    )
+    bf16_conv = MXFPConv2dOp(
+        model.conv.weight_qdata,
+        model.conv.weight_scale,
+        model.conv.bias,
+        model.conv.stride,
+        model.conv.padding,
+        model.conv.dilation,
+        model.conv.groups,
+        config.weight_dtype,
+        config.block_size,
+        output_dtype=torch.bfloat16,
+    )
+
+    test_input = torch.randn(1, IN_CHANNELS, 8, 8)
+
+    assert fp32_conv.output_dtype == torch.float32
+    assert fp32_conv(test_input).dtype == torch.float32
+    assert bf16_conv.output_dtype == torch.bfloat16
+    assert bf16_conv(test_input).dtype == torch.bfloat16
+
+
 def _test_mxfp_conv2d_export_preserves_custom_op(config: MXFPOpConfig) -> None:
     model = Conv2dModule().eval()
     to_mxfp(model, config)
@@ -198,6 +253,33 @@ def test_mxfp6_e3m2_conv2d_export_preserves_custom_op() -> None:
     )
 
 
+def test_mxfp_conv2d_export_preserves_inferred_bfloat16_output_dtype() -> None:
+    model = Conv2dModule().eval().to(torch.bfloat16)
+    to_mxfp(
+        model,
+        MXFPOpConfig(weight_dtype=torch.float8_e4m3fn),
+    )
+
+    exported = export(
+        model,
+        (torch.randn(1, IN_CHANNELS, 8, 8, dtype=torch.bfloat16),),
+        strict=False,
+    )
+
+    cast_nodes = [
+        node
+        for node in exported.graph_module.graph.nodes
+        if node.op == "call_function" and node.target == torch.ops.aten.to.dtype
+    ]
+
+    assert len(cast_nodes) == 1
+    assert cast_nodes[0].args[1] == torch.bfloat16
+    assert cast_nodes[0].meta["val"].dtype == torch.bfloat16
+    cast_input = cast_nodes[0].args[0]
+    assert isinstance(cast_input, torch.fx.Node)
+    assert cast_input.target == torch.ops.tosa_mxfp.conv2d.default
+
+
 def test_mxfp_conv2d_cpu_impl_matches_ref() -> None:
     ref_model = Conv2dModule().eval()
     test_model = Conv2dModule().eval()
 
@@ -453,7 +453,7 @@ def forward(self, x: torch.Tensor):
         Model4ConvLstmLinearLayerNorm(), (torch.randn(2, 8, 32),), 3
     ),
     "model_5_dwconv_gelu_layernorm_avgpool": TransposeCountCase(
-        Model5DwConvGeluLayerNormAvgPool(), (torch.randn(1, 8, 16, 16),), 4
+        Model5DwConvGeluLayerNormAvgPool(), (torch.randn(1, 8, 16, 16),), 2
     ),
     "model_6_gru_linear": TransposeCountCase(
         Model6GruLinear(), (torch.randn(2, 16, 8),), 2