Quantize moveaxis/movedim so they delegate to Ethos-U (#20314)

apullin · facebook-github-bot · commit c82224726170 · 2026-06-18T08:15:20.000-07:00
Summary:

The ARM PT2 quantizer's pass-through shared-qspec set in quantization_annotator.py
(_one_to_one_shared_input_qspec) covers permute/permute_copy/transpose/view/squeeze
etc., but omits aten.moveaxis/aten.movedim. A model that uses torch.moveaxis
therefore leaves those ops unquantized: the quantizer brackets each one with
dequantize -&gt; moveaxis(float) -&gt; quantize.

On lowering, moveaxis decomposes to a float permute_copy. The Ethos-U55
operator-support check (operator_support/ethos_u55_support.py) only delegates
permute_copy for int8/int16/int32, so it rejects the float one. Each rejected
permute is stranded on the host, splitting the model into many delegated
partitions (one NPU island per permute), which bloats the .pte with per-partition
delegate overhead and host round-trips.

Add aten.moveaxis.int / aten.movedim.int to _one_to_one_shared_input_qspec
(guarded with getattr for torch-build variance, mirroring the existing
transpose.Dimname handling) so they share the input quantization spec exactly like
transpose/permute. They then stay int8, decompose to int8 permute_copy, and
delegate to the NPU -- eliminating the host float islands.

Impact: a quantized example ensemble (ConvNeXt-style blocks that
use torch.moveaxis) that previously lowered into 9 Ethos-U55 partitions now lowers
into a single delegate, with zero host permutes and ~24% smaller .pte, with no
model changes. Generalizes to any moveaxis/movedim-using model on the Ethos-U
backend.

Differential Revision: D108478011
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -619,6 +619,10 @@ def _get_fixed_qparams_qspec(
     # dequant -> neg -> requant chain.
     torch.ops.aten.neg.default,
     torch.ops.aten.detach_copy.default,
+    torch.ops.aten.moveaxis.int,
+    torch.ops.aten.moveaxis.intlist,
+    torch.ops.aten.movedim.int,
+    torch.ops.aten.movedim.intlist,
 }
 
 # Dimname has been removed from upstream PyTorch, but there may be a window
@@ -630,6 +634,7 @@ def _get_fixed_qparams_qspec(
 if _transpose_dimname is not None:
     _one_to_one_shared_input_qspec.add(_transpose_dimname)
 
+
 _one_to_one_shared_input_or_input_act_qspec: set[OpOverload] = {
     torch.ops.aten.alias.default,
     torch.ops.aten.clone.default,
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
@@ -9,6 +9,7 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.arm.quantizer import quantization_annotator
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
 )
@@ -78,6 +79,12 @@ def forward(self, x):
         return torch.permute(x, self.dims)
 
 
+class SimpleMoveAxis(torch.nn.Module):
+
+    def forward(self, x):
+        return torch.moveaxis(x, 1, -1)
+
+
 @common.parametrize(
     "test_data", test_data_suite | test_data_suite_fp16 | test_data_suite_bf16
 )
@@ -118,6 +125,29 @@ def test_permute_u55_INT(test_data):
     pipeline.run()
 
 
+def test_moveaxis_movedim_shared_qspec_annotations():
+    expected_ops = {
+        torch.ops.aten.moveaxis.int,
+        torch.ops.aten.moveaxis.intlist,
+        torch.ops.aten.movedim.int,
+        torch.ops.aten.movedim.intlist,
+    }
+
+    assert expected_ops <= quantization_annotator._one_to_one_shared_input_qspec
+
+
+@common.XfailIfNoCorstone300
+def test_moveaxis_u55_INT():
+    pipeline = EthosU55PipelineINT[input_t1](
+        SimpleMoveAxis(),
+        (torch.rand(1, 4, 5, 6),),
+        "torch.ops.aten.moveaxis.int",
+        exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
+        run_on_fvp=False,
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_suite_u55_reject)
 def test_permute_u55_INT_not_delegated(test_data: torch.Tensor):
     test_data, dims = test_data()