Qualcomm AI Engine Direct - Adding QNN backend support for the addmm coreATen op (#20355)

qti-horodnic · web-flow · commit 66884b4c2c5e · 2026-06-18T09:04:12.000-07:00
### Summary
Added full support for the `aten.addmm` core ATen op via a two-pass
decomposition strategy:

1. `AddmmToLinearTransform` (ExecuTorch shared pass): Converts the
common `nn.Linear` decomposition pattern (`addmm(bias, input,
weight.T)`) back to `aten.linear`, mapping to QNN's fused
`FullyConnected` op for optimal performance.

2. `DecomposeAddmm` (new pass): Handles remaining standalone `addmm`
nodes by decomposing them into `mm + add`. Supports non-unit
`alpha`/`beta` scalars via additional `mul` nodes.

`AddmmToLinearTransform` alone is not sufficient because it only handles
the subset of `addmm` nodes that match the `nn.Linear` decomposition
pattern, specifically where `args[2]` is a transposed weight (`t_copy`
or `permute_copy`).
Standalone `addmm(bias, A, B)` calls where `B` is not transposed are
explicitly skipped by that pass. `DecomposeAddmm` serves as the fallback
for these cases.


Also made some small improvements to the `new_op_development` skill
based on recent learnings.

### Test plan
```
python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedOperator.test_qnn_backend_addmm --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android

python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNFloatingPointOperator.test_qnn_backend_addmm --model SM8750 --host aisw-vm15-labsd --device 545ee4aa --build_folder build-android
```
diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md
@@ -217,8 +217,17 @@ class DecomposeMyOp(ExportPass):
 
 ### Registration (all decompose passes)
 1. `_passes/__init__.py` — import + `__all__`
-2. `_passes/qnn_pass_manager.py` — import + `transform_for_annotation_pipeline` + `transform_for_export_pipeline` + `get_capture_program_passes`
-3. `_passes/utils.py` — add to `get_passes_dependency_for_capture_program()` with `[RemoveRedundancy]` dependency
+2. `_passes/qnn_pass_manager.py` — The pass manager uses classmethods for pipeline definitions:
+   - **Import** — add to the import block at top of file
+   - **`get_annotation_passes()`** — add pass class to the returned list (runs before quantizer, ATen IR)
+   - **`get_export_passes()`** — add pass class if needed for float-only path (runs after quantization, before to-edge)
+   - **`get_default_pass_activations()`** — add `(PassClass, True)` ONLY if the pass also needs to run in the to-edge pipeline
+   - **`get_passes_dependency_for_capture_program()`** — add `PassClass: [RemoveRedundancy]` dependency ONLY if also in `get_default_pass_activations`
+
+**When to add to which pipeline:**
+- **Annotation only** (most common for decompose passes): `get_annotation_passes()` — pass decomposes the op before the quantizer sees it
+- **Export pipeline** too: if the float-only test fails without it (op doesn't get handled by PyTorch's built-in decomposition during to-edge)
+- **Capture program** (to-edge) too: if the op can appear in edge dialect and needs decomposition there (e.g., `DecomposeVar`, `DecomposeCDist`, `DecomposeDiagonal`)
 
 ---
 
@@ -255,4 +264,4 @@ class DecomposeMyOp(ExportPass):
 
 **Native QNN Op:** `qnn_constants.py` → `op_my_op.py` → `builders/__init__.py` → `htp_rules.py` → `lpai_rules.py` → `layout_transform.py` → `tests/models.py` → `test_qnn_delegate.py` → `partition/utils.py` (skip decomp) → `common_defs.py` (remove to_be_implemented) → `builders/README.md`
 
-**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (annotation + export + capture) → `_passes/utils.py` (dependency) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md`
+**Decompose Pass:** `_passes/decompose_my_op.py` → `_passes/__init__.py` → `qnn_pass_manager.py` (`get_annotation_passes` + optionally `get_export_passes`; if also needed in to-edge: `get_default_pass_activations` + `get_passes_dependency_for_capture_program`) → `tests/models.py` → `test_qnn_delegate.py` → `common_defs.py` → `builders/README.md`
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -14,6 +14,7 @@
 from .convert_mha_to_sha import ConvertMhaToSha
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_acos import DecomposeAcos
+from .decompose_addmm import DecomposeAddmm
 from .decompose_any import DecomposeAny
 from .decompose_atan2 import DecomposeAtan2
 from .decompose_binary_alpha import DecomposeBinaryAlpha
@@ -76,6 +77,7 @@
     ConvertMhaToSha,
     ConvertSquareToPow,
     DecomposeAcos,
+    DecomposeAddmm,
     DecomposeAny,
     DecomposeAtan2,
     DecomposeBinaryAlpha,
diff --git a/backends/qualcomm/_passes/decompose_addmm.py b/backends/qualcomm/_passes/decompose_addmm.py
@@ -0,0 +1,115 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta, get_const_node
+
+
+class DecomposeAddmm(ExportPass):
+    """
+    Decompose addmm into mm + add (with optional mul for non-unit alpha/beta).
+        addmm(bias, input, mat2, beta=1, alpha=1) = beta * bias + alpha * (input @ mat2)
+
+    For the common case (alpha=1, beta=1): addmm(bias, input, mat2) = mm(input, mat2) + bias
+
+    Note: This pass serves as a fallback for standalone addmm nodes that are NOT
+    handled by the ExecuTorch-provided pass AddmmToLinearTransform.
+    Any remaining addmm nodes (e.g., with non-transposed mat2) are decomposed here into mm + add.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.addmm_targets = {
+            torch.ops.aten.addmm.default,
+            exir_ops.edge.aten.addmm.default,
+        }
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+
+        for node in list(graph.nodes):
+            if node.op == "call_function" and node.target in self.addmm_targets:
+                is_edge = isinstance(node.target, EdgeOpOverload)
+                bias_node = node.args[0]
+                input_node = node.args[1]
+                mat2_node = node.args[2]
+                # kwargs beta and alpha default to 1
+                beta = node.kwargs.get("beta", 1)
+                alpha = node.kwargs.get("alpha", 1)
+
+                mm_op = (
+                    exir_ops.edge.aten.mm.default
+                    if is_edge
+                    else torch.ops.aten.mm.default
+                )
+                add_op = (
+                    exir_ops.edge.aten.add.Tensor
+                    if is_edge
+                    else torch.ops.aten.add.Tensor
+                )
+                mul_op = (
+                    exir_ops.edge.aten.mul.Tensor
+                    if is_edge
+                    else torch.ops.aten.mul.Tensor
+                )
+
+                meta = node.meta
+
+                with graph.inserting_before(node):
+                    # mm_result = input @ mat2
+                    mm_node = graph.create_node(
+                        "call_function", mm_op, (input_node, mat2_node)
+                    )
+                    mm_node.meta = copy_meta(meta)
+
+                    if alpha != 1:
+                        alpha_node = get_const_node(
+                            graph,
+                            graph_module,
+                            f"{node.name}_alpha",
+                            alpha,
+                            mm_node,
+                        )
+                        mm_scaled = graph.create_node(
+                            "call_function", mul_op, (mm_node, alpha_node)
+                        )
+                        mm_scaled.meta = copy_meta(meta)
+                        mm_result = mm_scaled
+                    else:
+                        mm_result = mm_node
+
+                    if beta != 1:
+                        beta_const = get_const_node(
+                            graph,
+                            graph_module,
+                            f"{node.name}_beta",
+                            beta,
+                            bias_node,
+                        )
+                        bias_scaled = graph.create_node(
+                            "call_function", mul_op, (bias_node, beta_const)
+                        )
+                        bias_scaled.meta = copy_meta(meta)
+                        bias_result = bias_scaled
+                    else:
+                        bias_result = bias_node
+
+                    # result = mm_result + bias
+                    add_node = graph.create_node(
+                        "call_function", add_op, (mm_result, bias_result)
+                    )
+                    add_node.meta = copy_meta(meta)
+
+                for user in node.users.copy():
+                    user.replace_input_with(node, add_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -20,6 +20,7 @@
     ConvertMhaToSha,
     ConvertSquareToPow,
     DecomposeAcos,
+    DecomposeAddmm,
     DecomposeAny,
     DecomposeAtan2,
     DecomposeBinaryAlpha,
@@ -122,6 +123,7 @@ def get_default_pass_activations(cls):
             (AnnotateUnbind, True),
             (ConvertBmmToMatmul, False),
             (DecomposeAcos, True),
+            (DecomposeAddmm, True),
             (DecomposeAny, True),
             (DecomposeAtan2, True),
             (DecomposeColIm, True),
@@ -160,6 +162,7 @@ def get_annotation_passes(cls):
             RecomposeRmsNorm,
             ReplaceArangeArgs,
             DecomposeAcos,
+            DecomposeAddmm,
             DecomposeAtan2,
             DecomposeBinaryAlpha,
             DecomposeCDist,
@@ -275,6 +278,7 @@ def get_passes_dependency_for_capture_program(cls):
             AnnotateUnbind: [RemoveRedundancy],
             ConvertBmmToMatmul: [RecomposePixelUnshuffle],
             DecomposeAcos: [RemoveRedundancy],
+            DecomposeAddmm: [RemoveRedundancy],
             DecomposeAny: [RemoveRedundancy],
             DecomposeAtan2: [RemoveRedundancy],
             DecomposeColIm: [FoldQDQ],
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
@@ -498,6 +498,7 @@ The following PyTorch operators are supported through decomposition or annotatio
 | PyTorch Op | Decomposition Pass |
 |---|---|
 | `aten.acos` | `DecomposeAcos` |
+| `aten.addmm` | `DecomposeAddmm` |
 | `aten.adaptive_avg_pool1d`, `aten.avg_pool1d` | `AnnotateAvgPool1D` |
 | `aten.any` | `DecomposeAny` |
 | `aten.atan2.default`, `aten.atan2.out` | `DecomposeAtan2` |
diff --git a/backends/qualcomm/quantizer/annotators/htp_rules.py b/backends/qualcomm/quantizer/annotators/htp_rules.py
@@ -1077,7 +1077,11 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
 
 
 @register_annotator(
-    [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default],
+    [
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.mm.default,
+    ],
     QnnConstants.OpMatMul.op_name,
 )
 class MatMul(GeneralOpDef):
diff --git a/backends/qualcomm/quantizer/annotators/lpai_rules.py b/backends/qualcomm/quantizer/annotators/lpai_rules.py
@@ -601,7 +601,11 @@ def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
 
 
 @register_annotator(
-    [torch.ops.aten.bmm.default, torch.ops.aten.matmul.default],
+    [
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.mm.default,
+    ],
     QnnConstants.OpMatMul.op_name,
 )
 class MatMul(GeneralOpDef):
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -144,6 +144,16 @@ def forward(self, x):
         return 10 + x
 
 
+class AddMM(torch.nn.Module):
+    def __init__(self, alpha=1, beta=1):
+        super().__init__()
+        self.alpha = alpha
+        self.beta = beta
+
+    def forward(self, bias, input, mat2):
+        return torch.addmm(bias, input, mat2, alpha=self.alpha, beta=self.beta)
+
+
 class Any(torch.nn.Module):
     def __init__(self, dim=None, keepdim=False):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -190,6 +190,30 @@ def test_qnn_backend_adaptive_max_pool2d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_addmm(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [AddMM()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+            {
+                QCOM_MODULE: [AddMM(alpha=2, beta=3)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_alias(self):
         module = Alias()  # noqa: F405
         sample_input = (torch.randn(1, 10),)
@@ -2969,6 +2993,31 @@ def test_qnn_backend_adaptive_max_pool2d(self):
                 module_one = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module_one, sample_input)
 
+    def test_qnn_backend_addmm(self):
+        test_comb = [
+            {
+                QCOM_MODULE: [AddMM()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+            {
+                QCOM_MODULE: [AddMM(alpha=2, beta=3)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(8), torch.randn(4, 3), torch.randn(3, 8)),
+                ],
+            },
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_alias(self):
         module = Alias()  # noqa: F405
         sample_input = (torch.randn(1, 10),)