Arm backend: fix composable quantizer leaving int16 elementwise constants and IO-boundary shared clusters unquantized

rascani · facebook-github-bot · commit 89d57c445bea · 2026-06-02T17:01:09.000-07:00
Summary:
Two correctness fixes for the composable TOSA quantizer (enabled via `use_composable_quantizer=True`).

1. Parameter operands of non conv/linear ops were quantized as weights.

In `annotate_match`, the first parameter input of any matched op was assigned the weight qspec. Only conv and linear ops have true weight/bias operands; for any other op (for example an elementwise `add`/`sub`), a constant/parameter operand is an ordinary activation operand. Quantizing it with the weight dtype while the other operand and the output use the activation dtype produces a graph that cannot be lowered for ops whose TOSA implementation requires both operands to share a dtype (`add`/`sub`), and silently demotes constants in 16A8W (int16-activation) configurations to int8. Weight/bias classification is now restricted to the ops that actually have them; parameter inputs of all other ops receive the input-activation qspec.

2. Shared-op clusters on the quantized IO boundary were left in float.

`SharedQspecQuantizer` only propagates a qspec from an already-quantized neighbor. A cluster of shared/no-arithmetic ops (for example `cat` and view/reshape ops) whose only quantized neighbors are a uint8 IO input (deliberately skipped so uint8 stays confined to the IO boundary) and/or an input placeholder carrying an empty annotation has no qspec to propagate, so the cluster was rejected and remained in float, falling off the integer delegate onto the CPU. Such clusters now initiate quantization from the global config's input-activation qspec when they sit on the quantized IO boundary, while still keeping uint8 confined to the IO boundary.

This change was developed with assistance from Claude.

Differential Revision: D107320847
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -1220,6 +1220,10 @@ def set_global(
             quantization_config, node_finder, self.pattern_matcher
         )
         self.global_config = quantization_config
+        # Let the shared-qspec pass initiate quantization from the global config
+        # for shared-op clusters that sit on the quantized I/O boundary but have
+        # no neighbor qspec to propagate (see SharedQspecQuantizer).
+        self.shared_qspec_quantizer.global_config = quantization_config
         return self
 
     def set_node_target(
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -336,14 +336,22 @@ def annotate_match(
                         f"{node.name} is not expected to not have parameter tensors but found {[n.name for n in params]}, which may cause unexpected quantization annotations."
                     )
 
+            # Only ops in ``parameter_targets`` (conv/linear) have true weight and
+            # bias operands. For any other op, a parameter input is a constant
+            # activation operand and must be quantized with the input-activation
+            # qspec, not the weight qspec. Treating it as a weight breaks ops whose
+            # TOSA implementation requires both operands to share a dtype (e.g.
+            # add/sub), and silently demotes constants in int16 (16a8w) modules to
+            # int8 -- the weight qspec ``dtype`` differs from the activation dtype.
+            has_weight_and_bias = node.target in parameter_targets
             for input_node in node.all_input_nodes:
                 if not has_float_output(input_node):
                     continue
-                if self.is_weight(input_node, params, model):
+                if has_weight_and_bias and self.is_weight(input_node, params, model):
                     input_qspec_map[input_node] = (
                         config.get_weight_qspec(node) if config else None
                     )
-                elif self.is_bias(input_node, params, model):
+                elif has_weight_and_bias and self.is_bias(input_node, params, model):
                     input_qspec_map[input_node] = (
                         config.get_bias_qspec(node) if config else None  # type: ignore[assignment]
                     )
@@ -481,6 +489,10 @@ class SharedQspecQuantizer(Quantizer, QuantizerReporterUser):
     def __init__(self, targets: Optional[list[Callable[..., object]]] = None) -> None:
         super().__init__()
         QuantizerReporterUser.__init__(self)
+        # Optional global config used to *initiate* quantization for shared-op
+        # clusters that sit on the quantized I/O boundary but have no neighbor
+        # qspec to propagate. Set by the composable quantizer's set_global().
+        self.global_config: Optional[QuantizationConfig] = None
         if targets is None:
             self.targets = self.SHARED_QSPEC_OPS_DEFAULT
             self.support_config_path = (
@@ -552,10 +564,24 @@ def _append_input_qspec(
             return
         adjacent_qspecs.append(input_qspec)
 
-    def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
+    def _is_quantized_io_boundary(self, node: Node) -> bool:
+        """Return True if ``node`` is a model input/output the quantizer annotated.
+
+        Such a node sits on the quantized interface, but its qspec is often
+        filtered out of shared-cluster propagation -- a uint8 IO qspec is skipped
+        by ``_skip_shared_qspec_from_io``, and an input-state placeholder may
+        carry an annotation with no ``output_qspec``. Its presence still signals
+        that the cluster is on the quantized data path.
+        """
+        return node.op in ("placeholder", "output") and self._is_annotated(node)
+
+    def _get_shared_clique(
+        self, root_node: Node
+    ) -> tuple[set[Node], list[Any], bool]:
         shared_nodes = set()
         bfs_queue = [root_node]
         adjacent_qspecs: list[Any] = []
+        touches_quantized_io = False
 
         while bfs_queue:
             node = bfs_queue.pop(0)
@@ -564,12 +590,14 @@ def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
             for input_node in node.all_input_nodes:
                 self._maybe_enqueue_shared_node(input_node, shared_nodes, bfs_queue)
                 self._append_output_qspec(input_node, adjacent_qspecs)
+                touches_quantized_io |= self._is_quantized_io_boundary(input_node)
 
             for output_node in node.users.keys():
                 self._maybe_enqueue_shared_node(output_node, shared_nodes, bfs_queue)
                 self._append_input_qspec(output_node, node, adjacent_qspecs)
+                touches_quantized_io |= self._is_quantized_io_boundary(output_node)
 
-        return shared_nodes, adjacent_qspecs
+        return shared_nodes, adjacent_qspecs, touches_quantized_io
 
     def _annotate_shared_cluster(self, root_node: Node) -> None:
         if (
@@ -588,7 +616,27 @@ def _annotate_shared_cluster(self, root_node: Node) -> None:
             )
             return
 
-        shared_nodes, adjacent_qspecs = self._get_shared_clique(root_node)
+        shared_nodes, adjacent_qspecs, touches_quantized_io = self._get_shared_clique(
+            root_node
+        )
+
+        # If there is no neighbor qspec to propagate but the cluster sits on the
+        # quantized I/O boundary (e.g. a state-passthrough ``cat`` whose only
+        # neighbors are a uint8 model input -- skipped by
+        # _skip_shared_qspec_from_io -- and an input-state placeholder), initiate
+        # quantization from the global config rather than leaving the cluster in
+        # float. Otherwise such clusters fall off the integer delegate onto the
+        # CPU. Initiating from the global (internal) config keeps uint8 confined
+        # to the IO boundary.
+        if (
+            len(adjacent_qspecs) == 0
+            and touches_quantized_io
+            and self.global_config is not None
+        ):
+            global_input_qspec = self.global_config.get_input_act_qspec()
+            if global_input_qspec is not None:
+                adjacent_qspecs = [global_input_qspec]
+
         node_order = {node: index for index, node in enumerate(root_node.graph.nodes)}
         ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0))