Skip to content

Commit 89d57c4

Browse files
rascanifacebook-github-bot
authored andcommitted
Arm backend: fix composable quantizer leaving int16 elementwise constants and IO-boundary shared clusters unquantized
Summary: Two correctness fixes for the composable TOSA quantizer (enabled via `use_composable_quantizer=True`). 1. Parameter operands of non conv/linear ops were quantized as weights. In `annotate_match`, the first parameter input of any matched op was assigned the weight qspec. Only conv and linear ops have true weight/bias operands; for any other op (for example an elementwise `add`/`sub`), a constant/parameter operand is an ordinary activation operand. Quantizing it with the weight dtype while the other operand and the output use the activation dtype produces a graph that cannot be lowered for ops whose TOSA implementation requires both operands to share a dtype (`add`/`sub`), and silently demotes constants in 16A8W (int16-activation) configurations to int8. Weight/bias classification is now restricted to the ops that actually have them; parameter inputs of all other ops receive the input-activation qspec. 2. Shared-op clusters on the quantized IO boundary were left in float. `SharedQspecQuantizer` only propagates a qspec from an already-quantized neighbor. A cluster of shared/no-arithmetic ops (for example `cat` and view/reshape ops) whose only quantized neighbors are a uint8 IO input (deliberately skipped so uint8 stays confined to the IO boundary) and/or an input placeholder carrying an empty annotation has no qspec to propagate, so the cluster was rejected and remained in float, falling off the integer delegate onto the CPU. Such clusters now initiate quantization from the global config's input-activation qspec when they sit on the quantized IO boundary, while still keeping uint8 confined to the IO boundary. This change was developed with assistance from Claude. Differential Revision: D107320847
1 parent 79cbc45 commit 89d57c4

2 files changed

Lines changed: 57 additions & 5 deletions

File tree

backends/arm/quantizer/arm_quantizer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,10 @@ def set_global(
12201220
quantization_config, node_finder, self.pattern_matcher
12211221
)
12221222
self.global_config = quantization_config
1223+
# Let the shared-qspec pass initiate quantization from the global config
1224+
# for shared-op clusters that sit on the quantized I/O boundary but have
1225+
# no neighbor qspec to propagate (see SharedQspecQuantizer).
1226+
self.shared_qspec_quantizer.global_config = quantization_config
12231227
return self
12241228

12251229
def set_node_target(

backends/arm/quantizer/arm_quantizer_utils.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -336,14 +336,22 @@ def annotate_match(
336336
f"{node.name} is not expected to not have parameter tensors but found {[n.name for n in params]}, which may cause unexpected quantization annotations."
337337
)
338338

339+
# Only ops in ``parameter_targets`` (conv/linear) have true weight and
340+
# bias operands. For any other op, a parameter input is a constant
341+
# activation operand and must be quantized with the input-activation
342+
# qspec, not the weight qspec. Treating it as a weight breaks ops whose
343+
# TOSA implementation requires both operands to share a dtype (e.g.
344+
# add/sub), and silently demotes constants in int16 (16a8w) modules to
345+
# int8 -- the weight qspec ``dtype`` differs from the activation dtype.
346+
has_weight_and_bias = node.target in parameter_targets
339347
for input_node in node.all_input_nodes:
340348
if not has_float_output(input_node):
341349
continue
342-
if self.is_weight(input_node, params, model):
350+
if has_weight_and_bias and self.is_weight(input_node, params, model):
343351
input_qspec_map[input_node] = (
344352
config.get_weight_qspec(node) if config else None
345353
)
346-
elif self.is_bias(input_node, params, model):
354+
elif has_weight_and_bias and self.is_bias(input_node, params, model):
347355
input_qspec_map[input_node] = (
348356
config.get_bias_qspec(node) if config else None # type: ignore[assignment]
349357
)
@@ -481,6 +489,10 @@ class SharedQspecQuantizer(Quantizer, QuantizerReporterUser):
481489
def __init__(self, targets: Optional[list[Callable[..., object]]] = None) -> None:
482490
super().__init__()
483491
QuantizerReporterUser.__init__(self)
492+
# Optional global config used to *initiate* quantization for shared-op
493+
# clusters that sit on the quantized I/O boundary but have no neighbor
494+
# qspec to propagate. Set by the composable quantizer's set_global().
495+
self.global_config: Optional[QuantizationConfig] = None
484496
if targets is None:
485497
self.targets = self.SHARED_QSPEC_OPS_DEFAULT
486498
self.support_config_path = (
@@ -552,10 +564,24 @@ def _append_input_qspec(
552564
return
553565
adjacent_qspecs.append(input_qspec)
554566

555-
def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
567+
def _is_quantized_io_boundary(self, node: Node) -> bool:
568+
"""Return True if ``node`` is a model input/output the quantizer annotated.
569+
570+
Such a node sits on the quantized interface, but its qspec is often
571+
filtered out of shared-cluster propagation -- a uint8 IO qspec is skipped
572+
by ``_skip_shared_qspec_from_io``, and an input-state placeholder may
573+
carry an annotation with no ``output_qspec``. Its presence still signals
574+
that the cluster is on the quantized data path.
575+
"""
576+
return node.op in ("placeholder", "output") and self._is_annotated(node)
577+
578+
def _get_shared_clique(
579+
self, root_node: Node
580+
) -> tuple[set[Node], list[Any], bool]:
556581
shared_nodes = set()
557582
bfs_queue = [root_node]
558583
adjacent_qspecs: list[Any] = []
584+
touches_quantized_io = False
559585

560586
while bfs_queue:
561587
node = bfs_queue.pop(0)
@@ -564,12 +590,14 @@ def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
564590
for input_node in node.all_input_nodes:
565591
self._maybe_enqueue_shared_node(input_node, shared_nodes, bfs_queue)
566592
self._append_output_qspec(input_node, adjacent_qspecs)
593+
touches_quantized_io |= self._is_quantized_io_boundary(input_node)
567594

568595
for output_node in node.users.keys():
569596
self._maybe_enqueue_shared_node(output_node, shared_nodes, bfs_queue)
570597
self._append_input_qspec(output_node, node, adjacent_qspecs)
598+
touches_quantized_io |= self._is_quantized_io_boundary(output_node)
571599

572-
return shared_nodes, adjacent_qspecs
600+
return shared_nodes, adjacent_qspecs, touches_quantized_io
573601

574602
def _annotate_shared_cluster(self, root_node: Node) -> None:
575603
if (
@@ -588,7 +616,27 @@ def _annotate_shared_cluster(self, root_node: Node) -> None:
588616
)
589617
return
590618

591-
shared_nodes, adjacent_qspecs = self._get_shared_clique(root_node)
619+
shared_nodes, adjacent_qspecs, touches_quantized_io = self._get_shared_clique(
620+
root_node
621+
)
622+
623+
# If there is no neighbor qspec to propagate but the cluster sits on the
624+
# quantized I/O boundary (e.g. a state-passthrough ``cat`` whose only
625+
# neighbors are a uint8 model input -- skipped by
626+
# _skip_shared_qspec_from_io -- and an input-state placeholder), initiate
627+
# quantization from the global config rather than leaving the cluster in
628+
# float. Otherwise such clusters fall off the integer delegate onto the
629+
# CPU. Initiating from the global (internal) config keeps uint8 confined
630+
# to the IO boundary.
631+
if (
632+
len(adjacent_qspecs) == 0
633+
and touches_quantized_io
634+
and self.global_config is not None
635+
):
636+
global_input_qspec = self.global_config.get_input_act_qspec()
637+
if global_input_qspec is not None:
638+
adjacent_qspecs = [global_input_qspec]
639+
592640
node_order = {node: index for index, node in enumerate(root_node.graph.nodes)}
593641
ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0))
594642

0 commit comments

Comments
 (0)