Cortex-M backend: address review feedback on quantized_activation

rascani · claude · rascani · commit 8cc5dcf18ad6 · 2026-05-29T14:08:49.000-07:00
Adrian's three review comments on #19792, plus SIMD acceleration of the LUT lookup (his comment asked for vector intrinsics and loop unrolling): * Drop the target -> string indirection in the activation lowering. `passes_utils._ACTIVATION_FNS` now keys directly on the edge op target (`exir_ops.edge.aten.{sigmoid,tanh,silu}.default`), and `ConvertToCortexMPass._get_activation_replacement` passes `node.target` straight into `build_activation_lut` -- no `_ACTIVATION_KINDS` dict and no string round-trip. * Replace the scalar LUT-lookup loop with three compile-gated paths: - M55/M85 (MVE): 16 lanes per iteration -- `vldrbq_u8` load, `vaddq_n_u8` to bias by 128, `vldrbq_gather_offset_s8` to gather the LUT result, `vstrbq_s8` to store. - M4/M7 (DSP, no MVE): 4 bytes per iteration -- fold four byte-loads into one word-load, batch the +128 bias with `__uadd8`, four LUT lookups (no M-class gather instruction exists), fold four byte-stores into one word-store. Uses `<arm_acle.h>` and local memcpy helpers rather than pulling in the heavyweight `arm_nnsupportfunctions.h`. - All other cores (M0+/M3): a 4x-unrolled scalar tail, which also handles the sub-vector remainder of the two SIMD paths. * Switch the source header to Meta's standard copyright block to match the other cortex_m op files. The three paths were cross-compiled for cortex-m0plus / m4 / m7 / m55; the M4 build emits `uadd8` and the M55 build emits the MVE gather. Runtime correctness on M4/M7 hardware/FVP is not yet exercised by CI -- the host unit tests cover the scalar path only. Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp
@@ -1,15 +1,46 @@
 /*
- * Copyright 2026 Arm Limited and/or its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
 #include "cortex_m_ops_common.h"
 
+#include <cstring>
+
+#if defined(__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE & 1)
+#include <arm_mve.h>
+#define HAS_HELIUM_SIMD 1
+#endif
+
+#if defined(ARM_MATH_DSP) && !defined(HAS_HELIUM_SIMD)
+#include <arm_acle.h>
+#define HAS_DSP_PACKED_LUT 1
+#endif
+
 namespace cortex_m {
 namespace native {
 
+#if defined(HAS_DSP_PACKED_LUT)
+// Local 4-byte read/write helpers. We deliberately don't include
+// `arm_nnsupportfunctions.h` for the equivalent CMSIS-NN `arm_nn_read_s8x4_ia`
+// / `arm_nn_write_s8x4_ia` -- the header is public but pulls in the entire
+// CMSIS-NN support surface (~1500 lines) just for two memcpy wrappers.
+static inline uint32_t read_u8x4_ia(const int8_t** in) {
+  uint32_t val;
+  std::memcpy(&val, *in, 4);
+  *in += 4;
+  return val;
+}
+
+static inline void write_u8x4_ia(int8_t** out, uint32_t val) {
+  std::memcpy(*out, &val, 4);
+  *out += 4;
+}
+#endif
+
 // cppcheck-suppress unusedFunction
 Tensor& quantized_activation_out(
     KernelRuntimeContext& context,
@@ -37,12 +68,59 @@ Tensor& quantized_activation_out(
   const int8_t* lut_data = lut.const_data_ptr<int8_t>();
   int8_t* out_data = out.mutable_data_ptr<int8_t>();
 
-  // Bias the signed int8 input by 128 to use it as an unsigned table index;
-  // the LUT entries are precomputed AoT from the input/output qparams and the
+  // The LUT is precomputed AoT from the input/output qparams and the
   // activation function (sigmoid / tanh / silu / ...), so the kernel does not
-  // need to know which activation it is implementing.
+  // need to know which activation it is implementing. The signed int8 input
+  // is biased by 128 to use it as an unsigned [0, 255] table index.
   const int64_t n = input.numel();
-  for (int64_t i = 0; i < n; ++i) {
+  int64_t i = 0;
+
+#if defined(HAS_HELIUM_SIMD)
+  // M55/M85: 16 lanes per iteration. Reinterpret the int8 input as uint8
+  // (bit-identical load), add 128 mod 256 to produce a uint8 LUT index, then
+  // gather-load the int8 result from the LUT.
+  for (; i + 15 < n; i += 16) {
+    uint8x16_t in_u8 =
+        vldrbq_u8(reinterpret_cast<const uint8_t*>(in_data + i));
+    uint8x16_t idx = vaddq_n_u8(in_u8, 128);
+    int8x16_t result = vldrbq_gather_offset_s8(lut_data, idx);
+    vstrbq_s8(out_data + i, result);
+  }
+#elif defined(HAS_DSP_PACKED_LUT)
+  // M4/M7 (DSP, no MVE): process 4 bytes per iteration. The DSP win comes from
+  // (a) folding 4 byte-loads into one word-load, (b) batching the +128 bias
+  // with `__uadd8`, and (c) folding 4 byte-stores into one word-store. The
+  // LUT lookups themselves still hit memory four times per word -- no DSP
+  // gather instruction exists on M-class.
+  const int8_t* in_ptr = in_data;
+  int8_t* out_ptr = out_data;
+  const int64_t word_iters = n >> 2;
+  for (int64_t w = 0; w < word_iters; ++w) {
+    const uint32_t in_word = read_u8x4_ia(&in_ptr);
+    const uint32_t idx_word = __uadd8(in_word, 0x80808080u);
+    const uint32_t out_word =
+        static_cast<uint32_t>(static_cast<uint8_t>(lut_data[idx_word & 0xFFu])) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 8) & 0xFFu]))
+         << 8) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 16) & 0xFFu]))
+         << 16) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 24) & 0xFFu]))
+         << 24);
+    write_u8x4_ia(&out_ptr, out_word);
+  }
+  i = word_iters << 2;
+#endif
+
+  // 4x-unrolled scalar tail. On M-class cores without MVE or DSP the unroll
+  // lets the compiler issue independent LUT loads; on the MVE / DSP paths
+  // above this only runs for the < 16- (or < 4-) element remainder.
+  for (; i + 3 < n; i += 4) {
+    out_data[i + 0] = lut_data[static_cast<uint8_t>(in_data[i + 0] + 128)];
+    out_data[i + 1] = lut_data[static_cast<uint8_t>(in_data[i + 1] + 128)];
+    out_data[i + 2] = lut_data[static_cast<uint8_t>(in_data[i + 2] + 128)];
+    out_data[i + 3] = lut_data[static_cast<uint8_t>(in_data[i + 3] + 128)];
+  }
+  for (; i < n; ++i) {
     out_data[i] = lut_data[static_cast<uint8_t>(in_data[i] + 128)];
   }
 
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -486,12 +486,6 @@ def _get_bmm_replacement(self, node):
         )
         return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args
 
-    _ACTIVATION_KINDS = {
-        exir_ops.edge.aten.sigmoid.default: "sigmoid",
-        exir_ops.edge.aten.tanh.default: "tanh",
-        exir_ops.edge.aten.silu.default: "silu",
-    }
-
     def _get_activation_replacement(self, node):
         """Lower a standalone quantized sigmoid / tanh / silu to a single
         cortex_m.quantized_activation call backed by an AoT-built 256-entry
@@ -500,9 +494,8 @@ def _get_activation_replacement(self, node):
         """
         input_qparams = node.meta["input_qparams"][0]
         output_qparams = node.meta["output_qparams"][0]
-        kind = self._ACTIVATION_KINDS[node.target]
         lut_tensor = build_activation_lut(
-            kind,
+            node.target,
             float(input_qparams.scale),
             int(input_qparams.zp),
             float(output_qparams.scale),
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
@@ -205,9 +205,9 @@ def _stable_silu(x: float) -> float:
 
 
 _ACTIVATION_FNS = {
-    "sigmoid": _stable_sigmoid,
-    "tanh": math.tanh,
-    "silu": _stable_silu,
+    exir_ops.edge.aten.sigmoid.default: _stable_sigmoid,
+    exir_ops.edge.aten.tanh.default: math.tanh,
+    exir_ops.edge.aten.silu.default: _stable_silu,
 }
 
 
@@ -220,25 +220,28 @@ def _round_half_away_from_zero(x: float) -> int:
 
 
 def build_activation_lut(
-    kind: str,
+    target,
     input_scale: float,
     input_zp: int,
     output_scale: float,
     output_zp: int,
 ) -> torch.Tensor:
     """AoT-compute a 256-entry int8 lookup table for a quantized activation.
 
+    `target` is the edge-dialect op being lowered (e.g.
+    `exir_ops.edge.aten.sigmoid.default`).
+
     The LUT is indexed by the input byte value biased by 128: for any int8
     input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output.
     Because the LUT is computed in float and quantized once per entry, the
     runtime kernel is a single memory-lookup with no requantization math.
     """
-    if kind not in _ACTIVATION_FNS:
+    if target not in _ACTIVATION_FNS:
         raise ValueError(
-            f"build_activation_lut: unknown activation '{kind}' "
-            f"(supported: {sorted(_ACTIVATION_FNS)})"
+            f"build_activation_lut: unsupported activation target {target!r} "
+            f"(supported: {sorted(t.__name__ for t in _ACTIVATION_FNS)})"
         )
-    f = _ACTIVATION_FNS[kind]
+    f = _ACTIVATION_FNS[target]
     lut = torch.empty(256, dtype=torch.int8)
     for q in range(-128, 128):
         x = (q - input_zp) * input_scale