Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/cortex_m/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ set(_cortex_m_kernels__srcs
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_activation.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_avg_pool2d.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_batch_matmul.cpp
Expand Down
131 changes: 131 additions & 0 deletions backends/cortex_m/ops/op_quantized_activation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "cortex_m_ops_common.h"

#include <cstring>

#if defined(__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE & 1)
#include <arm_mve.h>
#define HAS_HELIUM_SIMD 1
#endif

#if defined(ARM_MATH_DSP) && !defined(HAS_HELIUM_SIMD)
#include <arm_acle.h>
#define HAS_DSP_PACKED_LUT 1
#endif

namespace cortex_m {
namespace native {

#if defined(HAS_DSP_PACKED_LUT)
// Local 4-byte read/write helpers. We deliberately don't include
// `arm_nnsupportfunctions.h` for the equivalent CMSIS-NN `arm_nn_read_s8x4_ia`
// / `arm_nn_write_s8x4_ia` -- the header is public but pulls in the entire
// CMSIS-NN support surface (~1500 lines) just for two memcpy wrappers.
static inline uint32_t read_u8x4_ia(const int8_t** in) {
uint32_t val;
std::memcpy(&val, *in, 4);
*in += 4;
return val;
}

static inline void write_u8x4_ia(int8_t** out, uint32_t val) {
std::memcpy(*out, &val, 4);
*out += 4;
}
#endif

// cppcheck-suppress unusedFunction
Tensor& quantized_activation_out(
KernelRuntimeContext& context,

Check warning on line 46 in backends/cortex_m/ops/op_quantized_activation.cpp

View workflow job for this annotation

GitHub Actions / lintrunner

CPPCHECK constParameterReference

Parameter 'context' can be declared as reference to const CWE: CWE-398
const Tensor& input,
const Tensor& lut,
Tensor& out) {
ET_CHECK_MSG(
input.scalar_type() == ScalarType::Char,
"quantized_activation: input must be int8");
ET_CHECK_MSG(
out.scalar_type() == ScalarType::Char,
"quantized_activation: output must be int8");
ET_CHECK_MSG(
lut.scalar_type() == ScalarType::Char,
"quantized_activation: lut must be int8");
ET_CHECK_MSG(
lut.numel() == 256,
"quantized_activation: lut must have 256 entries, got %" PRId64,
static_cast<int64_t>(lut.numel()));
ET_CHECK_MSG(
input.numel() == out.numel(),
"quantized_activation: input and output must have the same numel");

const int8_t* in_data = input.const_data_ptr<int8_t>();
const int8_t* lut_data = lut.const_data_ptr<int8_t>();
int8_t* out_data = out.mutable_data_ptr<int8_t>();

// The LUT is precomputed AoT from the input/output qparams and the
// activation function (sigmoid / tanh / silu / ...), so the kernel does not
// need to know which activation it is implementing. The signed int8 input
// is biased by 128 to use it as an unsigned [0, 255] table index.
const int64_t n = input.numel();
int64_t i = 0;

#if defined(HAS_HELIUM_SIMD)
// M55/M85: 16 lanes per iteration. Reinterpret the int8 input as uint8
// (bit-identical load), add 128 mod 256 to produce a uint8 LUT index, then
// gather-load the int8 result from the LUT.
for (; i + 15 < n; i += 16) {
uint8x16_t in_u8 =
vldrbq_u8(reinterpret_cast<const uint8_t*>(in_data + i));
uint8x16_t idx = vaddq_n_u8(in_u8, 128);
int8x16_t result = vldrbq_gather_offset_s8(lut_data, idx);
vstrbq_s8(out_data + i, result);
}
#elif defined(HAS_DSP_PACKED_LUT)
// M4/M7 (DSP, no MVE): process 4 bytes per iteration. The DSP win comes from
// (a) folding 4 byte-loads into one word-load, (b) batching the +128 bias
// with `__uadd8`, and (c) folding 4 byte-stores into one word-store. The
// LUT lookups themselves still hit memory four times per word -- no DSP
// gather instruction exists on M-class.
const int8_t* in_ptr = in_data;
int8_t* out_ptr = out_data;
const int64_t word_iters = n >> 2;
for (int64_t w = 0; w < word_iters; ++w) {
const uint32_t in_word = read_u8x4_ia(&in_ptr);
const uint32_t idx_word = __uadd8(in_word, 0x80808080u);
const uint32_t out_word =
static_cast<uint32_t>(static_cast<uint8_t>(lut_data[idx_word & 0xFFu])) |
(static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 8) & 0xFFu]))
<< 8) |
(static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 16) & 0xFFu]))
<< 16) |
(static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 24) & 0xFFu]))
<< 24);
write_u8x4_ia(&out_ptr, out_word);
}
i = word_iters << 2;
#endif

// 4x-unrolled scalar tail. On M-class cores without MVE or DSP the unroll
// lets the compiler issue independent LUT loads; on the MVE / DSP paths
// above this only runs for the < 16- (or < 4-) element remainder.
for (; i + 3 < n; i += 4) {
out_data[i + 0] = lut_data[static_cast<uint8_t>(in_data[i + 0] + 128)];
out_data[i + 1] = lut_data[static_cast<uint8_t>(in_data[i + 1] + 128)];
out_data[i + 2] = lut_data[static_cast<uint8_t>(in_data[i + 2] + 128)];
out_data[i + 3] = lut_data[static_cast<uint8_t>(in_data[i + 3] + 128)];
}
for (; i < n; ++i) {
out_data[i] = lut_data[static_cast<uint8_t>(in_data[i] + 128)];
}

return out;
}

} // namespace native
} // namespace cortex_m
29 changes: 29 additions & 0 deletions backends/cortex_m/ops/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,35 @@ def quantized_mul_impl(
return result


# ===================================================================
# QUANTIZED ACTIVATION (LUT) OPERATION DEFINITION
# ===================================================================
# Generic table-lookup activation. The 256-entry int8 LUT is precomputed AoT
# from the input/output qparams and the activation function (sigmoid, tanh,
# silu, ...), so the kernel is identical regardless of which activation it
# evaluates: out[i] = lut[input[i] + 128].
lib.define("quantized_activation(Tensor input, Tensor lut) -> Tensor")
lib.define(
"quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)"
)


@register_fake("cortex_m::quantized_activation") # type: ignore[misc]
def quantized_activation_meta(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
assert input.dtype == torch.int8, "quantized_activation input must be int8"
assert lut.dtype == torch.int8 and lut.numel() == 256, (
"quantized_activation lut must be int8 with 256 entries; "
f"got dtype={lut.dtype}, numel={lut.numel()}"
)
return torch.empty_like(input)


@impl(lib, "quantized_activation", "CompositeExplicitAutograd") # type: ignore[misc]
def quantized_activation_impl(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
indices = input.to(torch.int32) + 128
return lut[indices].to(torch.int8)


# ===================================================================
# QUANTIZED BATCH MATMUL OPERATION DEFINITION
# ===================================================================
Expand Down
6 changes: 6 additions & 0 deletions backends/cortex_m/ops/operators.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@
- arg_meta: null
kernel_name: cortex_m::quantized_mul_out

- func: cortex_m::quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
kernel_name: cortex_m::quantized_activation_out

- func: cortex_m::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
Expand Down
43 changes: 42 additions & 1 deletion backends/cortex_m/passes/convert_to_cortex_m_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor

from executorch.backends.cortex_m.passes.cortex_m_pass import CortexMPass
from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
from executorch.backends.cortex_m.passes.passes_utils import (
build_activation_lut,
quantize_multiplier_aot,
)
from executorch.backends.cortex_m.passes.scratch_buffer_sizes import (
required_cmsis_nn_buffer_sizes,
)
Expand Down Expand Up @@ -483,6 +486,38 @@ def _get_bmm_replacement(self, node):
)
return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args

def _get_activation_replacement(self, node):
"""Lower a standalone quantized sigmoid / tanh / silu to a single
cortex_m.quantized_activation call backed by an AoT-built 256-entry
int8 LUT. The kernel is shape-agnostic; the LUT encodes both the
activation function and the input/output qparams.
"""
input_qparams = node.meta["input_qparams"][0]
output_qparams = node.meta["output_qparams"][0]
lut_tensor = build_activation_lut(
node.target,
float(input_qparams.scale),
int(input_qparams.zp),
float(output_qparams.scale),
int(output_qparams.zp),
)

# Constant placeholders must appear before user-input placeholders;
# anchor on the first existing placeholder so the new LUT lands in the
# constant-placeholder block at the top of the graph.
first_placeholder = next(n for n in node.graph.nodes if n.op == "placeholder")
with node.graph.inserting_before(first_placeholder):
lut_node = create_constant_placeholder(
self.exported_program,
node.graph,
node.name + "_lut",
InputKind.PARAMETER,
lut_tensor,
)

new_args = (node.args[0], lut_node)
return exir_ops.edge.cortex_m.quantized_activation.default, new_args

def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
modified = False
for node in graph_module.graph.nodes:
Expand All @@ -506,6 +541,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
op, args = self._get_convolution_replacement(node)
case exir_ops.edge.aten.bmm.default:
op, args = self._get_bmm_replacement(node)
case (
exir_ops.edge.aten.sigmoid.default
| exir_ops.edge.aten.tanh.default
| exir_ops.edge.aten.silu.default
):
op, args = self._get_activation_replacement(node)
case _:
continue

Expand Down
61 changes: 61 additions & 0 deletions backends/cortex_m/passes/passes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,67 @@ def is_qualified_int8_node(args) -> bool:
return False


def _stable_sigmoid(x: float) -> float:
# Always exponentiate the non-positive value so `math.exp` never overflows
# for unusually large `|x|` (e.g. wide-range input qparams). Algebraically
# identical to `1 / (1 + exp(-x))`.
if x >= 0:
return 1.0 / (1.0 + math.exp(-x))
e = math.exp(x)
return e / (1.0 + e)


def _stable_silu(x: float) -> float:
return x * _stable_sigmoid(x)


_ACTIVATION_FNS = {
Comment thread
rascani marked this conversation as resolved.
exir_ops.edge.aten.sigmoid.default: _stable_sigmoid,
exir_ops.edge.aten.tanh.default: math.tanh,
exir_ops.edge.aten.silu.default: _stable_silu,
}


def _round_half_away_from_zero(x: float) -> int:
# Matches the rounding convention `requantize_cmsis` (above) applies after
# the right-shift step: ties on positive values round toward +∞, ties on
# negative values round toward -∞. Python's built-in `round` would use
# banker's rounding instead and disagree at exact half-integers.
return int(math.copysign(math.floor(abs(x) + 0.5), x)) if x != 0 else 0


def build_activation_lut(
target,
input_scale: float,
input_zp: int,
output_scale: float,
output_zp: int,
) -> torch.Tensor:
"""AoT-compute a 256-entry int8 lookup table for a quantized activation.

`target` is the edge-dialect op being lowered (e.g.
`exir_ops.edge.aten.sigmoid.default`).

The LUT is indexed by the input byte value biased by 128: for any int8
input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output.
Because the LUT is computed in float and quantized once per entry, the
runtime kernel is a single memory-lookup with no requantization math.
"""
if target not in _ACTIVATION_FNS:
raise ValueError(
f"build_activation_lut: unsupported activation target {target!r} "
f"(supported: {sorted(t.__name__ for t in _ACTIVATION_FNS)})"
)
f = _ACTIVATION_FNS[target]
lut = torch.empty(256, dtype=torch.int8)
for q in range(-128, 128):
x = (q - input_zp) * input_scale
y = f(x)
q_out = _round_half_away_from_zero(y / output_scale + output_zp)
lut[q + 128] = max(-128, min(127, q_out))
return lut


def quantize_multiplier_aot(scale: float) -> tuple[int, int]:
if scale == 0.0:
return 0, 0
Expand Down
19 changes: 19 additions & 0 deletions backends/cortex_m/quantizer/pattern_checkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,25 @@ def check_quantization_config(
return is_int8


class CortexMActivationCheck(PatternCheck):
"""Accept standalone elementwise activations (sigmoid / tanh / silu)
that the LUT-based cortex_m.quantized_activation op handles uniformly.

The kernel is shape-agnostic and the LUT is computed AoT from per-tensor
qparams, so the only thing to enforce is int8 per-tensor quantization.
"""

@classmethod
def check_quantization_config(
cls, pattern: list[Node], quantization_config: QuantizationConfig
) -> bool:
is_int8 = cls.is_int8_activations(quantization_config)
is_per_tensor = cls.is_per_tensor(
quantization_config.get_input_act_qspec()
) and cls.is_per_tensor(quantization_config.get_output_act_qspec())
return is_int8 and is_per_tensor


class CortexMSoftmaxCheck(PatternCheck):

@classmethod
Expand Down
8 changes: 8 additions & 0 deletions backends/cortex_m/quantizer/quantizer_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import torch
from executorch.backends.cortex_m.quantizer.pattern_checkers import (
CortexMActivationCheck,
CortexMAddMulCheck,
CortexMAvgPool2DCheck,
CortexMBmmCheck,
Expand Down Expand Up @@ -119,6 +120,12 @@
(torch.ops.aten.softmax.int,): CortexMSoftmaxCheck,
}

ACTIVATION_OP_PATTERNS = {
(torch.ops.aten.sigmoid.default,): CortexMActivationCheck,
(torch.ops.aten.tanh.default,): CortexMActivationCheck,
(torch.ops.aten.silu.default,): CortexMActivationCheck,
}

POOL_OP_PATTERNS = {
(torch.ops.aten.avg_pool2d.default,): CortexMAvgPool2DCheck,
(torch.ops.aten.max_pool2d.default,): CortexMMaxPool2DCheck,
Expand Down Expand Up @@ -161,4 +168,5 @@
| CONV_TRANSPOSE_OP_PATTERNS
| POOL_OP_PATTERNS
| BMM_OP_PATTERNS
| ACTIVATION_OP_PATTERNS
)
Loading
Loading