diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml index 429e9c202..a04229afe 100644 --- a/.github/workflows/ci-deeploy.yml +++ b/.github/workflows/ci-deeploy.yml @@ -61,9 +61,9 @@ jobs: run: | cd DeeployTest python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa --defaultMemLevel=L2 --l1=64000 --l2=75000 --memAllocStrategy=MiniMalloc - python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa --defaultMemLevel=L2 --l1=64000 --l2=60000 --memAllocStrategy=MiniMalloc --shouldFail + python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa --defaultMemLevel=L2 --l1=64000 --l2=50000 --memAllocStrategy=MiniMalloc --shouldFail python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa --defaultMemLevel=L2 --l1=64000 --l2=90000 --memAllocStrategy=TetrisRandom - python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa --defaultMemLevel=L2 --l1=64000 --l2=75000 --memAllocStrategy=TetrisRandom --shouldFail + python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa --defaultMemLevel=L2 --l1=64000 --l2=69000 --memAllocStrategy=TetrisRandom --shouldFail deeploy-state-serialization: needs: select-env diff --git a/CHANGELOG.md b/CHANGELOG.md index a567305e2..6b6ee83f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -177,9 +177,9 @@ This release containing major architectural changes, new platform support, enhan ### Added -- BatchNorm kernel -- ConvTranspose kernel -- MaxPool1D kernel +- BatchNorm kernel +- ConvTranspose kernel +- MaxPool1D kernel - Template for 1D Convolution - Support for float32 data type in the previous kernels - Float binding for Pad1D kernel @@ -318,7 +318,7 @@ This release containing major architectural changes, new platform support, enhan ### Changed - FloatConvTemplate file -- Platform.py file +- Platform.py file - Bump the CMake version to 3.24 as required for the chimera-sdk - Bump GVSoC's version and add chimera simulation target - Rename the generic source util to utils to avoid name collision with chimera-sdk diff --git a/Deeploy/AbstractDataTypes.py b/Deeploy/AbstractDataTypes.py index feeebe939..0e8d4a071 100644 --- a/Deeploy/AbstractDataTypes.py +++ b/Deeploy/AbstractDataTypes.py @@ -206,12 +206,20 @@ def checkValue(cls, value: Union[int, Iterable[int], np.ndarray], ctxt: Optional if isinstance(value, int): _max, _min = (value, value) + elif isinstance(value, np.number): + value = value.item() + if isinstance(value, float): + assert value.is_integer(), f"Floating-point value {value} is not an integer." + value = int(value) + _max, _min = (value, value) elif isinstance(value, np.ndarray): _max = value.max() _min = value.min() elif isinstance(value, Iterable): _max = max(value) _min = min(value) + else: + raise ValueError(f"Unsupported value of type {type(value)} with value {value}") if _max > cls.typeMax: return False diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py index 4f6dba382..c05ea3b9d 100644 --- a/Deeploy/CommonExtensions/DataTypes.py +++ b/Deeploy/CommonExtensions/DataTypes.py @@ -87,11 +87,11 @@ class float64_t(FloatImmediate): SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t) UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t) -IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted(( - *SignedIntegerDataTypes, - *UnsignedIntegerDataTypes, -), - key = lambda _type: _type.typeWidth)) +IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple( + sorted(( + *SignedIntegerDataTypes, + *UnsignedIntegerDataTypes, + ), key = lambda _type: _type.typeWidth)) FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t) diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py index f07fe57c9..476128b3d 100644 --- a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py +++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py @@ -6,7 +6,7 @@ import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity +from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkDeployer, ONNXLayer, _NoVerbosity class NetworkDeployerWrapper(NetworkDeployer): @@ -48,8 +48,8 @@ def prepared(self): """ # SignPropDeployer augment - def _createIOBindings(self, ctxt: NetworkContext, graph: gs.Graph): - return self._innerObject._createIOBindings(ctxt, graph) + def parse(self, default_channels_first: bool = True) -> bool: + return self._innerObject.parse(default_channels_first) # MemoryAwareDeployer, TilerAwareDeployer, and PULPDeployer augments def bind(self) -> bool: diff --git a/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py b/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py index 7a9fbea1a..6f3498db6 100644 --- a/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py +++ b/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py @@ -6,8 +6,10 @@ import onnx_graphsurgeon as gs -from Deeploy.AbstractDataTypes import Pointer -from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer +from Deeploy.AbstractDataTypes import IntegerImmediate, Pointer +from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NetworkDeployer, OperatorDescriptor, \ + TopologyOptimizer, VariableBuffer from Deeploy.Logging import DEFAULT_LOGGER as log @@ -18,12 +20,13 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first: bool = True, deeployStateDir: str = "DeeployState", inputOffsets: Dict[str, int] = {}): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir) if inputOffsets == {}: @@ -32,17 +35,6 @@ def __init__(self, self.inputOffsets = inputOffsets - def _createIOBindings(self, ctxt, graph): - ctxt = super()._createIOBindings(ctxt, graph) - for node in graph.inputs: - data_name = node.name - nb = ctxt.lookup(data_name) - data_type = self.inputTypes[data_name] - nb._signed = (self.inputOffsets[data_name] == 0) - nb.nLevels = (2**data_type.referencedType.typeWidth) - - return ctxt - def _printInputOutputSummary(self): log.info('Input:') for buf in self.inputs(): @@ -55,3 +47,39 @@ def _printInputOutputSummary(self): log.info( f" - '{buf.name}': Type: {buf._type.referencedType.typeName}, nLevels: {buf.nLevels}, Signed: {buf._signed}" ) + + def parse(self, default_channels_first: bool = True) -> bool: + parsable = super().parse(default_channels_first) + if not parsable: + return False + + # Annotate global buffers + for obj in self.ctxt.globalObjects.values(): + assert isinstance(obj, VariableBuffer) + refTy = obj._type.referencedType + if isinstance(obj, ConstantBuffer): + assert refTy.checkPromotion(obj.values), f"Can't cast {obj} to {refTy}" + if issubclass(refTy, IntegerImmediate): + obj.nLevels = obj.values.max() - obj.values.min() + obj._signed = refTy.typeMin < 0 + elif obj.name in self.inputOffsets: + obj._signed = (self.inputOffsets[obj.name] == 0) + obj.nLevels = (2**refTy.typeWidth) + + # Annotate rest + for layer in self.layerBinding.values(): + node = layer.node + opRepr = layer.mapper.parser.operatorRepresentation + typeChecker = layer.mapper.binder.typeChecker + outTy = self.ctxt.lookup(node.outputs[0].name)._type.referencedType + if issubclass(outTy, IntegerImmediate) and isinstance(typeChecker, SignPropTypeChecker): + inputs = [self.ctxt.lookup(t.name) for t in node.inputs] + outputNLevels = typeChecker._inferNumLevels(inputs, opRepr) + outputSigned = typeChecker._inferSignedness(inputs, opRepr) + + outputs = [self.ctxt.lookup(t.name) for t in node.outputs] + for buffer, nLevels, signed in zip(outputs, outputNLevels, outputSigned): + buffer.nLevels = nLevels + buffer._signed = signed + + return True diff --git a/Deeploy/CommonExtensions/NodeTemplate.py b/Deeploy/CommonExtensions/NodeTemplate.py new file mode 100644 index 000000000..a94619f7a --- /dev/null +++ b/Deeploy/CommonExtensions/NodeTemplate.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Sequence, Tuple + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import NodeTemplate + + +class ElementwiseTemplate(NodeTemplate): + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + assert len(node.outputs) == 1, f"Expected only one output. Received {len(node.outputs)}" + shape = tuple(np.broadcast_shapes(*[t.shape for t in node.inputs])) + return [shape] * len(node.inputs), [shape] + + +class ElementwiseScalarTemplate(NodeTemplate): + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + assert len(node.inputs) == 2, f"Expected only two inputs. Received {len(node.inputs)}" + assert len(node.outputs) == 1, f"Expected only one output. Received {len(node.outputs)}" + shape = tuple(node.inputs[0].shape) + return [shape, (1,)], [shape] + + +class RequantShiftTemplate(NodeTemplate): + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + inShapes, outShapes = [t.shape for t in node.inputs], [t.shape for t in node.outputs] + batch, ch = inShapes[0][:2] + # TODO: Copied from old computeShape. Should probably be investigated + inShapes[1] = (batch, ch, *inShapes[1][1:]) + inShapes[2] = (batch, ch, *inShapes[2][1:]) + return inShapes, outShapes + + +class ConvTemplate(NodeTemplate): + + @staticmethod + def minPerChannelTensorShape(node: gs.Node, channels: int) -> Tuple[int, ...]: + spatialDims = len(node.attrs["kernel_shape"]) + if node.attrs["channels_first"]: + return (channels,) + (1,) * (spatialDims) + else: + return (channels,) + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + inShapes, outShapes = [t.shape for t in node.inputs], [t.shape for t in node.outputs] + if len(node.inputs) == 3: + minBiasShape = self.minPerChannelTensorShape(node, inShapes[1][0]) + inShapes[2] = minBiasShape + return inShapes, outShapes + + +class RequantizedConvTemplate(ConvTemplate): + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + inShapes, outShapes = [t.shape for t in node.inputs[:2]], [t.shape for t in node.outputs] + minRqsShape = self.minPerChannelTensorShape(node, inShapes[1][0]) + rqsShapes = [minRqsShape] * len(node.inputs[2:]) + return inShapes + rqsShapes, outShapes + + +class GemmTemplate(NodeTemplate): + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + biasShape = node.outputs[0].shape[-2:] + return [node.inputs[0].shape, node.inputs[1].shape, biasShape], [node.outputs[0].shape] + + +class RequantizedGemmTemplate(NodeTemplate): + + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + inShapes, outShapes = [t.shape for t in node.inputs[:2]], [t.shape for t in node.outputs] + if node.attrs["transB"]: + N = inShapes[1][-2] + else: + N = inShapes[1][-1] + rqsShapes = [(N,)] * len(node.inputs[2:]) + return inShapes + rqsShapes, outShapes diff --git a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py index c70628729..2e7eafa66 100644 --- a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py +++ b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py @@ -2,7 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from abc import ABC, abstractmethod +from typing import List import onnx_graphsurgeon as gs @@ -11,27 +12,30 @@ from Deeploy.Logging import DEFAULT_LOGGER as log -class SignPropTypeChecker(NodeTypeChecker): +class SignPropTypeChecker(NodeTypeChecker, ABC): + @abstractmethod def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return None + operatorRepresentation: OperatorRepresentation) -> List[int]: + pass + @abstractmethod def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> Optional[List[int]]: - return None + operatorRepresentation: OperatorRepresentation) -> List[bool]: + pass def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: ctxt = super().typeInferGlobalCtxt(ctxt, node) - for inputNode, _type in zip(node.inputs, self.input_types): - if isinstance(ctxt.lookup(inputNode.name), ConstantBuffer): - reference = ctxt.lookup(inputNode.name) - if not _type.referencedType.checkPromotion(reference.values): - raise Exception(f"Can't cast {reference} to {_type}!") - - reference.nLevels = reference.values.max() - reference.values.min() - reference._signed = _type.referencedType.typeMin < 0 + for tensor, _type in zip(node.inputs, self.input_types): + buffer = ctxt.lookup(tensor.name) + if isinstance(buffer, ConstantBuffer): + refTy = _type.referencedType + assert issubclass(refTy, IntegerImmediate) + if not refTy.checkPromotion(buffer.values): + raise ValueError(f"Can't cast {buffer} to {refTy}!") + buffer.nLevels = buffer.values.max() - buffer.values.min() + buffer._signed = refTy.typeMin < 0 return ctxt @@ -42,21 +46,16 @@ def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node, inputs = [ctxt.lookup(inputNode.name) for inputNode in node.inputs] outputs = [ctxt.lookup(outputNode.name) for outputNode in node.outputs] - signProp = all([hasattr(_input, "_signed") and hasattr(_input, "nLevels") for _input in inputs]) - - if signProp: - nLevels = self._inferNumLevels(inputs, operatorRepresentation) - signedness = self._inferSignedness(inputs, operatorRepresentation) - - if nLevels is None or signedness is None: - return ctxt - for obj, nLevel, sign in zip(outputs, nLevels, signedness): - obj.nLevels = nLevel - obj._signed = sign - - if issubclass(obj._type.referencedType, IntegerImmediate) and not obj._type.fitsNumLevels(nLevel): - log.warning( - f"{obj.name} has {nLevel} levels, but {obj._type.referencedType.typeName} only supports {obj._type.referencedType.nLevels} levels." - ) + nLevels = self._inferNumLevels(inputs, operatorRepresentation) + signedness = self._inferSignedness(inputs, operatorRepresentation) + + for obj, nLevels, sign in zip(outputs, nLevels, signedness): + assert isinstance(obj, VariableBuffer) + obj.nLevels = nLevels + obj._signed = sign + refTy = obj._type.referencedType + if issubclass(refTy, IntegerImmediate) and not refTy.fitsNumLevels(nLevels): + log.warning( + f"{obj.name} has {nLevels} levels, but {refTy.typeName} only supports {refTy.nLevels} levels.") return ctxt diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 8c2f5d248..576a29970 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -5,6 +5,7 @@ from __future__ import annotations import copy +import itertools import math import os import pickle @@ -104,6 +105,27 @@ def __init__(self, templateStr: str): Tuple[NetworkContext, OperatorRepresentation]]]] = {} self.subTemplateGenerators = {} + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + return [t.shape for t in node.inputs], [t.shape for t in node.outputs] + + def _alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + _in, out = self.alignShapes(node) + for tensor, shape in zip(node.inputs + node.outputs, _in + out): + assert shape is not None, f"Aligned shape for tensor {tensor.name} is None" + return _in, out + + def _tensorShapesBroadcastable(self, node: gs.Node) -> bool: + minShapesIn, minShapesOut = self._alignShapes(node) + for tensor, minShape in zip(node.inputs, minShapesIn, strict = True): + try: + np.broadcast_shapes(tensor.shape, minShape) + except ValueError: + return False + for tensor, minShape in zip(node.outputs, minShapesOut, strict = True): + if not all(dim == other for dim, other in zip(tensor.shape, minShape)): + return False + return True + def internalSize(self) -> int: """Return the byte size of internal memory buffers used by this template @@ -251,8 +273,8 @@ def __init__(self, name: str = '', shape = [1], aliases: Optional[List[str]] = N self._live: bool = False #: bool: DO NOT OVERRIDE - this variable is true if a previous Memory allocation pass has allocated the buffer, and false if this buffer has been deallocated or has not been allocated yet. self._deploy: bool = True #: bool: MAY OVERRIDE - this variable is a global switch to deactivate the buffer for all purposes without deleting it outright. - self._signed = None - self.nLevels = None + self._signed: bool = None + self.nLevels: int = None self.is_input: bool = False self.is_output: bool = False @@ -1009,9 +1031,10 @@ def annotateType(self, name: str, _type: Type[Pointer]): VariableBuffer with """ - obj = self.lookup(name) - obj._type = _type - obj._instance = _type(name, ctxt = self) + buffer = self.lookup(name) + assert isinstance(buffer, VariableBuffer) + buffer._type = _type + buffer._instance = _type(name, ctxt = self) def copy(self) -> NetworkContext: """Return a shallow copy of this NetworkContext @@ -1020,6 +1043,153 @@ def copy(self) -> NetworkContext: return copy.copy(self) +class IoDesc: + + def __init__(self, required: Union[str, List[str]], optional: Optional[Union[str, List[str]]] = None) -> None: + if isinstance(required, str): + required = [required] + self.required = required + optional = optional if optional is not None else [] + if isinstance(optional, str): + optional = [optional] + self.optional = optional + + def symbolicName(self, idx: int) -> str: + return (self.required + self.optional)[idx] + + def checkTensors(self, tensors: Sequence[gs.Tensor]) -> bool: + return len(tensors) >= len(self.required) and \ + len(tensors) <= len(self.required) + len(self.optional) + + +class VariadicIoDesc(IoDesc): + + def __init__(self, baseName: str, minNumTensors: int = 0) -> None: + self.baseName = baseName + self.minNumTensors = minNumTensors + + def symbolicName(self, idx: int) -> str: + return f"{self.baseName}_{idx}" + + def checkTensors(self, tensors: Sequence[gs.Tensor]) -> bool: + return len(tensors) >= self.minNumTensors + + +@dataclass +class AttrDesc: + name: str + unpacker: Callable[[Any], Any] + default: Optional[Union[Any, Callable[[gs.Node], Any]]] = None + + @staticmethod + def _constUnpack(value: Any) -> Any: + if isinstance(value, gs.Constant): + return value.values.tolist() + elif isinstance(value, np.ndarray): + return value.tolist() + # LMACAN: hacky way to detect a 0-dim numpy array + elif hasattr(value, "ndim") and value.ndim == 0 and hasattr(value, "item"): + return value.item() + else: + return value + + def unpack(self, value: Any) -> Union[int, float, List[int], List[float]]: + return self.unpacker(self._constUnpack(value)) + + def getDefault(self, node: gs.Node) -> Any: + if callable(self.default): + return self.default(node) + else: + return self.default + + +@dataclass +class OperatorDescriptor: + inputDescriptor: IoDesc + outputDescriptor: IoDesc + attrDescriptors: List[AttrDesc] + + def check(self, node: gs.Node) -> bool: + """This method checks whether the node is valid. + + Parameters + ---------- + node : gs.Node + Graphsurgeon node to be validated + + Returns + ------- + bool : node validity + + """ + valid = True + + if not self.inputDescriptor.checkTensors(node.inputs): + log.error(f"[OP {node.op}] Invalid input tensors: {[t.name for t in node.inputs]}") + valid = False + + if not self.outputDescriptor.checkTensors(node.outputs): + log.error(f"[OP {node.op}] Invalid output tensors: {[t.name for t in node.outputs]}") + valid = False + + for attrDesc in self.attrDescriptors: + if attrDesc.default is None and not attrDesc.name in node.attrs: + log.error(f"[OP {node.op}] Missing attribute {attrDesc.name}") + valid = False + + return valid + + def canonicalize(self, node: gs.Node, opset: int) -> bool: + _ = opset + for desc in self.attrDescriptors: + if desc.default is None: + value = node.attrs[desc.name] + else: + value = node.attrs.get(desc.name, desc.getDefault(node)) + try: + node.attrs[desc.name] = desc.unpack(value) + except Exception as e: + raise ValueError(f"[OP {node.op}] Error unpacking the attribute {desc.name}. {e}") from e + return True + + def parseTensors(self, ctxt: NetworkContext, tensors: Sequence[gs.Tensor], + ioDesc: IoDesc) -> OperatorRepresentation: + opRepr = {} + for i, tensor in enumerate(tensors): + symName = ioDesc.symbolicName(i) + buffer = ctxt.lookup(tensor.name) + assert isinstance(buffer, VariableBuffer) + opRepr[symName] = buffer.name + opRepr[f"{symName}_shape"] = buffer.shape + opRepr[f"{symName}_size"] = math.prod(buffer.shape) + opRepr[f"{symName}_type"] = buffer._type + return opRepr + + def parseAttrs(self, node: gs.Node) -> OperatorRepresentation: + return node.attrs.copy() + + def parse(self, ctxt: NetworkContext, node: gs.Node) -> OperatorRepresentation: + opReprs = { + "input tensors": self.parseTensors(ctxt, node.inputs, self.inputDescriptor), + "output tensors": self.parseTensors(ctxt, node.outputs, self.outputDescriptor), + "attributes": self.parseAttrs(node), + } + + for (firstName, firstOpRepr), (secondName, secondOpRepr) in itertools.combinations(opReprs.items(), 2): + firstKeySet = set(firstOpRepr.keys()) + secondKeySet = set(secondOpRepr.keys()) + assert firstKeySet.isdisjoint(secondKeySet), \ + f"[OP {node.op}] Encourntered error while parsing node {node.name}. " \ + f"Keys from parsing {firstName} clash with the keys from parsing {secondName}. "\ + f"Overlapping keys: {firstKeySet ^ secondKeySet}" + + resultOpRepr = {} + for opRepr in opReprs.values(): + resultOpRepr.update(opRepr) + + return resultOpRepr + + class NodeParser(): """Deeploy's core Parser class. Analyzes network nodes and evaluates whether they can be mapped by it. @@ -1143,7 +1313,9 @@ def _unpack_const(attr) -> Union[int, float]: The attributes can either be a numpy scalar value or a Constant tensor. This expects the numpy value to be of size 1. """ - if isinstance(attr, gs.Constant): + if isinstance(attr, (int, float, bool, str)): + return attr + elif isinstance(attr, gs.Constant): value = attr.values elif isinstance(attr, np.ndarray): value = attr @@ -1312,14 +1484,12 @@ def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool: return retCheck def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: - for inputNode, _type in zip(node.inputs, self.input_types): - if isinstance(ctxt.lookup(inputNode.name), ConstantBuffer): - reference = ctxt.lookup(inputNode.name) - if not _type.referencedType.checkPromotion(reference.values): - raise Exception(f"Can't cast {reference} to {_type}!") - - ctxt.annotateType(inputNode.name, _type) - + for tensor, ty in zip(node.inputs, self.input_types): + buffer = ctxt.lookup(tensor.name) + if isinstance(buffer, ConstantBuffer): + if not ty.referencedType.checkPromotion(buffer.values): + raise Exception(f"Can't cast {buffer} to {ty}!") + ctxt.annotateType(tensor.name, ty) return ctxt def annotateDict(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentation: OperatorRepresentation): @@ -2429,6 +2599,7 @@ def __init__(self, graph: gs.Graph, platform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes), name: str = 'DeeployNetwork', deeployStateDir: str = "DeeployState"): @@ -2453,6 +2624,7 @@ def __init__(self, """ self.graph = graph + self.operatorDescriptors = operatorDescriptors self.scheduler = scheduler self.layerBinding: 'OrderedDict[str, ONNXLayer]' = OrderedDict() self.parsed = False @@ -2500,6 +2672,30 @@ def _createIOBindings(self, ctxt: NetworkContext, graph: gs.Graph): return ctxt + def hoistGraphTensors(self, typeMap: Dict[str, Type[Pointer]]): + for name, tensor in self.graph.tensors().items(): + if isinstance(tensor, gs.Constant): + buffer = self.ctxt.ConstantBuffer(name, tensor.shape, tensor.values) + self.ctxt.add(buffer, "global") + else: + buffer = self.ctxt.VariableBuffer(name, tensor.shape) + if tensor in self.graph.inputs: + buffer.is_input = True + self.ctxt.add(buffer, "global") + elif tensor in self.graph.outputs: + buffer.is_output = True + self.ctxt.add(buffer, "global") + else: + self.ctxt.add(buffer, "local") + self.ctxt.annotateType(name, typeMap[name]) + + # Users have to be annotated in order of the schedule + for layer in self.layerBinding.values(): + for tensor in layer.node.inputs: + buffer = self.ctxt.lookup(tensor.name) + isinstance(buffer, VariableBuffer) + buffer._users.append(layer.node.name) + def inputs(self) -> List[VariableBuffer]: """Return a list of all VariableBuffers that are also global inputs of the network @@ -2563,10 +2759,17 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity): self.transformed = True def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: - for engine in self.Platform.engines: - if node.op in engine.Mapping: - return engine.Mapping[node.op](node) - raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}") + engine = None + if "engine" in node.attrs: + engineName = node.attrs["engine"] + engine = [engine for engine in self.Platform.engines if engine.name == engineName][0] + else: + for candidateEngine in self.Platform.engines: + if node.op in candidateEngine.Mapping: + engine = candidateEngine + break + assert engine is not None, f"No mapping found for node {node.name} with op type {node.op}" + return engine.Mapping[node.op](node) def _bindLayers(self): # Create schedule, binding, then parse resulting program for correctness @@ -2582,6 +2785,16 @@ def _bindLayers(self): flatSchedule += subGraph for node in flatSchedule: + assert node.op in self.operatorDescriptors, \ + f"[ERROR] Error parsing node {node.name}. There is no descriptor for operator {node.op}." + desc = self.operatorDescriptors[node.op] + try: + desc.canonicalize(node, self.graph.opset) + except BaseException as e: + raise ValueError(f"[ERROR] Node {node.name} of op {node.op} could not be canonicalized.") from e + assert desc.check(node), \ + f"[ERROR] Node {node.name} is not a valid instance of {node.op} operator" + layer = self._mapNode(node) if isinstance(layer, ONNXLayer): log.debug(f" {SUCCESS_MARK} Bind {node.name} to layer {layer.__class__.__name__}") @@ -2604,6 +2817,114 @@ def _typeCheckNode(self, node: ONNXLayer, ctxt: NetworkContext) -> Tuple[Network return newCtxt, True + def typeCheckInputs(self, types: Sequence[Optional[Type[Pointer]]], supportedTypes: Sequence[Type[Pointer]], + tensors: Sequence[gs.Tensor]) -> bool: + assert len(types) == len(tensors) + + valid = True + for ty, tensor, suppTy in zip(types, tensors, supportedTypes): + if isinstance(tensor, gs.Constant): + if not suppTy.referencedType.checkValue(tensor.values): + # TODO: Log + valid = False + elif isinstance(tensor, gs.Variable): + if ty is None: + # TODO: Log + valid = False + continue + if tensor in self.graph.inputs: + # TODO: Why do we do this for graph inputs?? + refTy = ty.referencedType + suppRefTy = suppTy.referencedType + if not suppRefTy.partialOrderUpcast(refTy): + # TODO: Log + valid = False + else: + if ty != suppTy: + # TODO: Log + valid = False + else: + raise ValueError(f"Unsupported tensor type {type(tensor)}") + + return valid + + def selectTemplate( + self, schedule: Sequence[ONNXLayer], + candidates: Dict[str, List[NodeBinding]]) -> Tuple[Dict[str, NodeBinding], Dict[str, Type[Pointer]]]: + selection: Dict[str, Optional[NodeBinding]] = dict.fromkeys(candidates.keys()) + discard: Dict[str, List[NodeBinding]] = {k: [] for k in candidates.keys()} + typeMap: Dict[str, Optional[Type[Pointer]]] = dict.fromkeys(self.graph.tensors().keys()) + + typeMap.update(self.inputTypes) + + idx: int = 0 + deepestIdx = 0 + + while (idx < len(schedule)): + layer = schedule[idx] + node = layer.node + deepestIdx = max(idx, deepestIdx) + + log.debug(31 * "-" + f" TRYING NODE {node.name} OP {node.op} AT IDX {idx} " + 31 * "-") + + inputTypes = [typeMap[t.name] for t in node.inputs] + + viable = [] + for binding in candidates[node.name]: + if binding in discard[node.name]: + # TODO: Log + continue + if not self.typeCheckInputs(inputTypes, binding.typeChecker.input_types, node.inputs): + # TODO: Log + continue + viable.append(binding) + + if len(viable) > 0: + selectedBinding = viable[0] + # Update inputs types because we might have casted constant tensors + typeMap.update(zip([t.name for t in node.inputs], selectedBinding.typeChecker.input_types)) + # Update output types + typeMap.update(zip([t.name for t in node.outputs], selectedBinding.typeChecker.output_types)) + selection[node.name] = selectedBinding + idx += 1 + elif idx == 0: + # SCHEREMO: If we can't find a mapping for the root, we must exit + layer = schedule[deepestIdx] + node = layer.node + log.debug("-" * 80) + log.error("💥 PARSING FAILED - Backtracking exhausted at root!") + log.error("=" * 80) + log.error(f"🔍 Diagnosis:") + log.error(f" - Deepest successful exploration: Layer {deepestIdx} '{node.name}'") + log.error(f" - Candidates: {[type(binding).__name__ for binding in candidates[node.name]]}") + log.error("=" * 80) + raise RuntimeError( + f'Did not find adequate mapping for graph! Explored until layer {layer} of node {node.name} ' + f'Candidates: {[type(binding).__name__ for binding in candidates[node.name]]}. Exhausted backtracking.' + ) + else: + # SCHEREMO: Rollback one step + prev = schedule[idx - 1] + node = prev.node + prevSelection = selection[node.name] + assert prevSelection is not None, f"Previous node doesn't have a selection" + discard[node.name].append(prevSelection) + selection[node.name] = None + idx = idx - 1 + log.debug(31 * "-" + f" ROLLBACK TO IDX {idx} " + 31 * "-") + + finalSelection: Dict[str, NodeBinding] = {} + for name, binding in selection.items(): + assert binding is not None + finalSelection[name] = binding + + finalTypeMap: Dict[str, Type[Pointer]] = {} + for name, ty in typeMap.items(): + assert ty is not None + finalTypeMap[name] = ty + + return finalSelection, finalTypeMap + # Don't override this def parse(self, default_channels_first: bool = True) -> bool: """Parses the full network by iteratively exploring mapping and binding options with backtracking @@ -2631,97 +2952,141 @@ def parse(self, default_channels_first: bool = True) -> bool: constantBuffer = self.Platform.ConstantBuffer, structBuffer = self.Platform.StructBuffer, transientBuffer = self.Platform.TransientBuffer) + # Create schedule, binding, then parse resulting program for correctness + schedule = self.scheduler(self.graph) + flatSchedule = [] - log.debug(" - Create IO Bindings") - self.ctxt = self._createIOBindings(self.ctxt, self.graph) - - log.debug(" - Bind Nodes to Layers") - self._bindLayers() - - ctxt = self.ctxt.copy() + for subGraph in schedule: + if isinstance(subGraph, gs.Node): + flatSchedule.append(subGraph) + else: + flatSchedule += subGraph - ctxtStack = deque() - scheduledLayerList = list(self.layerBinding.values()) - idx: int = 0 + self.layerBinding: 'OrderedDict[str, ONNXLayer]' = OrderedDict() + templateCandidates: Dict[str, List[NodeBinding]] = {} + for node in flatSchedule: + assert node.op in self.operatorDescriptors, \ + f"[ERROR] Error parsing node {node.name}. There is no descriptor for operator {node.op}." + desc = self.operatorDescriptors[node.op] + desc.canonicalize(node, self.graph.opset) + assert desc.check(node), \ + f"[ERROR] Node {node.name} is not a valid instance of {node.op} operator" - deepestIdx = 0 + layer = self._mapNode(node) + if isinstance(layer, ONNXLayer): + self.layerBinding[node.name] = layer + + candidates = [] + discardedMaps = [] + discardedBindings = [] + for map in layer.maps: + if not map.parser.parseNode(node): + discardedMaps.append(map) + continue + + # NOTE: We count a map to be _true_ SignProp if all the integer bindings support only signed output + outRefTys = [binding.typeChecker.output_types[0].referencedType for binding in map.bindings] + intRefTys = [ty for ty in outRefTys if issubclass(ty, IntegerImmediate)] + trueSignProp = all(ty.signed for ty in intRefTys) + + for binding in map.bindings: + if not binding.template._tensorShapesBroadcastable(node): + discardedBindings.append((binding, "Shapes are not broadcastable")) + continue + # NOTE: will this even be needed once I can infer the outtype from a template + # immediately and not by looking at a bunch of bindings? This only makes sense here now + # because we have to sift through the bindings, but if we can deduce the out type straight + # from the input types + node attrs, we don't need that. + if not trueSignProp and "signed" in node.attrs or "rqsOut_signed" in node.attrs: + signed = node.attrs["signed"] if "signed" in node.attrs else node.attrs["rqsOut_signed"] + assert len(binding.typeChecker.output_types) == 1, f"Assume 1 output" + refTy = binding.typeChecker.output_types[0].referencedType + if issubclass(refTy, IntegerImmediate) and signed != refTy.signed: + discardedBindings.append( + (binding, f"Out type is not {'signed' if signed else 'unsigned'}")) + continue + candidates.append(binding) + assert len(candidates) > 0, ( + f"Node {node.name} of op {node.op} has no template candidate.\n" \ + f"Tried these maps: {discardedMaps}\n" \ + f"Tried these bindings:\n" + + "\n".join(f" - Binding {binding}: {msg}" for binding, msg in discardedBindings) + ) + templateCandidates[node.name] = candidates - log.debug(" - Parse and Type Check Network") + log.debug(" - Template selection") start_time = time.perf_counter() + selection, typeMap = self.selectTemplate(list(self.layerBinding.values()), templateCandidates) + end_time = time.perf_counter() + log.info( + f" {SUCCESS_MARK} Template selection succeded with {len(self.layerBinding)} layers in {(end_time-start_time)*1E3:.3f} ms" + ) - iteration_main = 0 - iteration_sub = 0 - iteration_tot = 0 - while (idx < len(scheduledLayerList)): - currentLayer = scheduledLayerList[idx] - - # Log current exploration state - if idx == 0: - iteration_main += 1 - iteration_tot += 1 - iteration_sub = 0 - log.debug(31 * "-" + f" MAIN ITERATION {iteration_main:<2} " + 31 * "-") - - log.debug(f"[Layer {idx}] Trying '{currentLayer.node.name}' (op: {currentLayer.node.op})") - - stCtxt = copy.deepcopy(ctxt) + # TODO: Remove after refactor + # Fixup the choice to old way + for layer in self.layerBinding.values(): + binding = selection[layer.node.name] + + # Find map + selectedMap = None + for map in layer.maps: + if binding in map.bindings: + selectedMap = map + break + assert selectedMap is not None, f"Cannot find binding {binding} in any map" + + selectedMap.binder = binding + selectedMap.bound = True + layer.mapper = selectedMap + + # Align shapes + for layer in self.layerBinding.values(): + node = layer.node + newInputShapes, _ = layer.mapper.binder.template._alignShapes(node) + for tensor, shape in zip(node.inputs, newInputShapes): + # TODO: This needs to be investigated because it assumes that if the shape is + # broadcastable, it is also executable, but that might not be the case. + # E.g., just because a kernel can implement a requant shift with per-channel + # rqs params, doesn't mean it can do it for per-layer params. + # There needs to be a mechanism for the kernel (template) to say which + # shapes it can execute, and which shapes it can execute if they get broadcasted. + # Current vision is 2 functions `checkShapes` and `negotiateBroadcasts`, but + # it's a wip. + shape = np.broadcast_shapes(tensor.shape, shape) + if isinstance(tensor, gs.Variable): + if tensor in self.graph.inputs: + tensor.shape = shape + elif any(dim != other for dim, other in zip(tensor.shape, shape)): + raise RuntimeError( + "Non-graph-input shape change is forbidden for now until someone adds automatic Expand node insertion." + f"Node {node.name}'s alignShape tried to change tensor {tensor.name}'s shape {tensor.shape} to {shape}" + ) + elif isinstance(tensor, gs.Constant): + if math.prod(tensor.shape) == math.prod(shape): + tensor.values = tensor.values.reshape(shape) + else: + tensor.values = np.broadcast_to(tensor.values, shape) - newCtxt, parseSuccess = self._parseNode(currentLayer, ctxt, default_channels_first) + self.hoistGraphTensors(typeMap) - typeCheckSuccess = False - if parseSuccess: - newCtxt, typeCheckSuccess = self._typeCheckNode(currentLayer, newCtxt) + for layer in self.layerBinding.values(): + node = layer.node + parser = layer.mapper.parser - if parseSuccess and typeCheckSuccess: - # SCHEREMO: Continue depth-first exploration - ctxtStack.append(stCtxt) - ctxt = newCtxt - idx = idx + 1 - if idx > deepestIdx: - deepestIdx = max(idx, deepestIdx) - deepestCtxt = stCtxt + parser.parseNode(node) + parser.parseNodeCtxt(self.ctxt, node, default_channels_first) - else: - # SCHEREMO: If we can't find a mapping for the root, we must exit - if idx == 0: - deepestLayer = scheduledLayerList[deepestIdx] - deepestNodeName = deepestLayer.node.name - log.debug("-" * 80) - log.error("💥 PARSING FAILED - Backtracking exhausted at root!") - log.error("=" * 80) - log.error(f"🔍 Diagnosis:") - log.error(f" - Deepest successful exploration: Layer {deepestIdx} '{deepestNodeName}'") - log.error( - f" - Deepest layer available mappers: {[type(x.parser).__name__ for x in deepestLayer.maps]}") - log.error("=" * 80) - raise RuntimeError( - f'Did not find adequate mapping for graph! Explored until layer {deepestLayer.__class__.__name__} of node {deepestNodeName}' - f'Candidates: {[type(x.parser).__name__ for x in deepestLayer.maps]}. Exhausted backtracking.') - - previousLayer = scheduledLayerList[idx - 1] - ctxt = ctxtStack.pop() - - # Keep options of current layer open - the upstream mapping will change, so we don't know which options are feasible here - currentLayer.resetDiscardedMappers() - - # Update the previous layer, by discarding the current mapper or binder - if previousLayer.mapper.bindingsExhausted(): - previousLayer.discardCurrentMapper() - else: - previousLayer.mapper.discardCurrentBinder() + opRepr = parser.operatorRepresentation + opRepr["nodeName"] = node.name + opRepr["nodeOp"] = node.op + opRepr["channels_first"] = node.attrs.get("channels_first", default_channels_first) - # SCHEREMO: Rollback one step - idx = idx - 1 - if idx != 0: - iteration_sub += 1 - iteration_tot += 1 - log.debug(31 * "-" + f" SUB ITERATION {iteration_main}.{iteration_sub:<2} " + 31 * "-") + for tensor in node.inputs + node.outputs: + for key, value in opRepr.items(): + if isinstance(value, str) and value == tensor.name: + opRepr[f"{key}_type"] = typeMap[value] + break - end_time = time.perf_counter() - log.info( - f" {SUCCESS_MARK} Parsed network with {len(self.layerBinding)} layers after {iteration_tot} iterations in {(end_time-start_time)*1E3:.3f} ms" - ) - self.ctxt = ctxt self.parsed = True return True @@ -3181,6 +3546,7 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes), name: str = 'DeeployNetwork', default_channels_first: bool = True, @@ -3213,7 +3579,13 @@ def __init__(self, """ - super().__init__(graph, deploymentPlatform, inputTypes, scheduler, name, deeployStateDir = deeployStateDir) + super().__init__(graph, + deploymentPlatform, + inputTypes, + operatorDescriptors, + scheduler, + name, + deeployStateDir = deeployStateDir) self.loweringOptimizer = loweringOptimizer self.default_channels_first = default_channels_first @@ -3317,8 +3689,10 @@ def _duplicateConstants(self, graph: gs.Graph) -> None: graph.cleanup().toposort() def _foldConstants(self, graph: gs.Graph): + graph.toposort() # fold_constants requires the graph to be topologically sorted graph.fold_constants() - graph.cleanup().toposort() + graph.cleanup() # fold_constants doesn't remove dangling Constant nodes so we need a cleanup + graph.toposort() # toposort for good measure def _sanitizeGraphNames(self, graph: gs.Graph): @@ -3377,6 +3751,10 @@ def _assertTensorsHaveShape(self) -> None: assert len(missingShapes) == 0, \ f"Shape inference is not supported.\nFound tensors with missing shape annotation: {missingShapes}" + def _annotateChannelsFirst(self, graph: gs.Graph, default: bool) -> None: + for node in graph.nodes: + node.attrs["channels_first"] = node.attrs.get("channels_first", default) + def frontEnd(self): """API hook to prepare the graph to be deployed and build the initial NetworkContext @@ -3427,6 +3805,9 @@ def frontEnd(self): log.info(" - Assert all tensors have a shape annotation") self._assertTensorsHaveShape() + log.info("- Annotate node's with channel layout info") + self._annotateChannelsFirst(self.graph, self.default_channels_first) + log.info("- Perform Graph Parsing") try: self.parse(self.default_channels_first) # This reparses the lowered graph diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py index 4b05ab5be..eb7175f61 100644 --- a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py +++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py @@ -8,7 +8,8 @@ from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper -from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, OperatorDescriptor, Schedule, \ + TopologyOptimizer from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \ EngineColoringPass, EngineMapper @@ -20,12 +21,13 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes), name: str = 'DeeployNetwork', default_channels_first: bool = True, deeployStateDir: str = "DeeployState", engineMapperCls: Type[EngineMapper] = EngineMapper): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir) self._initEngineColoringDeployer(engineMapperCls) diff --git a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py index 2599f9e81..d75b28433 100644 --- a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py +++ b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py @@ -11,8 +11,8 @@ from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, DeploymentEngine, DeploymentPlatform, \ - NetworkContext, NetworkDeployer, NetworkOptimizationPass, NetworkOptimizer, Schedule, StructBuffer, \ - TopologyOptimizer, TransientBuffer, VariableBuffer, _NoVerbosity + NetworkContext, NetworkDeployer, NetworkOptimizationPass, NetworkOptimizer, OperatorDescriptor, Schedule, \ + StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer, _NoVerbosity from Deeploy.Logging import DEFAULT_LOGGER as log from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel @@ -112,12 +112,13 @@ def __init__(self, deploymentPlatform: Union[MemoryPlatform, MemoryPlatformWrapper], inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes), name: str = 'DeeployNetwork', default_channels_first: bool = True, deeployStateDir: str = "DeeployState", memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = []): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir) if len(memoryLevelAnnotationPasses) == 0: memoryLevelAnnotationPasses.append(AnnotateDefaultMemoryLevel(self.Platform.memoryHierarchy)) @@ -155,13 +156,14 @@ def __init__(self, deploymentPlatform: Union[MemoryPlatform, MemoryPlatformWrapper], inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first: bool = True, deeployStateDir: str = "DeeployState", inputOffsets: Dict[str, int] = {}, memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = []): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir, inputOffsets) if len(memoryLevelAnnotationPasses) == 0: memoryLevelAnnotationPasses.append(AnnotateDefaultMemoryLevel(self.Platform.memoryHierarchy)) diff --git a/Deeploy/OperatorDescriptor.py b/Deeploy/OperatorDescriptor.py new file mode 100644 index 000000000..288a9de50 --- /dev/null +++ b/Deeploy/OperatorDescriptor.py @@ -0,0 +1,833 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from enum import Enum, IntEnum +from typing import Any, Dict, Tuple, Union + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import AttrDesc, IoDesc, OperatorDescriptor, VariadicIoDesc +from Deeploy.Logging import DEFAULT_LOGGER as log + + +def IntUnpack(value: Any) -> int: + if isinstance(value, (list, tuple)) and len(value) == 1: + value = value[0] + + if isinstance(value, int): + return value + elif isinstance(value, float): + assert value.is_integer(), f"Received a non-integer value {value}" + return int(value) + raise ValueError(f"Unsupported value type {type(value)}") + + +def BoolUnpack(value: Any) -> bool: + value = IntUnpack(value) + assert value in [0, 1], f"Casting to bool only supported from 0, 1. Received {value}" + return bool(value) + + +def FloatUnpack(value: Any) -> float: + if isinstance(value, (list, tuple)) and len(value) == 1: + value = value[0] + + assert isinstance(value, (int, float)), f"Unsupported value type {type(value)}" + return float(value) + + +def IntTupleUnpack(value: Any) -> Tuple[int, ...]: + try: + return tuple(IntUnpack(item) for item in value) + except TypeError: + return (IntUnpack(value),) + + +def FloatTupleUnpack(value: Any) -> Tuple[float, ...]: + try: + return tuple(FloatUnpack(item) for item in value) + except TypeError: + return (FloatUnpack(value),) + + +def IntTupleIfNotSingleItemUnpack(value: Any) -> Union[int, Tuple[int, ...]]: + try: + return IntUnpack(value) + except ValueError: + return IntTupleUnpack(value) + + +def attrToInputTensor(node: gs.Node, attr: str) -> None: + values = node.attrs[attr] + if isinstance(values, (int, float)): + values = np.array([values]) + elif isinstance(values, (list, tuple)): + values = np.array(values) + assert isinstance(values, np.ndarray), f"Unsupported values type {type(values)}" + tensor = gs.Constant(f"{node.name}_{attr}", values) + node.inputs.append(tensor) + node.attrs.pop(attr) + + +def inputTensorToAttr(node: gs.Node, tensorIdx: int, attr: str) -> None: + tensor = node.inputs[tensorIdx] + assert isinstance(tensor, gs.Constant), \ + f"Can convert only constant tensors to attributes. Received tensor of type {tensor}" + node.attrs[attr] = tensor.values + tensor.outputs.clear() + + +concatDesc = OperatorDescriptor( + inputDescriptor = VariadicIoDesc("data_in", minNumTensors = 2), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + +iRMSNormDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "weight"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("D", IntUnpack), + AttrDesc("n_levels", IntUnpack), + ], +) + + +class SliceDescriptor(OperatorDescriptor): + + def canonicalize(self, node: gs.Node, opset: int) -> bool: + if opset < 10: + attrToInputTensor(node, "starts") + attrToInputTensor(node, "ends") + if "axes" in node.attrs: + attrToInputTensor(node, "axes") + + return super().canonicalize(node, opset) + + +# Opset: 13 +sliceDesc = SliceDescriptor( + inputDescriptor = IoDesc(["data_in", "starts", "ends"], ["axes", "steps"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + +# Opset: 1 +sliceDescOld = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("axes", IntTupleUnpack, lambda n: range(len(n.attrs["starts"]))), + AttrDesc("ends", IntTupleUnpack), + AttrDesc("starts", IntTupleUnpack), + ], +) + +transposeDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("perm", IntTupleUnpack)], +) + + +class CeilMode(IntEnum): + floor = 0 + ceil = 1 + + +maxPoolDesc = OperatorDescriptor(inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("ceil_mode", unpacker = CeilMode, default = CeilMode.floor), + AttrDesc("kernel_shape", IntTupleUnpack), + AttrDesc("pads", IntTupleUnpack), + AttrDesc("strides", IntTupleUnpack), + ]) + + +class PadMode(str, Enum): + constant = "constant" + reflect = "reflect" + edge = "edge" + wrap = "wrap" + + +# Opset 24 +padDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "pads"], ["constant_value", "axes"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc('mode', unpacker = PadMode, default = PadMode.constant), + ], +) + + +class PadModeOld(str, Enum): + constant = "constant" + reflect = "reflect" + edge = "edge" + + +padDescOld = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("mode", unpacker = PadModeOld, default = PadModeOld.constant), + AttrDesc("pads", IntTupleUnpack), + AttrDesc("value", FloatUnpack), + ], +) + +addDesc = OperatorDescriptor( + inputDescriptor = VariadicIoDesc("data_in", minNumTensors = 2), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + + +class ReduceMeanDescriptor(OperatorDescriptor): + + def canonicalize(self, node: gs.Node, opset: int) -> bool: + if opset < 18: + if "axes" in node.attrs: + attrToInputTensor(node, "axes") + return super().canonicalize(node, opset) + + +# Opset 18 +reduceMeanDesc = ReduceMeanDescriptor( + inputDescriptor = IoDesc("data_in", optional = "axes"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("keepdims", unpacker = BoolUnpack, default = True), + AttrDesc("noop_with_empty_axes", unpacker = BoolUnpack, default = False), + ], +) + +reduceSumDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in", optional = "axes"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("keepdims", unpacker = BoolUnpack, default = True), + AttrDesc("noop_with_empty_axes", unpacker = BoolUnpack, default = False), + ], +) + +softmaxDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("axis", IntUnpack, default = -1)], +) + +softmaxGradDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["upstream_grad", "softmax_output"]), + outputDescriptor = IoDesc("softmax_grad"), + attrDescriptors = [AttrDesc("axis", IntUnpack, default = -1)], +) + +iSoftmaxDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("axis", IntUnpack, default = -1), + AttrDesc("coeffA", IntUnpack), + AttrDesc("coeffB", IntUnpack), + AttrDesc("coeffC", IntUnpack), + AttrDesc("log2", IntUnpack), + AttrDesc("n_levels", IntUnpack), + ], +) + +itaMaxDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("axis", IntUnpack, default = -1), + AttrDesc("n_levels", IntUnpack), + ], +) + +itaPartialMaxDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("axis", IntUnpack, default = -1), + AttrDesc("n_levels", IntUnpack), + AttrDesc("group_width", IntUnpack), + ], +) + + +class GeluApprox(str, Enum): + tanh = "tanh" + none = "none" + + +geluDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("approximate", GeluApprox, default = GeluApprox.none), + ], +) + +iGeluDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("b", IntUnpack), + AttrDesc("one", IntUnpack), + ], +) + +requantizedIGeluDesc = OperatorDescriptor(inputDescriptor = IoDesc(["data_in", "mul", "add", "shift"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("b", IntUnpack), + AttrDesc("one", IntUnpack), + ]) + +iHardswishDesc = OperatorDescriptor(inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("one_over_six", IntUnpack), + AttrDesc("six", IntUnpack), + AttrDesc("three", IntUnpack), + ]) + +requantizedIHardswishDesc = OperatorDescriptor(inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("one_over_six", IntUnpack), + AttrDesc("six", IntUnpack), + AttrDesc("three", IntUnpack), + AttrDesc("mul", IntUnpack), + AttrDesc("add", IntUnpack), + AttrDesc("shift", IntUnpack), + ]) + +iNoNormDesc = OperatorDescriptor(inputDescriptor = IoDesc(["data_in", "weights", "bias"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("D", IntUnpack), + AttrDesc("mul", IntUnpack), + AttrDesc("n_levels", IntUnpack), + ]) + +quantDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("scale", FloatUnpack), + AttrDesc("zero_point", FloatUnpack), + AttrDesc("bit_width", IntUnpack), + AttrDesc("signed", BoolUnpack, default = True), + AttrDesc("min_val", + IntUnpack, + default = lambda node: -(2**(node.attrs["bit_width"] - 1)) if node.attrs["signed"] else 0), + AttrDesc("max_val", + IntUnpack, + default = lambda node: 2**(node.attrs["bit_width"] - 1) - 1 + if node.attrs["signed"] else 2**node.attrs["bit_width"] - 1), + ], +) + + +class AutoPad(str, Enum): + NOTSET = "NOTSET" + SAME_UPPER = "SAME_UPPER" + SAME_LOWER = "SAME_LOWER" + VALID = "VALID" + + +def _dilationsDefault(node: gs.Node) -> Tuple[int, ...]: + # Remove 2 dims for input and output channels + nSpatialDims = len(node.inputs[1].shape) - 2 + return tuple([1] * nSpatialDims) + + +def _kernelShapeDefault(node: gs.Node) -> Tuple[int, ...]: + # Remove 2 dims for input and output channels + nSpatialDims = len(node.inputs[1].shape) - 2 + return node.inputs[1].shape[-nSpatialDims:] + + +def _stridesDefault(node: gs.Node) -> Tuple[int, ...]: + # Remove 2 dims for input and output channels + nSpatialDims = len(node.inputs[1].shape) - 2 + return tuple([1] * nSpatialDims) + + +def _padsDefault(node: gs.Node) -> Tuple[int, ...]: + # Remove 2 dims for input and output channels + nSpatialDims = len(node.inputs[1].shape) - 2 + # Two 0's per dimension for begin and end + return tuple([0] * (2 * nSpatialDims)) + + +convDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "weight"], optional = "bias"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("auto_pad", AutoPad, default = AutoPad.NOTSET), + AttrDesc("dilations", IntTupleUnpack, default = _dilationsDefault), + AttrDesc("group", IntUnpack, default = 1), + AttrDesc("kernel_shape", IntTupleUnpack, default = _kernelShapeDefault), + AttrDesc("pads", IntTupleUnpack, default = _padsDefault), + AttrDesc("strides", IntTupleUnpack, default = _stridesDefault), + ], +) + +convTransposeDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "weight"], optional = "bias"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("auto_pad", AutoPad, default = AutoPad.NOTSET), + AttrDesc("dilations", IntTupleUnpack, default = _dilationsDefault), + AttrDesc("group", IntUnpack, default = 1), + AttrDesc("kernel_shape", IntTupleUnpack, default = _kernelShapeDefault), + # TODO: Add output_shape and output_padding default functions. + # Docs: + # - ONNX: https://onnx.ai/onnx/operators/onnx__ConvTranspose.html + # - PyTorch: https://docs.pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html + # AttrDesc("output_shape", IntTupleUnpack, default = _outputShapeDefault), + # AttrDesc("output_padding", IntTupleUnpack, default = _outputPaddingDefault), + AttrDesc("pads", IntTupleUnpack, default = _padsDefault), + AttrDesc("strides", IntTupleUnpack, default = _stridesDefault), + ], +) + + +class RequantizedOperatorDescriptor(OperatorDescriptor): + + def canonicalize(self, node: gs.Node, opset: int) -> bool: + if "n_levels_out" in node.attrs and "n_levels" in node.attrs: + log.warning("Requantized operator cannot have n_levels_out and n_levels in its attributes") + return False + + if "n_levels_out" in node.attrs: + node.attrs["n_levels"] = node.attrs["n_levels_out"] + node.attrs.pop("n_levels_out") + + return super().canonicalize(node, opset) + + +requantizedConvDesc = RequantizedOperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "weight", "mul", "add"], optional = ["shift"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + # Conv attrs + AttrDesc("auto_pad", AutoPad, default = AutoPad.NOTSET), + AttrDesc("dilations", IntTupleUnpack, default = _dilationsDefault), + AttrDesc("group", IntUnpack, default = 1), + AttrDesc("kernel_shape", IntTupleUnpack, default = _kernelShapeDefault), + AttrDesc("pads", IntTupleUnpack, default = _padsDefault), + AttrDesc("strides", IntTupleUnpack, default = _stridesDefault), + # RequantizedShift attrs + AttrDesc("n_levels", IntUnpack), + AttrDesc("signed", BoolUnpack), + AttrDesc("div", IntUnpack), + ], +) + +dequantDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("scale", FloatUnpack), + AttrDesc("zero_point", FloatUnpack), + AttrDesc("bit_width", IntUnpack), + AttrDesc("signed", BoolUnpack), + ], +) + +divDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["input1", "input2"]), + outputDescriptor = IoDesc("output"), + attrDescriptors = [], +) + +integerDivDescriptor = OperatorDescriptor( + inputDescriptor = IoDesc(["A", "B"]), + outputDescriptor = IoDesc("C"), + attrDescriptors = [ + AttrDesc("Delta", IntUnpack), + AttrDesc("eps", IntUnpack), + AttrDesc("eta", IntUnpack), + ], +) + +requantizedIntegerDivDescriptor = RequantizedOperatorDescriptor( + inputDescriptor = IoDesc(["A", "B", "requant_mul", "requant_add", "requant_div"]), + outputDescriptor = IoDesc("C"), + attrDescriptors = [ + # IntegerDiv attrs + AttrDesc("Delta", IntUnpack), + AttrDesc("eps", IntUnpack), + AttrDesc("eta", IntUnpack), + # RequantizedShift attrs + AttrDesc("n_levels", IntUnpack), + AttrDesc("signed", BoolUnpack), + AttrDesc("div", IntUnpack), + ]) + +debugPrintDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + +layerNormalizationDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "weight", "bias"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("epsilon", FloatUnpack)], +) + +iLayerNormDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "weight", "bias"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("D", IntUnpack), AttrDesc("n_levels", IntUnpack)], +) + +flattenDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("axis", IntUnpack, default = 1)], +) + +gatherDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "indices"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("axis", IntUnpack, default = 0)], +) + + +class SqueezeDescriptor(OperatorDescriptor): + + def canonicalize(self, node: gs.Node, opset: int) -> bool: + if len(node.inputs) == 2: + inputTensorToAttr(node, tensorIdx = 1, attr = "axes") + + if opset >= 13 and len(node.inputs) != 2: + log.warning( + "Squeeze operation expects 2 inputs for opset >= 13. " + f"Received node {node.name} with {len(node.inputs)} input{'s' if len(node.inputs) > 1 else ''} and opset {opset}" + ) + elif opset < 13 and len(node.inputs) != 1: + log.warning( + "Squeeze operation expects 1 input for opset < 13. " + f"Received node {node.name} with {len(node.inputs)} input{'s' if len(node.inputs) > 1 else ''} and opset {opset}" + ) + + return super().canonicalize(node, opset) + + +# Opset <= 11 +unsqueezeDesc = SqueezeDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("axes", IntTupleUnpack)], +) + +# Opset <= 11 +squeezeDesc = SqueezeDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [AttrDesc("axes", IntTupleUnpack)], +) + +mulDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["A", "B"]), + outputDescriptor = IoDesc("C"), + attrDescriptors = [], +) + +matMulDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["A", "B"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + +rqMatMulDesc = RequantizedOperatorDescriptor( + inputDescriptor = IoDesc(["A", "B", "add", "mul"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + # RequantizedShift attrs + AttrDesc("n_levels", IntUnpack), + AttrDesc("signed", BoolUnpack), + AttrDesc("div", IntUnpack), + ], +) + +gemmDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["A", "B"], optional = ["C"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("alpha", FloatUnpack, default = 1.0), + AttrDesc("beta", FloatUnpack, default = 1.0), + AttrDesc("transA", BoolUnpack, default = False), + AttrDesc("transB", BoolUnpack, default = False), + ], +) + +rqGemmDesc = RequantizedOperatorDescriptor( + inputDescriptor = IoDesc(["A", "B", "C", "add", "mul"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("alpha", FloatUnpack, default = 1.0), + AttrDesc("beta", FloatUnpack, default = 1.0), + AttrDesc("transA", BoolUnpack, default = False), + AttrDesc("transB", BoolUnpack, default = False), + # RequantizedShift attrs + AttrDesc("n_levels", IntUnpack), + AttrDesc("signed", BoolUnpack), + AttrDesc("div", IntUnpack), + ]) + +requantizedGemmDesc = RequantizedOperatorDescriptor( + inputDescriptor = IoDesc(["A", "B", "add", "mul"]), # Important diff to RQGemm + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("alpha", FloatUnpack, default = 1.0), + AttrDesc("beta", FloatUnpack, default = 1.0), + AttrDesc("transA", BoolUnpack, default = False), + AttrDesc("transB", BoolUnpack, default = False), + # RequantizedShift attrs + AttrDesc("n_levels", IntUnpack), + AttrDesc("signed", BoolUnpack), + AttrDesc("div", IntUnpack), + ]) + +linearAttentionDesc = OperatorDescriptor( + inputDescriptor = IoDesc( + ["q", "k", "v", "wq_weight", "wq_bias", "wk_weight", "wk_bias", "wv_weight", "wv_bias", "wo_weight", + "wo_bias"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("preattn_requant_mul", IntTupleUnpack), + AttrDesc("preattn_requant_div", IntTupleUnpack), + AttrDesc("normalizer_requant_mul", IntTupleUnpack), + AttrDesc("normalizer_requant_shift", IntTupleUnpack), + AttrDesc("normalizer_requant_div", IntTupleUnpack), + AttrDesc("postattn_requant_mul", IntTupleUnpack), + AttrDesc("postattn_requant_shift", IntTupleUnpack), + AttrDesc("postattn_requant_div", IntTupleUnpack), + AttrDesc("wo_requant_mul", IntTupleUnpack), + AttrDesc("wo_requant_shift", IntTupleUnpack), + AttrDesc("wo_requant_div", IntTupleUnpack), + AttrDesc("wq_requant_mul", IntTupleUnpack), + AttrDesc("wq_requant_shift", IntTupleUnpack), + AttrDesc("wq_requant_div", IntTupleUnpack), + AttrDesc("wk_requant_mul", IntTupleUnpack), + AttrDesc("wk_requant_shift", IntTupleUnpack), + AttrDesc("wk_requant_div", IntTupleUnpack), + AttrDesc("wv_requant_mul", IntTupleUnpack), + AttrDesc("wv_requant_shift", IntTupleUnpack), + AttrDesc("wv_requant_div", IntTupleUnpack), + AttrDesc("Delta", IntUnpack), + AttrDesc("eps", IntUnpack), + AttrDesc("act_type", IntUnpack), + AttrDesc("n_levels", IntUnpack), + AttrDesc("dim", IntUnpack), + AttrDesc("dim_head", IntUnpack), + AttrDesc("heads", IntUnpack), + ], +) + +clcaDesc = OperatorDescriptor( + inputDescriptor = IoDesc([ + "q", "k", "wq_weight", "wq_bias", "wk_weight", "wk_bias", "wo_weight", "wo_bias", "wq_requant_mul", + "wq_requant_add", "wq_requant_div", "wk_requant_mul", "wk_requant_add", "wk_requant_div", "wv_requant_mul", + "wv_requant_add", "wv_requant_div", "kdiv_requant_mul", "kdiv_requant_add", "kdiv_requant_div", + "preattn_requant_mul", "preattn_requant_add", "preattn_requant_div", "postattn_requant_mul", + "postattn_requant_add", "postattn_requant_div", "wo_requant_mul", "wo_requant_add", "wo_requant_div" + ]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("Delta", IntUnpack), + AttrDesc("eps", IntUnpack), + AttrDesc("eta", IntUnpack), + AttrDesc("act_type", IntUnpack), + AttrDesc("n_levels", IntUnpack), + AttrDesc("dim", IntUnpack), + AttrDesc("dim_head", IntUnpack), + AttrDesc("out_dim", IntUnpack), + AttrDesc("heads", IntUnpack), + ], +) + +mhsaDesc = OperatorDescriptor( + inputDescriptor = IoDesc( + ["q", "k", "v", "wq_weight", "wq_bias", "wk_weight", "wk_bias", "wv_weight", "wv_bias", "wo_weight", + "wo_bias"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("preattn_requant_mul", IntTupleIfNotSingleItemUnpack), + AttrDesc("preattn_requant_div", IntTupleIfNotSingleItemUnpack), + AttrDesc("postattn_requant_mul", IntTupleIfNotSingleItemUnpack), + AttrDesc("postattn_requant_div", IntTupleIfNotSingleItemUnpack), + AttrDesc("wo_requant_mul", IntTupleIfNotSingleItemUnpack), + AttrDesc("wo_requant_div", IntTupleIfNotSingleItemUnpack), + AttrDesc("wq_requant_mul", IntTupleIfNotSingleItemUnpack), + AttrDesc("wq_requant_div", IntTupleIfNotSingleItemUnpack), + AttrDesc("wk_requant_mul", IntTupleIfNotSingleItemUnpack), + AttrDesc("wk_requant_div", IntTupleIfNotSingleItemUnpack), + AttrDesc("wv_requant_mul", IntTupleIfNotSingleItemUnpack), + AttrDesc("wv_requant_div", IntTupleIfNotSingleItemUnpack), + AttrDesc("n_levels", IntUnpack), + AttrDesc("dim", IntUnpack), + AttrDesc("dim_head", IntUnpack), + AttrDesc("heads", IntUnpack), + AttrDesc("signed", BoolUnpack), + ], +) + +reluDesc = OperatorDescriptor( + inputDescriptor = IoDesc("data_in"), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + +reshapeDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "shape"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [], +) + +requantShiftDesc = RequantizedOperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "mul", "add"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("n_levels", IntUnpack), + AttrDesc("signed", BoolUnpack), + AttrDesc("div", IntUnpack), + ], +) + + +class RequantizedAddDescriptor(OperatorDescriptor): + + def canonicalize(self, node: gs.Node, opset: int) -> bool: + for tensor in ["rqs1", "rqs2", "rqsOut"]: + n_levels = f"{tensor}_n_levels" + n_levels_out = f"{tensor}_n_levels_out" + if n_levels_out in node.attrs and n_levels in node.attrs: + log.warning( + f"RequantizedAdd tensor {tensor} cannot have {n_levels_out} and {n_levels} in its attributes") + return False + + if n_levels_out in node.attrs: + node.attrs[n_levels] = node.attrs[n_levels_out] + node.attrs.pop(n_levels_out) + + return super().canonicalize(node, opset) + + +requantizedAddDesc = RequantizedAddDescriptor( + inputDescriptor = IoDesc(["data_in_0", "data_in_1"]), + outputDescriptor = IoDesc("data_out"), + attrDescriptors = [ + AttrDesc("rqs1_mul", IntUnpack), + AttrDesc("rqs1_add", IntUnpack), + AttrDesc("rqs1_div", IntUnpack), + AttrDesc("rqs1_signed", BoolUnpack), + AttrDesc("rqs1_n_levels", IntUnpack), + AttrDesc("rqs2_mul", IntUnpack), + AttrDesc("rqs2_add", IntUnpack), + AttrDesc("rqs2_div", IntUnpack), + AttrDesc("rqs2_signed", BoolUnpack), + AttrDesc("rqs2_n_levels", IntUnpack), + AttrDesc("rqsOut_mul", IntUnpack), + AttrDesc("rqsOut_add", IntUnpack), + AttrDesc("rqsOut_div", IntUnpack), + AttrDesc("rqsOut_signed", BoolUnpack), + AttrDesc("rqsOut_n_levels", IntUnpack), + ], +) + +sgdDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["weight", "grad"]), + outputDescriptor = IoDesc("weight_updated"), + attrDescriptors = [AttrDesc("lr", FloatUnpack)], +) + +softmaxCrossEntropyLossDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["logits", "labels"]), + outputDescriptor = IoDesc("log_prob"), + attrDescriptors = [], +) + +softmaxCrossEntropyLossGradDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["log_prob", "labels"]), + outputDescriptor = IoDesc("grad"), + attrDescriptors = [], +) + +batchNormalizationDesc = OperatorDescriptor( + inputDescriptor = IoDesc(["data_in", "scale", "bias", "mean", "variance"]), + outputDescriptor = IoDesc(["data_out"], optional = ["running_mean", "running_var"]), + attrDescriptors = [ + AttrDesc("epsilon", FloatUnpack, default = 1e-5), + AttrDesc("momentum", FloatUnpack, default = 0.9), + AttrDesc("training_mode", BoolUnpack, default = False), + ], +) + +defaultOperatorDescriptors: Dict[str, OperatorDescriptor] = { + "Add": addDesc, + "BatchNormalization": batchNormalizationDesc, + "CLCA": clcaDesc, + "Concat": concatDesc, + "Conv": convDesc, + "ConvTranspose": convTransposeDesc, + "DebugPrint": debugPrintDesc, + "Dequant": dequantDesc, + "Div": divDesc, + "Flatten": flattenDesc, + "Gather": gatherDesc, + "Gelu": geluDesc, + "Gemm": gemmDesc, + "ITAMax": itaMaxDesc, + "ITAPartialMax": itaPartialMaxDesc, + "IntegerDiv": integerDivDescriptor, + "IntegerMean": reduceMeanDesc, + "LayerNormalization": layerNormalizationDesc, + "LinearAttention": linearAttentionDesc, + "MHSA": mhsaDesc, + "MatMul": matMulDesc, + "MatMulInteger": matMulDesc, + "MaxPool": maxPoolDesc, + "Mul": mulDesc, + "Pad": padDescOld, + "Quant": quantDesc, + "RQGemm": rqGemmDesc, + "RQIntegerDiv": requantizedIntegerDivDescriptor, + "RQMatMul": rqMatMulDesc, + "ReduceMean": reduceMeanDesc, + "ReduceSum": reduceSumDesc, + "Relu": reluDesc, + "RequantizedAdd": requantizedAddDesc, + "RequantizedConv": requantizedConvDesc, + "RequantizedGemm": requantizedGemmDesc, + "RequantizediGELU": requantizedIGeluDesc, + "RequantizediHardswish": requantizedIHardswishDesc, + "RequantShift": requantShiftDesc, + "Reshape": reshapeDesc, + "SGD": sgdDesc, + "Slice": sliceDesc, + "Softmax": softmaxDesc, + "SoftmaxCrossEntropyLoss": softmaxCrossEntropyLossDesc, + "SoftmaxCrossEntropyLossGrad": softmaxCrossEntropyLossGradDesc, + "SoftmaxGrad": softmaxGradDesc, + "Squeeze": squeezeDesc, + "Transpose": transposeDesc, + "Unsqueeze": unsqueezeDesc, + "iGELU": iGeluDesc, + "iHardswish": iHardswishDesc, + "iLayerNorm": iLayerNormDesc, + "iNoNorm": iNoNormDesc, + "iRMSNorm": iRMSNormDesc, + "iSoftmax": iSoftmaxDesc, +} diff --git a/Deeploy/Targets/Chimera/Deployer.py b/Deeploy/Targets/Chimera/Deployer.py index ba28279b6..85b0496e3 100644 --- a/Deeploy/Targets/Chimera/Deployer.py +++ b/Deeploy/Targets/Chimera/Deployer.py @@ -8,7 +8,7 @@ from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer class ChimeraDeployer(SignPropDeployer): @@ -18,6 +18,7 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first = False, @@ -27,6 +28,7 @@ def __init__(self, deploymentPlatform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name, default_channels_first = default_channels_first, diff --git a/Deeploy/Targets/CortexM/Deployer.py b/Deeploy/Targets/CortexM/Deployer.py index bef8fdcf3..9a4f27b06 100644 --- a/Deeploy/Targets/CortexM/Deployer.py +++ b/Deeploy/Targets/CortexM/Deployer.py @@ -11,7 +11,7 @@ from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ NCHWtoNHWCPass, TransposeMatmulInputsPass -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass @@ -22,6 +22,7 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first = False, @@ -32,6 +33,7 @@ def __init__(self, deploymentPlatform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name, default_channels_first = default_channels_first, diff --git a/Deeploy/Targets/CortexM/Platform.py b/Deeploy/Targets/CortexM/Platform.py index 25caeed60..abcddee64 100644 --- a/Deeploy/Targets/CortexM/Platform.py +++ b/Deeploy/Targets/CortexM/Platform.py @@ -14,17 +14,17 @@ LinearAttentionAlignmentPass, MatMulRequantMergePass, MHSAAlignmentPass from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicDebugPrintBindings, BasicDivBindings, \ BasicGatherBindings, BasicGELUBindings, BasicLayerNormBindings, BasicMatMulBindings, BasicMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, \ - BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, \ - BasicTransposeBindings, DummyBinding + BasicMulScalarBindings, BasicPad1DBindings, BasicPad2DBindings, BasicReduceMeanBindings, BasicReduceSumBindings, \ + BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, \ + BasicSoftmaxBindings, BasicTransposeBindings, DummyBinding from Deeploy.Targets.Generic.Layers import AddLayer, CLCALayer, DebugPrintLayer, DivLayer, GatherLayer, GELULayer, \ LayerNormLayer, LinearAttentionLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, ReduceMeanLayer, \ ReduceSumLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, SoftmaxLayer, \ TransposeLayer from Deeploy.Targets.Generic.Parsers import AddParser, DebugParser, DummyParser, FlattenParser, GatherParser, \ - GELUParser, IntegerDivParser, MatMulParser, MulParser, Pad1DParser, Pad2DParser, ReduceMeanParser, \ - ReduceSumParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, \ - TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + GELUParser, IntegerDivParser, MatMulParser, MulParser, MulScalarParser, Pad1DParser, Pad2DParser, \ + ReduceMeanParser, ReduceSumParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, \ + SliceParser, TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, iGELURequantMergePass @@ -46,6 +46,7 @@ MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) MaxPool2DMapper = NodeMapper(CMSISMaxPool2DParser(), [CMSISMaxPool2DBinding]) MulMapper = NodeMapper(MulParser(), BasicMulBindings) +MulScalarMapper = NodeMapper(MulScalarParser(), BasicMulScalarBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings) @@ -78,7 +79,7 @@ 'LinearAttention': LinearAttentionLayer([LinearAttention_int16_Mapper]), 'MatMul': MatMulLayer([MatMulMapper]), 'MaxPool': MaxPoolLayer([MaxPool2DMapper]), - 'Mul': MulLayer([MulMapper]), + 'Mul': MulLayer([MulMapper, MulScalarMapper]), 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), 'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]), 'ReduceSum': ReduceSumLayer([ReduceSumMapper]), diff --git a/Deeploy/Targets/CortexM/Templates/ConvTemplate.py b/Deeploy/Targets/CortexM/Templates/ConvTemplate.py index d5e05c834..6d9984a11 100644 --- a/Deeploy/Targets/CortexM/Templates/ConvTemplate.py +++ b/Deeploy/Targets/CortexM/Templates/ConvTemplate.py @@ -6,12 +6,13 @@ from ortools.constraint_solver.pywrapcp import IntVar -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantizedConvTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation from Deeploy.Targets.CortexM.DataTypes import cmsis_nn_context, cmsis_nn_conv_params, cmsis_nn_dims, \ cmsis_nn_per_channel_quant_params -class _Conv2D_8_Template(NodeTemplate): +class _Conv2D_8_Template(RequantizedConvTemplate): def __init__(self, templateStr): super().__init__(templateStr) @@ -128,7 +129,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ") -class _Conv1D_16_Template(NodeTemplate): +class _Conv1D_16_Template(RequantizedConvTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 6bfe805b3..b6855f352 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -8,13 +8,13 @@ from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration, MemoryPassthroughGeneration from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \ - int8_t, int32_t, uint8_t + int8_t, int32_t, int64_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, ConvTemplate, \ ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \ FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \ - FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \ + FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulScalarTemplate, FloatPadTemplate, \ FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GatherTemplate, GemmTemplate, \ IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, MaxPoolTemplate, MulTemplate, \ PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, RequantShiftTemplate, ReshapeTemplate, \ @@ -171,9 +171,11 @@ NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]), MulTemplate.referenceTemplate, BasicTransformer) for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes) -] + [ +] + +BasicMulScalarBindings = [ NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), - FloatMulTemplate.referenceTemplate, BasicTransformer) + FloatMulScalarTemplate.referenceTemplate, BasicTransformer) ] BasicPad1DBindings = [ @@ -195,13 +197,11 @@ ] BasicReduceMeanBindings = [ - NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate, - BasicTransformer) for type in SignedIntegerDataTypes + NodeBinding(ReduceMeanChecker([PointerClass(ty), PointerClass(int64_t)], [PointerClass(ty)]), + ReduceMeanTemplate.referenceTemplate, BasicTransformer) for ty in SignedIntegerDataTypes ] + [ - NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]), - FloatReduceMeanTemplate.referenceTemplate, BasicTransformer) - for integer_type in SignedIntegerDataTypes - for float_type in FloatDataTypes + NodeBinding(ReduceMeanChecker([PointerClass(ty), PointerClass(int64_t)], [PointerClass(ty)]), + FloatReduceMeanTemplate.referenceTemplate, BasicTransformer) for ty in FloatDataTypes ] BasicReduceSumBindings = [ diff --git a/Deeploy/Targets/Generic/Deployer.py b/Deeploy/Targets/Generic/Deployer.py index 3cef57a2e..9bf89a8a0 100644 --- a/Deeploy/Targets/Generic/Deployer.py +++ b/Deeploy/Targets/Generic/Deployer.py @@ -11,7 +11,7 @@ from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ NCHWtoNHWCPass, TransposeMatmulInputsPass -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass @@ -22,6 +22,7 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first = False, @@ -32,6 +33,7 @@ def __init__(self, deploymentPlatform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name, default_channels_first = default_channels_first, diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index c924895c1..1beb876a8 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -3,11 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 import copy +import math from typing import List, Tuple import numpy as np -from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, OperatorRepresentation, Shape +from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, Shape class ConcatLayer(ONNXLayer): @@ -64,23 +65,6 @@ def __init__(self, maps: List[NodeMapper]): super().__init__(maps) -class iNoNormLayer(ONNXLayer): - - def __init__(self, maps: List[NodeMapper]): - super().__init__(maps) - - def computeOps(self): - return self.mapper.parser.operatorRepresentation['size'] * 4 # 2 mul, 1 add, 1 right shift - - def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation: OperatorRepresentation, - channels_first: bool) -> Tuple[Shape]: - - # JUNGVI: Broadcast the weights and bias to have as many dimensions as the inputs - inputShapes[1] = [1] * (len(inputShapes[0]) - len(inputShapes[1])) + list(inputShapes[1]) - inputShapes[2] = inputShapes[1] - return (inputShapes, outputShapes) - - class RQSiGELULayer(GELULayer): def __init__(self, maps: List[NodeMapper]): @@ -312,15 +296,12 @@ def __init__(self, maps: List[NodeMapper]): def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation, channels_first) -> Tuple[Shape, Shape]: - if inputShapes[1] == () or inputShapes[1] == []: inputShapes[1] = (1,) - - if len(inputShapes[0]) > len(inputShapes[1]): - inputShapes[1] = inputShapes[0] + if math.prod(inputShapes[1]) == 1: + return inputShapes, outputShapes else: - inputShapes[0] = inputShapes[1] - return (inputShapes, outputShapes) + return [np.broadcast_shapes(*inputShapes)] * len(inputShapes), outputShapes def computeOps(self): return self.mapper.parser.operatorRepresentation['size'] diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index 7752834c5..e0200b133 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -52,7 +52,7 @@ def parseNode(self, node: gs.Node) -> (bool): if ret: - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels']) + self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D'])) return ret @@ -227,20 +227,25 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - ret = super().parseNode(node) - wellFormed = False - if ret: - pads = self.operatorRepresentation['pads'] - kernel_shape = self.operatorRepresentation['kernel_shape'] - strides = self.operatorRepresentation['strides'] - # 1D: pads should be length 2, kernel_shape length 1, strides length 1 - if len(pads) == 2 and len(kernel_shape) == 1 and len(strides) == 1: - wellFormed = True - self.operatorRepresentation['padding_y'] = int(pads[0]) - self.operatorRepresentation['padding_y_right'] = int(pads[1]) - self.operatorRepresentation['stride_y'] = int(strides[0]) - self.operatorRepresentation['dim_kernel_y'] = int(kernel_shape[0]) - return wellFormed + if not super().parseNode(node): + return False + + pads = self.operatorRepresentation['pads'] + kernel_shape = self.operatorRepresentation['kernel_shape'] + strides = self.operatorRepresentation['strides'] + + if not all([ + len(pads) == 2, + len(kernel_shape) == 1, + len(strides) == 1, + ]): + return False + + self.operatorRepresentation['padding_y'] = pads[0] + self.operatorRepresentation['padding_y_right'] = pads[1] + self.operatorRepresentation['stride_y'] = strides[0] + self.operatorRepresentation['dim_kernel_y'] = kernel_shape[0] + return True def parseNodeCtxt(self, ctxt, node, channels_first = True): newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) @@ -269,28 +274,31 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: + if not super().parseNode(node): + return False - ret = super().parseNode(node) - wellFormed = False - if ret: - pads = self.operatorRepresentation['pads'] - kernel_shape = self.operatorRepresentation['kernel_shape'] - strides = self.operatorRepresentation['strides'] - if len(pads) == 4 and len(kernel_shape) == 2 and len(strides) == 2: - wellFormed = True + pads = self.operatorRepresentation['pads'] + kernel_shape = self.operatorRepresentation['kernel_shape'] + strides = self.operatorRepresentation['strides'] - self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0]) - self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1]) - self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][0]) - self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][1]) - self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][2]) - self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][3]) - self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) - self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) - self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) - self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) + if not all([ + len(pads) == 4, + len(kernel_shape) == 2, + len(strides) == 2, + ]): + return False - return wellFormed + self.operatorRepresentation['padding_x'] = pads[0] + self.operatorRepresentation['padding_y'] = pads[1] + self.operatorRepresentation['padding_x_left'] = pads[0] + self.operatorRepresentation['padding_y_top'] = pads[1] + self.operatorRepresentation['padding_x_right'] = pads[2] + self.operatorRepresentation['padding_y_bottom'] = pads[3] + self.operatorRepresentation['stride_x'] = strides[0] + self.operatorRepresentation['stride_y'] = strides[1] + self.operatorRepresentation['dim_kernel_x'] = kernel_shape[0] + self.operatorRepresentation['dim_kernel_y'] = kernel_shape[1] + return True def parseNodeCtxt(self, ctxt: NetworkContext, @@ -669,11 +677,11 @@ def parseNode(self, node: gs.Node) -> bool: ]) if wellFormed: - self.operatorRepresentation['coeffA'] = int(node.attrs['coeffA'].values) - self.operatorRepresentation['coeffB'] = int(node.attrs['coeffB'].values) - self.operatorRepresentation['coeffC'] = int(node.attrs['coeffC'].values) - self.operatorRepresentation['log2'] = int(node.attrs['log2'].values) - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) + self.operatorRepresentation['coeffA'] = node.attrs['coeffA'] + self.operatorRepresentation['coeffB'] = node.attrs['coeffB'] + self.operatorRepresentation['coeffC'] = node.attrs['coeffC'] + self.operatorRepresentation['log2'] = node.attrs['log2'] + self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] return wellFormed @@ -698,7 +706,7 @@ def parseNode(self, node: gs.Node) -> bool: ret = all(['n_levels' in node.attrs]) if ret and wellFormed: - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) + self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] return True return False @@ -725,8 +733,8 @@ def parseNode(self, node: gs.Node) -> bool: ret = all(['group_width' in node.attrs, 'n_levels' in node.attrs]) if ret and wellFormed: - self.operatorRepresentation['group_width'] = int(node.attrs['group_width']) - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) + self.operatorRepresentation['group_width'] = node.attrs['group_width'] + self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] return True return False @@ -848,8 +856,8 @@ def parseNode(self, node: gs.Node) -> bool: if ret: self.operatorRepresentation['D'] = node.attrs['D'] - self.operatorRepresentation['log2D'] = int(np.log2(node.attrs['D'].values).tolist()[0]) - self.operatorRepresentation['mul'] = int(node.attrs['mul'].values.tolist()[0]) + self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D'])) + self.operatorRepresentation['mul'] = node.attrs['mul'] self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] return ret @@ -986,48 +994,23 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> (bool): + if not all(['axes' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1]): + return False - # ONNX v11: 'axes' is a node attribute - if 'axes' in node.attrs: - ret = all(['axes' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1]) - # ONNX v13+: 'axes' becomes an input with the data - # Source: https://onnx.ai/onnx/operators/onnx__Unsqueeze.html - else: - ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) - - if ret and 'axes' in node.attrs: - axes_attr = node.attrs['axes'] - self.operatorRepresentation['axes'] = [int(axes_attr)] if isinstance(axes_attr, int) \ - else [int(a) for a in axes_attr] - # For opset 13+, axes will be extracted from the second input in parseNodeCtxt - - return ret + self.operatorRepresentation['axes'] = node.attrs['axes'] + return True def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: + inputs = ['data_in'] + for idx, inputNode in enumerate(node.inputs): + self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name outputs = ['data_out'] - if len(node.inputs) == 1: - inputs = ['data_in'] - for idx, inputNode in enumerate(node.inputs): - self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name - for idx, outputNode in enumerate(node.outputs): - self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name - else: - data_in = ctxt.lookup(node.inputs[0].name) - data_out = ctxt.lookup(node.outputs[0].name) - self.operatorRepresentation['data_in'] = data_in.name - self.operatorRepresentation['data_out'] = data_out.name - # axes must be a constant; extract values - axes_buf = ctxt.lookup(node.inputs[1].name) - assert hasattr(axes_buf, 'values'), "Unsqueeze: expected constant 'axes' input for opset 13+" - axes_vals = np.array(axes_buf.values).astype(int).flatten().tolist() - self.operatorRepresentation['axes'] = axes_vals - # Do not deploy the axes tensor - axes_buf._live = False - axes_buf._deploy = False + for idx, outputNode in enumerate(node.outputs): + self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name return ctxt, True @@ -1131,35 +1114,28 @@ def parseNode(self, node: gs.Node) -> (bool): class MulParser(NodeParser): - def __init__(self): - super().__init__() - - def parseNode(self, node: gs.Node) -> (bool): - - wellFormed = all([ - len(node.inputs) == 2, - len(node.outputs) == 1, - ]) - - return wellFormed + def parseNode(self, node: gs.Node) -> bool: + return len(node.inputs) == 2 and len(node.outputs) == 1 def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: + inBuffers = [ctxt.lookup(t.name) for t in node.inputs] + outBuffers = [ctxt.lookup(t.name) for t in node.outputs] - inputs = ['A', 'B'] - outputs = ['C'] + self.operatorRepresentation.update(dict(zip(['A', 'B'], [b.name for b in inBuffers]))) + self.operatorRepresentation.update(dict(zip(['C'], [b.name for b in outBuffers]))) - for idx, inputNode in enumerate(node.inputs): - self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name - for idx, outputNode in enumerate(node.outputs): - self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name + self.operatorRepresentation['size'] = math.prod(inBuffers[0].shape) + self.operatorRepresentation['sizeB'] = math.prod(inBuffers[1].shape) + return ctxt, True - self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape) - self.operatorRepresentation['sizeB'] = np.prod(ctxt.lookup(node.inputs[1].name).shape) - return ctxt, True +class MulScalarParser(MulParser): + + def parseNode(self, node: gs.Node) -> bool: + return super().parseNode(node) and math.prod(node.inputs[1].shape) == 1 class ConvParser(NodeParser): @@ -1408,23 +1384,7 @@ def parseNode(self, node: gs.Node) -> (bool): ]) if ret: - self.operatorRepresentation['preattn_requant_mul'] = node.attrs['preattn_requant_mul'] - self.operatorRepresentation['preattn_requant_div'] = node.attrs['preattn_requant_div'] - self.operatorRepresentation['postattn_requant_mul'] = node.attrs['postattn_requant_mul'] - self.operatorRepresentation['postattn_requant_div'] = node.attrs['postattn_requant_div'] - self.operatorRepresentation['wo_requant_mul'] = node.attrs['wo_requant_mul'] - self.operatorRepresentation['wo_requant_div'] = node.attrs['wo_requant_div'] - self.operatorRepresentation['wq_requant_mul'] = node.attrs['wq_requant_mul'] - self.operatorRepresentation['wq_requant_div'] = node.attrs['wq_requant_div'] - self.operatorRepresentation['wk_requant_mul'] = node.attrs['wk_requant_mul'] - self.operatorRepresentation['wk_requant_div'] = node.attrs['wk_requant_div'] - self.operatorRepresentation['wv_requant_mul'] = node.attrs['wv_requant_mul'] - self.operatorRepresentation['wv_requant_div'] = node.attrs['wv_requant_div'] - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels']) - self.operatorRepresentation['dim'] = int(node.attrs['dim']) # Sequence Length - self.operatorRepresentation['dim_head'] = int(node.attrs['dim_head']) # Projection Size - self.operatorRepresentation['heads'] = int(node.attrs['heads']) - self.operatorRepresentation['signed'] = int(node.attrs['signed']) + self.operatorRepresentation.update(node.attrs) return ret @@ -1472,37 +1432,24 @@ def parseNode(self, node: gs.Node) -> (bool): ]) if ret: - self.operatorRepresentation['preattn_requant_mul'] = int(node.attrs['preattn_requant_mul'].values) - self.operatorRepresentation['preattn_requant_shift'] = int(node.attrs['preattn_requant_shift'].values) - self.operatorRepresentation['preattn_requant_div'] = int( - math.log2(int(node.attrs['preattn_requant_div'].values))) - self.operatorRepresentation['normalizer_requant_mul'] = int(node.attrs['normalizer_requant_mul'].values) - self.operatorRepresentation['normalizer_requant_shift'] = int(node.attrs['normalizer_requant_shift'].values) - self.operatorRepresentation['normalizer_requant_div'] = int( - math.log2(int(node.attrs['normalizer_requant_div'].values))) - self.operatorRepresentation['postattn_requant_mul'] = int(node.attrs['postattn_requant_mul'].values) - self.operatorRepresentation['postattn_requant_shift'] = int(node.attrs['postattn_requant_shift'].values) - self.operatorRepresentation['postattn_requant_div'] = int( - math.log2(int(node.attrs['postattn_requant_div'].values))) - self.operatorRepresentation['wo_requant_mul'] = int(node.attrs['wo_requant_mul'].values) - self.operatorRepresentation['wo_requant_shift'] = int(node.attrs['wo_requant_shift'].values) - self.operatorRepresentation['wo_requant_div'] = int(math.log2(int(node.attrs['wo_requant_div'].values))) - self.operatorRepresentation['wq_requant_mul'] = int(node.attrs['wq_requant_mul'].values) - self.operatorRepresentation['wq_requant_shift'] = int(node.attrs['wq_requant_shift'].values) - self.operatorRepresentation['wq_requant_div'] = int(math.log2(int(node.attrs['wq_requant_div'].values))) - self.operatorRepresentation['wk_requant_mul'] = int(node.attrs['wk_requant_mul'].values) - self.operatorRepresentation['wk_requant_shift'] = int(node.attrs['wk_requant_shift'].values) - self.operatorRepresentation['wk_requant_div'] = int(math.log2(int(node.attrs['wk_requant_div'].values))) - self.operatorRepresentation['wv_requant_mul'] = int(node.attrs['wv_requant_mul'].values) - self.operatorRepresentation['wv_requant_shift'] = int(node.attrs['wv_requant_shift'].values) - self.operatorRepresentation['wv_requant_div'] = int(math.log2(int(node.attrs['wv_requant_div'].values))) - self.operatorRepresentation['Delta'] = int(node.attrs['Delta']) - self.operatorRepresentation['eps'] = int(node.attrs['eps']) - self.operatorRepresentation['act_type'] = int(node.attrs['act_type']) - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) - self.operatorRepresentation['dim'] = int(node.attrs['dim'].values) - self.operatorRepresentation['dim_head'] = int(node.attrs['dim_head'].values) - self.operatorRepresentation['heads'] = int(node.attrs['heads'].values) + self.operatorRepresentation.update(node.attrs) + + # All *_div attrs are log2d-ified + log2Attrs = [ + "preattn_requant_div", + "normalizer_requant_div", + "postattn_requant_div", + "wo_requant_div", + "wq_requant_div", + "wk_requant_div", + "wv_requant_div", + ] + + for attr in log2Attrs: + value = self.operatorRepresentation[attr] + assert isinstance( + value, int) and value > 0, f"Attribute {attr} must be a positive integer. Received value {value}" + self.operatorRepresentation[attr] = int(math.log2(value)) return ret @@ -1544,15 +1491,7 @@ def parseNode(self, node: gs.Node) -> (bool): ]) if ret: - self.operatorRepresentation['Delta'] = int(node.attrs['Delta']) - self.operatorRepresentation['eps'] = int(node.attrs['eps']) - self.operatorRepresentation['eta'] = int(node.attrs['eta']) - self.operatorRepresentation['act_type'] = int(node.attrs['act_type']) - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) - self.operatorRepresentation['dim'] = int(node.attrs['dim'].values) - self.operatorRepresentation['dim_head'] = int(node.attrs['dim_head'].values) - self.operatorRepresentation['out_dim'] = int(node.attrs['out_dim'].values) - self.operatorRepresentation['heads'] = int(node.attrs['heads'].values) + self.operatorRepresentation.update(node.attrs) return ret @@ -1690,27 +1629,40 @@ def parseNodeCtxt(self, node.inputs.append(zeroTensor) self.operatorRepresentation['C'] = f'{node.name}_C_Tensor' + buffA = ctxt.lookup(node.inputs[0].name) + assert isinstance(buffA, VariableBuffer) + buffB = ctxt.lookup(node.inputs[1].name) + assert isinstance(buffB, VariableBuffer) + buffOut = ctxt.lookup(node.outputs[0].name) + assert isinstance(buffOut, VariableBuffer) + # Store the input and output shapes in the operator representation - self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape) - self.operatorRepresentation['A_shape'] = ctxt.lookup(node.inputs[0].name).shape - self.operatorRepresentation['B_shape'] = ctxt.lookup(node.inputs[1].name).shape - self.operatorRepresentation['data_out_shape'] = ctxt.lookup(node.outputs[0].name).shape + self.operatorRepresentation['size'] = np.prod(buffA.shape) + self.operatorRepresentation['A_shape'] = buffA.shape + self.operatorRepresentation['B_shape'] = buffB.shape + self.operatorRepresentation['data_out_shape'] = buffOut.shape + + if self.operatorRepresentation['transA']: + N_A, M = buffA.shape[-2:] + else: + M, N_A = buffA.shape[-2:] + + if self.operatorRepresentation['transB']: + O, N_B = buffB.shape[-2:] + else: + N_B, O = buffB.shape[-2:] # Store the matrix dimensions in the operator representation - self.operatorRepresentation['M'] = ctxt.lookup( - node.inputs[0].name).shape[(-2 + self.operatorRepresentation['transA'])] - self.operatorRepresentation['N'] = ctxt.lookup( - node.inputs[0].name).shape[(-1 - self.operatorRepresentation['transA'])] - self.operatorRepresentation['O'] = ctxt.lookup( - node.inputs[1].name).shape[(-1 - self.operatorRepresentation['transB'])] + self.operatorRepresentation['M'] = M + self.operatorRepresentation['N'] = N_A + self.operatorRepresentation['O'] = O # SCHEREMO: Assert that reduction dimension is the same on both matrices - ret = ret and (self.operatorRepresentation['N'] == ctxt.lookup( - node.inputs[1].name).shape[-2 + self.operatorRepresentation['transB']]) + ret = ret and N_A == N_B # Check if the batch dimensions are compatible - self.operatorRepresentation['batch_A'] = np.prod(ctxt.lookup(node.inputs[0].name).shape[:-2]) - self.operatorRepresentation['batch_B'] = np.prod(ctxt.lookup(node.inputs[1].name).shape[:-2]) + self.operatorRepresentation['batch_A'] = np.prod(buffA.shape[:-2]) + self.operatorRepresentation['batch_B'] = np.prod(buffB.shape[:-2]) self.operatorRepresentation['batch'] = max(self.operatorRepresentation['batch_A'], self.operatorRepresentation['batch_B']) @@ -1722,10 +1674,10 @@ def parseNodeCtxt(self, ), "Incompatible dimensions for input matrices. Broadcasting not yet supported for dimensions larger than 1 on one of the inputs, or equal dimensions between the 2." # Create flags for same dimension between each input matrix and the final batch dimension - self.operatorRepresentation['A_batched'] = (self.operatorRepresentation['batch'] == np.prod( - ctxt.lookup(node.inputs[0].name).shape[:-2])) + self.operatorRepresentation['A_batched'] = ( + self.operatorRepresentation['batch'] == self.operatorRepresentation['batch_A']) self.operatorRepresentation['W_batched'] = self.operatorRepresentation['B_batched'] = ( - self.operatorRepresentation['batch'] == np.prod(ctxt.lookup(node.inputs[1].name).shape[:-2])) + self.operatorRepresentation['batch'] == self.operatorRepresentation['batch_B']) return ctxt, ret @@ -2395,32 +2347,12 @@ def parseNode(self, node: gs.Node) -> bool: ]) if ret: - if 'rqs1_n_levels' in node.attrs: - self.operatorRepresentation['rqs1_n_levels'] = int(node.attrs['rqs1_n_levels'].values) - else: - self.operatorRepresentation['rqs1_n_levels'] = int(node.attrs['rqs1_n_levels_out'].values) - self.operatorRepresentation['rqs1_mul'] = int(node.attrs['rqs1_mul']) - self.operatorRepresentation['rqs1_add'] = int(node.attrs['rqs1_add']) - self.operatorRepresentation['rqs1_signed'] = int(node.attrs['rqs1_signed'].values) - self.operatorRepresentation['rqs1_log2D'] = int(math.log2(node.attrs['rqs1_div'].values)) - - if 'rqs2_n_levels' in node.attrs: - self.operatorRepresentation['rqs2_n_levels'] = int(node.attrs['rqs2_n_levels'].values) - else: - self.operatorRepresentation['rqs2_n_levels'] = int(node.attrs['rqs2_n_levels_out'].values) - self.operatorRepresentation['rqs2_mul'] = int(node.attrs['rqs2_mul']) - self.operatorRepresentation['rqs2_add'] = int(node.attrs['rqs2_add']) - self.operatorRepresentation['rqs2_signed'] = int(node.attrs['rqs2_signed'].values) - self.operatorRepresentation['rqs2_log2D'] = int(math.log2(node.attrs['rqs2_div'].values)) - - if 'rqsOut_n_levels' in node.attrs: - self.operatorRepresentation['rqsOut_n_levels'] = int(node.attrs['rqsOut_n_levels'].values) - else: - self.operatorRepresentation['rqsOut_n_levels'] = int(node.attrs['rqsOut_n_levels_out'].values) - self.operatorRepresentation['rqsOut_mul'] = int(node.attrs['rqsOut_mul']) - self.operatorRepresentation['rqsOut_add'] = int(node.attrs['rqsOut_add']) - self.operatorRepresentation['rqsOut_signed'] = int(node.attrs['rqsOut_signed'].values) - self.operatorRepresentation['rqsOut_log2D'] = int(math.log2(node.attrs['rqsOut_div'].values)) + self.operatorRepresentation.update(node.attrs) + + for tensor in ["rqs1", "rqs2", "rqsOut"]: + value = self.operatorRepresentation[f"{tensor}_div"] + assert isinstance(value, int) + self.operatorRepresentation[f"{tensor}_log2D"] = int(math.log2(value)) return ret @@ -2488,12 +2420,10 @@ def parseNode(self, node: gs.Node) -> bool: ]) if ret: - self.operatorRepresentation['scale'] = float(node.attrs['scale']) - self.operatorRepresentation['zero_point'] = float(node.attrs['zero_point']) - self.operatorRepresentation['bit_width'] = int(node.attrs['bit_width']) - - self.operatorRepresentation['signed'] = bool(node.attrs['signed']) - + self.operatorRepresentation['scale'] = node.attrs['scale'] + self.operatorRepresentation['zero_point'] = node.attrs['zero_point'] + self.operatorRepresentation['bit_width'] = node.attrs['bit_width'] + self.operatorRepresentation['signed'] = node.attrs['signed'] return ret def parseNodeCtxt(self, diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index a15b3db2e..69bf41a83 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -11,9 +11,9 @@ BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \ BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicQuantBindings, BasicReduceMeanBindings, BasicReduceSumBindings, \ - BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, \ - BasicSliceBindings, BasicSoftmaxBindings, BasicTransposeBindings, DummyBinding + BasicMulScalarBindings, BasicPad1DBindings, BasicPad2DBindings, BasicQuantBindings, BasicReduceMeanBindings, \ + BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \ + BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicTransposeBindings, DummyBinding from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \ LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \ @@ -23,9 +23,9 @@ DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \ GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \ IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \ - Pad1DParser, Pad2DParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, \ - ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, TransposeParser, UnsqueezeParser, \ - iLayerNormParser, iSoftmaxParser + MulScalarParser, Pad1DParser, Pad2DParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \ + RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, \ + TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -52,6 +52,7 @@ MaxPoolMapper = NodeMapper(GenericMaxPool2DParser(), BasicMaxPool2DBindings) MaxPool1DMapper = NodeMapper(MaxPool1DParser(), BasicMaxPool1DBindings) MulMapper = NodeMapper(MulParser(), BasicMulBindings) +MulScalarMapper = NodeMapper(MulScalarParser(), BasicMulScalarBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings) @@ -97,7 +98,7 @@ 'MatMul': GEMMLayer([MatMulMapper]), 'MatMulInteger': MatMulLayer([MatMulMapper]), 'MaxPool': MaxPoolLayer([MaxPool1DMapper, MaxPoolMapper]), - 'Mul': MulLayer([MulMapper]), + 'Mul': MulLayer([MulScalarMapper, MulMapper]), 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), 'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]), 'ReduceSum': ReduceSumLayer([ReduceSumMapper]), diff --git a/Deeploy/Targets/Generic/Templates/AddTemplate.py b/Deeploy/Targets/Generic/Templates/AddTemplate.py index 75c16ac42..2376e7b6b 100644 --- a/Deeploy/Targets/Generic/Templates/AddTemplate.py +++ b/Deeploy/Targets/Generic/Templates/AddTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _AddTemplate(NodeTemplate): +class _AddTemplate(ElementwiseTemplate): def alignToContext(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: diff --git a/Deeploy/Targets/Generic/Templates/ConvTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTemplate.py index 51f292dca..1966e4889 100644 --- a/Deeploy/Targets/Generic/Templates/ConvTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ConvTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import ConvTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _Conv2D_Template(NodeTemplate): +class _Conv2D_Template(ConvTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py index 69bea8484..b5537ff83 100644 --- a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py @@ -2,42 +2,42 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from Deeploy.CommonExtensions.NodeTemplate import GemmTemplate -referenceTemplate = NodeTemplate(""" +referenceTemplate = GemmTemplate(""" // GEMM (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${C_type.typeName} ref_${data_out}_${C} = ${C}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${C_type.typeName} ref_${nodeName}_${C} = ${C}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0; i<${batch}; i++){ Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, - ${transA}, - ${transB} + ${int(transA)}, + ${int(transB)} ); % if A_batched: - ref_${data_out}_${A} += ${M} * ${N}; + ref_${nodeName}_${A} += ${M} * ${N}; % endif % if B_batched: - ref_${data_out}_${B} += ${N} * ${O}; + ref_${nodeName}_${B} += ${N} * ${O}; % endif % if C_batched: - ref_${data_out}_${C} += ${M} * ${O}; + ref_${nodeName}_${C} += ${M} * ${O}; % endif - ref_${data_out}_${data_out} += ${M} * ${O}; + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) \ No newline at end of file diff --git a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMulScalarTemplate.py similarity index 68% rename from Deeploy/Targets/Generic/Templates/FloatMulTemplate.py rename to Deeploy/Targets/Generic/Templates/FloatMulScalarTemplate.py index 3c8c2da50..03aea61c4 100644 --- a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatMulScalarTemplate.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseScalarTemplate -referenceTemplate = NodeTemplate(""" +referenceTemplate = ElementwiseScalarTemplate(""" // Float Mul (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE for (uint32_t i=0;i<${size};i++){ diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py index 005b0b889..7dbcaed26 100644 --- a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py @@ -18,10 +18,10 @@ def alignToContext(self, ctxt: NetworkContext, data_in = ctxt.lookup(operatorRepresentation['data_in']) data_out = ctxt.lookup(operatorRepresentation['data_out']) operatorRepresentation['input_offset'] = 0 - if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"): + if data_in._signed is not None and data_in.nLevels is not None: operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2) operatorRepresentation['output_offset'] = 0 - if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"): + if data_out._signed is not None and data_out.nLevels is not None: operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_in.nLevels / 2) return ctxt, operatorRepresentation, [] diff --git a/Deeploy/Targets/Generic/Templates/GemmTemplate.py b/Deeploy/Targets/Generic/Templates/GemmTemplate.py index 62d760d15..eae375e55 100644 --- a/Deeploy/Targets/Generic/Templates/GemmTemplate.py +++ b/Deeploy/Targets/Generic/Templates/GemmTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import GemmTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _GemmTemplate(NodeTemplate): +class _GemmTemplate(GemmTemplate): def __init__(self, templateStr): super().__init__(templateStr) @@ -40,34 +41,43 @@ def alignToContext(self, ctxt: NetworkContext, referenceTemplate = _GemmTemplate(""" // GEMM (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE - ${A_type.typeName} ref_${data_out}_${A} = ${A}; - ${B_type.typeName} ref_${data_out}_${B} = ${B}; - ${C_type.typeName} ref_${data_out}_${C} = ${C}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + ${A_type.typeName} ref_${nodeName}_${A} = ${A}; + ${B_type.typeName} ref_${nodeName}_${B} = ${B}; + ${C_type.typeName} ref_${nodeName}_${C} = ${C}; + ${data_out_type.typeName} ref_${nodeName}_${data_out} = ${data_out}; for(uint32_t i=0;i<${batch};i++){ Gemm_s${A_type.referencedType.typeWidth}_s${B_type.referencedType.typeWidth}_s${C_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}( - ref_${data_out}_${A}, - ref_${data_out}_${B}, - ref_${data_out}_${C}, - ref_${data_out}_${data_out}, + ref_${nodeName}_${A}, + ref_${nodeName}_${B}, + ref_${nodeName}_${C}, + ref_${nodeName}_${data_out}, ${M}, ${N}, ${O}, ${alpha}, ${beta}, - ${transA}, - ${transB}, + ${int(transA)}, + ${int(transB)}, ${A_offset}, ${B_offset}, ${C_offset}, ${Y_offset} ); - ref_${data_out}_${A} += ${M} * ${N}; - ref_${data_out}_${B} += ${N} * ${O}; - ref_${data_out}_${C} += ${M} * ${O}; - ref_${data_out}_${data_out} += ${M} * ${O}; + % if A_batched: + ref_${nodeName}_${A} += ${M} * ${N}; + % endif + + % if B_batched: + ref_${nodeName}_${B} += ${N} * ${O}; + % endif + + % if C_batched: + ref_${nodeName}_${C} += ${M} * ${O}; + % endif + + ref_${nodeName}_${data_out} += ${M} * ${O}; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/MulTemplate.py b/Deeploy/Targets/Generic/Templates/MulTemplate.py index 5709eef4b..0db4c6ce6 100644 --- a/Deeploy/Targets/Generic/Templates/MulTemplate.py +++ b/Deeploy/Targets/Generic/Templates/MulTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _MulTemplate(NodeTemplate): +class _MulTemplate(ElementwiseTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/Templates/RQAddTemplate.py b/Deeploy/Targets/Generic/Templates/RQAddTemplate.py index 35593ad13..bf4e9d0a0 100644 --- a/Deeploy/Targets/Generic/Templates/RQAddTemplate.py +++ b/Deeploy/Targets/Generic/Templates/RQAddTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class RQAddTemplate(NodeTemplate): +class RQAddTemplate(ElementwiseTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py b/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py index 2fca2e0eb..5518c6300 100644 --- a/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py +++ b/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantShiftTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _RequantShiftTemplate(NodeTemplate): +class _RequantShiftTemplate(RequantShiftTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py index e87f9abb6..487c18e11 100644 --- a/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py @@ -6,4 +6,6 @@ class AddTileConstraint(BOPTileConstraint): - pass + dataIn1Name = "data_in_1" + dataIn2Name = "data_in_2" + dataOutName = "data_out" diff --git a/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py index e1f6f0e71..a2d7da1a9 100644 --- a/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from typing import Dict, List, Tuple import numpy as np @@ -12,16 +13,17 @@ from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TileConstraint import TileConstraint from Deeploy.TilingExtension.TilerModel import TilerModel -from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme class BOPTileConstraint(TileConstraint): """Tile constraint class for binary operators, i.e. operators that use two input tensors of equal dimensions """ - dataIn1Name = 'data_in_1' #: str: Name of the first input tensor as defined by the operator's parser - dataIn2Name = 'data_in_2' #: str: Name of the second input tensor as defined by the operator's parser - dataOutName = 'data_out' #: str: Name of the output tensor as defined by the operator's parser + dataIn1Name: str # Name of the first input tensor as defined by the operator's descripter + dataIn2Name: str # Name of the second input tensor as defined by the operator's descripter + dataOutName: str # Name of the output tensor as defined by the operator's descripter @classmethod def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -34,6 +36,15 @@ def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: tilerModel.addTensorDimToModel(ctxt, bufferName) input1Shape = ctxt.lookup(inputBuffer1Name).shape + input2Shape = ctxt.lookup(inputBuffer2Name).shape + outputShape = ctxt.lookup(outputBufferName).shape + + assert len(input1Shape) == len( + input2Shape + ), f"[{cls.__name__}] Input shape ranks differ. Shape input1: {input1Shape} vs. input2: {input2Shape}" + assert len(input1Shape) == len( + outputShape + ), f"[{cls.__name__}] Input and output shape ranks differ. Shape input: {input1Shape} vs. output: {outputShape}" for dim in range(len(input1Shape)): inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) @@ -77,3 +88,68 @@ def serializeTilingSolution( variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) return variableReplacementSchedule, tilingSchedule + + +class BOPScalarTileConstraint(TileConstraint): + """Tile constraint class for binary operators, i.e. operators that use two input tensors of equal dimensions + """ + + dataIn1Name: str # Name of the first input tensor as defined by the operator's descripter + dataIn2Name: str # Name of the second input tensor as defined by the operator's descripter + dataOutName: str # Name of the output tensor as defined by the operator's descripter + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBuffer1Name = parseDict[cls.dataIn1Name] + inputBuffer2Name = parseDict[cls.dataIn2Name] + outputBufferName = parseDict[cls.dataOutName] + + for bufferName in [inputBuffer1Name, inputBuffer2Name, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + input1Shape = ctxt.lookup(inputBuffer1Name).shape + input2Shape = ctxt.lookup(inputBuffer2Name).shape + assert math.prod(input2Shape) == 1, f"Expecting the second operand to be a scalar" + + for dim in range(len(input1Shape)): + inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(inputDim1Var == outputDimVar) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + + replacementTypes = {"size": PointerClass(uint16_t)} + + for cube in outputCubes: + newSize = np.prod(cube.dims) + replacements["size"].append(newSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + # TODO: Optimize to not fetch dataIn2 + scalarCube = HyperRectangle((0,), (1,)) + for cube in outputCubes: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: scalarCube}) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py index 9f71012ff..178890954 100644 --- a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py @@ -2,10 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 -from .BOPTileConstraint import BOPTileConstraint +from .BOPTileConstraint import BOPScalarTileConstraint, BOPTileConstraint class MulTileConstraint(BOPTileConstraint): dataIn1Name = "A" dataIn2Name = "B" dataOutName = "C" + + +class MulScalarTileConstraint(BOPScalarTileConstraint): + dataIn1Name = "A" + dataIn2Name = "B" + dataOutName = "C" diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py index b881529f7..09ed0b6c7 100644 --- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py @@ -353,44 +353,49 @@ def __init__(self): super().__init__(graph, _split_add_fun, name) -def _extract_padding_fun_conv(graph: gs.Graph, match: Match, name: str, value = 0): +def _extract_padding_fun_conv(graph: gs.Graph, match: Match, name: str, value = 0) -> gs.Graph: + conv = list(match.nodes_map.values())[0] - matched_nodes = [m for k, m in match.nodes_map.items()] - conv = matched_nodes[0] - if 'pads' in conv.attrs and np.sum(conv.attrs['pads']) > 1: - pads = copy.deepcopy(conv.attrs['pads']) - shape = copy.deepcopy(conv.inputs[0].shape) - newPads = np.zeros(2 * len(shape)) - assert len(shape) - 2 == len(pads) / 2, "Conv padding dims do not match!" - newShape = shape + if 'pads' not in conv.attrs: + return graph - beginPads = pads[0:len(pads) // 2] - endPads = pads[len(pads) // 2:] - for idx, i in enumerate(beginPads): - newShape[2 + idx] = newShape[2 + idx] + i - newPads[2 + idx] = i + convPads = conv.attrs['pads'] - for idx, i in enumerate(endPads): - newShape[2 + idx] = newShape[2 + idx] + i - newPads[len(newPads) // 2 + 2 + idx] = i + if all(p == 0 for p in convPads): + return graph - newConvInput = gs.Variable(name + '_padded_input', dtype = np.float32, shape = newShape) - #valConst = gs.Constant('value', np.array(0)) - conv.attrs['pads'] = [0 for pad in conv.attrs['pads']] - newPad = gs.Node(op = 'Pad', - name = name + '_pad', - attrs = { - 'pads': newPads, - 'mode': 'constant', - 'value': value - }, - inputs = [conv.inputs[0]], - outputs = [newConvInput]) + inTensor = conv.inputs[0] + assert isinstance(inTensor, gs.Variable) + convShape = inTensor.shape - conv.inputs[0] = newConvInput - graph.nodes.append(newPad) - graph.cleanup().toposort() + beginConvPads = convPads[0:len(convPads) // 2] + endConvPads = convPads[len(convPads) // 2:] + + nonSpatialDimCount = len(convShape) - (len(convPads) // 2) + pads = [0] * nonSpatialDimCount + beginConvPads + [0] * nonSpatialDimCount + endConvPads + shape = [] + for dim, begin, end in zip(convShape, pads[:len(pads) // 2], pads[len(pads) // 2:]): + shape.append(begin + dim + end) + + paddedInput = gs.Variable(f"{name}_{inTensor.name}", dtype = np.float32, shape = shape) + + newPad = gs.Node(op = 'Pad', + name = name + '_pad', + attrs = { + 'pads': pads, + 'mode': 'constant', + 'value': value + }, + inputs = [conv.inputs[0]], + outputs = [paddedInput]) + + graph.nodes.append(newPad) + + conv.attrs['pads'] = [0] * len(convPads) + conv.inputs[0] = paddedInput + + graph.cleanup().toposort() return graph diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f..2e81f259f 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -185,10 +185,8 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[ def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: - return [ - 2**((self.input_types[0].referencedType.typeWidth) * 2) * - inputs[0].shape[-1 - operatorRepresentation['transA']] - ] + O = inputs[0].shape[-1] if not operatorRepresentation['transA'] else inputs[0].shape[-2] + return [2**((self.input_types[0].referencedType.typeWidth) * 2) * O] def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -368,23 +366,6 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] -class iNoNormChecker(SignPropTypeChecker): - - def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): - super().__init__(input_types, output_types) - - def _inferNumLevels(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(4 * self.input_types[0].referencedType.typeWidth)] - - def _inferSignedness(self, inputs: List[VariableBuffer], - operatorRepresentation: OperatorRepresentation) -> List[bool]: - if inputs[0]._signed: - return [True] - else: - return [False] - - class GELUChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): @@ -493,6 +474,10 @@ class DummyChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): super().__init__(input_types, output_types) + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [] + def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: return [2**(self.input_types[0].referencedType.typeWidth)] diff --git a/Deeploy/Targets/MemPool/Deployer.py b/Deeploy/Targets/MemPool/Deployer.py index 543132097..968787972 100644 --- a/Deeploy/Targets/MemPool/Deployer.py +++ b/Deeploy/Targets/MemPool/Deployer.py @@ -11,7 +11,7 @@ from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintMergePass from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ NCHWtoNHWCPass, TransposeMatmulInputsPass -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass @@ -22,12 +22,13 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first: bool = True, deeployStateDir: str = "DeeployState", inputOffsets: Dict[str, int] = {}): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir) self.inputOffsets = inputOffsets diff --git a/Deeploy/Targets/MemPool/Platform.py b/Deeploy/Targets/MemPool/Platform.py index 48599736f..478d51422 100644 --- a/Deeploy/Targets/MemPool/Platform.py +++ b/Deeploy/Targets/MemPool/Platform.py @@ -10,18 +10,19 @@ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConv1DBindings, BasicConv2DBindings, \ BasicDebugPrintBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \ - BasicGELUBindings, BasicLayerNormBindings, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, \ - BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, BasicRQIntegerDivBinding, \ - BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicTransposeBindings, DummyBinding + BasicGELUBindings, BasicLayerNormBindings, BasicMulBindings, BasicMulScalarBindings, BasicPad1DBindings, \ + BasicPad2DBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, \ + BasicRQIntegerDivBinding, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicTransposeBindings, \ + DummyBinding from Deeploy.Targets.Generic.Layers import AddLayer, ConvLayer, DebugPrintLayer, DivLayer, GatherLayer, GELULayer, \ GEMMLayer, ITAMaxLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MHSALayer, MulLayer, PadLayer, ReduceMeanLayer, \ ReduceSumLayer, RequantShiftLayer, ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, RQMatMulLayer, RQSiGELULayer, \ SliceLayer, SoftmaxLayer, TransposeLayer from Deeploy.Targets.Generic.Parsers import AddParser, DebugParser, DummyParser, FlattenParser, GatherParser, \ GELUParser, GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, \ - GenericGEMMParser, GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, MatMulParser, MulParser, Pad1DParser, \ - Pad2DParser, ReduceMeanParser, ReduceSumParser, RequantShiftParser, ReshapeParser, RQGEMMParser, \ - RQIntegerDivParser, RQMatMulParser, RQSiGELUParser, SliceParser, TransposeParser, UnsqueezeParser, \ + GenericGEMMParser, GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, MatMulParser, MulParser, \ + MulScalarParser, Pad1DParser, Pad2DParser, ReduceMeanParser, ReduceSumParser, RequantShiftParser, ReshapeParser, \ + RQGEMMParser, RQIntegerDivParser, RQMatMulParser, RQSiGELUParser, SliceParser, TransposeParser, UnsqueezeParser, \ iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, SplitAddPass, iGELURequantMergePass @@ -54,6 +55,7 @@ IntegerDiv_Mapper = NodeMapper(IntegerDivParser(), BasicDivBindings) ITAMaxMapper = NodeMapper(ITAMaxParser(), [MemPoolITASoftmaxBinding_8_8]) Mul_Mapper = NodeMapper(MulParser(), BasicMulBindings) +MulScalar_Mapper = NodeMapper(MulScalarParser(), BasicMulScalarBindings) Pad1D_Mapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2D_Mapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) ReduceMean_Mapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings) @@ -108,7 +110,7 @@ 'MatMulInteger': MatMulLayer([MatMul_Mapper]), 'MaxPool': MaxPoolLayer([MaxPool_Mapper]), 'MHSA': MHSALayer(MHSA_Mappers), - 'Mul': MulLayer([Mul_Mapper]), + 'Mul': MulLayer([MulScalar_Mapper, Mul_Mapper]), 'Pad': PadLayer([Pad1D_Mapper, Pad2D_Mapper]), 'ReduceMean': ReduceMeanLayer([ReduceMean_Mapper]), 'ReduceSum': ReduceSumLayer([ReduceSum_Mapper]), diff --git a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py index e5d53bd25..54cc86f6a 100644 --- a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py @@ -127,8 +127,8 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${O}, ${alpha}, ${beta}, - ${transA}, - ${transB}, + ${int(transA)}, + ${int(transB)}, ${A_offset}, ${B_offset}, ${C_offset}, diff --git a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py index e6a42768e..45f6a1e77 100644 --- a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py @@ -2,19 +2,21 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Tuple +from typing import Dict, List, Sequence, Tuple + +import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation -class _RQGemmTemplate(NodeTemplate, OperatorRepresentation): +class _RQGemmTemplate(NodeTemplate): def __init__(self, templateStr): super().__init__(templateStr) - def alignToContext(self, ctxt: NetworkContext, - operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]: - + def alignToContext( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]: A = ctxt.lookup(operatorRepresentation['A']) B = ctxt.lookup(operatorRepresentation['B']) C = ctxt.lookup(operatorRepresentation['C']) @@ -79,6 +81,16 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, names + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + inShapes, outShapes = [t.shape for t in node.inputs], [t.shape for t in node.outputs] + # rqs bias + inShapes[2] = outShapes[0][-2:] + # rqs add + inShapes[3] = (1,) + # rqs mul + inShapes[4] = (1,) + return inShapes, outShapes + MemPoolParallelTemplate = _RQGemmTemplate(""" <% @@ -145,8 +157,8 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${O}, ${alpha}, ${beta}, - ${transA}, - ${transB}, + ${int(transA)}, + ${int(transB)}, ${mul}, ${add}, ${log2Dstring}, @@ -170,8 +182,8 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${O}, ${alpha}, ${beta}, - ${transA}, - ${transB}, + ${int(transA)}, + ${int(transB)}, ${mul}, ${add}, ${log2Dstring}, diff --git a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py index 76ad029fb..b38408952 100644 --- a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py @@ -2,19 +2,21 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Tuple +from typing import Dict, List, Sequence, Tuple + +import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation -class _RQMatMulTemplate(NodeTemplate, OperatorRepresentation): +class _RQMatMulTemplate(NodeTemplate): def __init__(self, templateStr): super().__init__(templateStr) - def alignToContext(self, ctxt: NetworkContext, - operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]: - + def alignToContext( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, OperatorRepresentation, List[str]]: A = ctxt.lookup(operatorRepresentation['A']) B = ctxt.lookup(operatorRepresentation['B']) data_out = ctxt.lookup(operatorRepresentation['data_out']) @@ -74,6 +76,14 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, names + def alignShapes(self, node: gs.Node) -> Tuple[List[Sequence[int]], List[Sequence[int]]]: + inShapes, outShapes = [t.shape for t in node.inputs], [t.shape for t in node.outputs] + # rqs mul + inShapes[2] = (1,) + # rqs add + inShapes[3] = (1,) + return inShapes, outShapes + MemPoolParallelTemplate = _RQMatMulTemplate(""" <% diff --git a/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py b/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py index 7898790af..a43fe7755 100644 --- a/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py +++ b/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantShiftTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _RequantShiftTemplate(NodeTemplate): +class _RequantShiftTemplate(RequantShiftTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py index 49f317caa..46bad04ce 100644 --- a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py @@ -289,7 +289,7 @@ def get_constant_input_or_zeros(n: gs.Node, shape): name = name + "_sum", attrs = { 'axes': [1], - "keepdims": "0" + "keepdims": 0 }) mhsa_out[0].shape = [_output.shape[0]] + [int(H)] + _output.shape[1:] diff --git a/Deeploy/Targets/Neureka/Deployer.py b/Deeploy/Targets/Neureka/Deployer.py index be34e1f4d..6d96f8d09 100644 --- a/Deeploy/Targets/Neureka/Deployer.py +++ b/Deeploy/Targets/Neureka/Deployer.py @@ -9,7 +9,7 @@ from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ NCHWtoNHWCPass, PULPNCHWtoNHWCPass -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer from Deeploy.Targets.Neureka.TopologyOptimizationPasses.Passes import ConvEngineDiscolorationPass, \ NeurekaOptimizationPass from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer @@ -22,12 +22,13 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda graph: list(graph.nodes), name: str = 'DeeployNetwork', default_channels_first = False, deeployStateDir: str = "DeeployStateDir", inputOffsets = {}): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir, inputOffsets) if self.Platform.engines[0].enable3x3: diff --git a/Deeploy/Targets/Neureka/Parsers.py b/Deeploy/Targets/Neureka/Parsers.py index 3c564c10b..d0d0d7e91 100644 --- a/Deeploy/Targets/Neureka/Parsers.py +++ b/Deeploy/Targets/Neureka/Parsers.py @@ -18,9 +18,9 @@ def parseNode(self, node: gs.Node) -> bool: if not all([ # No dilation support - self.operatorRepresentation['dilations'] == [1, 1], + self.operatorRepresentation['dilations'] == (1, 1), # Channels have to be last - 'channels_first' in self.operatorRepresentation and not self.operatorRepresentation['channels_first'], + 'channels_first' in node.attrs and not node.attrs['channels_first'], # Expect "weight_offset" attribute in the node "weight_offset" in node.attrs, ]): @@ -87,7 +87,7 @@ def parseNode(self, node: gs.Node) -> bool: ch_im_in = node.inputs[1].shape[1] if not all([ - self.operatorRepresentation['kernel_shape'] == [3, 3], + self.operatorRepresentation['kernel_shape'] == (3, 3), self.operatorRepresentation['group'] == ch_im_out, self.operatorRepresentation['group'] == ch_im_in, ]): @@ -129,7 +129,7 @@ def parseNode(self, node: gs.Node) -> bool: return False if not all([ - self.operatorRepresentation['kernel_shape'] == [1, 1], + self.operatorRepresentation['kernel_shape'] == (1, 1), self.operatorRepresentation['group'] == 1, ]): return False @@ -169,7 +169,7 @@ def parseNode(self, node: gs.Node) -> bool: return False if not all([ - self.operatorRepresentation['kernel_shape'] == [3, 3], + self.operatorRepresentation['kernel_shape'] == (3, 3), self.operatorRepresentation['group'] == 1, ]): return False diff --git a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py index 97253d6e1..aebe884ca 100644 --- a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py +++ b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py @@ -7,7 +7,8 @@ import numpy as np -from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantizedConvTemplate +from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, OperatorRepresentation def _getNumTiles(fullDim: int, tileDim: int) -> int: @@ -47,7 +48,7 @@ def getInputAddrOffset(width_in: int, width_in_stride: int, padding_top: int, pa return (padding_top * width_in + padding_left) * width_in_stride -class NeurekaConvTemplate(NodeTemplate): +class NeurekaConvTemplate(RequantizedConvTemplate): def __init__(self, templateStr: str): super().__init__(templateStr) diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 9ff940b2f..c7d463a3f 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -9,8 +9,8 @@ from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration, MemoryPassthroughGeneration -from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t, \ - uint8_t +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \ + int8_t, int32_t, int64_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration @@ -153,11 +153,8 @@ ] PULPReshapeBindings = [ - NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]), - ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes -] + [ - NodeBinding(ReshapeChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), - ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]), + ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes ] PULPRQAddBindings = [ @@ -376,7 +373,9 @@ NodeBinding(MulChecker([PointerClass(typeA), PointerClass(typeB)], [PointerClass(int32_t)]), MulTemplate.referenceTemplate, ForkTransformer) for typeA, typeB in itertools.product(SignedIntegerDataTypes, SignedIntegerDataTypes) -] + [ +] + +PULPMulScalarBindings = [ NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), FloatMulTemplate.referenceTemplate, ForkTransformer) ] diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py index 86bf02e57..17412c8da 100644 --- a/Deeploy/Targets/PULPOpen/Deployer.py +++ b/Deeploy/Targets/PULPOpen/Deployer.py @@ -12,7 +12,8 @@ from Deeploy.CommonExtensions.OptimizationPasses.BindingsOptimizationPasses.AutoTranspose import AutoTransposeMergePass from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ PULPNCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass -from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer +from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, OperatorDescriptor, \ + TopologyOptimizer, VariableBuffer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass @@ -33,6 +34,7 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first = False, @@ -42,6 +44,7 @@ def __init__(self, deploymentPlatform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name, default_channels_first = default_channels_first, diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index e94af6e42..b28fb86a2 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -133,13 +133,9 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][0]) - if 'n_levels' in node.attrs: - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) - else: - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels_out'].values) - - self.operatorRepresentation['signed'] = int(node.attrs['signed'].values) - self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'].values)) + self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] + self.operatorRepresentation['signed'] = node.attrs['signed'] + self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'])) return ret def parseNodeCtxt(self, @@ -206,12 +202,9 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) - if 'n_levels' in node.attrs: - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels'].values) - else: - self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels_out'].values) - self.operatorRepresentation['signed'] = int(node.attrs['signed'].values) - self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'].values)) + self.operatorRepresentation['n_levels'] = node.attrs['n_levels'] + self.operatorRepresentation['signed'] = node.attrs['signed'] + self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['div'])) return ret return False @@ -349,41 +342,16 @@ def parseNodeCtxt(self, class PULPMatrixVecParser(PULPGEMMParser): - def parseNodeCtxt(self, - ctxt: NetworkContext, - node: gs.Node, - channels_first: bool = True) -> Tuple[NetworkContext, bool]: - - newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) - - if not ret: - return ctxt, False - - if not (self.operatorRepresentation['M'] == 1 and self.operatorRepresentation['batch'] >= 8): - return ctxt, False - - return newCtxt, True + def parseNode(self, node: gs.Node) -> bool: + M = node.inputs[0].shape[-1 if node.attrs["transA"] else -2] + batch = math.prod(node.inputs[0].shape[:-2]) + return super().parseNode(node) and M == 1 and batch >= 8 class PULPTallGEMMParser(PULPGEMMParser): - def parseNodeCtxt(self, - ctxt: NetworkContext, - node: gs.Node, - channels_first: bool = True) -> Tuple[NetworkContext, bool]: - - newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) - - if not ret: - return ctxt, False - - ret = all([ - self.operatorRepresentation['batch'] < 8, - self.operatorRepresentation['M'] >= 8, - self.operatorRepresentation['M'] % 8 < self.operatorRepresentation['O'] % 8, - ]) - - if not ret: - return ctxt, False - - return newCtxt, True + def parseNode(self, node: gs.Node) -> bool: + M = node.inputs[0].shape[-1 if node.attrs["transA"] else -2] + N = node.inputs[1].shape[-2 if node.attrs["transB"] else -1] + batch = math.prod(node.inputs[0].shape[:-2]) + return super().parseNode(node) and M >= 8 and (M % 8) < (N % 8) and batch < 8 diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 99c1c9335..293f40deb 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -17,9 +17,9 @@ SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ TransposeLayer, iHardswishLayer, iRMSNormLayer from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ - GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \ - QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \ - RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ + GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, MulScalarParser, Pad1DParser, \ + Pad2DParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, \ + RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \ TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate @@ -36,13 +36,14 @@ PULPConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUTilingReadyBindings, \ PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, \ PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormTilingReadyBindings, \ - PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \ - PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, \ - PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \ - PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \ - PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ - PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ - PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings + PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulScalarTilingReadyBindings, \ + PULPMulTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, \ + PULPRQAddTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, \ + PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, \ + PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \ + PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \ + PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \ + PULPUniformRQSTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass @@ -52,6 +53,7 @@ GELUMapper = NodeMapper(GELUParser(), PULPFPGELUTilingReadyBindings) GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings) MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings) +MulScalarMapper = NodeMapper(MulScalarParser(), PULPMulScalarTilingReadyBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) ReshapeMapper = NodeMapper(ReshapeParser(), PULPFlattenTilingReadyBindings) @@ -118,7 +120,7 @@ 'Add': AddLayer([AddMapper]), 'Flatten': ReshapeLayer([FlattenMapper]), 'Gather': GatherLayer([GatherMapper]), - 'Mul': MulLayer([MulMapper]), + 'Mul': MulLayer([MulMapper, MulScalarMapper]), 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), 'Relu': ReluLayer([ReluMapper]), 'Reshape': ReshapeLayer([ReshapeMapper]), diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py index ebc614f47..85414c86e 100644 --- a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py @@ -6,10 +6,11 @@ from ortools.constraint_solver.pywrapcp import IntVar -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantizedConvTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class PULP2DConvTemplate(NodeTemplate): +class PULP2DConvTemplate(RequantizedConvTemplate): def __init__(self, templateStr): super().__init__(templateStr) @@ -63,7 +64,7 @@ def alignToContext(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, [] -class PULP1DConvTemplate(NodeTemplate): +class PULP1DConvTemplate(RequantizedConvTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py index 7f1c2e21c..a6dd731dd 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseTemplate -referenceTemplate = NodeTemplate(""" +referenceTemplate = ElementwiseTemplate(""" // Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp}) int8_t ${nodeName}_core_id = pi_core_id(); int8_t ${nodeName}_log2Core = log2(NUM_CORES); diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index f4c22b2c2..17b8ec736 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from Deeploy.CommonExtensions.NodeTemplate import GemmTemplate -referenceTemplate = NodeTemplate(""" +referenceTemplate = GemmTemplate(""" // GEMM (Name: ${nodeName}, Op: ${nodeOp}) ${A_type.typeName} ref_${data_out}_${A} = ${A}; ${B_type.typeName} ref_${data_out}_${B} = ${B}; @@ -20,8 +20,8 @@ ${M}, ${N}, ${O}, - ${transA}, - ${transB} + ${int(transA)}, + ${int(transB)} ); ref_${data_out}_${A} += ${M} * ${N}; diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py index 2f202b24d..e9927981e 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseScalarTemplate -referenceTemplate = NodeTemplate(""" +referenceTemplate = ElementwiseScalarTemplate(""" // Float Mul with parallelism and 6x unrolling (Name: ${nodeName}, Op: ${nodeOp}) int8_t ${nodeName}_core_id = pi_core_id(); diff --git a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py index 1f7149e1e..26ea28ec6 100644 --- a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple +from Deeploy.CommonExtensions.NodeTemplate import RequantizedGemmTemplate from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation -class PULPGEMMTemplate(NodeTemplate): +class PULPGEMMTemplate(RequantizedGemmTemplate): def __init__(self, templateStr): super().__init__(templateStr) @@ -41,7 +42,7 @@ def alignToContext(self, ctxt: NetworkContext, else: signatureString += '_u8' %> -// PULP NN GEMM +// PULP NN GEMM (Name: ${nodeName}, Op: ${nodeOp}) int8_t* ref_${data_out}_${A} = ${A}; int8_t* ref_${data_out}_${B} = ${B}; int8_t* ref_${data_out}_${data_out} = ${data_out}; @@ -82,11 +83,11 @@ def alignToContext(self, ctxt: NetworkContext, operatorRepresentation['B_offset'] = 0 operatorRepresentation['C_offset'] = 0 - if hasattr(A, "nLevels"): + if A.nLevels is not None: operatorRepresentation['A_offset'] = (A._type.referencedType.typeMin == 0) * int(A.nLevels / 2) - if hasattr(B, "nLevels"): + if B.nLevels is not None: operatorRepresentation['B_offset'] = (B._type.referencedType.typeMin == 0) * int(B.nLevels / 2) - if hasattr(C, "nLevels"): + if C.nLevels is not None: operatorRepresentation['C_offset'] = -(C._type.referencedType.typeMin == 0) * int(C.nLevels / 2) return ctxt, operatorRepresentation, [] diff --git a/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py index e4b834861..d5f316421 100644 --- a/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantizedGemmTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _PULPMatrixVectorTemplate(NodeTemplate): +class _PULPMatrixVectorTemplate(RequantizedGemmTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py index 1dbefa328..03a5f7219 100644 --- a/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py @@ -2,14 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseTemplate - -class _MulTemplate(NodeTemplate, OperatorRepresentation): - pass - - -referenceTemplate = _MulTemplate(""" +referenceTemplate = ElementwiseTemplate(""" // Mul (Name: ${nodeName}, Op: ${nodeOp}) int8_t ${nodeName}_core_id = pi_core_id(); diff --git a/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py index 76fd47cfb..2fd75cb16 100644 --- a/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py @@ -4,10 +4,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.CommonExtensions.NodeTemplate import RequantizedGemmTemplate +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation -class _PULPTallGEMMTemplate(NodeTemplate): +class _PULPTallGEMMTemplate(RequantizedGemmTemplate): def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py index c69760df5..156271417 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py @@ -54,12 +54,15 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) - addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0) - mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0) + addBuffer = ctxt.lookup(addBufferName) + addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = len(addBuffer.shape) - 1) + mulBuffer = ctxt.lookup(mulBufferName) + mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = len(mulBuffer.shape) - 1) # Map output dims to inputs dims tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel + tilerModel.addConstraint(inputChannelVar == weightInChannelVar) # Input channel tilerModel.addConstraint(outputChannelVar == addChannelVar) tilerModel.addConstraint(outputChannelVar == mulChannelVar) @@ -88,10 +91,8 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo outputChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0) weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1) weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2) - weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3) strides = parseDict["strides"] - padding = parseDict["pads"] # VIC: Force at least one row of A and one col of B in the GEMM (since it's a im2col Conv) to avoid partial results tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) @@ -101,7 +102,6 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y']) - tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) @@ -174,6 +174,8 @@ def serializeTilingSolution( weightH = ctxt.lookup(varWeight).shape[1] weightW = ctxt.lookup(varWeight).shape[2] weightC = ctxt.lookup(varWeight).shape[3] + shapeMul = ctxt.lookup(operatorRepresentation["mul"]).shape + shapeAdd = ctxt.lookup(operatorRepresentation["add"]).shape pads = operatorRepresentation['pads'] strides = operatorRepresentation['strides'] @@ -200,12 +202,13 @@ def serializeTilingSolution( inputInCubes.append(InCube) - RequantCube = HyperRectangle((COffset,), (CSize,)) + MulCube = HyperRectangle((0,) * (len(shapeMul) - 1) + (COffset,), (1,) * (len(shapeMul) - 1) + (CSize,)) + AddCube = HyperRectangle((0,) * (len(shapeAdd) - 1) + (COffset,), (1,) * (len(shapeAdd) - 1) + (CSize,)) WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightC)) inputWeightCubes.append(WeightCube) - inputAddCubes.append(RequantCube) - inputMulCubes.append(RequantCube) + inputMulCubes.append(MulCube) + inputAddCubes.append(AddCube) inputLoadSchedule = [] outputLoadSchedule = [] diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py index 8d54eea43..2d6ea07a0 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py @@ -60,8 +60,10 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) - addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0) - mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0) + addBuffer = ctxt.lookup(addBufferName) + addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = len(addBuffer.shape) - 1) + mulBuffer = ctxt.lookup(mulBufferName) + mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = len(mulBuffer.shape) - 1) # map output dims to inputs dims tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch @@ -183,6 +185,8 @@ def serializeTilingSolution( weightH = ctxt.lookup(varWeight).shape[1] weightW = ctxt.lookup(varWeight).shape[2] + shapeMul = ctxt.lookup(operatorRepresentation["mul"]).shape + shapeAdd = ctxt.lookup(operatorRepresentation["add"]).shape pads = operatorRepresentation['pads'] strides = operatorRepresentation['strides'] @@ -200,7 +204,8 @@ def serializeTilingSolution( NCHWInCube = HyperRectangle((NHWCInCube.offset[0], COffset, NHWCInCube.offset[1], NHWCInCube.offset[2]), (NHWCInCube.dims[0], CSize, NHWCInCube.dims[1], NHWCInCube.dims[2])) - RequantCube = HyperRectangle((COffset,), (CSize,)) + MulCube = HyperRectangle((0,) * (len(shapeMul) - 1) + (COffset,), (1,) * (len(shapeMul) - 1) + (CSize,)) + AddCube = HyperRectangle((0,) * (len(shapeAdd) - 1) + (COffset,), (1,) * (len(shapeAdd) - 1) + (CSize,)) WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, 1)) replacements['dim_im_in_x'].append(NCHWInCube.dims[2]) @@ -216,8 +221,8 @@ def serializeTilingSolution( replacements['padding_x_right'].append(padding_right) inputInCubes.append(NCHWInCube) - inputAddCubes.append(RequantCube) - inputMulCubes.append(RequantCube) + inputMulCubes.append(MulCube) + inputAddCubes.append(AddCube) inputWeightCubes.append(WeightCube) inputLoadSchedule = [] diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py index 8b795be88..1df898bab 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py @@ -18,61 +18,63 @@ class MatMulTileConstraint(TileConstraint): @staticmethod - def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + def _getIdxMapping(rank: int, isTrans: bool) -> Tuple[int, int]: + if isTrans: + idxSecondDim, idxFirstDim = rank - 2, rank - 1 + else: + idxFirstDim, idxSecondDim = rank - 2, rank - 1 + return idxFirstDim, idxSecondDim - # Get to-be-tiled tensor's buffers + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: bufferA = ctxt.lookup(name = parseDict['A']) bufferB = ctxt.lookup(name = parseDict['B']) - outputBuffer = ctxt.lookup(name = parseDict['data_out']) + bufferOut = ctxt.lookup(name = parseDict['data_out']) # Add I/O dimensions to the model as variables - for _buffer in [bufferA, bufferB, outputBuffer]: + for _buffer in [bufferA, bufferB, bufferOut]: tilerModel.addTensorDimToModel(ctxt, _buffer.name) - tensorsShapeLen = len(bufferA.shape) - - AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) - ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) - BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) - BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) - outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2)) - outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1)) - - # Map output dims to inputs dims - for idx in range(tensorsShapeLen - 2): - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = bufferA.name, dimIdx = idx)) - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = bufferB.name, dimIdx = idx)) + idxFirstDimA, idxSecondDimA = MatMulTileConstraint._getIdxMapping(len(bufferA.shape), parseDict['transA']) + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idxFirstDimA) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idxSecondDimA) + + idxFirstDimB, idxSecondDimB = MatMulTileConstraint._getIdxMapping(len(bufferB.shape), parseDict['transB']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idxFirstDimB) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idxSecondDimB) + + rankOut = len(bufferOut.shape) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = rankOut - 2) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = rankOut - 1) + + # Map input A's batch dims to output batch dims if present + for idx in range(len(bufferA.shape) - 2): + varA = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idx) + varOut = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = idx) + tilerModel.addConstraint(varA == varOut) + + # Map input B's batch dims to output batch dims if present + for idx in range(len(bufferB.shape) - 2): + varB = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idx) + varOut = tilerModel.getTensorDimVar(tensorName = bufferOut.name, dimIdx = idx) + tilerModel.addConstraint(varB == varOut) tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) - - # Add GEMM Geometrical constraints tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) return tilerModel @staticmethod def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: - bufferA = ctxt.lookup(name = parseDict['A']) bufferB = ctxt.lookup(name = parseDict['B']) - tensorsShapeLen = len(bufferA.shape) + _, idxSecondDimA = MatMulTileConstraint._getIdxMapping(len(bufferA.shape), parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = idxSecondDimA) - ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) - BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) - BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + idxFirstDimB, _ = MatMulTileConstraint._getIdxMapping(len(bufferB.shape), parseDict['transB']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = idxFirstDimB) # VIC: We don't want to deal with intermediate results between kernel calls tilerModel.addConstraint(ASecondDimVar == parseDict['N']) diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index a6dbaa4e8..1ae43099d 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -8,7 +8,7 @@ from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint -from Deeploy.Targets.Generic.TileConstraints.MulTileConstraint import MulTileConstraint +from Deeploy.Targets.Generic.TileConstraints.MulTileConstraint import MulScalarTileConstraint, MulTileConstraint from Deeploy.Targets.Generic.TileConstraints.NOPTileConstraint import NOPTileConstraint from Deeploy.Targets.Generic.TileConstraints.RQSiGELUTileConstraint import RQSiGELUTileConstraint from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint @@ -18,8 +18,8 @@ from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \ PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, \ - PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, \ - PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \ + PULPMulScalarBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \ + PULPRQSBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \ PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSoftmaxBindings, \ PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \ PULPTransposeBindings, PULPUniformRQSBindings @@ -105,6 +105,9 @@ PULPMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMulBindings, tileConstraint = MulTileConstraint()) +PULPMulScalarTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMulScalarBindings, + tileConstraint = MulScalarTileConstraint()) + PULPReluTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPReluBinding], tileConstraint = UnaryTileConstraint()) diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py index 43d490e80..9ea1eda57 100644 --- a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import copy +import math from collections import OrderedDict import numpy as np @@ -164,23 +165,35 @@ def __init__(self): def _merge_conv_rq_fun(graph: gs.Graph, match: Match, name: str): - matched_nodes = [m for k, m in match.nodes_map.items()] - conv = matched_nodes[0] - rqs = matched_nodes[1] - - totalShift = int(np.log2(rqs.attrs['div'].values)) - - # Artifically add half the shift division value to implement rounding - rounding = 2**(totalShift - 1) if totalShift > 0 else 0 - - rqs.inputs[-1].values = copy.deepcopy(rqs.inputs[-1].values) + rounding - - _inputs = list(conv.inputs) + list(rqs.inputs[1:]) - - _outputs = rqs.outputs - - rqsConv = gs.Node(op = 'RequantizedConv', name = name, attrs = {**conv.attrs, **rqs.attrs, "shift": totalShift}) - graph.replaceInsertNode(_inputs, _outputs, rqsConv) + conv, rqs = list(match.nodes_map.values()) + + mul, add = rqs.inputs[1:] + + div_attr = rqs.attrs['div'] + if isinstance(div_attr, gs.Constant): + assert div_attr.values.size == 1 + div = div_attr.values.item() + elif isinstance(div_attr, int): + div = div_attr + elif isinstance(div_attr, float) and div_attr.is_integer(): + div = int(div_attr) + else: + raise ValueError(f"Cannot convert div to integer. Received {div_attr}") + shift = int(math.log2(div)) + # Artifically add half the division value as rounding + if shift > 0: + add.values += 2**(shift - 1) + + rqsConv = gs.Node( + op = 'RequantizedConv', + name = name, + attrs = { + **conv.attrs, + **rqs.attrs, + "shift": shift, + }, + ) + graph.replaceInsertNode(list(conv.inputs) + [mul, add], rqs.outputs, rqsConv) return graph diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py index e9be18a53..1d1af32a3 100644 --- a/Deeploy/Targets/Snitch/Bindings.py +++ b/Deeploy/Targets/Snitch/Bindings.py @@ -11,15 +11,16 @@ from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import iNoNormTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \ SnitchProfileExecutionBlockPass, SnitchSynchCoresPass from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma -from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iNoNormTemplate, \ + iSoftmaxTemplate from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template +from Deeploy.Targets.Snitch.TypeCheckers import iNoNormChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ TilingVariableReplacementUpdate diff --git a/Deeploy/Targets/Snitch/Deployer.py b/Deeploy/Targets/Snitch/Deployer.py index 7c3922a6b..4daab3b9f 100644 --- a/Deeploy/Targets/Snitch/Deployer.py +++ b/Deeploy/Targets/Snitch/Deployer.py @@ -10,7 +10,7 @@ from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ NCHWtoNHWCPass, RemoveGlobalOutputReshapePass, TransposeMatmulInputsPass -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeSplitPass @@ -22,6 +22,7 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first = False, @@ -31,6 +32,7 @@ def __init__(self, deploymentPlatform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name, default_channels_first = default_channels_first, diff --git a/Deeploy/Targets/Snitch/Layers.py b/Deeploy/Targets/Snitch/Layers.py new file mode 100644 index 000000000..017d279c3 --- /dev/null +++ b/Deeploy/Targets/Snitch/Layers.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NodeMapper, ONNXLayer, OperatorRepresentation, Shape + + +class iNoNormLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] * 4 # 2 mul, 1 add, 1 right shift + + def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation: OperatorRepresentation, + channels_first: bool) -> Tuple[Shape]: + # JUNGVI: Broadcast the weights and bias to have as many dimensions as the inputs + shape = np.broadcast_shapes(*inputShapes) + return ([shape] * len(inputShapes), outputShapes) diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py index 005199468..6f7015609 100644 --- a/Deeploy/Targets/Snitch/Parsers.py +++ b/Deeploy/Targets/Snitch/Parsers.py @@ -2,11 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from typing import Tuple import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import NetworkContext +from Deeploy.DeeployTypes import NetworkContext, NodeParser from Deeploy.Targets.Generic.Parsers import GEMMParser, RQGEMMParser @@ -18,9 +19,7 @@ def parseNode(self, node: gs.Node) -> bool: if not ret: return False - if not all([ - self.operatorRepresentation['transA'] == 0, - ]): + if self.operatorRepresentation['transA']: return False return True @@ -50,9 +49,7 @@ def parseNode(self, node: gs.Node) -> bool: if not ret: return False - if not all([ - self.operatorRepresentation['transA'] == 0, - ]): + if self.operatorRepresentation['transA']: return False return True @@ -72,3 +69,36 @@ def parseNodeCtxt(self, return ctxt, False return newCtxt, True + + +class iNoNormParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all(['D' in node.attrs, 'mul' in node.attrs, 'n_levels' in node.attrs]) + + if ret: + self.operatorRepresentation.update(node.attrs) + self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D'])) + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + weights = ctxt.lookup(node.inputs[1].name) + bias = ctxt.lookup(node.inputs[2].name) + data_out = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['weights'] = weights.name + self.operatorRepresentation['bias'] = bias.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['size'] = math.prod(data_in.shape) + + return ctxt, True diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index d62d1c380..bb570b588 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -11,15 +11,16 @@ from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicLayerNormBindings, BasicMatMulBindings, \ BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \ - ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer + ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ - RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser + RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \ IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Platform import RQAddMapper -from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser +from Deeploy.Targets.Snitch.Layers import iNoNormLayer +from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser, iNoNormParser from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchGemmTilingReadyBindings, \ SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, SnitchRQAddTilingReadyBindings, \ diff --git a/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py b/Deeploy/Targets/Snitch/Templates/iNoNormTemplate.py similarity index 62% rename from Deeploy/Targets/Generic/Templates/iNoNormTemplate.py rename to Deeploy/Targets/Snitch/Templates/iNoNormTemplate.py index 562b3168a..f99ffba3d 100644 --- a/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py +++ b/Deeploy/Targets/Snitch/Templates/iNoNormTemplate.py @@ -2,16 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from Deeploy.DeeployTypes import NodeTemplate +from Deeploy.CommonExtensions.NodeTemplate import ElementwiseTemplate - -class _iNoNormTemplate(NodeTemplate): - - def __init__(self, templateStr): - super().__init__(templateStr) - - -referenceTemplate = _iNoNormTemplate(""" +referenceTemplate = ElementwiseTemplate(""" // iNoNorm (Name: ${nodeName}, Op: ${nodeOp}) SnitchiNoNorm_s${data_in_type.referencedType.typeWidth}_s${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${weights}, ${bias}, ${size}, ${mul}, ${log2D}); """) diff --git a/Deeploy/Targets/Snitch/TypeCheckers.py b/Deeploy/Targets/Snitch/TypeCheckers.py new file mode 100644 index 000000000..09ef3bc3c --- /dev/null +++ b/Deeploy/Targets/Snitch/TypeCheckers.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Sequence, Type + +from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker +from Deeploy.DeeployTypes import OperatorRepresentation, Pointer, VariableBuffer + + +class iNoNormChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(4 * self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + if inputs[0]._signed: + return [True] + else: + return [False] diff --git a/Deeploy/Targets/SoftHier/Deployer.py b/Deeploy/Targets/SoftHier/Deployer.py index e4ab37f29..4827ba83b 100644 --- a/Deeploy/Targets/SoftHier/Deployer.py +++ b/Deeploy/Targets/SoftHier/Deployer.py @@ -8,7 +8,7 @@ from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer -from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, OperatorDescriptor, TopologyOptimizer class SoftHierDeployer(SignPropDeployer): @@ -18,12 +18,13 @@ def __init__(self, deploymentPlatform: DeploymentPlatform, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: TopologyOptimizer, + operatorDescriptors: Dict[str, OperatorDescriptor], scheduler: Callable = lambda x: x, name: str = 'DeeployNetwork', default_channels_first: bool = True, deeployStateDir: str = "DeeployState", inputOffsets: Dict[str, int] = {}): - super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, operatorDescriptors, scheduler, name, default_channels_first, deeployStateDir) self.inputOffsets = inputOffsets diff --git a/DeeployTest/Platforms/Generic/CMakeLists.txt b/DeeployTest/Platforms/Generic/CMakeLists.txt index f97f1cdf1..b2e68b257 100644 --- a/DeeployTest/Platforms/Generic/CMakeLists.txt +++ b/DeeployTest/Platforms/Generic/CMakeLists.txt @@ -8,7 +8,7 @@ file(GLOB_RECURSE SOURCES main.c ) -link_directories(${ProjectId}/../../${GENERATED_SOURCE}) +link_directories(${GENERATED_SOURCE}) add_deeploy_executable(${ProjectId} EXCLUDE_FROM_ALL ${SOURCES} ) target_link_libraries(${ProjectId} PRIVATE network deeploylib) diff --git a/DeeployTest/testMemoryLevelExtension.py b/DeeployTest/testMemoryLevelExtension.py index 0e1ed6cc4..a6a1cf37d 100644 --- a/DeeployTest/testMemoryLevelExtension.py +++ b/DeeployTest/testMemoryLevelExtension.py @@ -18,6 +18,7 @@ from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \ MemoryLevelAwareSignPropDeployer +from Deeploy.OperatorDescriptor import defaultOperatorDescriptors from Deeploy.Targets.CortexM.Platform import CMSISEngine, CMSISMapping, CMSISOptimizer, CMSISPlatform from Deeploy.Targets.Generic.Platform import GenericEngine, GenericMapping, GenericOptimizer, GenericPlatform from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import TransposeConstOptPass, TransposeMergePass @@ -83,6 +84,7 @@ MockPlatform, inputTypes, CMSISOptimizer, + defaultOperatorDescriptors, defaultScheduler, name = "DeeployNetwork", deeployStateDir = _DEEPLOYSTATEDIR, @@ -106,6 +108,7 @@ MockPlatform, inputTypes, MemPoolOptimizer, + defaultOperatorDescriptors, defaultScheduler, name = "DeeployNetwork", deeployStateDir = _DEEPLOYSTATEDIR, @@ -121,6 +124,7 @@ MockPlatform, inputTypes, GenericOptimizer, + defaultOperatorDescriptors, defaultScheduler, name = "DeeployNetworkMock", deeployStateDir = _DEEPLOYSTATEDIRMOCK, @@ -136,6 +140,7 @@ MockPlatform, inputTypes, PULPOptimizer, + defaultOperatorDescriptors, defaultScheduler, name = "DeeployNetworkMock", deeployStateDir = _DEEPLOYSTATEDIRMOCK, diff --git a/DeeployTest/testRunner_siracusa_l3dma.py b/DeeployTest/testRunner_siracusa_l3dma.py index b70d8dda2..937f7e9b2 100644 --- a/DeeployTest/testRunner_siracusa_l3dma.py +++ b/DeeployTest/testRunner_siracusa_l3dma.py @@ -6,15 +6,16 @@ import numpy as np from testUtils.codeGenerate import generateTestNetwork -from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \ - memcpyTemplate, prepare_deployer_with_custom_tiling, setup_pulp_deployer +from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, generate_graph, memcpyTemplate, \ + prepare_deployer_with_custom_tiling, setup_pulp_deployer from testUtils.testRunner import TestRunner, TestRunnerArgumentParser from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, NodeTypeChecker, _NoVerbosity from Deeploy.Targets.PULPOpen.Bindings import L3MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack @@ -74,8 +75,11 @@ MemoryManagementGeneration(), ]) -binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer) -tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint()) +bindings = [ + NodeBinding(NodeTypeChecker([PointerClass(ty)], [PointerClass(ty)]), memcpyTemplate, transformer) + for ty in IntegerDataTypes + FloatDataTypes +] +tilingReadyBindings = TilingReadyNodeBindings(bindings, MemcpyTileConstraint()) memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings) memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])} deployer.Platform.engines[0].Mapping.update(memcpyMapping) diff --git a/DeeployTest/testRunner_siracusa_mchandma.py b/DeeployTest/testRunner_siracusa_mchandma.py index 56ed6f5a1..aeb407d7e 100644 --- a/DeeployTest/testRunner_siracusa_mchandma.py +++ b/DeeployTest/testRunner_siracusa_mchandma.py @@ -6,15 +6,16 @@ import numpy as np from testUtils.codeGenerate import generateTestNetwork -from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \ - memcpyTemplate, prepare_deployer_with_custom_tiling, setup_pulp_deployer +from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, generate_graph, memcpyTemplate, \ + prepare_deployer_with_custom_tiling, setup_pulp_deployer from testUtils.testRunner import TestRunner, TestRunnerArgumentParser from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, NodeTypeChecker, _NoVerbosity from Deeploy.Targets.PULPOpen.Bindings import MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma @@ -75,8 +76,11 @@ MemoryManagementGeneration(), ]) -binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer) -tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint()) +bindings = [ + NodeBinding(NodeTypeChecker([PointerClass(ty)], [PointerClass(ty)]), memcpyTemplate, transformer) + for ty in IntegerDataTypes + FloatDataTypes +] +tilingReadyBindings = TilingReadyNodeBindings(bindings, MemcpyTileConstraint()) memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings) memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])} deployer.Platform.engines[0].Mapping.update(memcpyMapping) diff --git a/DeeployTest/testRunner_snitch_dma.py b/DeeployTest/testRunner_snitch_dma.py index 80073ac5e..ba42b433f 100644 --- a/DeeployTest/testRunner_snitch_dma.py +++ b/DeeployTest/testRunner_snitch_dma.py @@ -6,15 +6,16 @@ import numpy as np from testUtils.codeGenerate import generateTestNetwork -from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \ - memcpyTemplate, prepare_deployer_with_custom_tiling, setup_snitch_deployer +from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, generate_graph, memcpyTemplate, \ + prepare_deployer_with_custom_tiling, setup_snitch_deployer from testUtils.testRunner import TestRunner, TestRunnerArgumentParser from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration -from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes +from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, NodeTypeChecker, _NoVerbosity from Deeploy.Targets.Snitch.Bindings import MemoryAwareFunctionCallClosure, TilingCallClosure from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass @@ -80,8 +81,11 @@ MemoryManagementGeneration(), ]) -binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer) -tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint()) +bindings = [ + NodeBinding(NodeTypeChecker([PointerClass(ty)], [PointerClass(ty)]), memcpyTemplate, transformer) + for ty in IntegerDataTypes + FloatDataTypes +] +tilingReadyBindings = TilingReadyNodeBindings(bindings, MemcpyTileConstraint()) memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings) memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])} deployer.Platform.engines[0].Mapping.update(memcpyMapping) diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py index 878bc4201..5a4774a44 100644 --- a/DeeployTest/testUtils/codeGenerate.py +++ b/DeeployTest/testUtils/codeGenerate.py @@ -2,11 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 +import math import os from typing import List, Tuple import numpy as np +from Deeploy.AbstractDataTypes import FloatImmediate, IntegerImmediate from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, NetworkDeployer, VariableBuffer from Deeploy.Targets.MemPool.Platform import MemPoolPlatform from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPPlatform @@ -30,6 +32,46 @@ def _shapeBroadcast(ctxt, value, name): return broadcastNum +def generateArray(name: str, buffer: VariableBuffer, values: np.ndarray) -> str: + assert math.prod(buffer.shape) == math.prod(values.shape), \ + f"Buffer size ({math.prod(buffer.shape)}) and values size ({math.prod(values.shape)}) are not equal." + refTy = buffer._type.referencedType + + values = values.flatten() + + if issubclass(refTy, FloatImmediate): + if refTy.typeWidth == 32: + suffix = "f" + elif refTy.typeWidth == 64: + suffix = "" + else: + raise RuntimeError( + f"Unimplemented floating-poing literal suffix for type {refTy.typeName} of typeWidth {refTy.typeWidth}") + + def formatFloat(x: float, suffix: str = "") -> str: + if np.isinf(x) or np.isnan(x): + return str(x) + else: + return str(x) + suffix + + list_str = ",".join(formatFloat(x) for x in values) + elif issubclass(refTy, IntegerImmediate): + suffix = "u" if refTy.typeMin >= 0 else "" + suffix += "l" if refTy.typeWidth >= 64 else "" + list_str = ",".join(str(int(x)) + suffix for x in values) + else: + list_str = ",".join(str(x) for x in values) + + # WIESEP: Arrays have to be 4 byte aligned (at least in banshee) + total_bytes = (values.size * refTy.typeWidth) // 8 + pad_bytes = (-total_bytes) % 4 + if pad_bytes: + paddingElements = (pad_bytes * 8 + refTy.typeWidth - 1) // refTy.typeWidth + list_str += ", " + (", ").join("0" for _ in range(paddingElements)) + + return f"{refTy.typeName} {name}[] = {{ {list_str} }};\n" + + def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List) -> str: vectors = [] retStr = "" @@ -44,69 +86,44 @@ def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List) -> st if not deployer.ctxt.is_buffer(bufferName): continue - values = _shapeBroadcast(deployer.ctxt, values, bufferName) - buffer = deployer.ctxt.lookup(bufferName) - typeName = buffer._type.referencedType.typeName - typeWidth = buffer._type.referencedType.typeWidth + assert isinstance(buffer, VariableBuffer) + + bufferSize = math.prod(buffer.shape) + valuesSize = math.prod(values.shape) + assert bufferSize % valuesSize == 0, \ + f"Values shape {values.shape} of size {valuesSize} cannot be repeated into buffer of shape {buffer.shape} and size {bufferSize}." + repeat = bufferSize // valuesSize + values = np.tile(values, repeat) vectorName = f"testInputVector{index}" + retStr += generateArray(vectorName, buffer, values) vectors.append(vectorName) - retStr += f"{typeName} {vectorName}[] =" - retStr += "{" - if typeName == 'float32_t': - list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values]) - else: - list_str = (", ").join([str(x) for x in values]) - - # WIESEP: Arrays have to be 4 byte aligned (at least in banshee) - total_bytes = (values.size * typeWidth) // 8 - pad_bytes = (-total_bytes) % 4 - if pad_bytes: - paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth - list_str += ", " + (", ").join("0" for _ in range(paddingElements)) - - retStr += list_str - retStr += "};\n" - retStr += f"void* testInputVector[{len(vectors)}] = {{" - retStr += ", ".join(vectors) + retStr += ",".join(vectors) retStr += "};\n" return retStr def generateTestOutputsHeader(deployer: NetworkDeployer, test_outputs: List[np.ndarray]) -> str: + vectors = [] retStr = "" for index, values in enumerate(test_outputs): - typeName = deployer.ctxt.lookup(f'output_{index}')._type.referencedType.typeName - typeWidth = deployer.ctxt.lookup(f'output_{index}')._type.referencedType.typeWidth + buffer = deployer.ctxt.lookup(f"output_{index}") + assert isinstance(buffer, VariableBuffer) + refTy = buffer._type.referencedType - retStr += f"#define OUTPUTTYPE {typeName}\n" - retStr += f"#define ISOUTPUTFLOAT {int(typeName == 'float32_t')}\n" - retStr += f"{typeName} testOutputVector{index}[] =" - retStr += "{" + retStr += f"#define OUTPUTTYPE {refTy.typeName}\n" + retStr += f"#define ISOUTPUTFLOAT {int(refTy.typeName == 'float32_t')}\n" - values = values.flatten() - - if typeName == "float32_t": - list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values]) - else: - list_str = (", ").join([str(x) for x in values]) - - # WIESEP: Arrays have to be 4 byte aligned (at least in banshee) - total_bytes = (len(values) * typeWidth) // 8 - pad_bytes = (-total_bytes) % 4 - if pad_bytes: - paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth - list_str += ", " + (", ").join("0" for _ in range(paddingElements)) - - retStr += list_str - retStr += "};\n" + vectorName = f"testOutputVector{index}" + retStr += generateArray(vectorName, buffer, values) + vectors.append(vectorName) - retStr += f"void* testOutputVector[{len(test_outputs)}] = " + "{" - retStr += ", ".join([f"testOutputVector{idx}" for idx, _ in enumerate(test_outputs)]) + retStr += f"void* testOutputVector[{len(vectors)}] = {{" + retStr += ",".join(vectors) retStr += "};\n" return retStr diff --git a/DeeployTest/testUtils/dmaUtils.py b/DeeployTest/testUtils/dmaUtils.py index 3266ce512..09ce4ef02 100644 --- a/DeeployTest/testUtils/dmaUtils.py +++ b/DeeployTest/testUtils/dmaUtils.py @@ -10,8 +10,8 @@ from Deeploy.AbstractDataTypes import BaseType, Pointer, PointerClass from Deeploy.CommonExtensions.DataTypes import minimalIntegerType -from Deeploy.DeeployTypes import NetworkContext, NetworkDeployer, NodeParser, NodeTemplate, NodeTypeChecker, \ - ONNXLayer, OperatorRepresentation, VariableBuffer +from Deeploy.DeeployTypes import IoDesc, NetworkContext, NetworkDeployer, NodeParser, NodeTemplate, ONNXLayer, \ + OperatorDescriptor, OperatorRepresentation, VariableBuffer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \ MemoryPlatformWrapper @@ -35,28 +35,6 @@ """) -# Same interface as NodeTypeChecker but allow any input type and the -# output type matches the input type. -class MemcpyTypeChecker(NodeTypeChecker): - - def __init__(self): - super().__init__([], []) - - def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node, - operatorRepresentation: OperatorRepresentation) -> NetworkContext: - assert len(node.inputs) == 1 and len(node.outputs) == 1 - buffer_in = ctxt.lookup(node.inputs[0].name) - ctxt.annotateType(node.outputs[0].name, buffer_in._type) - return ctxt - - def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool: - return True - - def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext: - # Whatever it has already annotated, it's good - return ctxt - - class MemcpyTileConstraint(TileConstraint): @classmethod @@ -279,6 +257,17 @@ def defaultScheduler(graph: gs.Graph) -> List[List[gs.Node]]: return [[node] for node in graph.nodes] +memcpyDesc = OperatorDescriptor( + inputDescriptor = IoDesc("src"), + outputDescriptor = IoDesc("dest"), + attrDescriptors = [], +) + +dmaTestOperatorDescriptors = { + "Memcpy": memcpyDesc, +} + + def setup_pulp_deployer(defaultMemory: str, targetMemory: str, graph: gs.Graph, inputTypes: Dict[str, Type[Pointer]], doublebuffer: bool, deeployStateDir: str) -> NetworkDeployer: L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000) @@ -299,6 +288,7 @@ def setup_pulp_deployer(defaultMemory: str, targetMemory: str, graph: gs.Graph, platform, inputTypes, PULPOptimizer, + dmaTestOperatorDescriptors, defaultScheduler, default_channels_first = True, deeployStateDir = deeployStateDir) @@ -340,6 +330,7 @@ def setup_snitch_deployer(defaultMemory: str, targetMemory: str, graph: gs.Graph platform, inputTypes, SnitchOptimizer, + dmaTestOperatorDescriptors, defaultScheduler, deeployStateDir = deeployStateDir) memoryLevelAnnotationPasses = [AnnotateIOMemoryLevel(defaultMemory), AnnotateDefaultMemoryLevel(memoryHierarchy)] diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 48c577790..d02c3da64 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -7,9 +7,10 @@ import onnx_graphsurgeon as gs from Deeploy.AbstractDataTypes import Pointer -from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, OperatorDescriptor, TopologyOptimizer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper +from Deeploy.OperatorDescriptor import defaultOperatorDescriptors from Deeploy.Targets.Chimera.Deployer import ChimeraDeployer from Deeploy.Targets.Chimera.Platform import ChimeraOptimizer, ChimeraPlatform from Deeploy.Targets.CortexM.Deployer import CMSISDeployer @@ -93,6 +94,7 @@ def mapDeployer(platform: DeploymentPlatform, graph: gs.Graph, inputTypes: Dict[str, Type[Pointer]], loweringOptimizer: Optional[TopologyOptimizer] = None, + operatorDescriptors: Optional[Dict[str, OperatorDescriptor]] = None, scheduler: Optional[Callable] = None, name: Optional[str] = None, default_channels_first: Optional[bool] = None, @@ -108,6 +110,9 @@ def mapDeployer(platform: DeploymentPlatform, if name is None: name = "DeeployNetwork" + if operatorDescriptors is None: + operatorDescriptors = defaultOperatorDescriptors + if isinstance(platform, CMSISPlatform): if loweringOptimizer is None: @@ -120,6 +125,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -138,6 +144,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -156,6 +163,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -177,6 +185,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -195,6 +204,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -212,6 +222,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -228,6 +239,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first, @@ -244,6 +256,7 @@ def mapDeployer(platform: DeploymentPlatform, platform, inputTypes, loweringOptimizer, + operatorDescriptors, scheduler, name = name, default_channels_first = default_channels_first,