diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml index 7c6a5f754..f59f7fa88 100644 --- a/.github/workflows/ci-platform-siracusa.yml +++ b/.github/workflows/ci-platform-siracusa.yml @@ -53,7 +53,15 @@ jobs: testBacktracking testFloatAdder testFloatGEMM + testFloat2DConvolution + testFloat2DConvolutionBias + testFloat2DConvolutionZeroBias + + testFloat2DDWConvolution + testFloat2DDWConvolutionBias + testFloat2DDWConvolutionZeroBias + testFloatLayerNorm testFloatRelu testFloatMaxPool @@ -64,6 +72,7 @@ jobs: Quant Dequant testFloatReduceSum + testFloatReshapeWithSkipConnection testFloatSoftmaxGrad testFloatSoftmaxCrossEntropy testFloatSoftmaxCrossEntropyGrad @@ -87,4 +96,5 @@ jobs: CCT/CCT_1_16_16_8 CCT/CCT_2_32_32_128_Opset20 testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8 + testFloatDemoTinyViT num-cores: 8 diff --git a/CHANGELOG.md b/CHANGELOG.md index faf4de42c..fc7269587 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- TinyViT on non-tiled Siracusa [#117](https://github.com/pulp-platform/Deeploy/pull/117) - Support Fully Asynchronous DMAs [#114](https://github.com/pulp-platform/Deeploy/pull/114) - Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128) - Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123) @@ -24,6 +25,13 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Fix bias hoisting in generic GEMM with no bias [#126](https://github.com/pulp-platform/Deeploy/pull/126) ### Added +- PULP 2D FP DW conv Im2Col template and kernel, with bias support. +- Bias support for PULP 2D FP regular conv Im2Col in template & kernel. +- PULP FP DW conv 2D parser. +- FP conv 2D (simple & DW), reshape & skip connection, and TinyViT demo tests to the non-tiled Siracusa CI pipeline. +- FP bindings and mappings for PULP slice, DW conv 2D, and reduce mean operations. +- FP PULP DW conv lowering optimization pass similar to the existent one for integer version. +- RemoveEmptyConvBiasPass to the PULP optimizer. - Add manual type inference feature (CLI: `--input-type-map`/`--input-offset-map`) to resolve ambiguities when test inputs are not representative enough - Added a `testTypeInferenceDifferentTypes` test case to validate type inference for different input types - Added `_mangleNodeNames` function to avoid duplicate node mappings @@ -60,6 +68,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy` ### Changed +- Reduced size of reshape & skip connection test, for non-tiled Siracusa memory compatibility. - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`. - mchan HAL is now reduced to bare-bones - refactor of the IntrospectiveCodeTransformation to work on the Mako template @@ -97,6 +106,9 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode ### Fixed +- Fixed bug for non-batched elements in the PULPOpen FP GEMM and matmul templates. +- Added underscore to the beginning of closure names to avoid naming issues when they start with unsupported first characters (like numbers). +- Data types in the PULPOpen FP add and mul templates. - Prevent node duplication for graphs generated via GraphSurgeon - Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step. - Fix license CI check and prevent potential issues with `jq` installation @@ -185,9 +197,9 @@ This release containing major architectural changes, new platform support, enhan ### Added -- BatchNorm kernel -- ConvTranspose kernel -- MaxPool1D kernel +- BatchNorm kernel +- ConvTranspose kernel +- MaxPool1D kernel - Template for 1D Convolution - Support for float32 data type in the previous kernels - Float binding for Pad1D kernel @@ -326,7 +338,7 @@ This release containing major architectural changes, new platform support, enhan ### Changed - FloatConvTemplate file -- Platform.py file +- Platform.py file - Bump the CMake version to 3.24 as required for the chimera-sdk - Bump GVSoC's version and add chimera simulation target - Rename the generic source util to utils to avoid name collision with chimera-sdk diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py index c5f9c883a..41073ad64 100644 --- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py +++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py @@ -155,7 +155,8 @@ def apply(self, executionBlock: ExecutionBlock, name: str, verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: - self.closureName = name + self.closureSuffix + # Prepend underscore to avoid name issues when beginning with problematic characters (like numbers) + self.closureName = "_" + name + self.closureSuffix self.functionCall = executionBlock.generate(ctxt) self._generateClosureStruct(ctxt, executionBlock) ctxt = self._generateClosureCtxt(ctxt, name) diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py index 4f6dba382..c05ea3b9d 100644 --- a/Deeploy/CommonExtensions/DataTypes.py +++ b/Deeploy/CommonExtensions/DataTypes.py @@ -87,11 +87,11 @@ class float64_t(FloatImmediate): SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t) UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t) -IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted(( - *SignedIntegerDataTypes, - *UnsignedIntegerDataTypes, -), - key = lambda _type: _type.typeWidth)) +IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple( + sorted(( + *SignedIntegerDataTypes, + *UnsignedIntegerDataTypes, + ), key = lambda _type: _type.typeWidth)) FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t) diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py index f07fe57c9..a8f27b546 100644 --- a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py +++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py @@ -2,11 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Union - import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity +from Deeploy.DeeployTypes import CodeGenVerbosity, DeploymentEngine, NetworkContext, NetworkDeployer, _NoVerbosity class NetworkDeployerWrapper(NetworkDeployer): @@ -68,8 +66,8 @@ def generateBufferAllocationCode(self) -> str: return self._innerObject.generateBufferAllocationCode() # MultiEngineDeployer augment - def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: - return self._innerObject._mapNode(node) + def _selectEngine(self, node: gs.Node) -> DeploymentEngine: + return self._innerObject._selectEngine(node) def _printMemorySummary(self): return self._innerObject._printMemorySummary() diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 8c2f5d248..5ccfb7dcf 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -325,7 +325,7 @@ def fromNode(cls, node: gs.Node): return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape)) def has_live_aliases(self, ctxt: NetworkContext) -> bool: - """Checks whether this VariableBuffer has any live ancestors, i.e. buffers that are still live and are aliased by this buffer. + """Checks whether this VariableBuffer has any live aliases, i.e. buffers that are still live and are aliased by this buffer. Parameters ---------- ctxt : NetworkContext @@ -333,7 +333,7 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool: Returns ------- bool - True if this VariableBuffer has any live ancestors, False otherwise + True if this VariableBuffer has any live aliases, False otherwise """ # Do a breadth-first search across the aliasing double-linked list live = self._live @@ -2562,10 +2562,10 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity): self.ctxt = layer.codeTransform(self.ctxt, verbose) self.transformed = True - def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: + def _selectEngine(self, node: gs.Node) -> DeploymentEngine: for engine in self.Platform.engines: if node.op in engine.Mapping: - return engine.Mapping[node.op](node) + return engine raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}") def _bindLayers(self): @@ -2582,7 +2582,8 @@ def _bindLayers(self): flatSchedule += subGraph for node in flatSchedule: - layer = self._mapNode(node) + engine = self._selectEngine(node) + layer = engine.Mapping[node.op](node) if isinstance(layer, ONNXLayer): log.debug(f" {SUCCESS_MARK} Bind {node.name} to layer {layer.__class__.__name__}") self.layerBinding[layer.node.name] = layer diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py index 4b05ab5be..570363b9a 100644 --- a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py +++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py @@ -2,13 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, Type, Union +from typing import Callable, Dict, Type import onnx_graphsurgeon as gs from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper -from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentEngine, DeploymentPlatform, NetworkDeployer, Schedule, TopologyOptimizer from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \ EngineColoringPass, EngineMapper @@ -48,14 +48,14 @@ def lower(self, graph: gs.Graph) -> gs.Graph: ) == 0, f"Missing engine color for nodes {[node.name for node in uncoloredNodes]} with operations {uncoloredOperations}" return graph - def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: + def _selectEngine(self, node: gs.Node) -> DeploymentEngine: assert "engine" in node.attrs, f"Node {node.name} doesn't have an engine color." engineName = node.attrs["engine"] assert isinstance(engineName, str) and engineName in self.engineDict, \ f"Node {node.name} has an invalid engine {engineName} assigned." engine = self.engineDict[engineName] assert node.op in engine.Mapping, f"No mapping found for {node.op} in engine {engine.name}" - return engine.Mapping[node.op](node) + return engine class EngineColoringDeployerWrapper(EngineColoringDeployer, NetworkDeployerWrapper): diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py index 005b0b889..e4d164f6a 100644 --- a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py @@ -8,6 +8,7 @@ class _FloatReduceMeanTemplate(NodeTemplate): + # WARNING: Currently only supports single axis reducing! def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py index 93d884eb8..67a476ca6 100644 --- a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py @@ -8,6 +8,7 @@ class _ReduceMeanTemplate(NodeTemplate): + # WARNING: Currently only supports single axis reducing! def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/Generic/Templates/SliceTemplate.py b/Deeploy/Targets/Generic/Templates/SliceTemplate.py index 3ffaa4621..5797c9ba6 100644 --- a/Deeploy/Targets/Generic/Templates/SliceTemplate.py +++ b/Deeploy/Targets/Generic/Templates/SliceTemplate.py @@ -10,6 +10,7 @@ class _SliceTemplate(NodeTemplate): + # WARNING: Currently only supports single axis slicing! def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 9ff940b2f..cc81527f3 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -9,13 +9,13 @@ from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration, MemoryPassthroughGeneration -from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t, \ - uint8_t +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \ + int8_t, int32_t, int64_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \ - GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate +from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \ + FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ @@ -27,11 +27,11 @@ from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \ - FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, \ - FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, \ - ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \ - SliceTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ + FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ + FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \ + MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \ + SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker @@ -148,16 +148,24 @@ PointerClass(uint8_t), PointerClass(uint8_t), PointerClass(uint8_t) - ], [PULPDMAFuture(underlyingType = type)]), SliceTemplate.referenceTemplate, MemoryAwareForkTransformer) + ], [PULPDMAFuture(underlyingType = type)]), DMASliceTemplate.referenceTemplate, MemoryAwareForkTransformer) for type in IntegerDataTypes ] +PULPSliceBindings = [ + NodeBinding( + SliceChecker([ + PointerClass(type), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t) + ], [PointerClass(type)]), SliceTemplate.referenceTemplate, ForkTransformer) for type in FloatDataTypes +] + PULPReshapeBindings = [ - NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]), - ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes -] + [ - NodeBinding(ReshapeChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), - ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]), + ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes ] PULPRQAddBindings = [ @@ -225,6 +233,14 @@ ForkTransformer) ] +PULPFloatDWConv2DBindings = [ + NodeBinding( + ConvChecker( + [PointerClass(float_type), PointerClass(float_type), + PointerClass(float_type)], [PointerClass(float_type)]), FloatConvTemplate.referenceDW2DIm2ColTemplate, + ForkTransformer) for float_type in FloatDataTypes +] + PULPRQSMatrixVecBindings = [ NodeBinding( PULPLinearChecker([PointerClass(type1), @@ -276,6 +292,11 @@ PULPReduceMeanBindings = [ NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]), + FloatReduceMeanTemplate.referenceTemplate, ClusterTransformer) + for integer_type in SignedIntegerDataTypes + for float_type in FloatDataTypes ] PULPReduceSumBindings = [ diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py index 86bf02e57..bceea01f4 100644 --- a/Deeploy/Targets/PULPOpen/Deployer.py +++ b/Deeploy/Targets/PULPOpen/Deployer.py @@ -15,6 +15,7 @@ from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass _L3AllocTemplate = NodeTemplate(""" @@ -63,7 +64,15 @@ def __init__(self, self.extNameCount = 0 - def bind(self): + def annotateNCores(self) -> None: + for layer in self.layerBinding.values(): + node = layer.node + engine = self._selectEngine(node) + opRepr = layer.mapper.parser.operatorRepresentation + if isinstance(engine, PULPClusterEngine): + opRepr["n_cores"] = engine.n_cores + + def bind(self) -> bool: # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU. # SCHEREMO: The BindingOptimizationPass system is fairly fragile; # it was designed this way because implementing further topology optimizations after @@ -71,11 +80,16 @@ def bind(self): # but if there is only very few cases, this solution is okay. autoTransposePass = AutoTransposeMergePass() #self.ctxt, self.layerBinding = autoTransposePass.apply(self.ctxt, self.graph, self.layerBinding) + + # LMACAN: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU. + self.annotateNCores() + # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU. - ret = super().bind() - if ret: - self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;") - return ret + if not super().bind(): + return False + + self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;") + return True def _l3ConstBuffer(self) -> List[VariableBuffer]: return [ diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index e94af6e42..ab99fcabc 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -72,24 +72,24 @@ def parseNode(self, node: gs.Node) -> (bool): wellFormed = super().parseNode(node) if wellFormed: ret = all([ - # Make sure padding is square + # Current PULP kernel only supports grouping of 1 self.operatorRepresentation['group'] == 1, + + # Make sure padding is square self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], - len(node.inputs) == 2 + + # Check number of inputs + # 2 inputs if no bias, 3 if layer has bias + len(node.inputs) in [2, 3], ]) - self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) - self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) - self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0]) - self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1]) + # Extract additional attributes self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) - self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) - self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) return ret return False @@ -102,11 +102,86 @@ def parseNodeCtxt(self, newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) if ret: + # Set inputs names + inputs = ['data_in', 'weight'] + + # Handle bias, if present + if len(node.inputs) == 2: + self.operatorRepresentation["has_bias"] = "false" + self.operatorRepresentation["bias"] = "NULL" + else: + inputs.append("bias") + self.operatorRepresentation["has_bias"] = "true" + + for idx, inputNode in enumerate(node.inputs): + self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name + return newCtxt, True return ctxt, False +class PULPFPDWConv2DParser(Conv2DParser): + + def __init__(self, noBiasHoisting = True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> (bool): + # Parse root conv 2D information + wellFormed = super().parseNode(node) + + if wellFormed: + # Check if the node is a depthwise convolution + ret = all([ + # Make sure padding is square + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + + # Check number of inputs + # 2 inputs if no bias, 3 if layer has bias + len(node.inputs) in [2, 3], + ]) + + # Extract additional attributes + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + + return ret + return False + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + # Parse node context for 2D conv + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + # Define input names + inputs = ['data_in', 'weight'] + + # Handle bias, if present + if len(node.inputs) == 2: + self.operatorRepresentation["has_bias"] = "false" + self.operatorRepresentation["bias"] = "NULL" + else: + inputs.append("bias") + self.operatorRepresentation["has_bias"] = "true" + + # Map input nodes to operator representation + for idx, inputNode in enumerate(node.inputs): + self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name + + # Check if DW + if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']: + return newCtxt, True + + return ctxt, False + + class PULPDWConv1DParser(RQSConv1DParser): def __init__(self, noBiasHoisting = True): diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 99c1c9335..133670da0 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -5,6 +5,8 @@ import numpy as np import onnx_graphsurgeon as gs +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + RemoveEmptyConvBiasPass from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \ NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel @@ -27,20 +29,22 @@ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \ - PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings + PULPDMASliceBindings, PULPDWConv1DBinding from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ - PULPDWConv2DParser, PULPFPConv2DParser, PULPGEMMParser, PULPMatrixVecParser, PULPTallGEMMParser + PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ + PULPTallGEMMParser from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \ - PULPConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUTilingReadyBindings, \ - PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, \ - PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormTilingReadyBindings, \ - PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \ - PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, \ - PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \ - PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \ - PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ + PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \ + PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, \ + PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, \ + PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \ + PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \ + PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, \ + PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \ + PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \ + PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ @@ -61,7 +65,7 @@ RequantShiftMapper = NodeMapper(RequantShiftParser(), PULPRQSTilingReadyBindings) UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings) -ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanBindings) +ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanTilingReadyBindings) ReduceSumMapper = NodeMapper(ReduceSumParser(), PULPReduceSumTilingReadyBindings) MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings) RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) @@ -71,6 +75,7 @@ DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding]) FPConv2DMapper = NodeMapper(PULPFPConv2DParser(), PULPConv2DTilingReadyBindings) Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings) +FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPDWConv2DTilingReadyBindings) DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings) GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings) FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings) @@ -85,7 +90,9 @@ ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings) -SliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) +DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) + +SliceMapper = NodeMapper(SliceParser(), PULPSliceTilingReadyBindings) iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings) @@ -99,7 +106,7 @@ DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) PULPMapping = { - 'Conv': ConvLayer([FPConv2DMapper]), + 'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]), 'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]), 'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]), 'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]), @@ -125,7 +132,7 @@ 'Squeeze': ReshapeLayer([UnsqueezeMapper]), 'Transpose': TransposeLayer([TransposeMapper]), 'Unsqueeze': ReshapeLayer([UnsqueezeMapper]), - 'Slice': SliceLayer([SliceMapper]), + 'Slice': SliceLayer([SliceMapper, DMASliceMapper]), 'RequantizedAdd': AddLayer([RQAddMapper]), 'Concat': ConcatLayer([ConcatMapper]), 'iRMSNorm': iRMSNormLayer([iRMSNormMapper]), @@ -225,7 +232,8 @@ class PULPStructBuffer(StructBuffer): MergeConstAddAndRequantPass(), PULPGEMMRequantMergePass(), PULPMatMulRequantMergePass(), - PULPAddRequantMergePass() + PULPAddRequantMergePass(), + RemoveEmptyConvBiasPass(), ], name = "PULPOptimizer") @@ -237,8 +245,14 @@ class PULPStructBuffer(StructBuffer): class PULPClusterEngine(DeploymentEngine): - def __init__(self, name: str, Mapping = PULPMapping, initCode = "", includeList = _includeList) -> None: + def __init__(self, + name: str, + Mapping = PULPMapping, + initCode = "", + includeList = _includeList, + n_cores: int = 8) -> None: super().__init__(name, Mapping, initCode, includeList) + self.n_cores = n_cores class PULPPlatform(DeploymentPlatform): diff --git a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py b/Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py similarity index 100% rename from Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py rename to Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py index 7f1c2e21c..200ad1b9e 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py @@ -6,14 +6,14 @@ referenceTemplate = NodeTemplate(""" // Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp}) -int8_t ${nodeName}_core_id = pi_core_id(); -int8_t ${nodeName}_log2Core = log2(NUM_CORES); -int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); -int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size}); -int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size}); +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); uint32_t i = ${nodeName}_chunk_start; -for (; i+5 < ${nodeName}_chunk_stop; i+=6) { +for (; i + 5 < ${nodeName}_chunk_stop; i += 6) { ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i]; ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1]; ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2]; diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index 29a216d72..bfa893db9 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -18,9 +18,13 @@ def __init__(self, templateStr): def computeTransientBuffersSize( ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: - im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * - operatorRepresentation['dim_kernel_y']) + # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // + 8) * operatorRepresentation["n_cores"] * operatorRepresentation[ + 'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] def hoistTransientBuffers(self, ctxt: NetworkContext, @@ -34,6 +38,39 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, [im2col_name] +class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + + # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[ + "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + # Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers + ctxt.lookup(im2col_name)._type.referencedType = ctxt.lookup( + operatorRepresentation['data_in'])._type.referencedType + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + + reference2DTemplate = NodeTemplate(""" // 2D FP Conv HWC with ChannelOut parallelism (Name: ${nodeName}, Op: ${nodeOp}) @@ -47,6 +84,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${dim_kernel_x}, ${stride_y}, ${stride_x}, + ${bias}, ${has_bias}, ref_${data_out}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} ); @@ -66,15 +104,48 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, for (uint32_t n=0; n<${batch}; ++n) { PULP_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( ref_${data_out}_${data_in}, - ${dim_im_in_y}, ${dim_im_in_x}, + ${dim_im_in_y}, ${ch_im_in}, ${weight}, ${ch_im_out}, - ${dim_kernel_y}, ${dim_kernel_x}, + ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ${bias}, ${has_bias}, + ref_${data_out}_${data_out}, + ${padding_y_top}, + ${padding_y_bottom}, + ${padding_x_left}, + ${padding_x_right}, + ${ctxtBuffer} + ); + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} +""") + +referenceDW2DIm2ColTemplate = PULP2DFloatDWConvIm2ColTemplate(""" +// 2D DW FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp}) + +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_DW_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ${dim_im_in_x}, + ${dim_im_in_y}, + ${ch_im_in}, + ${weight}, + ${ch_im_out}, + ${dim_kernel_x}, + ${dim_kernel_y}, ${stride_x}, + ${stride_y}, + ${bias}, ${has_bias}, ref_${data_out}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, @@ -86,4 +157,4 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index f4c22b2c2..d007e60df 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -24,9 +24,18 @@ ${transB} ); + % if A_batched: ref_${data_out}_${A} += ${M} * ${N}; + % endif + + % if B_batched: ref_${data_out}_${B} += ${N} * ${O}; + % endif + + % if C_batched: ref_${data_out}_${C} += ${M} * ${O}; + % endif + ref_${data_out}_${data_out} += ${M} * ${O}; } """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py index 11b7c9aa2..3cdf26097 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py @@ -8,8 +8,18 @@ // Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp}) for(uint32_t b=0; b<${batch}; b++) { + % if A_batched: ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + % else: + ${A_type.typeName} batch_A = ${A}; + % endif + + % if B_batched: ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + % else: + ${B_type.typeName} batch_B = ${B}; + % endif + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; PULP_MatMul_fp32_fp32_fp32_unroll1x7( diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py index 2f202b24d..ced6c3cbc 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py @@ -7,11 +7,11 @@ referenceTemplate = NodeTemplate(""" // Float Mul with parallelism and 6x unrolling (Name: ${nodeName}, Op: ${nodeOp}) -int8_t ${nodeName}_core_id = pi_core_id(); -int8_t ${nodeName}_log2Core = log2(NUM_CORES); +uint32_t ${nodeName}_core_id = pi_core_id(); +uint32_t ${nodeName}_log2Core = (uint32_t) log2(NUM_CORES); uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1)) != 0); -uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${size}); -uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size}); +uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size}); if (${nodeName}_start < ${nodeName}_end) { float32_t ${nodeName}_scalar = ${B}[0]; diff --git a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py index 849f68eef..9dcea4256 100644 --- a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py @@ -8,6 +8,7 @@ class _ReduceMeanTemplate(NodeTemplate): + # WARNING: Currently only supports single axis reducing! def __init__(self, templateStr): super().__init__(templateStr) diff --git a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py index 41c4b5366..a795a555e 100644 --- a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py @@ -25,10 +25,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer +from Deeploy.Targets.Generic.Templates.ReshapeTemplate import _ReshapeTemplate as _GenericReshapeTemplate -class _ReshapeTemplate(NodeTemplate): +class _ReshapeTemplate(_GenericReshapeTemplate): def __init__(self, templateStr): super().__init__(templateStr) @@ -36,19 +37,18 @@ def __init__(self, templateStr): def alignToContext(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - # SCHEREMO: Selectively mark 'indices' dead, since we don't need them - if 'indices' in operatorRepresentation.keys(): - ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False - ctxt.globalObjects[operatorRepresentation['indices']]._live = False + ctxt, operatorRepresentation, _ = super().alignToContext(ctxt, operatorRepresentation) - # Same for "shape" - if "shape" in operatorRepresentation.keys(): - ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False - ctxt.globalObjects[operatorRepresentation["shape"]]._live = False + # Get buffers + bufferIn = ctxt.lookup(operatorRepresentation['data_in']) + assert isinstance(bufferIn, VariableBuffer) - inBuffer = ctxt.lookup(operatorRepresentation['data_in']) - outBuffer = ctxt.lookup(operatorRepresentation['data_out']) - outBuffer._alias = inBuffer.name + bufferOut = ctxt.lookup(operatorRepresentation['data_out']) + assert isinstance(bufferOut, VariableBuffer) + + # HACK: Tiling wasn't updated in the Fix aliasing PR so we have to still + # set the _alias argument + bufferOut._alias = bufferIn.name return ctxt, operatorRepresentation, [] diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py index c69760df5..e6819f81a 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from ortools.constraint_solver.pywrapcp import IntVar @@ -141,6 +141,7 @@ def serializeTilingSolution( operatorRepresentation, addrNames) varWeight = operatorRepresentation['weight'] + varIn = operatorRepresentation["data_in"] varOut = operatorRepresentation['data_out'] inputInCubes = [] @@ -182,9 +183,16 @@ def serializeTilingSolution( (BatchOffset, HOffset, WOffset, COffset) = cube.offset (BatchSize, HSize, WSize, CSize) = cube.dims - InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC, - cube, - ctxt.lookup(varOut).shape) + InCube, padding_tuple = Conv2DTileConstraint.computeInputCube( + kernelShape = (weightH, weightW), + pads = pads, + strides = strides, + inputCSize = weightC, + outputCube = cube, + inputDims = ctxt.lookup(varIn).shape, + outputDims = ctxt.lookup(varOut).shape, + ) + padding_left, padding_right, padding_top, padding_bottom = padding_tuple replacements['dim_im_in_x'].append(InCube.dims[1]) @@ -230,6 +238,7 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw # Get to-be-tiled tensor's buffers inputBufferName = parseDict['data_in'] weightBufferName = parseDict['weight'] + biasBufferName = parseDict['bias'] outputBufferName = parseDict['data_out'] strides = parseDict["strides"] @@ -237,27 +246,38 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw dilation = parseDict["dilations"] # Add I/O dimensions to the model as variables - for bufferName in [inputBufferName, weightBufferName, outputBufferName]: - tilerModel.addTensorDimToModel(ctxt, bufferName) + for bufferName in [inputBufferName, weightBufferName, biasBufferName, outputBufferName]: + if bufferName != "NULL": + tilerModel.addTensorDimToModel(ctxt, bufferName) + # Handle input dimensions inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + # Handle weight dimensions weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + # Handle bias dimensions + if biasBufferName != "NULL": + biasChannelVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0) + + # Handle output dimensions outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + # Add constraints to the optimization problem of the tiler model # Map output dims to inputs dims tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel + if biasBufferName != "NULL": + tilerModel.addConstraint(outputChannelVar == biasChannelVar) # Bias inputBuffer = ctxt.lookup(inputBufferName) @@ -317,9 +337,14 @@ def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, return symbolicParseDict @staticmethod - def computeInputCube(kernelShape: Tuple[int, int], pads: Tuple[int, int, int, int], strides: Tuple[int, int], - inputCSize: int, outputCube: HyperRectangle, - outputDims: Tuple[int, int, int]) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]: + def computeInputCube( + kernelShape: Tuple[int, int], + pads: Tuple[int, int, int, int], + strides: Tuple[int, int], + inputCSize: int, + outputCube: HyperRectangle, + outputDims: Tuple[int, int, int], + inputDims: Optional[Tuple[int, int, int]] = None) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]: (outputBatchOffset, outputHOffset, outputWOffset, outputCOffset) = outputCube.offset (outputBatchSize, outputHSize, outputWSize, outputCSize) = outputCube.dims @@ -338,8 +363,19 @@ def computeInputCube(kernelShape: Tuple[int, int], pads: Tuple[int, int, int, in inputHOffset = max(outputHOffset * strideH - padTop, 0) inputWOffset = max(outputWOffset * strideW - padLeft, 0) - inputHSize = outputHSize * strideH + (kernelShape[0] - 1) - (tilePadTop + tilePadBottom) - inputWSize = outputWSize * strideW + (kernelShape[1] - 1) - (tilePadLeft + tilePadRight) + if inputDims is not None: + # Compute input dimensions according to procedure described in PyTorch's Conv2D documentation + # Assuming worst case (cutting of (stride - 1) elements at the end of each dimension) + inputHSize = outputHSize * strideH + kernelShape[0] - (tilePadTop + tilePadBottom) - 1 + inputWSize = outputWSize * strideW + kernelShape[1] - (tilePadLeft + tilePadRight) - 1 + + # Mitigating all situations other than the worst case assumed earlier + inputHSize = min(inputHSize, inputDims[1]) + inputWSize = min(inputWSize, inputDims[2]) + else: + # Use previous version, compatible with RQ layers + inputHSize = outputHSize * strideH + (kernelShape[0] - 1) - (tilePadTop + tilePadBottom) + inputWSize = outputWSize * strideW + (kernelShape[1] - 1) - (tilePadLeft + tilePadRight) InCube = HyperRectangle((outputBatchOffset, inputHOffset, inputWOffset, 0), (outputBatchSize, inputHSize, inputWSize, inputCSize)) @@ -351,17 +387,34 @@ def serializeTilingSolution( cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], targetMemLevel: str, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: - outputCubes = [cube.rectangle for cube in absoluteOutputCubes] - addrNames = ['data_in', 'weight', 'data_out'] - inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, - operatorRepresentation, addrNames) + # Extract rectangle information (offsets and dimensions) from output cubes + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + # Extract required component information from operator representation varWeight = operatorRepresentation['weight'] + varBias = operatorRepresentation['bias'] + varIn = operatorRepresentation["data_in"] varOut = operatorRepresentation['data_out'] + # Prepare address names, also handling bias + if varBias != "NULL": + addrNames = ['data_in', 'weight', 'bias', 'data_out'] + else: + addrNames = ['data_in', 'weight', 'data_out'] + + # Extract memory base addresses for each of the required components, + # based on the computed memory configuration + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # Prepare cube lists for components inputInCubes = [] inputWeightCubes = [] + inputBiasCubes = [] + + # Prepare replacement lists for the elements inside the operator representation, + # for the cubes to be computed further down in this function replacements: Dict[str, List[int]] = { "dim_im_in_x": [], "dim_im_in_y": [], @@ -386,23 +439,36 @@ def serializeTilingSolution( "padding_x_right": PointerClass(uint8_t) } + # Obtain weight dimensions weightH = ctxt.lookup(varWeight).shape[1] weightW = ctxt.lookup(varWeight).shape[2] weightC = ctxt.lookup(varWeight).shape[3] + # Obtain padding and striding information pads = operatorRepresentation['pads'] strides = operatorRepresentation['strides'] + # Iterate throught the cubes in which the output will be split for tiling for cube in outputCubes: + # Obtain current cube offsets and dimensions (BatchOffset, HOffset, WOffset, COffset) = cube.offset (BatchSize, HSize, WSize, CSize) = cube.dims - InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC, - cube, - ctxt.lookup(varOut).shape) - + # Compute input cube + InCube, padding_tuple = Conv2DTileConstraint.computeInputCube( + kernelShape = (weightH, weightW), + pads = pads, + strides = strides, + inputCSize = weightC, + outputCube = cube, + inputDims = ctxt.lookup(varIn).shape, + outputDims = ctxt.lookup(varOut).shape, + ) + + # Extract individual padding padding_left, padding_right, padding_top, padding_bottom = padding_tuple + # Add element information for the operator representation replacements['dim_im_in_x'].append(InCube.dims[1]) replacements['dim_im_in_y'].append(InCube.dims[2]) replacements['dim_im_out_x'].append(HSize) @@ -414,21 +480,37 @@ def serializeTilingSolution( replacements['padding_x_left'].append(padding_left) replacements['padding_x_right'].append(padding_right) + # Add input cube with tiling information to the corresponding list inputInCubes.append(InCube) + # Obtain and add weight cube with tiling information to the corresponding list WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightC)) - inputWeightCubes.append(WeightCube) + # Obtain and add bias cube with tiling information to the corresponding list, + # if bias exists + if varBias != "NULL": + BiasCube = HyperRectangle((COffset,), (CSize,)) + inputBiasCubes.append(BiasCube) + + # Prepare loading schedule lists inputLoadSchedule = [] outputLoadSchedule = [] - for a, b in zip(inputInCubes, inputWeightCubes): - inputLoadSchedule.append({"data_in": a, "weight": b}) + # Create input schedule lists, with bias handling + if varBias == "NULL": + for a, b in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": a, "weight": b}) + else: + for a, b, c in zip(inputInCubes, inputWeightCubes, inputBiasCubes): + inputLoadSchedule.append({"data_in": a, "weight": b, "bias": c}) + # Create output schedule list for out in outputCubes: outputLoadSchedule.append({"data_out": out}) + # Prepare containing objects with information computed in this function regarding tiling schedule + # and variable replacement inside operator representation tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py index 8d54eea43..71c9fec25 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py @@ -17,7 +17,7 @@ VariableReplacementScheme -class DWConv2DTileConstraint(TileConstraint): +class RQDWConv2DTileConstraint(TileConstraint): @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -233,3 +233,332 @@ def serializeTilingSolution( variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) return variableReplacementSchedule, tilingSchedule + + +class DWConv2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + ''' + This function adds geometrical constraints for a PULP Im2Col 2D DW Convolution Tilling. + ''' + + # ===== GET NECESSARY INFORMATION ===== + # Get to-be-tiled tensor's buffers + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + weightBufferName = parseDict['weight'] + biasBufferName = parseDict['bias'] + + im2colBufferName = parseDict['ctxtBuffer'] + + # Get other information + has_bias = False if parseDict['has_bias'] == "false" else True + + pads = parseDict['pads'] + strides = parseDict['strides'] + dilations = parseDict['dilations'] + group = parseDict['group'] + n_cores = parseDict['n_cores'] + + im2col_buffer_size = ctxt.lookup(im2colBufferName).size + weight_type_width = ctxt.lookup(weightBufferName)._type.typeWidth // 8 + + # ===== ADD I/O DIMS TO MODEL AS VARS ===== + buffersOfInterest = [inputBufferName, outputBufferName, weightBufferName] + if has_bias: + buffersOfInterest.append(biasBufferName) + + for bufferName in buffersOfInterest: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # ===== EXTRACT TENSOR DIMS AS VARS ===== + # Input + # NHWC layout + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + # Output + # NHWC layout + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + + # Weight + # C_out - C_in - H - W layout (depthwise convolution weights, + # with c_in used for grouping different than number of channels) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + # Bias (C_out) + if has_bias: + biasDimVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0) + + # ===== ADD CONSTRAINTS ===== + # Add constraint for batch size match between input and output + tilerModel.addConstraint(inputBatchVar == outputBatchVar) + + # Add constraint for input width and height sizes match + # (Depends on output height and width, kernel size, padding, dilations, and strides. + # For more information on the connections, see ONNX and/or Torch Conv2D documentation). + tilerModel.addConstraint(outputHeightVar == (((inputHeightVar + pads[0] + pads[2] - dilations[0] * + (weightHeightVar - 1) - 1) // strides[0]) + 1)) + tilerModel.addConstraint(outputWidthVar == (((inputWidthVar + pads[1] + pads[3] - dilations[1] * + (weightWidthVar - 1) - 1) // strides[1]) + 1)) + + # Add constraint for input channel size match + # (Depends on weight output channel and conv grouping) + tilerModel.addConstraint(inputChannelVar == (weightInChannelVar * group)) + + # Add constraint for weight output channels to match + # output number of channels + tilerModel.addConstraint(weightOutChannelVar == outputChannelVar) + + # Add constraint for bias size to match number of output channels + if has_bias: + tilerModel.addConstraint(biasDimVar == outputChannelVar) + + # Add constraint for size of im2col buffer to be equal to + # number of cores * width of weight data type * size of a single convolutional filter + tilerModel.addConstraint(im2col_buffer_size == (n_cores * weight_type_width * weightHeightVar * weightWidthVar)) + + # Add constraint for relationship between in and out number of channels + tilerModel.addConstraint((outputChannelVar % inputChannelVar) == 0) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # ===== GET NECESSARY INFORMATION ===== + # Get to-be-tiled tensor's buffers + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + weightBufferName = parseDict['weight'] + biasBufferName = parseDict['bias'] + + # Get other information + has_bias = False if parseDict['has_bias'] == "false" else True + + pads = parseDict['pads'] + strides = parseDict['strides'] + + # ===== ADD I/O DIMS TO MODEL AS VARS ===== + buffersOfInterest = [inputBufferName, outputBufferName, weightBufferName] + if has_bias: + buffersOfInterest.append(biasBufferName) + + for bufferName in buffersOfInterest: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # ===== EXTRACT TENSOR DIMS AS VARS ===== + # Input + # NHWC layout + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + # Output + # NHWC layout + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + + # Weight + # C_out - C_in - H - W layout (depthwise convolution weights, + # with c_in used for grouping different than number of channels) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + # Bias (C_out) + if has_bias: + biasDimVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0) + + # ===== ADD CONSTRAINTS ===== + # Workaround tiling issue with non-wordaligned accesses + if "L3" in ctxt.lookup(parseDict['data_in'])._memoryLevel: + tilerModel.addTileSizeDivisibleConstraint(parseDict, 'ch_im_in', inputChannelVar, 4) + + # Check that height and width of weights match the parsed values + tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x']) + tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y']) + tilerModel.addConstraint(weightOutChannelVar == parseDict['ch_im_out']) + + # Check bias dimension + if biasBufferName != "NULL": + tilerModel.addConstraint(biasDimVar == parseDict["ch_im_out"]) + + # Constraint the minimum tile size such that at least one kernel can be applied + # Account for padding + tilerModel.addConstraint(outputHeightVar >= 1 + max([pads[0], pads[2]])) + tilerModel.addConstraint(outputWidthVar >= 1 + max([pads[1], pads[3]])) + + tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'] + pads[0], strategy = PerformanceHint(1)) + tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'] + pads[1], strategy = PerformanceHint(1)) + + tilerModel.addConstraint((inputHeightVar % strides[0]) == 0) + tilerModel.addConstraint((inputWidthVar % strides[1]) == 0) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + symbolicParseDict = parseDict.copy() + symbolicParseDict['ch_im_in'] = tilerModel.getTensorDimVar(inputBuffer.name, 3) + symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 2) + symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 3) + + return symbolicParseDict + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + # Extract rectangle information (offsets and dimensions) from output cubes + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # Extract required component information from operator representation + varIn = operatorRepresentation['data_in'] + varWeight = operatorRepresentation['weight'] + varBias = operatorRepresentation['bias'] + varOut = operatorRepresentation['data_out'] + + # Prepare address names, also handling bias + if varBias != "NULL": + addrNames = ['data_in', 'weight', 'bias', 'data_out'] + else: + addrNames = ['data_in', 'weight', 'data_out'] + + # Extract memory base addresses for each of the required components, + # based on the computed memory configuration + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # Prepare cube lists for components + inputInCubes = [] + inputWeightCubes = [] + inputBiasCubes = [] + + # Prepare replacement lists for the elements inside the operator representation, + # for the cubes to be computed further down in this function + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_out": [], + "ch_im_in": [], + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [] + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_out": PointerClass(uint16_t), + "ch_im_in": PointerClass(uint16_t), + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t) + } + + # Obtain weight dimensions + # C_out - C_in - H - W layout (depthwise convolution weights, + # with c_in used for grouping different than number of channels) + weightC_in = ctxt.lookup(varWeight).shape[1] + weightH = ctxt.lookup(varWeight).shape[2] + weightW = ctxt.lookup(varWeight).shape[3] + + # Obtain padding and striding information + pads = operatorRepresentation['pads'] + strides = operatorRepresentation['strides'] + group = operatorRepresentation['group'] + + # Iterate throught the cubes in which the output will be split for tiling + for cube in outputCubes: + # Obtain current cube offsets and dimensions + (BatchOffset, HOffset, WOffset, COffset) = cube.offset + (BatchSize, HSize, WSize, CSize) = cube.dims + + # Compute input cube + InCube, padding_tuple = Conv2DTileConstraint.computeInputCube(kernelShape = (weightH, weightW), + pads = pads, + strides = strides, + inputCSize = weightC_in * group, + outputCube = cube, + inputDims = ctxt.lookup(varIn).shape, + outputDims = ctxt.lookup(varOut).shape) + + # Extract individual padding + padding_left, padding_right, padding_top, padding_bottom = padding_tuple + + # Extract InCuve hyperrectangle + InCube = HyperRectangle((InCube.offset[0], InCube.offset[1], InCube.offset[2], InCube.offset[3]), + (InCube.dims[0], InCube.dims[1], InCube.dims[2], InCube.dims[3])) + + # Prepare weight cube + WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, InCube.dims[3] // group, weightH, weightW)) + + # Add element information for the operator representation + replacements['dim_im_in_x'].append(InCube.dims[1]) + replacements['dim_im_in_y'].append(InCube.dims[2]) + replacements['dim_im_out_x'].append(HSize) + replacements['dim_im_out_y'].append(WSize) + replacements['ch_im_out'].append(CSize) + replacements['ch_im_in'].append(InCube.dims[3]) + + replacements['padding_y_top'].append(padding_top) + replacements['padding_y_bottom'].append(padding_bottom) + replacements['padding_x_left'].append(padding_left) + replacements['padding_x_right'].append(padding_right) + + # Add computed cubes to the respective lists + inputInCubes.append(InCube) + inputWeightCubes.append(WeightCube) + + # Obtain and add bias cube with tiling information to the corresponding list, + # if bias exists + if varBias != "NULL": + BiasCube = HyperRectangle((COffset,), (CSize,)) + inputBiasCubes.append(BiasCube) + + # Prepare loading schedule lists + inputLoadSchedule = [] + outputLoadSchedule = [] + + # Create input schedule lists, with bias handling + if varBias == "NULL": + for a, b in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": a, "weight": b}) + else: + for a, b, c in zip(inputInCubes, inputWeightCubes, inputBiasCubes): + inputLoadSchedule.append({"data_in": a, "weight": b, "bias": c}) + + # Create output schedule list + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + # Prepare containing objects with information computed in this function regarding tiling schedule + # and variable replacement inside operator representation + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py index 8b795be88..db38b841a 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py @@ -19,60 +19,74 @@ class MatMulTileConstraint(TileConstraint): @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: - - # Get to-be-tiled tensor's buffers + # ===== GET NECESSARY INFORMATION ===== bufferA = ctxt.lookup(name = parseDict['A']) bufferB = ctxt.lookup(name = parseDict['B']) outputBuffer = ctxt.lookup(name = parseDict['data_out']) - # Add I/O dimensions to the model as variables + tensorsShapeLenA = len(bufferA.shape) + tensorsShapeLenB = len(bufferB.shape) + tensorsShapeLenOutput = len(outputBuffer.shape) + + # ===== ADD I/O DIMS TO MODEL AS VARS ===== for _buffer in [bufferA, bufferB, outputBuffer]: tilerModel.addTensorDimToModel(ctxt, _buffer.name) - tensorsShapeLen = len(bufferA.shape) - - AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) - ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) - BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) - BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) - outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2)) - outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1)) - - # Map output dims to inputs dims - for idx in range(tensorsShapeLen - 2): - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = bufferA.name, dimIdx = idx)) - tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = bufferB.name, dimIdx = idx)) - - tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) - tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) - - # Add GEMM Geometrical constraints - tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + # ===== EXTRACT TENSOR DIMS AS VARS ===== + # *Checks on wether dimesnions are reversed via the transA and transB flags + # A dims + AMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLenA - 2) + parseDict['transA']) + AMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLenA - 1) - parseDict['transA']) + + # B dims + BMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLenB - 2) + parseDict['transB']) + BMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLenB - 1) - parseDict['transB']) + + # Output dims + outputMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, + dimIdx = (tensorsShapeLenOutput - 2)) + outputMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, + dimIdx = (tensorsShapeLenOutput - 1)) + + # ===== ADD CONSTRAINTS ===== + # Add batch constraints + if (bufferA.shape[:-2] == bufferB.shape[:-2]): + for idx in range(tensorsShapeLenA - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx) + == tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = tensorsShapeLenA - 3 - idx)) + + for idx in range(tensorsShapeLenB - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx) + == tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = tensorsShapeLenB - 3 - idx)) + + # Add GEMM geometrical constraints + tilerModel.addConstraint(outputMatrixFirstDimVar == AMatrixFirstDimVar) + tilerModel.addConstraint(outputMatrixSecondDimVar == BMatrixSecondDimVar) + + tilerModel.addConstraint(AMatrixSecondDimVar == BMatrixFirstDimVar) return tilerModel @staticmethod def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Get input buffers and other required information bufferA = ctxt.lookup(name = parseDict['A']) bufferB = ctxt.lookup(name = parseDict['B']) tensorsShapeLen = len(bufferA.shape) + # Get dimensions of interest from the 2 inputs ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) - BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, - dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) # VIC: We don't want to deal with intermediate results between kernel calls tilerModel.addConstraint(ASecondDimVar == parseDict['N']) @@ -85,28 +99,39 @@ def serializeTilingSolution( cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], targetMemLevel: str, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + # Get output cubes outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + # Get names, optimizer variables, buffers, and other information for elements of interest addrNames = ['A', 'B', 'data_out'] inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation, addrNames) buffA = ctxt.lookup(operatorRepresentation['A']) buffB = ctxt.lookup(operatorRepresentation['B']) + buffOut = ctxt.lookup(operatorRepresentation['data_out']) + + tensorsShapeLenA = len(buffA.shape) + tensorsShapeLenB = len(buffB.shape) + tensorsShapeOutput = len(buffOut.shape) NSize = buffA.shape[-1] NOffset = 0 + # Prepare input cubes lists inputACubes = [] inputBCubes = [] + # Prepare replacements lists replacements = {"M": [], "O": [], "batch": []} # Every output is constructed by a pair of inputs. Reconstruct this pair. for cube in outputCubes: + # Get output dimensions MOffset, OOffset = cube.offset[-2:] MSize, OSize = cube.dims[-2:] + # Check that batch tiling is set up properly if len(cube.offset) > 2: BatchSize = math.prod(cube.dims[:-2]) @@ -117,35 +142,60 @@ def serializeTilingSolution( else: BatchSize = 1 + # Prepare cube dimensions replacements replacements["M"].append(MSize) replacements["O"].append(OSize) replacements["batch"].append(BatchSize) + # Compute A cube information + # Matrix offsets and shape AMatrixOffsets = (MOffset, NOffset) AMatrixShape = (MSize, NSize) - if len(buffA.shape) > 2: - batchDimCount = len(buffA.shape) - 2 - AMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + AMatrixOffsets - AMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + AMatrixShape - - ACube = HyperRectangle(AMatrixOffsets, AMatrixShape) + # Batch offset and shape (with broadcasting handling) + ABatchOffsets = list() + ABatchShape = list() + + for idx in range(tensorsShapeLenA - 2): + if buffA.shape[tensorsShapeLenA - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]: + ABatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx]) + ABatchShape.append(cube.dims[len(cube.dims) - 3 - idx]) + else: + ABatchOffsets.append(0) + ABatchShape.append(1) + + ACube = HyperRectangle( + tuple(reversed(ABatchOffsets)) + tuple(AMatrixOffsets), + tuple(reversed(ABatchShape)) + tuple(AMatrixShape)) inputACubes.append(ACube) + # Compute B cube information + # Matrix offsets and shape BMatrixOffsets = (NOffset, OOffset) BMatrixShape = (NSize, OSize) - if len(buffB.shape) > 2: - batchDimCount = len(buffB.shape) - 2 - BMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + BMatrixOffsets - BMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + BMatrixShape - - BCube = HyperRectangle(BMatrixOffsets, BMatrixShape) + # Batch offset and shape (with broadcasting handling) + BBatchOffsets = list() + BBatchShape = list() + + for idx in range(tensorsShapeLenB - 2): + if buffB.shape[tensorsShapeLenB - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]: + BBatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx]) + BBatchShape.append(cube.dims[len(cube.dims) - 3 - idx]) + else: + BBatchOffsets.append(0) + BBatchShape.append(1) + + BCube = HyperRectangle( + tuple(reversed(BBatchOffsets)) + tuple(BMatrixOffsets), + tuple(reversed(BBatchShape)) + tuple(BMatrixShape)) inputBCubes.append(BCube) + # Prepare load schedule lists for computed cubes inputLoadSchedule = [] outputLoadSchedule = [] + # Prepare replacements replacements["N"] = [NSize] * len(outputCubes) replacementTypes = { @@ -155,12 +205,14 @@ def serializeTilingSolution( "batch": PointerClass(int8_t) } + # Update load schedule lists for a, b in zip(inputACubes, inputBCubes): inputLoadSchedule.append({"A": a, "B": b}) for out in outputCubes: outputLoadSchedule.append({"data_out": out}) + # Prepare tiling schedule object schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py new file mode 100644 index 000000000..9ac444fc1 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py @@ -0,0 +1,164 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +import numpy as np +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class ReduceMeanTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Get necessary information + # Get I/O buffer names + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + # Get I/O shapes + outputShape = parseDict['data_out_shape'] + + # Get other necessary information + reduceAxes = parseDict['axes'] + keepDims = parseDict['keepdims'] + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # Add constratints for the I/O dimensions + input_ax = 0 + for idx in range(len(outputShape)): + # Get current dimension variables + outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) + + if idx in reduceAxes: + # For reduced axes, constrain to 1 if keepdims is set, + # otherwise skip this axis in the input tensor, + # as it needs to be eliminated. + if keepDims: + tilerModel.addConstraint(outputDimensionVar == 1) + input_ax += 1 + else: + # Otherwise, input and output dimensions need to be equal + inputDimensionVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = input_ax) + + tilerModel.addConstraint(outputDimensionVar == inputDimensionVar) + + input_ax += 1 + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @staticmethod + def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict: Dict) -> HyperRectangle: + # Get required parameters + originalInputShape = parseDict['data_in_shape'] + keepDims = parseDict['keepdims'] + + # Start from the output cube dimensions and offsets + in_cube_dims = list(originalInputShape).copy() + in_cube_offset = [ + 0, + ] * len(in_cube_dims) + + # Iterate through input axes + out_idx = 0 + for ax in range(len(in_cube_dims)): + if ax in parseDict['axes']: + # This axis is reduced + if keepDims: + # Keepdims is set, so the output cube has a dimension here (which will be 1, as it's the reduction result) + out_idx += 1 + else: + # This axis is not reduced, so copy from output cube + in_cube_dims[ax] = outputCube.dims[out_idx] + in_cube_offset[ax] = outputCube.offset[out_idx] + out_idx += 1 + + return HyperRectangle(offset = tuple(in_cube_offset), dims = tuple(in_cube_dims)) + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + + # Prepare address names + addrNames = ['data_in', 'data_out'] + + # Extract memory base addresses for each of the required components, + # based on the computed memory configuration + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # Prepare replacement lists for the elements inside the operator representation, + # for the cubes to be computed further down in this function + replacements: Dict[str, List[int]] = { + "data_in_shape": [], + "data_out_shape": [], + "size": [], + } + + replacementTypes = { + "data_in_shape": [ + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t) + ], + "data_out_shape": [ + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t) + ], + "size": PointerClass(uint16_t), + } + + # Prepare loading schedule lists + inputLoadSchedule = [] + outputLoadSchedule = [] + + # Iterate over output cubes to compute corresponding input cubes + for out_cube in [cube.rectangle for cube in absoluteOutputCubes]: + # Compute input cube + in_cube = ReduceMeanTileConstraint.computeInputCubeFromOutputCube(out_cube, + parseDict = operatorRepresentation) + + # Append replacement elements + replacements["data_in_shape"].append(list(in_cube.dims).copy()) + replacements["data_out_shape"].append(list(out_cube.dims).copy()) + replacements["size"].append(int(np.prod(out_cube.dims))) + + # Append new cubes + inputLoadSchedule.append({"data_in": in_cube}) + outputLoadSchedule.append({"data_out": out_cube}) + + # Prepare containing objects with information computed in this function regarding tiling schedule + # and variable replacement inside operator representation + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py new file mode 100644 index 000000000..623aa9a71 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple, Union + +import numpy as np +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class SliceTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get necessary information + # Get I/O buffer names + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + # Get I/O shapes + inputShape = parseDict['data_in_shape'] + + # Get other necessary information + sliceAxes = parseDict['axes'] + sliceSteps = parseDict['steps'] + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # Add constratints for the I/O dimensions + for idx in range(len(inputShape)): + # Get current dimension variables + inputDimensionVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx) + outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) + + if idx in sliceAxes: + # For sliced axes, constrain to minimal input dimension + # based on the output dimension and the slicing step + axIndex = list(sliceAxes).index(idx) + axStep = sliceSteps[axIndex] + + tilerModel.addConstraint(inputDimensionVar == ((outputDimensionVar - 1) * axStep + 1)) + else: + # Otherwise, input and output dimensions need to be equal + tilerModel.addConstraint(outputDimensionVar == inputDimensionVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + symbolicParseDict = parseDict.copy() + + return symbolicParseDict + + @staticmethod + def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict: Dict) -> HyperRectangle: + # Computes the input cube given the output cube and the slicing parameters. + # + # Will provide a minimal input cube, that only requires the data needed for the output cube + # by ignoring the input data that is outside of the slicing scope, + # as given by the slicing starting and ending parameters. + # + # (It will start with the first element required for the output cube, + # and will end with the last element required for the output cube). + # + # *Function is ready for multiple axes slicing. + + # Start from the output cube dimensions and offsets + in_cube_dims = list(outputCube.dims).copy() + in_cube_offset = list(outputCube.offset).copy() + + # Iterate through the sliced axes + for idx, ax in enumerate(parseDict['axes']): + # Get current sliced ax parameters + start = parseDict['starts'][idx] + step = parseDict['steps'][idx] + + # Compute input cube parameters for the current axis + in_cube_dims[ax] = (outputCube.dims[ax] - 1) * step + 1 + in_cube_offset[ax] = start + outputCube.offset[ax] * step + + return HyperRectangle(offset = tuple(in_cube_offset), dims = tuple(in_cube_dims)) + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + # Extract rectangle information (offsets and dimensions) from output cubes + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + # Prepare address names + addrNames = ['data_in', 'data_out'] + + # Extract memory base addresses for each of the required components, + # based on the computed memory configuration + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + # Prepare replacement lists for the elements inside the operator representation, + # for the cubes to be computed further down in this function + replacements = { + "data_in_shape": [], + "data_out_shape": [], + "starts": [[ + 0, + ] * len(operatorRepresentation['axes'])] * len(outputCubes), + "ends": [], + "data_in_size": [], + } + + replacementTypes = { + "data_in_shape": [ + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t) + ], + "data_out_shape": [ + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t), + PointerClass(uint16_t) + ], + "starts": PointerClass(uint16_t), + "ends": PointerClass(uint16_t), + "data_in_size": PointerClass(uint16_t), + } + + # Prepare loading schedule lists + inputLoadSchedule = [] + outputLoadSchedule = [] + + for out_cube in outputCubes: + # Compute input cube + in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, parseDict = operatorRepresentation) + + # Compute new ends for replacement + new_ends = list() + for ax in operatorRepresentation['axes']: + new_ends.append(in_cube.offset[ax] + in_cube.dims[ax]) + + # Append replacement elements + replacements["data_in_shape"].append(list(in_cube.dims).copy()) + replacements["data_out_shape"].append(list(out_cube.dims).copy()) + replacements["ends"].append(new_ends) + replacements["data_in_size"].append(int(np.prod(in_cube.dims))) + + # Append new cubes + inputLoadSchedule.append({"data_in": in_cube}) + outputLoadSchedule.append({"data_out": out_cube}) + + # Prepare containing objects with information computed in this function regarding tiling schedule + # and variable replacement inside operator representation + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index a6dbaa4e8..6de8ca300 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -16,23 +16,26 @@ from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ - PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \ - PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, \ - PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, \ - PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \ - PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSoftmaxBindings, \ - PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \ - PULPTransposeBindings, PULPUniformRQSBindings + PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, \ + PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, \ + PULPMaxPool2DBindings, PULPMulBindings, PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, \ + PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, \ + PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, \ + PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, \ + PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ + RQDWConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import iSoftmaxTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ReduceMeanConstraint import ReduceMeanTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings @@ -41,11 +44,14 @@ tileConstraint = RQConv2DTileConstraint()) PULPRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSDWConv2DBindings, - tileConstraint = DWConv2DTileConstraint()) + tileConstraint = RQDWConv2DTileConstraint()) PULPConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings, tileConstraint = Conv2DTileConstraint()) +PULPDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConv2DBindings, + tileConstraint = DWConv2DTileConstraint()) + PULPRQSGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSGEMMBindings, tileConstraint = GEMMTileConstraint()) @@ -130,4 +136,10 @@ tileConstraint = UntiledTileConstraint()) PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings, - tileConstraint = SGDTileConstraint()) \ No newline at end of file + tileConstraint = SGDTileConstraint()) + +PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings, + tileConstraint = SliceTileConstraint()) + +PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings, + tileConstraint = ReduceMeanTileConstraint()) diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py index bdae0fbdc..27ca222e4 100644 --- a/Deeploy/TilingExtension/TilerExtension.py +++ b/Deeploy/TilingExtension/TilerExtension.py @@ -435,7 +435,8 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo if not isinstance(ctxt.lookup(tensorName), TransientBuffer): - tensorShapeLen = len(ctxt.lookup(tensorName).shape) + tensorShapeLen = 1 if isinstance(ctxt.lookup(tensorName).shape, int) else len( + ctxt.lookup(tensorName).shape) newShape: List[int] = [] if isinstance(memoryConstraint.size, int): @@ -446,7 +447,7 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo newShape.append( self.tilerModel._resolveVariable(tilerModel.getTensorDimVar(tensorName, i, copyIdx))) - newMemoryConstraint.shape = tuple(newShape) + newMemoryConstraint.shape = (newShape,) if isinstance(newShape, int) else tuple(newShape) solvedTensorConstraint.addMemoryConstraint(newMemoryConstraint) diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py index 80f0191d7..db83974f0 100644 --- a/Deeploy/TilingExtension/TilerModel.py +++ b/Deeploy/TilingExtension/TilerModel.py @@ -147,7 +147,9 @@ def addTensorDimToModel(self, ctxt: NetworkContext, tensorName: str, copyIdx: Op ''' tensor = ctxt.lookup(tensorName) - for idx, dim in enumerate(tensor.shape): + for idx, dim in enumerate([ + tensor.shape, + ] if isinstance(tensor.shape, int) else tensor.shape): varName = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx) @@ -170,7 +172,9 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId tensorDimProductExpr = 1 - for idx, _ in enumerate(tensor.shape): + for idx, _ in enumerate([ + tensor.shape, + ] if isinstance(tensor.shape, int) else tensor.shape): varNameIdx = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx) tensorDimProductExpr *= self._variables[varNameIdx] diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py index 604ba23c9..da27365c7 100644 --- a/Deeploy/TilingExtension/TilingCodegen.py +++ b/Deeploy/TilingExtension/TilingCodegen.py @@ -165,7 +165,16 @@ def minimizeVariableReplacement( newRepTypes = {} for key, value in scheme.perTileReplacements.items(): - if len(set(value)) > 1: + more_than_one_unique_item = False + items_checked = list() + for item in value: + if item not in items_checked: + items_checked.append(item) + if len(items_checked) > 1: + more_than_one_unique_item = True + break + + if more_than_one_unique_item: newPerTileRep[key] = scheme.perTileReplacements[key] newRepTypes[key] = scheme.replacementTypes[key] else: diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz index a98a6c33b..36567a96c 100644 Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz differ diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx index ae1b3ac93..5eb3ae446 100644 Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx differ diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz index a5d4b6e97..0e2e55fcf 100644 Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz differ diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py index 24f0638f2..cf8acf05d 100644 --- a/DeeployTest/generateNetwork.py +++ b/DeeployTest/generateNetwork.py @@ -20,7 +20,7 @@ from Deeploy.DeeployTypes import _NoVerbosity from Deeploy.Logging import DEFAULT_LOGGER as log from Deeploy.Targets.CortexM.Platform import CMSISPlatform -from Deeploy.Targets.PULPOpen.Platform import PULPPlatform +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPPlatform def generateNetwork(args): @@ -84,6 +84,10 @@ def generateNetwork(args): platform, signProp = mapPlatform(args.platform) + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + inputTypes = {} inputOffsets = {} @@ -183,6 +187,13 @@ def generateNetwork(args): 'If not specified, offsets are set to 0. ' 'Example: --input-offset-map input_0=0 input_1=128 ...') parser.add_argument('--shouldFail', action = 'store_true') + parser.add_argument( + "--cores", + type = int, + default = 1, + help = + "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1.", + ) parser.set_defaults(shouldFail = False) args = parser.parse_args() diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 013f854da..4b1ebef20 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -26,6 +26,7 @@ from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \ AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper @@ -76,6 +77,10 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg if args.enableStrides: platform.engines[0].enableStrides = True + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + for index, num in enumerate(test_inputs): _type, offset = inferTypeAndOffset(num, signProp) inputTypes[f"input_{index}"] = _type @@ -195,6 +200,13 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg parser.add_argument('--plotMemAlloc', action = 'store_true', help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n') + parser.add_argument( + "--cores", + type = int, + default = 1, + help = + "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1." + ) parser.set_defaults(shouldFail = False) args = parser.parse_args() diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 7d1f7f312..a3329ebf7 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -342,6 +342,10 @@ def generate_test(self): generation_script = "generateNetwork.py" command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}" + + if self._platform in ["Siracusa", "Siracusa_w_neureka"]: + command += f" --cores={self._args.cores}" + command += self._argument_parser.generate_cmd_args() log.debug(f"[TestRunner] Generation Command: {command}") diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h index f6e8308c9..4da9e2abd 100644 --- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h @@ -32,6 +32,7 @@ #include "kernel/RequantShift.h" #include "kernel/Softmax.h" #include "kernel/UniformRequantShift.h" +#include "kernel/gemm.h" #include "kernel/gemv.h" #include "kernel/iRMSnorm.h" diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h index f5382a339..3ebab54a0 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -9,20 +9,30 @@ #include "DeeployPULPMath.h" -void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t H, uint32_t W, uint32_t C, - const float32_t *__restrict__ pSrcB, - uint32_t F_total, uint32_t P, uint32_t Q, - uint32_t SP, uint32_t SQ, - float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, - uint32_t pad_left, uint32_t pad_right); +void PULP_Conv2d_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right); void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, - uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, - uint32_t pad_right, float32_t *__restrict__ pContextBuffer); + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer); + +void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer); #endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c index c33ac31e8..af2129323 100644 --- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c +++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c @@ -7,18 +7,19 @@ #include "DeeployPULPMath.h" #include "pmsis.h" -void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t H, uint32_t W, uint32_t C, - const float32_t *__restrict__ pSrcB, - uint32_t F_total, uint32_t P, uint32_t Q, - uint32_t SP, uint32_t SQ, - float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, - uint32_t pad_left, uint32_t pad_right) { +void PULP_Conv2d_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right) { + // Compute core int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + // Compute the chunk size for each core uint16_t ch_out_chunk = (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0); uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total); @@ -29,37 +30,72 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, return; } + // Pointer to the weights for the current core const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q; + // Compute the output dimensions uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; - for (uint32_t h = 0; h < H_out; ++h) { - for (uint32_t w = 0; w < W_out; ++w) { - for (uint32_t f = 0; f < ch_out_count; ++f) { - float32_t sum = 0.0f; + // Compute the output + if (has_bias) { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + for (uint32_t f = 0; f < ch_out_count; ++f) { + float32_t sum = 0.0f; - for (uint32_t p = 0; p < P; ++p) { - for (uint32_t q = 0; q < Q; ++q) { - for (uint32_t c = 0; c < C; ++c) { - int32_t h_in = h * SP + p - pad_top; - int32_t w_in = w * SQ + q - pad_left; + for (uint32_t p = 0; p < P; ++p) { + for (uint32_t q = 0; q < Q; ++q) { + for (uint32_t c = 0; c < C; ++c) { + int32_t h_in = h * SP + p - pad_top; + int32_t w_in = w * SQ + q - pad_left; - if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 || - w_in >= (int32_t)W) { - continue; - } + if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 || + w_in >= (int32_t)W) { + continue; + } - uint32_t input_idx = (h_in * W + w_in) * C + c; - uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c; + uint32_t input_idx = (h_in * W + w_in) * C + c; + uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c; - sum += pSrcA[input_idx] * weight_ptr[weight_idx]; + sum += pSrcA[input_idx] * weight_ptr[weight_idx]; + } } } + + uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f); + pDstC[output_idx] = sum + pSrcBias[f + ch_out_start]; } + } + } + } else { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + for (uint32_t f = 0; f < ch_out_count; ++f) { + float32_t sum = 0.0f; + + for (uint32_t p = 0; p < P; ++p) { + for (uint32_t q = 0; q < Q; ++q) { + for (uint32_t c = 0; c < C; ++c) { + int32_t h_in = h * SP + p - pad_top; + int32_t w_in = w * SQ + q - pad_left; + + if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 || + w_in >= (int32_t)W) { + continue; + } + + uint32_t input_idx = (h_in * W + w_in) * C + c; + uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c; + + sum += pSrcA[input_idx] * weight_ptr[weight_idx]; + } + } + } - uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f); - pDstC[output_idx] = sum; + uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f); + pDstC[output_idx] = sum; + } } } } @@ -68,12 +104,17 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, - uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, - uint32_t pad_right, float32_t *__restrict__ pContextBuffer) { + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer) { + + // Compute core int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + // Compute the chunk size for each core uint16_t ch_out_chunk = (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0); uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total); @@ -84,50 +125,95 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( return; } + // Pointer to the weights for the current core const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q; uint32_t im2col_size_per_core = C * P * Q; float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core; + // Compute the output dimensions uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; uint32_t kernel_size = P * Q * C; - for (uint32_t h_out = 0; h_out < H_out; h_out++) { - for (uint32_t w_out = 0; w_out < W_out; w_out++) { - int32_t h_in_start = h_out * SP - pad_top; - int32_t w_in_start = w_out * SQ - pad_left; + // Compute the output + if (has_bias) { + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; p++) { + int32_t h_in = h_in_start + p; + + for (uint32_t q = 0; q < Q; q++) { + int32_t w_in = w_in_start + q; + + for (uint32_t c = 0; c < C; c++) { + if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 && + w_in < (int32_t)W) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx]; + } else { + im2col_buffer[p * Q * C + q * C + c] = 0.0f; + } + } + } + } + + for (uint32_t f = ch_out_start; f < ch_out_stop; f++) { + float32_t sum = 0.0f; + const float32_t *local_weight_ptr = + weight_ptr + (f - ch_out_start) * kernel_size; - for (uint32_t p = 0; p < P; p++) { - int32_t h_in = h_in_start + p; + for (uint32_t k = 0; k < kernel_size; k++) { + sum += im2col_buffer[k] * local_weight_ptr[k]; + } - for (uint32_t q = 0; q < Q; q++) { - int32_t w_in = w_in_start + q; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; - for (uint32_t c = 0; c < C; c++) { - if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 && - w_in < (int32_t)W) { - uint32_t in_idx = (h_in * W + w_in) * C + c; - im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx]; - } else { - im2col_buffer[p * Q * C + q * C + c] = 0.0f; + pDstC[out_idx] = sum + pSrcBias[f]; + } + } + } + } else { + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; p++) { + int32_t h_in = h_in_start + p; + + for (uint32_t q = 0; q < Q; q++) { + int32_t w_in = w_in_start + q; + + for (uint32_t c = 0; c < C; c++) { + if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 && + w_in < (int32_t)W) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx]; + } else { + im2col_buffer[p * Q * C + q * C + c] = 0.0f; + } } } } - } - for (uint32_t f = 0; f < ch_out_count; f++) { - float32_t sum = 0.0f; - const float32_t *local_weight_ptr = weight_ptr + f * kernel_size; + for (uint32_t f = ch_out_start; f < ch_out_stop; f++) { + float32_t sum = 0.0f; + const float32_t *local_weight_ptr = + weight_ptr + (f - ch_out_start) * kernel_size; - for (uint32_t k = 0; k < kernel_size; k++) { - sum += im2col_buffer[k] * local_weight_ptr[k]; - } + for (uint32_t k = 0; k < kernel_size; k++) { + sum += im2col_buffer[k] * local_weight_ptr[k]; + } - uint32_t out_idx = - (h_out * W_out + w_out) * F_total + (ch_out_start + f); - pDstC[out_idx] = sum; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; + + pDstC[out_idx] = sum; + } } } } -} \ No newline at end of file +} diff --git a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c new file mode 100644 index 000000000..88f21b9a2 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c @@ -0,0 +1,251 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer) { + + // Compute core information + int8_t core_id = pi_core_id(); + int8_t log2Core = (int8_t)log2(NUM_CORES); + + // Compute the chunk size for each core + // (Splitting work along the output channels) + uint16_t ch_out_chunk = + (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0); + uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total); + uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, F_total); + uint16_t ch_out_count = ch_out_stop - ch_out_start; + + // If there is no output channel to process, return + // (when F < NUM_CORES and working on a core with id > F) + if (ch_out_count == 0) { + return; + } + + // Move pointer of the weights for the current core + const float32_t *weight_ptr = pSrcB + ch_out_start * P * Q; + + // Move pointer of the im2col buffer for the current core + uint32_t im2col_size_per_core = P * Q; + float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core; + + // Compute the output dimensions + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + uint32_t kernel_size = P * Q * F_total; + + // Compute the output + if (has_bias) { + // Work on individual output elements + // (each element depends on a column from the im2col buffer + // and one convolutional filter, stored in memory continuously) + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + // Compute height and width starting point + // (depending on stride and padding) + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + // Initialize the padded part of the im2col buffer with 0 + // Work on the TOP padding + for (int32_t h_in = (int32_t)h_in_start; + h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the BOTTOM padding + for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P; + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining LEFT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining RIGHT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q; + w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Copy input data to im2col buffer + // Input channels depend on the output channels assigned to the core + // (each input channel is associated with F_total / C output channels, + // number which corresponds to the "group" parameter in the Conv ONNX + // operator) + for (uint32_t c = ch_out_start / (F_total / C); + c < (ch_out_stop + 1) / (F_total / C); c++) { + // Copy the valid input data to the im2col buffer + for (uint32_t h_in = MAX(0, h_in_start); + h_in < MIN(H, h_in_start + P); h_in++) { + for (uint32_t w_in = MAX(0, w_in_start); + w_in < MIN(W, w_in_start + Q); w_in++) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = + pSrcA[in_idx]; + } + } + + // Compute output channels of interest, based on current input channel + // and core + uint32_t lower_f, upper_f; + + if (c * (F_total / C) < ch_out_start) { + lower_f = ch_out_start; + } else { + lower_f = c * (F_total / C); + } + + if ((c + 1) * (F_total / C) < ch_out_stop) { + upper_f = (c + 1) * (F_total / C); + } else { + upper_f = ch_out_stop; + } + + // Perform convolution for the assigned output channels + for (uint32_t f = lower_f; f < upper_f; f++) { + float32_t sum = 0.0f; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; + + for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) { + sum += + im2col_buffer[im2col_idx] * + weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)]; + } + + // Copy the result to the output tensor + pDstC[out_idx] = sum + pSrcBias[f]; + } + } + } + } + } else { + // Work on individual output elements + // (each element depends on a column from the im2col buffer + // and one convolutional filter, stored in memory continuously) + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + // Compute height and width starting point + // (depending on stride and padding) + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + // Initialize the padded part of the im2col buffer with 0 + // Work on the TOP padding + for (int32_t h_in = (int32_t)h_in_start; + h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the BOTTOM padding + for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P; + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining LEFT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining RIGHT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q; + w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Copy input data to im2col buffer + // Input channels depend on the output channels assigned to the core + // (each input channel is associated with F_total / C output channels, + // number which corresponds to the "group" parameter in the Conv ONNX + // operator) + for (uint32_t c = ch_out_start / (F_total / C); + c < (ch_out_stop + 1) / (F_total / C); c++) { + // Copy the valid input data to the im2col buffer + for (uint32_t h_in = MAX(0, h_in_start); + h_in < MIN(H, h_in_start + P); h_in++) { + for (uint32_t w_in = MAX(0, w_in_start); + w_in < MIN(W, w_in_start + Q); w_in++) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = + pSrcA[in_idx]; + } + } + + // Compute output channels of interest, based on current input channel + // and core + uint32_t lower_f, upper_f; + + if (c * (F_total / C) < ch_out_start) { + lower_f = ch_out_start; + } else { + lower_f = c * (F_total / C); + } + + if ((c + 1) * (F_total / C) < ch_out_stop) { + upper_f = (c + 1) * (F_total / C); + } else { + upper_f = ch_out_stop; + } + + // Perform convolution for the assigned output channels + for (uint32_t f = lower_f; f < upper_f; f++) { + float32_t sum = 0.0f; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; + + for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) { + sum += + im2col_buffer[im2col_idx] * + weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)]; + } + + // Copy the result to the output tensor + pDstC[out_idx] = sum; + } + } + } + } + } + + return; +} diff --git a/TargetLibraries/PULPOpen/src/GELU.c b/TargetLibraries/PULPOpen/src/GELU.c index 281d4674d..ef2319e3b 100644 --- a/TargetLibraries/PULPOpen/src/GELU.c +++ b/TargetLibraries/PULPOpen/src/GELU.c @@ -12,23 +12,21 @@ void PULP_GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize) { + // Get core information int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + + // Split into chunks for each core int16_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0); int16_t chunk_start = MIN(chunk * core_id, dataSize); int16_t chunk_stop = MIN(chunk_start + chunk, dataSize); - const float32_t sqrt_2_over_pi = 0.7978845608f; // sqrt(2/π) - const float32_t coeff = 0.044715f; + // Compute GELU on the assigned chunk for (uint32_t i = chunk_start; i < chunk_stop; i++) { float32_t x = data_in[i]; - float32_t x_cubed = x * x * x; - float32_t inner = sqrt_2_over_pi * (x + coeff * x_cubed); - - float32_t exp_2z = expf(2.0f * inner); - float32_t tanh_val = (exp_2z - 1.0f) / (exp_2z + 1.0f); + float32_t cdf = 0.5f * (1.0f + tanhf((sqrtf(2.0f / (float)M_PI) * + (x + 0.044715f * powf(x, 3.0f))))); - float32_t cdf = 0.5f * (1.0f + tanh_val); data_out[i] = x * cdf; } }