diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml index 7c6a5f7541..f59f7fa884 100644 --- a/.github/workflows/ci-platform-siracusa.yml +++ b/.github/workflows/ci-platform-siracusa.yml @@ -53,7 +53,15 @@ jobs: testBacktracking testFloatAdder testFloatGEMM + testFloat2DConvolution + testFloat2DConvolutionBias + testFloat2DConvolutionZeroBias + + testFloat2DDWConvolution + testFloat2DDWConvolutionBias + testFloat2DDWConvolutionZeroBias + testFloatLayerNorm testFloatRelu testFloatMaxPool @@ -64,6 +72,7 @@ jobs: Quant Dequant testFloatReduceSum + testFloatReshapeWithSkipConnection testFloatSoftmaxGrad testFloatSoftmaxCrossEntropy testFloatSoftmaxCrossEntropyGrad @@ -87,4 +96,5 @@ jobs: CCT/CCT_1_16_16_8 CCT/CCT_2_32_32_128_Opset20 testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8 + testFloatDemoTinyViT num-cores: 8 diff --git a/CHANGELOG.md b/CHANGELOG.md index faf4de42c5..1ed34f8da9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ## Unreleased (Planned Release Target: v0.2.1) ### List of Pull Requests +- TinyViT on non-tiled Siracusa [#117](https://github.com/pulp-platform/Deeploy/pull/117) - Support Fully Asynchronous DMAs [#114](https://github.com/pulp-platform/Deeploy/pull/114) - Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128) - Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123) @@ -24,6 +25,13 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Fix bias hoisting in generic GEMM with no bias [#126](https://github.com/pulp-platform/Deeploy/pull/126) ### Added +- PULP 2D FP DW conv Im2Col template and kernel, with bias support. +- Bias support for PULP 2D FP regular conv Im2Col in template & kernel. +- PULP FP DW conv 2D parser. +- FP conv 2D (simple & DW), reshape & skip connection, and TinyViT demo tests to the non-tiled Siracusa CI pipeline. +- FP bindings and mappings for PULP slice, DW conv 2D, and reduce mean operations. +- FP PULP DW conv lowering optimization pass similar to the existent one for integer version. +- RemoveEmptyConvBiasPass to the PULP optimizer. - Add manual type inference feature (CLI: `--input-type-map`/`--input-offset-map`) to resolve ambiguities when test inputs are not representative enough - Added a `testTypeInferenceDifferentTypes` test case to validate type inference for different input types - Added `_mangleNodeNames` function to avoid duplicate node mappings @@ -58,8 +66,11 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Added testFloatGEMMnobias - Profiling support and optional comments in generated DMA code for better traceability - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy` +- PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used +- annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations ### Changed +- Reduced size of reshape & skip connection test, for non-tiled Siracusa memory compatibility. - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`. - mchan HAL is now reduced to bare-bones - refactor of the IntrospectiveCodeTransformation to work on the Mako template @@ -95,8 +106,12 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Disabled ICCT_ITA_8 MemPool test because it was using a lowering that created shapeless tensors - Added missing shape annotation to the testTypeInferenceDifferentTypes - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode +- changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection ### Fixed +- Fixed bug for non-batched elements in the PULPOpen FP GEMM and matmul templates. +- Added underscore to the beginning of closure names to avoid naming issues when they start with unsupported first characters (like numbers). +- Data types in the PULPOpen FP add and mul templates. - Prevent node duplication for graphs generated via GraphSurgeon - Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step. - Fix license CI check and prevent potential issues with `jq` installation @@ -185,9 +200,9 @@ This release containing major architectural changes, new platform support, enhan ### Added -- BatchNorm kernel -- ConvTranspose kernel -- MaxPool1D kernel +- BatchNorm kernel +- ConvTranspose kernel +- MaxPool1D kernel - Template for 1D Convolution - Support for float32 data type in the previous kernels - Float binding for Pad1D kernel @@ -326,7 +341,7 @@ This release containing major architectural changes, new platform support, enhan ### Changed - FloatConvTemplate file -- Platform.py file +- Platform.py file - Bump the CMake version to 3.24 as required for the chimera-sdk - Bump GVSoC's version and add chimera simulation target - Rename the generic source util to utils to avoid name collision with chimera-sdk diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py index c5f9c883af..41073ad646 100644 --- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py +++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py @@ -155,7 +155,8 @@ def apply(self, executionBlock: ExecutionBlock, name: str, verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: - self.closureName = name + self.closureSuffix + # Prepend underscore to avoid name issues when beginning with problematic characters (like numbers) + self.closureName = "_" + name + self.closureSuffix self.functionCall = executionBlock.generate(ctxt) self._generateClosureStruct(ctxt, executionBlock) ctxt = self._generateClosureCtxt(ctxt, name) diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py index 4f6dba3827..c05ea3b9d9 100644 --- a/Deeploy/CommonExtensions/DataTypes.py +++ b/Deeploy/CommonExtensions/DataTypes.py @@ -87,11 +87,11 @@ class float64_t(FloatImmediate): SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t) UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t) -IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted(( - *SignedIntegerDataTypes, - *UnsignedIntegerDataTypes, -), - key = lambda _type: _type.typeWidth)) +IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple( + sorted(( + *SignedIntegerDataTypes, + *UnsignedIntegerDataTypes, + ), key = lambda _type: _type.typeWidth)) FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t) diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py index f07fe57c96..a8f27b5463 100644 --- a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py +++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py @@ -2,11 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Union - import onnx_graphsurgeon as gs -from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity +from Deeploy.DeeployTypes import CodeGenVerbosity, DeploymentEngine, NetworkContext, NetworkDeployer, _NoVerbosity class NetworkDeployerWrapper(NetworkDeployer): @@ -68,8 +66,8 @@ def generateBufferAllocationCode(self) -> str: return self._innerObject.generateBufferAllocationCode() # MultiEngineDeployer augment - def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: - return self._innerObject._mapNode(node) + def _selectEngine(self, node: gs.Node) -> DeploymentEngine: + return self._innerObject._selectEngine(node) def _printMemorySummary(self): return self._innerObject._printMemorySummary() diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 8c2f5d2485..5ccfb7dcf7 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -325,7 +325,7 @@ def fromNode(cls, node: gs.Node): return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape)) def has_live_aliases(self, ctxt: NetworkContext) -> bool: - """Checks whether this VariableBuffer has any live ancestors, i.e. buffers that are still live and are aliased by this buffer. + """Checks whether this VariableBuffer has any live aliases, i.e. buffers that are still live and are aliased by this buffer. Parameters ---------- ctxt : NetworkContext @@ -333,7 +333,7 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool: Returns ------- bool - True if this VariableBuffer has any live ancestors, False otherwise + True if this VariableBuffer has any live aliases, False otherwise """ # Do a breadth-first search across the aliasing double-linked list live = self._live @@ -2562,10 +2562,10 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity): self.ctxt = layer.codeTransform(self.ctxt, verbose) self.transformed = True - def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: + def _selectEngine(self, node: gs.Node) -> DeploymentEngine: for engine in self.Platform.engines: if node.op in engine.Mapping: - return engine.Mapping[node.op](node) + return engine raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}") def _bindLayers(self): @@ -2582,7 +2582,8 @@ def _bindLayers(self): flatSchedule += subGraph for node in flatSchedule: - layer = self._mapNode(node) + engine = self._selectEngine(node) + layer = engine.Mapping[node.op](node) if isinstance(layer, ONNXLayer): log.debug(f" {SUCCESS_MARK} Bind {node.name} to layer {layer.__class__.__name__}") self.layerBinding[layer.node.name] = layer diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py index 4b05ab5be4..570363b9a2 100644 --- a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py +++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py @@ -2,13 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, Type, Union +from typing import Callable, Dict, Type import onnx_graphsurgeon as gs from Deeploy.AbstractDataTypes import Pointer from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper -from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer +from Deeploy.DeeployTypes import DeploymentEngine, DeploymentPlatform, NetworkDeployer, Schedule, TopologyOptimizer from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \ EngineColoringPass, EngineMapper @@ -48,14 +48,14 @@ def lower(self, graph: gs.Graph) -> gs.Graph: ) == 0, f"Missing engine color for nodes {[node.name for node in uncoloredNodes]} with operations {uncoloredOperations}" return graph - def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]: + def _selectEngine(self, node: gs.Node) -> DeploymentEngine: assert "engine" in node.attrs, f"Node {node.name} doesn't have an engine color." engineName = node.attrs["engine"] assert isinstance(engineName, str) and engineName in self.engineDict, \ f"Node {node.name} has an invalid engine {engineName} assigned." engine = self.engineDict[engineName] assert node.op in engine.Mapping, f"No mapping found for {node.op} in engine {engine.name}" - return engine.Mapping[node.op](node) + return engine class EngineColoringDeployerWrapper(EngineColoringDeployer, NetworkDeployerWrapper): diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 9ff940b2f0..cc81527f32 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -9,13 +9,13 @@ from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ MemoryManagementGeneration, MemoryPassthroughGeneration -from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t, \ - uint8_t +from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \ + int8_t, int32_t, int64_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \ - GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate +from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \ + FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ @@ -27,11 +27,11 @@ from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \ - FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, \ - FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, \ - ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \ - SliceTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ + FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ + FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \ + MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \ + SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker @@ -148,16 +148,24 @@ PointerClass(uint8_t), PointerClass(uint8_t), PointerClass(uint8_t) - ], [PULPDMAFuture(underlyingType = type)]), SliceTemplate.referenceTemplate, MemoryAwareForkTransformer) + ], [PULPDMAFuture(underlyingType = type)]), DMASliceTemplate.referenceTemplate, MemoryAwareForkTransformer) for type in IntegerDataTypes ] +PULPSliceBindings = [ + NodeBinding( + SliceChecker([ + PointerClass(type), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t), + PointerClass(uint8_t) + ], [PointerClass(type)]), SliceTemplate.referenceTemplate, ForkTransformer) for type in FloatDataTypes +] + PULPReshapeBindings = [ - NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]), - ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes -] + [ - NodeBinding(ReshapeChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]), - ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]), + ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes ] PULPRQAddBindings = [ @@ -225,6 +233,14 @@ ForkTransformer) ] +PULPFloatDWConv2DBindings = [ + NodeBinding( + ConvChecker( + [PointerClass(float_type), PointerClass(float_type), + PointerClass(float_type)], [PointerClass(float_type)]), FloatConvTemplate.referenceDW2DIm2ColTemplate, + ForkTransformer) for float_type in FloatDataTypes +] + PULPRQSMatrixVecBindings = [ NodeBinding( PULPLinearChecker([PointerClass(type1), @@ -276,6 +292,11 @@ PULPReduceMeanBindings = [ NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]), + FloatReduceMeanTemplate.referenceTemplate, ClusterTransformer) + for integer_type in SignedIntegerDataTypes + for float_type in FloatDataTypes ] PULPReduceSumBindings = [ diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py index 86bf02e578..bceea01f4d 100644 --- a/Deeploy/Targets/PULPOpen/Deployer.py +++ b/Deeploy/Targets/PULPOpen/Deployer.py @@ -15,6 +15,7 @@ from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \ TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass _L3AllocTemplate = NodeTemplate(""" @@ -63,7 +64,15 @@ def __init__(self, self.extNameCount = 0 - def bind(self): + def annotateNCores(self) -> None: + for layer in self.layerBinding.values(): + node = layer.node + engine = self._selectEngine(node) + opRepr = layer.mapper.parser.operatorRepresentation + if isinstance(engine, PULPClusterEngine): + opRepr["n_cores"] = engine.n_cores + + def bind(self) -> bool: # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU. # SCHEREMO: The BindingOptimizationPass system is fairly fragile; # it was designed this way because implementing further topology optimizations after @@ -71,11 +80,16 @@ def bind(self): # but if there is only very few cases, this solution is okay. autoTransposePass = AutoTransposeMergePass() #self.ctxt, self.layerBinding = autoTransposePass.apply(self.ctxt, self.graph, self.layerBinding) + + # LMACAN: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU. + self.annotateNCores() + # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU. - ret = super().bind() - if ret: - self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;") - return ret + if not super().bind(): + return False + + self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;") + return True def _l3ConstBuffer(self) -> List[VariableBuffer]: return [ diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index e94af6e420..ab99fcabc6 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -72,24 +72,24 @@ def parseNode(self, node: gs.Node) -> (bool): wellFormed = super().parseNode(node) if wellFormed: ret = all([ - # Make sure padding is square + # Current PULP kernel only supports grouping of 1 self.operatorRepresentation['group'] == 1, + + # Make sure padding is square self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], - len(node.inputs) == 2 + + # Check number of inputs + # 2 inputs if no bias, 3 if layer has bias + len(node.inputs) in [2, 3], ]) - self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) - self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) - self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0]) - self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1]) + # Extract additional attributes self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) - self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) - self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) return ret return False @@ -102,11 +102,86 @@ def parseNodeCtxt(self, newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) if ret: + # Set inputs names + inputs = ['data_in', 'weight'] + + # Handle bias, if present + if len(node.inputs) == 2: + self.operatorRepresentation["has_bias"] = "false" + self.operatorRepresentation["bias"] = "NULL" + else: + inputs.append("bias") + self.operatorRepresentation["has_bias"] = "true" + + for idx, inputNode in enumerate(node.inputs): + self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name + return newCtxt, True return ctxt, False +class PULPFPDWConv2DParser(Conv2DParser): + + def __init__(self, noBiasHoisting = True): + super().__init__(noBiasHoisting) + + def parseNode(self, node: gs.Node) -> (bool): + # Parse root conv 2D information + wellFormed = super().parseNode(node) + + if wellFormed: + # Check if the node is a depthwise convolution + ret = all([ + # Make sure padding is square + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + + # Check number of inputs + # 2 inputs if no bias, 3 if layer has bias + len(node.inputs) in [2, 3], + ]) + + # Extract additional attributes + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + + return ret + return False + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + # Parse node context for 2D conv + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + # Define input names + inputs = ['data_in', 'weight'] + + # Handle bias, if present + if len(node.inputs) == 2: + self.operatorRepresentation["has_bias"] = "false" + self.operatorRepresentation["bias"] = "NULL" + else: + inputs.append("bias") + self.operatorRepresentation["has_bias"] = "true" + + # Map input nodes to operator representation + for idx, inputNode in enumerate(node.inputs): + self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name + + # Check if DW + if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']: + return newCtxt, True + + return ctxt, False + + class PULPDWConv1DParser(RQSConv1DParser): def __init__(self, noBiasHoisting = True): diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 99c1c93351..fc2ae8a1fa 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -5,6 +5,8 @@ import numpy as np import onnx_graphsurgeon as gs +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + RemoveEmptyConvBiasPass from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \ NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel @@ -27,10 +29,11 @@ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \ - PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings + PULPDMASliceBindings, PULPDWConv1DBinding, PULPFloatDWConv2DBindings, PULPReduceMeanBindings, PULPSliceBindings from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ - PULPDWConv2DParser, PULPFPConv2DParser, PULPGEMMParser, PULPMatrixVecParser, PULPTallGEMMParser + PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ + PULPTallGEMMParser from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \ PULPConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUTilingReadyBindings, \ @@ -71,6 +74,7 @@ DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding]) FPConv2DMapper = NodeMapper(PULPFPConv2DParser(), PULPConv2DTilingReadyBindings) Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings) +FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPFloatDWConv2DBindings) DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings) GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings) FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings) @@ -85,7 +89,9 @@ ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings) -SliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) +DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings) + +SliceMapper = NodeMapper(SliceParser(), PULPSliceBindings) iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings) @@ -99,7 +105,7 @@ DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) PULPMapping = { - 'Conv': ConvLayer([FPConv2DMapper]), + 'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]), 'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]), 'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]), 'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]), @@ -125,7 +131,7 @@ 'Squeeze': ReshapeLayer([UnsqueezeMapper]), 'Transpose': TransposeLayer([TransposeMapper]), 'Unsqueeze': ReshapeLayer([UnsqueezeMapper]), - 'Slice': SliceLayer([SliceMapper]), + 'Slice': SliceLayer([SliceMapper, DMASliceMapper]), 'RequantizedAdd': AddLayer([RQAddMapper]), 'Concat': ConcatLayer([ConcatMapper]), 'iRMSNorm': iRMSNormLayer([iRMSNormMapper]), @@ -225,7 +231,8 @@ class PULPStructBuffer(StructBuffer): MergeConstAddAndRequantPass(), PULPGEMMRequantMergePass(), PULPMatMulRequantMergePass(), - PULPAddRequantMergePass() + PULPAddRequantMergePass(), + RemoveEmptyConvBiasPass(), ], name = "PULPOptimizer") @@ -237,8 +244,14 @@ class PULPStructBuffer(StructBuffer): class PULPClusterEngine(DeploymentEngine): - def __init__(self, name: str, Mapping = PULPMapping, initCode = "", includeList = _includeList) -> None: + def __init__(self, + name: str, + Mapping = PULPMapping, + initCode = "", + includeList = _includeList, + n_cores: int = 8) -> None: super().__init__(name, Mapping, initCode, includeList) + self.n_cores = n_cores class PULPPlatform(DeploymentPlatform): diff --git a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py b/Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py similarity index 100% rename from Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py rename to Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py index 7f1c2e21c6..200ad1b9ea 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py @@ -6,14 +6,14 @@ referenceTemplate = NodeTemplate(""" // Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp}) -int8_t ${nodeName}_core_id = pi_core_id(); -int8_t ${nodeName}_log2Core = log2(NUM_CORES); -int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); -int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size}); -int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size}); +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); uint32_t i = ${nodeName}_chunk_start; -for (; i+5 < ${nodeName}_chunk_stop; i+=6) { +for (; i + 5 < ${nodeName}_chunk_stop; i += 6) { ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i]; ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1]; ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2]; diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index 29a216d728..bfa893db94 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -18,9 +18,13 @@ def __init__(self, templateStr): def computeTransientBuffersSize( ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: - im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * - operatorRepresentation['dim_kernel_y']) + # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // + 8) * operatorRepresentation["n_cores"] * operatorRepresentation[ + 'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] def hoistTransientBuffers(self, ctxt: NetworkContext, @@ -34,6 +38,39 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, return ctxt, operatorRepresentation, [im2col_name] +class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + + # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[ + "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + # Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers + ctxt.lookup(im2col_name)._type.referencedType = ctxt.lookup( + operatorRepresentation['data_in'])._type.referencedType + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + + reference2DTemplate = NodeTemplate(""" // 2D FP Conv HWC with ChannelOut parallelism (Name: ${nodeName}, Op: ${nodeOp}) @@ -47,6 +84,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${dim_kernel_x}, ${stride_y}, ${stride_x}, + ${bias}, ${has_bias}, ref_${data_out}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} ); @@ -66,15 +104,48 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, for (uint32_t n=0; n<${batch}; ++n) { PULP_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( ref_${data_out}_${data_in}, - ${dim_im_in_y}, ${dim_im_in_x}, + ${dim_im_in_y}, ${ch_im_in}, ${weight}, ${ch_im_out}, - ${dim_kernel_y}, ${dim_kernel_x}, + ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ${bias}, ${has_bias}, + ref_${data_out}_${data_out}, + ${padding_y_top}, + ${padding_y_bottom}, + ${padding_x_left}, + ${padding_x_right}, + ${ctxtBuffer} + ); + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} +""") + +referenceDW2DIm2ColTemplate = PULP2DFloatDWConvIm2ColTemplate(""" +// 2D DW FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp}) + +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_DW_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ${dim_im_in_x}, + ${dim_im_in_y}, + ${ch_im_in}, + ${weight}, + ${ch_im_out}, + ${dim_kernel_x}, + ${dim_kernel_y}, ${stride_x}, + ${stride_y}, + ${bias}, ${has_bias}, ref_${data_out}_${data_out}, ${padding_y_top}, ${padding_y_bottom}, @@ -86,4 +157,4 @@ def hoistTransientBuffers(self, ctxt: NetworkContext, ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index f4c22b2c22..d007e60df0 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -24,9 +24,18 @@ ${transB} ); + % if A_batched: ref_${data_out}_${A} += ${M} * ${N}; + % endif + + % if B_batched: ref_${data_out}_${B} += ${N} * ${O}; + % endif + + % if C_batched: ref_${data_out}_${C} += ${M} * ${O}; + % endif + ref_${data_out}_${data_out} += ${M} * ${O}; } """) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py index 11b7c9aa2a..3cdf26097b 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py @@ -8,8 +8,18 @@ // Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp}) for(uint32_t b=0; b<${batch}; b++) { + % if A_batched: ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + % else: + ${A_type.typeName} batch_A = ${A}; + % endif + + % if B_batched: ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + % else: + ${B_type.typeName} batch_B = ${B}; + % endif + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; PULP_MatMul_fp32_fp32_fp32_unroll1x7( diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py index 2f202b24d2..ced6c3cbcf 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py @@ -7,11 +7,11 @@ referenceTemplate = NodeTemplate(""" // Float Mul with parallelism and 6x unrolling (Name: ${nodeName}, Op: ${nodeOp}) -int8_t ${nodeName}_core_id = pi_core_id(); -int8_t ${nodeName}_log2Core = log2(NUM_CORES); +uint32_t ${nodeName}_core_id = pi_core_id(); +uint32_t ${nodeName}_log2Core = (uint32_t) log2(NUM_CORES); uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1)) != 0); -uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${size}); -uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size}); +uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size}); if (${nodeName}_start < ${nodeName}_end) { float32_t ${nodeName}_scalar = ${B}[0]; diff --git a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py index 41c4b5366c..a795a555ed 100644 --- a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py @@ -25,10 +25,11 @@ from typing import Dict, List, Tuple -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer +from Deeploy.Targets.Generic.Templates.ReshapeTemplate import _ReshapeTemplate as _GenericReshapeTemplate -class _ReshapeTemplate(NodeTemplate): +class _ReshapeTemplate(_GenericReshapeTemplate): def __init__(self, templateStr): super().__init__(templateStr) @@ -36,19 +37,18 @@ def __init__(self, templateStr): def alignToContext(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - # SCHEREMO: Selectively mark 'indices' dead, since we don't need them - if 'indices' in operatorRepresentation.keys(): - ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False - ctxt.globalObjects[operatorRepresentation['indices']]._live = False + ctxt, operatorRepresentation, _ = super().alignToContext(ctxt, operatorRepresentation) - # Same for "shape" - if "shape" in operatorRepresentation.keys(): - ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False - ctxt.globalObjects[operatorRepresentation["shape"]]._live = False + # Get buffers + bufferIn = ctxt.lookup(operatorRepresentation['data_in']) + assert isinstance(bufferIn, VariableBuffer) - inBuffer = ctxt.lookup(operatorRepresentation['data_in']) - outBuffer = ctxt.lookup(operatorRepresentation['data_out']) - outBuffer._alias = inBuffer.name + bufferOut = ctxt.lookup(operatorRepresentation['data_out']) + assert isinstance(bufferOut, VariableBuffer) + + # HACK: Tiling wasn't updated in the Fix aliasing PR so we have to still + # set the _alias argument + bufferOut._alias = bufferIn.name return ctxt, operatorRepresentation, [] diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz index a98a6c33b9..36567a96ce 100644 Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz differ diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx index ae1b3ac939..5eb3ae446e 100644 Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx differ diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz index a5d4b6e974..0e2e55fcfe 100644 Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz differ diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py index 24f0638f21..cf8acf05db 100644 --- a/DeeployTest/generateNetwork.py +++ b/DeeployTest/generateNetwork.py @@ -20,7 +20,7 @@ from Deeploy.DeeployTypes import _NoVerbosity from Deeploy.Logging import DEFAULT_LOGGER as log from Deeploy.Targets.CortexM.Platform import CMSISPlatform -from Deeploy.Targets.PULPOpen.Platform import PULPPlatform +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPPlatform def generateNetwork(args): @@ -84,6 +84,10 @@ def generateNetwork(args): platform, signProp = mapPlatform(args.platform) + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + inputTypes = {} inputOffsets = {} @@ -183,6 +187,13 @@ def generateNetwork(args): 'If not specified, offsets are set to 0. ' 'Example: --input-offset-map input_0=0 input_1=128 ...') parser.add_argument('--shouldFail', action = 'store_true') + parser.add_argument( + "--cores", + type = int, + default = 1, + help = + "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1.", + ) parser.set_defaults(shouldFail = False) args = parser.parse_args() diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 013f854daa..4b1ebef20b 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -26,6 +26,7 @@ from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \ AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper @@ -76,6 +77,10 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg if args.enableStrides: platform.engines[0].enableStrides = True + clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)] + for cluster in clusters: + cluster.n_cores = args.cores + for index, num in enumerate(test_inputs): _type, offset = inferTypeAndOffset(num, signProp) inputTypes[f"input_{index}"] = _type @@ -195,6 +200,13 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg parser.add_argument('--plotMemAlloc', action = 'store_true', help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n') + parser.add_argument( + "--cores", + type = int, + default = 1, + help = + "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1." + ) parser.set_defaults(shouldFail = False) args = parser.parse_args() diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 7d1f7f312a..a3329ebf73 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -342,6 +342,10 @@ def generate_test(self): generation_script = "generateNetwork.py" command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}" + + if self._platform in ["Siracusa", "Siracusa_w_neureka"]: + command += f" --cores={self._args.cores}" + command += self._argument_parser.generate_cmd_args() log.debug(f"[TestRunner] Generation Command: {command}") diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h index f5382a339b..3ebab54a0b 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -9,20 +9,30 @@ #include "DeeployPULPMath.h" -void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t H, uint32_t W, uint32_t C, - const float32_t *__restrict__ pSrcB, - uint32_t F_total, uint32_t P, uint32_t Q, - uint32_t SP, uint32_t SQ, - float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, - uint32_t pad_left, uint32_t pad_right); +void PULP_Conv2d_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right); void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, - uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, - uint32_t pad_right, float32_t *__restrict__ pContextBuffer); + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer); + +void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer); #endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c index c33ac31e88..af21293233 100644 --- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c +++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c @@ -7,18 +7,19 @@ #include "DeeployPULPMath.h" #include "pmsis.h" -void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t H, uint32_t W, uint32_t C, - const float32_t *__restrict__ pSrcB, - uint32_t F_total, uint32_t P, uint32_t Q, - uint32_t SP, uint32_t SQ, - float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, - uint32_t pad_left, uint32_t pad_right) { +void PULP_Conv2d_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right) { + // Compute core int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + // Compute the chunk size for each core uint16_t ch_out_chunk = (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0); uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total); @@ -29,37 +30,72 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, return; } + // Pointer to the weights for the current core const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q; + // Compute the output dimensions uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; - for (uint32_t h = 0; h < H_out; ++h) { - for (uint32_t w = 0; w < W_out; ++w) { - for (uint32_t f = 0; f < ch_out_count; ++f) { - float32_t sum = 0.0f; + // Compute the output + if (has_bias) { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + for (uint32_t f = 0; f < ch_out_count; ++f) { + float32_t sum = 0.0f; - for (uint32_t p = 0; p < P; ++p) { - for (uint32_t q = 0; q < Q; ++q) { - for (uint32_t c = 0; c < C; ++c) { - int32_t h_in = h * SP + p - pad_top; - int32_t w_in = w * SQ + q - pad_left; + for (uint32_t p = 0; p < P; ++p) { + for (uint32_t q = 0; q < Q; ++q) { + for (uint32_t c = 0; c < C; ++c) { + int32_t h_in = h * SP + p - pad_top; + int32_t w_in = w * SQ + q - pad_left; - if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 || - w_in >= (int32_t)W) { - continue; - } + if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 || + w_in >= (int32_t)W) { + continue; + } - uint32_t input_idx = (h_in * W + w_in) * C + c; - uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c; + uint32_t input_idx = (h_in * W + w_in) * C + c; + uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c; - sum += pSrcA[input_idx] * weight_ptr[weight_idx]; + sum += pSrcA[input_idx] * weight_ptr[weight_idx]; + } } } + + uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f); + pDstC[output_idx] = sum + pSrcBias[f + ch_out_start]; } + } + } + } else { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + for (uint32_t f = 0; f < ch_out_count; ++f) { + float32_t sum = 0.0f; + + for (uint32_t p = 0; p < P; ++p) { + for (uint32_t q = 0; q < Q; ++q) { + for (uint32_t c = 0; c < C; ++c) { + int32_t h_in = h * SP + p - pad_top; + int32_t w_in = w * SQ + q - pad_left; + + if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 || + w_in >= (int32_t)W) { + continue; + } + + uint32_t input_idx = (h_in * W + w_in) * C + c; + uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c; + + sum += pSrcA[input_idx] * weight_ptr[weight_idx]; + } + } + } - uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f); - pDstC[output_idx] = sum; + uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f); + pDstC[output_idx] = sum; + } } } } @@ -68,12 +104,17 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, - uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC, - uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, - uint32_t pad_right, float32_t *__restrict__ pContextBuffer) { + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer) { + + // Compute core int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + // Compute the chunk size for each core uint16_t ch_out_chunk = (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0); uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total); @@ -84,50 +125,95 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( return; } + // Pointer to the weights for the current core const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q; uint32_t im2col_size_per_core = C * P * Q; float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core; + // Compute the output dimensions uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; uint32_t kernel_size = P * Q * C; - for (uint32_t h_out = 0; h_out < H_out; h_out++) { - for (uint32_t w_out = 0; w_out < W_out; w_out++) { - int32_t h_in_start = h_out * SP - pad_top; - int32_t w_in_start = w_out * SQ - pad_left; + // Compute the output + if (has_bias) { + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; p++) { + int32_t h_in = h_in_start + p; + + for (uint32_t q = 0; q < Q; q++) { + int32_t w_in = w_in_start + q; + + for (uint32_t c = 0; c < C; c++) { + if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 && + w_in < (int32_t)W) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx]; + } else { + im2col_buffer[p * Q * C + q * C + c] = 0.0f; + } + } + } + } + + for (uint32_t f = ch_out_start; f < ch_out_stop; f++) { + float32_t sum = 0.0f; + const float32_t *local_weight_ptr = + weight_ptr + (f - ch_out_start) * kernel_size; - for (uint32_t p = 0; p < P; p++) { - int32_t h_in = h_in_start + p; + for (uint32_t k = 0; k < kernel_size; k++) { + sum += im2col_buffer[k] * local_weight_ptr[k]; + } - for (uint32_t q = 0; q < Q; q++) { - int32_t w_in = w_in_start + q; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; - for (uint32_t c = 0; c < C; c++) { - if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 && - w_in < (int32_t)W) { - uint32_t in_idx = (h_in * W + w_in) * C + c; - im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx]; - } else { - im2col_buffer[p * Q * C + q * C + c] = 0.0f; + pDstC[out_idx] = sum + pSrcBias[f]; + } + } + } + } else { + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; p++) { + int32_t h_in = h_in_start + p; + + for (uint32_t q = 0; q < Q; q++) { + int32_t w_in = w_in_start + q; + + for (uint32_t c = 0; c < C; c++) { + if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 && + w_in < (int32_t)W) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx]; + } else { + im2col_buffer[p * Q * C + q * C + c] = 0.0f; + } } } } - } - for (uint32_t f = 0; f < ch_out_count; f++) { - float32_t sum = 0.0f; - const float32_t *local_weight_ptr = weight_ptr + f * kernel_size; + for (uint32_t f = ch_out_start; f < ch_out_stop; f++) { + float32_t sum = 0.0f; + const float32_t *local_weight_ptr = + weight_ptr + (f - ch_out_start) * kernel_size; - for (uint32_t k = 0; k < kernel_size; k++) { - sum += im2col_buffer[k] * local_weight_ptr[k]; - } + for (uint32_t k = 0; k < kernel_size; k++) { + sum += im2col_buffer[k] * local_weight_ptr[k]; + } - uint32_t out_idx = - (h_out * W_out + w_out) * F_total + (ch_out_start + f); - pDstC[out_idx] = sum; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; + + pDstC[out_idx] = sum; + } } } } -} \ No newline at end of file +} diff --git a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c new file mode 100644 index 0000000000..b0a06c66eb --- /dev/null +++ b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c @@ -0,0 +1,251 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, + uint32_t Q, uint32_t SP, uint32_t SQ, + const float32_t *__restrict__ pSrcBias, const bool has_bias, + float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, + uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pContextBuffer) { + + // Compute core information + int8_t core_id = pi_core_id(); + int8_t log2Core = log2(NUM_CORES); + + // Compute the chunk size for each core + // (Splitting work along the output channels) + uint16_t ch_out_chunk = + (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0); + uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total); + uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, F_total); + uint16_t ch_out_count = ch_out_stop - ch_out_start; + + // If there is no output channel to process, return + // (when F < NUM_CORES and working on a core with id > F) + if (ch_out_count == 0) { + return; + } + + // Move pointer of the weights for the current core + const float32_t *weight_ptr = pSrcB + ch_out_start * P * Q; + + // Move pointer of the im2col buffer for the current core + uint32_t im2col_size_per_core = P * Q; + float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core; + + // Compute the output dimensions + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + uint32_t kernel_size = P * Q * F_total; + + // Compute the output + if (has_bias) { + // Work on individual output elements + // (each element depends on a column from the im2col buffer + // and one convolutional filter, stored in memory continuously) + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + // Compute height and width starting point + // (depending on stride and padding) + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + // Initialize the padded part of the im2col buffer with 0 + // Work on the TOP padding + for (int32_t h_in = (int32_t)h_in_start; + h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the BOTTOM padding + for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P; + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining LEFT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining RIGHT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q; + w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Copy input data to im2col buffer + // Input channels depend on the output channels assigned to the core + // (each input channel is associated with F_total / C output channels, + // number which corresponds to the "group" parameter in the Conv ONNX + // operator) + for (uint32_t c = ch_out_start / (F_total / C); + c < (ch_out_stop + 1) / (F_total / C); c++) { + // Copy the valid input data to the im2col buffer + for (uint32_t h_in = MAX(0, h_in_start); + h_in < MIN(H, h_in_start + P); h_in++) { + for (uint32_t w_in = MAX(0, w_in_start); + w_in < MIN(W, w_in_start + Q); w_in++) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = + pSrcA[in_idx]; + } + } + + // Compute output channels of interest, based on current input channel + // and core + uint32_t lower_f, upper_f; + + if (c * (F_total / C) < ch_out_start) { + lower_f = ch_out_start; + } else { + lower_f = c * (F_total / C); + } + + if ((c + 1) * (F_total / C) < ch_out_stop) { + upper_f = (c + 1) * (F_total / C); + } else { + upper_f = ch_out_stop; + } + + // Perform convolution for the assigned output channels + for (uint32_t f = lower_f; f < upper_f; f++) { + float32_t sum = 0.0f; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; + + for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) { + sum += + im2col_buffer[im2col_idx] * + weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)]; + } + + // Copy the result to the output tensor + pDstC[out_idx] = sum + pSrcBias[f]; + } + } + } + } + } else { + // Work on individual output elements + // (each element depends on a column from the im2col buffer + // and one convolutional filter, stored in memory continuously) + for (uint32_t h_out = 0; h_out < H_out; h_out++) { + for (uint32_t w_out = 0; w_out < W_out; w_out++) { + // Compute height and width starting point + // (depending on stride and padding) + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + // Initialize the padded part of the im2col buffer with 0 + // Work on the TOP padding + for (int32_t h_in = (int32_t)h_in_start; + h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the BOTTOM padding + for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P; + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < (int32_t)(w_in_start + Q); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining LEFT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (int32_t w_in = (int32_t)w_in_start; + w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Work on the remaining RIGHT padding + for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P); + h_in++) { + for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q; + w_in++) { + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f; + } + } + + // Copy input data to im2col buffer + // Input channels depend on the output channels assigned to the core + // (each input channel is associated with F_total / C output channels, + // number which corresponds to the "group" parameter in the Conv ONNX + // operator) + for (uint32_t c = ch_out_start / (F_total / C); + c < (ch_out_stop + 1) / (F_total / C); c++) { + // Copy the valid input data to the im2col buffer + for (uint32_t h_in = MAX(0, h_in_start); + h_in < MIN(H, h_in_start + P); h_in++) { + for (uint32_t w_in = MAX(0, w_in_start); + w_in < MIN(W, w_in_start + Q); w_in++) { + uint32_t in_idx = (h_in * W + w_in) * C + c; + im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = + pSrcA[in_idx]; + } + } + + // Compute output channels of interest, based on current input channel + // and core + uint32_t lower_f, upper_f; + + if (c * (F_total / C) < ch_out_start) { + lower_f = ch_out_start; + } else { + lower_f = c * (F_total / C); + } + + if ((c + 1) * (F_total / C) < ch_out_stop) { + upper_f = (c + 1) * (F_total / C); + } else { + upper_f = ch_out_stop; + } + + // Perform convolution for the assigned output channels + for (uint32_t f = lower_f; f < upper_f; f++) { + float32_t sum = 0.0f; + uint32_t out_idx = (h_out * W_out + w_out) * F_total + f; + + for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) { + sum += + im2col_buffer[im2col_idx] * + weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)]; + } + + // Copy the result to the output tensor + pDstC[out_idx] = sum; + } + } + } + } + } + + return; +}