diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml
index 7c6a5f7541..f59f7fa884 100644
--- a/.github/workflows/ci-platform-siracusa.yml
+++ b/.github/workflows/ci-platform-siracusa.yml
@@ -53,7 +53,15 @@ jobs:
         testBacktracking
         testFloatAdder
         testFloatGEMM
+
         testFloat2DConvolution
+        testFloat2DConvolutionBias
+        testFloat2DConvolutionZeroBias
+
+        testFloat2DDWConvolution
+        testFloat2DDWConvolutionBias
+        testFloat2DDWConvolutionZeroBias
+
         testFloatLayerNorm
         testFloatRelu
         testFloatMaxPool
@@ -64,6 +72,7 @@ jobs:
         Quant
         Dequant
         testFloatReduceSum
+        testFloatReshapeWithSkipConnection
         testFloatSoftmaxGrad
         testFloatSoftmaxCrossEntropy
         testFloatSoftmaxCrossEntropyGrad
@@ -87,4 +96,5 @@ jobs:
         CCT/CCT_1_16_16_8
         CCT/CCT_2_32_32_128_Opset20
         testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8
+        testFloatDemoTinyViT
       num-cores: 8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index faf4de42c5..1ed34f8da9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- TinyViT on non-tiled Siracusa [#117](https://github.com/pulp-platform/Deeploy/pull/117)
 - Support Fully Asynchronous DMAs [#114](https://github.com/pulp-platform/Deeploy/pull/114)
 - Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128)
 - Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123)
@@ -24,6 +25,13 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Fix bias hoisting in generic GEMM with no bias [#126](https://github.com/pulp-platform/Deeploy/pull/126)
 
 ### Added
+- PULP 2D FP DW conv Im2Col template and kernel, with bias support.
+- Bias support for PULP 2D FP regular conv Im2Col in template & kernel.
+- PULP FP DW conv 2D parser.
+- FP conv 2D (simple & DW), reshape & skip connection, and TinyViT demo tests to the non-tiled Siracusa CI pipeline.
+- FP bindings and mappings for PULP slice, DW conv 2D, and reduce mean operations.
+- FP PULP DW conv lowering optimization pass similar to the existent one for integer version.
+- RemoveEmptyConvBiasPass to the PULP optimizer.
 - Add manual type inference feature (CLI: `--input-type-map`/`--input-offset-map`) to resolve ambiguities when test inputs are not representative enough
 - Added a `testTypeInferenceDifferentTypes` test case to validate type inference for different input types
 - Added `_mangleNodeNames` function to avoid duplicate node mappings
@@ -58,8 +66,11 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Added testFloatGEMMnobias
 - Profiling support and optional comments in generated DMA code for better traceability
 - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy`
+- PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used
+- annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations
 
 ### Changed
+- Reduced size of reshape & skip connection test, for non-tiled Siracusa memory compatibility.
 - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`.
 - mchan HAL is now reduced to bare-bones
 - refactor of the IntrospectiveCodeTransformation to work on the Mako template
@@ -95,8 +106,12 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Disabled ICCT_ITA_8 MemPool test because it was using a lowering that created shapeless tensors
 - Added missing shape annotation to the testTypeInferenceDifferentTypes
 - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
+- changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
 
 ### Fixed
+- Fixed bug for non-batched elements in the PULPOpen FP GEMM and matmul templates.
+- Added underscore to the beginning of closure names to avoid naming issues when they start with unsupported first characters (like numbers).
+- Data types in the PULPOpen FP add and mul templates.
 - Prevent node duplication for graphs generated via GraphSurgeon
 - Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step.
 - Fix license CI check and prevent potential issues with `jq` installation
@@ -185,9 +200,9 @@ This release containing major architectural changes, new platform support, enhan
 
 
 ### Added
-- BatchNorm kernel 
-- ConvTranspose kernel 
-- MaxPool1D kernel 
+- BatchNorm kernel
+- ConvTranspose kernel
+- MaxPool1D kernel
 - Template for 1D Convolution
 - Support for float32 data type in the previous kernels
 - Float binding for Pad1D kernel
@@ -326,7 +341,7 @@ This release containing major architectural changes, new platform support, enhan
 
 ### Changed
 - FloatConvTemplate file
-- Platform.py file  
+- Platform.py file
 - Bump the CMake version to 3.24 as required for the chimera-sdk
 - Bump GVSoC's version and add chimera simulation target
 - Rename the generic source util to utils to avoid name collision with chimera-sdk
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
index c5f9c883af..41073ad646 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
@@ -155,7 +155,8 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        self.closureName = name + self.closureSuffix
+        # Prepend underscore to avoid name issues when beginning with problematic characters (like numbers)
+        self.closureName = "_" + name + self.closureSuffix
         self.functionCall = executionBlock.generate(ctxt)
         self._generateClosureStruct(ctxt, executionBlock)
         ctxt = self._generateClosureCtxt(ctxt, name)
diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
index 4f6dba3827..c05ea3b9d9 100644
--- a/Deeploy/CommonExtensions/DataTypes.py
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -87,11 +87,11 @@ class float64_t(FloatImmediate):
 
 SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t)
 UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t)
-IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted((
-    *SignedIntegerDataTypes,
-    *UnsignedIntegerDataTypes,
-),
-                                                               key = lambda _type: _type.typeWidth))
+IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple(
+    sorted((
+        *SignedIntegerDataTypes,
+        *UnsignedIntegerDataTypes,
+    ), key = lambda _type: _type.typeWidth))
 FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t)
 
 
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
index f07fe57c96..a8f27b5463 100644
--- a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
+++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
@@ -2,11 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Union
-
 import onnx_graphsurgeon as gs
 
-from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity
+from Deeploy.DeeployTypes import CodeGenVerbosity, DeploymentEngine, NetworkContext, NetworkDeployer, _NoVerbosity
 
 
 class NetworkDeployerWrapper(NetworkDeployer):
@@ -68,8 +66,8 @@ def generateBufferAllocationCode(self) -> str:
         return self._innerObject.generateBufferAllocationCode()
 
     # MultiEngineDeployer augment
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
-        return self._innerObject._mapNode(node)
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
+        return self._innerObject._selectEngine(node)
 
     def _printMemorySummary(self):
         return self._innerObject._printMemorySummary()
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 8c2f5d2485..5ccfb7dcf7 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -325,7 +325,7 @@ def fromNode(cls, node: gs.Node):
         return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape))
 
     def has_live_aliases(self, ctxt: NetworkContext) -> bool:
-        """Checks whether this VariableBuffer has any live ancestors, i.e. buffers that are still live and are aliased by this buffer.
+        """Checks whether this VariableBuffer has any live aliases, i.e. buffers that are still live and are aliased by this buffer.
         Parameters
         ----------
         ctxt : NetworkContext
@@ -333,7 +333,7 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool:
         Returns
         -------
         bool
-            True if this VariableBuffer has any live ancestors, False otherwise
+            True if this VariableBuffer has any live aliases, False otherwise
         """
         # Do a breadth-first search across the aliasing double-linked list
         live = self._live
@@ -2562,10 +2562,10 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
             self.ctxt = layer.codeTransform(self.ctxt, verbose)
         self.transformed = True
 
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
         for engine in self.Platform.engines:
             if node.op in engine.Mapping:
-                return engine.Mapping[node.op](node)
+                return engine
         raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}")
 
     def _bindLayers(self):
@@ -2582,7 +2582,8 @@ def _bindLayers(self):
                 flatSchedule += subGraph
 
         for node in flatSchedule:
-            layer = self._mapNode(node)
+            engine = self._selectEngine(node)
+            layer = engine.Mapping[node.op](node)
             if isinstance(layer, ONNXLayer):
                 log.debug(f"   {SUCCESS_MARK} Bind {node.name} to layer {layer.__class__.__name__}")
                 self.layerBinding[layer.node.name] = layer
diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
index 4b05ab5be4..570363b9a2 100644
--- a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
+++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
@@ -2,13 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Callable, Dict, Type, Union
+from typing import Callable, Dict, Type
 
 import onnx_graphsurgeon as gs
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
-from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer
+from Deeploy.DeeployTypes import DeploymentEngine, DeploymentPlatform, NetworkDeployer, Schedule, TopologyOptimizer
 from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \
     EngineColoringPass, EngineMapper
 
@@ -48,14 +48,14 @@ def lower(self, graph: gs.Graph) -> gs.Graph:
         ) == 0, f"Missing engine color for nodes {[node.name for node in uncoloredNodes]} with operations {uncoloredOperations}"
         return graph
 
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
         assert "engine" in node.attrs, f"Node {node.name} doesn't have an engine color."
         engineName = node.attrs["engine"]
         assert isinstance(engineName, str) and engineName in self.engineDict, \
             f"Node {node.name} has an invalid engine {engineName} assigned."
         engine = self.engineDict[engineName]
         assert node.op in engine.Mapping, f"No mapping found for {node.op} in engine {engine.name}"
-        return engine.Mapping[node.op](node)
+        return engine
 
 
 class EngineColoringDeployerWrapper(EngineColoringDeployer, NetworkDeployerWrapper):
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 9ff940b2f0..cc81527f32 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -9,13 +9,13 @@
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
     MemoryManagementGeneration, MemoryPassthroughGeneration
-from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t, \
-    uint8_t
+from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \
+    int8_t, int32_t, int64_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \
-    GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate
+from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
+    FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
     GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
     QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
@@ -27,11 +27,11 @@
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
 from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
 from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \
-    FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, \
-    FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, \
-    ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \
-    SliceTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \
+    FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \
+    FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
+    MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \
+    SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
     iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
@@ -148,16 +148,24 @@
             PointerClass(uint8_t),
             PointerClass(uint8_t),
             PointerClass(uint8_t)
-        ], [PULPDMAFuture(underlyingType = type)]), SliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
+        ], [PULPDMAFuture(underlyingType = type)]), DMASliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
     for type in IntegerDataTypes
 ]
 
+PULPSliceBindings = [
+    NodeBinding(
+        SliceChecker([
+            PointerClass(type),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t)
+        ], [PointerClass(type)]), SliceTemplate.referenceTemplate, ForkTransformer) for type in FloatDataTypes
+]
+
 PULPReshapeBindings = [
-    NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]),
-                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes
-] + [
-    NodeBinding(ReshapeChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
-                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes
+    NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]),
+                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes
 ]
 
 PULPRQAddBindings = [
@@ -225,6 +233,14 @@
         ForkTransformer)
 ]
 
+PULPFloatDWConv2DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(float_type), PointerClass(float_type),
+             PointerClass(float_type)], [PointerClass(float_type)]), FloatConvTemplate.referenceDW2DIm2ColTemplate,
+        ForkTransformer) for float_type in FloatDataTypes
+]
+
 PULPRQSMatrixVecBindings = [
     NodeBinding(
         PULPLinearChecker([PointerClass(type1),
@@ -276,6 +292,11 @@
 PULPReduceMeanBindings = [
     NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
                 ClusterTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]),
+                FloatReduceMeanTemplate.referenceTemplate, ClusterTransformer)
+    for integer_type in SignedIntegerDataTypes
+    for float_type in FloatDataTypes
 ]
 
 PULPReduceSumBindings = [
diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py
index 86bf02e578..bceea01f4d 100644
--- a/Deeploy/Targets/PULPOpen/Deployer.py
+++ b/Deeploy/Targets/PULPOpen/Deployer.py
@@ -15,6 +15,7 @@
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
     TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass
 
 _L3AllocTemplate = NodeTemplate("""
@@ -63,7 +64,15 @@ def __init__(self,
 
         self.extNameCount = 0
 
-    def bind(self):
+    def annotateNCores(self) -> None:
+        for layer in self.layerBinding.values():
+            node = layer.node
+            engine = self._selectEngine(node)
+            opRepr = layer.mapper.parser.operatorRepresentation
+            if isinstance(engine, PULPClusterEngine):
+                opRepr["n_cores"] = engine.n_cores
+
+    def bind(self) -> bool:
         # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
         # SCHEREMO: The BindingOptimizationPass system is fairly fragile;
         # it was designed this way because implementing further topology optimizations after
@@ -71,11 +80,16 @@ def bind(self):
         # but if there is only very few cases, this solution is okay.
         autoTransposePass = AutoTransposeMergePass()
         #self.ctxt, self.layerBinding = autoTransposePass.apply(self.ctxt, self.graph, self.layerBinding)
+
+        # LMACAN: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
+        self.annotateNCores()
+
         # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
-        ret = super().bind()
-        if ret:
-            self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
-        return ret
+        if not super().bind():
+            return False
+
+        self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
+        return True
 
     def _l3ConstBuffer(self) -> List[VariableBuffer]:
         return [
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
index e94af6e420..ab99fcabc6 100644
--- a/Deeploy/Targets/PULPOpen/Parsers.py
+++ b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -72,24 +72,24 @@ def parseNode(self, node: gs.Node) -> (bool):
         wellFormed = super().parseNode(node)
         if wellFormed:
             ret = all([
-                # Make sure padding is square
+                # Current PULP kernel only supports grouping of 1
                 self.operatorRepresentation['group'] == 1,
+
+                # Make sure padding is square
                 self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
                 self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
                 self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
-                len(node.inputs) == 2
+
+                # Check number of inputs
+                # 2 inputs if no bias, 3 if layer has bias
+                len(node.inputs) in [2, 3],
             ])
 
-            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
-            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
-            self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
-            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+            # Extract additional attributes
             self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
             self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
             self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
             self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
-            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
-            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
 
             return ret
         return False
@@ -102,11 +102,86 @@ def parseNodeCtxt(self,
         newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
 
         if ret:
+            # Set inputs names
+            inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = "true"
+
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
             return newCtxt, True
 
         return ctxt, False
 
 
+class PULPFPDWConv2DParser(Conv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        # Parse root conv 2D information
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            # Check if the node is a depthwise convolution
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+
+                # Check number of inputs
+                # 2 inputs if no bias, 3 if layer has bias
+                len(node.inputs) in [2, 3],
+            ])
+
+            # Extract additional attributes
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+            self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+
+            return ret
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        # Parse node context for 2D conv
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            # Define input names
+            inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = "true"
+
+            # Map input nodes to operator representation
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            # Check if DW
+            if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']:
+                return newCtxt, True
+
+        return ctxt, False
+
+
 class PULPDWConv1DParser(RQSConv1DParser):
 
     def __init__(self, noBiasHoisting = True):
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 99c1c93351..fc2ae8a1fa 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -5,6 +5,8 @@
 import numpy as np
 import onnx_graphsurgeon as gs
 
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    RemoveEmptyConvBiasPass
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
     NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
@@ -27,10 +29,11 @@
     MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
     SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
 from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \
-    PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings
+    PULPDMASliceBindings, PULPDWConv1DBinding, PULPFloatDWConv2DBindings, PULPReduceMeanBindings, PULPSliceBindings
 from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
 from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
-    PULPDWConv2DParser, PULPFPConv2DParser, PULPGEMMParser, PULPMatrixVecParser, PULPTallGEMMParser
+    PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
+    PULPTallGEMMParser
 from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
     PULPConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUTilingReadyBindings, \
@@ -71,6 +74,7 @@
 DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding])
 FPConv2DMapper = NodeMapper(PULPFPConv2DParser(), PULPConv2DTilingReadyBindings)
 Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
+FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPFloatDWConv2DBindings)
 DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings)
 GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings)
 FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings)
@@ -85,7 +89,9 @@
 
 ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings)
 
-SliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+
+SliceMapper = NodeMapper(SliceParser(), PULPSliceBindings)
 
 iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings)
 
@@ -99,7 +105,7 @@
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
 PULPMapping = {
-    'Conv': ConvLayer([FPConv2DMapper]),
+    'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]),
     'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
     'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
     'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]),
@@ -125,7 +131,7 @@
     'Squeeze': ReshapeLayer([UnsqueezeMapper]),
     'Transpose': TransposeLayer([TransposeMapper]),
     'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
-    'Slice': SliceLayer([SliceMapper]),
+    'Slice': SliceLayer([SliceMapper, DMASliceMapper]),
     'RequantizedAdd': AddLayer([RQAddMapper]),
     'Concat': ConcatLayer([ConcatMapper]),
     'iRMSNorm': iRMSNormLayer([iRMSNormMapper]),
@@ -225,7 +231,8 @@ class PULPStructBuffer(StructBuffer):
     MergeConstAddAndRequantPass(),
     PULPGEMMRequantMergePass(),
     PULPMatMulRequantMergePass(),
-    PULPAddRequantMergePass()
+    PULPAddRequantMergePass(),
+    RemoveEmptyConvBiasPass(),
 ],
                                   name = "PULPOptimizer")
 
@@ -237,8 +244,14 @@ class PULPStructBuffer(StructBuffer):
 
 class PULPClusterEngine(DeploymentEngine):
 
-    def __init__(self, name: str, Mapping = PULPMapping, initCode = "", includeList = _includeList) -> None:
+    def __init__(self,
+                 name: str,
+                 Mapping = PULPMapping,
+                 initCode = "",
+                 includeList = _includeList,
+                 n_cores: int = 8) -> None:
         super().__init__(name, Mapping, initCode, includeList)
+        self.n_cores = n_cores
 
 
 class PULPPlatform(DeploymentPlatform):
diff --git a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py b/Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
similarity index 100%
rename from Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py
rename to Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
index 7f1c2e21c6..200ad1b9ea 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
@@ -6,14 +6,14 @@
 
 referenceTemplate = NodeTemplate("""
 // Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp})
-int8_t ${nodeName}_core_id = pi_core_id();
-int8_t ${nodeName}_log2Core = log2(NUM_CORES);
-int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
-int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
-int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 uint32_t i = ${nodeName}_chunk_start;
-for (; i+5 < ${nodeName}_chunk_stop; i+=6) {
+for (; i + 5 < ${nodeName}_chunk_stop; i += 6) {
     ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i];
     ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1];
     ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2];
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
index 29a216d728..bfa893db94 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
@@ -18,9 +18,13 @@ def __init__(self, templateStr):
     def computeTransientBuffersSize(
             ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
-        im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
-                              operatorRepresentation['dim_kernel_y'])
+        # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth //
+                      8) * operatorRepresentation["n_cores"] * operatorRepresentation[
+                          'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+
         im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+
         return [(im2col_name, im2col_dim)]
 
     def hoistTransientBuffers(self, ctxt: NetworkContext,
@@ -34,6 +38,39 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         return ctxt, operatorRepresentation, [im2col_name]
 
 
+class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+
+        # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[
+            "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize(
+            ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        # Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers
+        ctxt.lookup(im2col_name)._type.referencedType = ctxt.lookup(
+            operatorRepresentation['data_in'])._type.referencedType
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
 reference2DTemplate = NodeTemplate("""
 // 2D FP Conv HWC with ChannelOut parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
@@ -47,6 +84,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         ${weight}, ${ch_im_out},
         ${dim_kernel_y}, ${dim_kernel_x},
         ${stride_y}, ${stride_x},
+        ${bias}, ${has_bias},
         ref_${data_out}_${data_out},
         ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
     );
@@ -66,15 +104,48 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
 for (uint32_t n=0; n<${batch}; ++n) {
     PULP_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
         ref_${data_out}_${data_in},
-        ${dim_im_in_y},
         ${dim_im_in_x},
+        ${dim_im_in_y},
         ${ch_im_in},
         ${weight},
         ${ch_im_out},
-        ${dim_kernel_y},
         ${dim_kernel_x},
+        ${dim_kernel_y},
+        ${stride_x},
         ${stride_y},
+        ${bias}, ${has_bias},
+        ref_${data_out}_${data_out},
+        ${padding_y_top},
+        ${padding_y_bottom},
+        ${padding_x_left},
+        ${padding_x_right},
+        ${ctxtBuffer}
+    );
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+""")
+
+referenceDW2DIm2ColTemplate = PULP2DFloatDWConvIm2ColTemplate("""
+// 2D DW FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp})
+
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    PULP_DW_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
+        ref_${data_out}_${data_in},
+        ${dim_im_in_x},
+        ${dim_im_in_y},
+        ${ch_im_in},
+        ${weight},
+        ${ch_im_out},
+        ${dim_kernel_x},
+        ${dim_kernel_y},
         ${stride_x},
+        ${stride_y},
+        ${bias}, ${has_bias},
         ref_${data_out}_${data_out},
         ${padding_y_top},
         ${padding_y_bottom},
@@ -86,4 +157,4 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
     ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
     ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
 }
-""")
\ No newline at end of file
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index f4c22b2c22..d007e60df0 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -24,9 +24,18 @@
         ${transB}
     );
 
+    % if A_batched:
     ref_${data_out}_${A} += ${M} * ${N};
+    % endif
+
+    % if B_batched:
     ref_${data_out}_${B} += ${N} * ${O};
+    % endif
+
+    % if C_batched:
     ref_${data_out}_${C} += ${M} * ${O};
+    % endif
+
     ref_${data_out}_${data_out} += ${M} * ${O};
 }
 """)
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
index 11b7c9aa2a..3cdf26097b 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
@@ -8,8 +8,18 @@
 // Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
 for(uint32_t b=0; b<${batch}; b++) {
+    % if A_batched:
     ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+    % else:
+    ${A_type.typeName} batch_A = ${A};
+    % endif
+
+    % if B_batched:
     ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+    % else:
+    ${B_type.typeName} batch_B = ${B};
+    % endif
+
     ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
 
     PULP_MatMul_fp32_fp32_fp32_unroll1x7(
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
index 2f202b24d2..ced6c3cbcf 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
@@ -7,11 +7,11 @@
 referenceTemplate = NodeTemplate("""
 // Float Mul with parallelism and 6x unrolling (Name: ${nodeName}, Op: ${nodeOp})
 
-int8_t ${nodeName}_core_id = pi_core_id();
-int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+uint32_t ${nodeName}_core_id = pi_core_id();
+uint32_t ${nodeName}_log2Core = (uint32_t) log2(NUM_CORES);
 uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1)) != 0);
-uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${size});
-uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size});
+uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 if (${nodeName}_start < ${nodeName}_end) {
     float32_t ${nodeName}_scalar = ${B}[0];
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
index 41c4b5366c..a795a555ed 100644
--- a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
@@ -25,10 +25,11 @@
 
 from typing import Dict, List, Tuple
 
-from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer
+from Deeploy.Targets.Generic.Templates.ReshapeTemplate import _ReshapeTemplate as _GenericReshapeTemplate
 
 
-class _ReshapeTemplate(NodeTemplate):
+class _ReshapeTemplate(_GenericReshapeTemplate):
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
@@ -36,19 +37,18 @@ def __init__(self, templateStr):
     def alignToContext(self, ctxt: NetworkContext,
                        operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
 
-        # SCHEREMO: Selectively mark 'indices' dead, since we don't need them
-        if 'indices' in operatorRepresentation.keys():
-            ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False
-            ctxt.globalObjects[operatorRepresentation['indices']]._live = False
+        ctxt, operatorRepresentation, _ = super().alignToContext(ctxt, operatorRepresentation)
 
-        # Same for "shape"
-        if "shape" in operatorRepresentation.keys():
-            ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False
-            ctxt.globalObjects[operatorRepresentation["shape"]]._live = False
+        # Get buffers
+        bufferIn = ctxt.lookup(operatorRepresentation['data_in'])
+        assert isinstance(bufferIn, VariableBuffer)
 
-        inBuffer = ctxt.lookup(operatorRepresentation['data_in'])
-        outBuffer = ctxt.lookup(operatorRepresentation['data_out'])
-        outBuffer._alias = inBuffer.name
+        bufferOut = ctxt.lookup(operatorRepresentation['data_out'])
+        assert isinstance(bufferOut, VariableBuffer)
+
+        # HACK: Tiling wasn't updated in the Fix aliasing PR so we have to still
+        #       set the _alias argument
+        bufferOut._alias = bufferIn.name
 
         return ctxt, operatorRepresentation, []
 
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz
index a98a6c33b9..36567a96ce 100644
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx
index ae1b3ac939..5eb3ae446e 100644
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz
index a5d4b6e974..0e2e55fcfe 100644
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz differ
diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
index 24f0638f21..cf8acf05db 100644
--- a/DeeployTest/generateNetwork.py
+++ b/DeeployTest/generateNetwork.py
@@ -20,7 +20,7 @@
 from Deeploy.DeeployTypes import _NoVerbosity
 from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.Targets.CortexM.Platform import CMSISPlatform
-from Deeploy.Targets.PULPOpen.Platform import PULPPlatform
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPPlatform
 
 
 def generateNetwork(args):
@@ -84,6 +84,10 @@ def generateNetwork(args):
 
     platform, signProp = mapPlatform(args.platform)
 
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
     inputTypes = {}
     inputOffsets = {}
 
@@ -183,6 +187,13 @@ def generateNetwork(args):
                         'If not specified, offsets are set to 0. '
                         'Example: --input-offset-map input_0=0 input_1=128 ...')
     parser.add_argument('--shouldFail', action = 'store_true')
+    parser.add_argument(
+        "--cores",
+        type = int,
+        default = 1,
+        help =
+        "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1.",
+    )
     parser.set_defaults(shouldFail = False)
 
     args = parser.parse_args()
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 013f854daa..4b1ebef20b 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -26,6 +26,7 @@
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
 from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
     AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
 from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
 
 
@@ -76,6 +77,10 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     if args.enableStrides:
         platform.engines[0].enableStrides = True
 
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
     for index, num in enumerate(test_inputs):
         _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
@@ -195,6 +200,13 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     parser.add_argument('--plotMemAlloc',
                         action = 'store_true',
                         help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
+    parser.add_argument(
+        "--cores",
+        type = int,
+        default = 1,
+        help =
+        "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1."
+    )
 
     parser.set_defaults(shouldFail = False)
     args = parser.parse_args()
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index 7d1f7f312a..a3329ebf73 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -342,6 +342,10 @@ def generate_test(self):
             generation_script = "generateNetwork.py"
 
         command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}"
+
+        if self._platform in ["Siracusa", "Siracusa_w_neureka"]:
+            command += f" --cores={self._args.cores}"
+
         command += self._argument_parser.generate_cmd_args()
 
         log.debug(f"[TestRunner] Generation Command: {command}")
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
index f5382a339b..3ebab54a0b 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -9,20 +9,30 @@
 
 #include "DeeployPULPMath.h"
 
-void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                    uint32_t H, uint32_t W, uint32_t C,
-                                    const float32_t *__restrict__ pSrcB,
-                                    uint32_t F_total, uint32_t P, uint32_t Q,
-                                    uint32_t SP, uint32_t SQ,
-                                    float32_t *__restrict__ pDstC,
-                                    uint32_t pad_top, uint32_t pad_bottom,
-                                    uint32_t pad_left, uint32_t pad_right);
+void PULP_Conv2d_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right);
 
 void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
     const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
-    uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC,
-    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
-    uint32_t pad_right, float32_t *__restrict__ pContextBuffer);
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer);
+
+void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer);
 
 #endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
index c33ac31e88..af21293233 100644
--- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c
+++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
@@ -7,18 +7,19 @@
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
 
-void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                    uint32_t H, uint32_t W, uint32_t C,
-                                    const float32_t *__restrict__ pSrcB,
-                                    uint32_t F_total, uint32_t P, uint32_t Q,
-                                    uint32_t SP, uint32_t SQ,
-                                    float32_t *__restrict__ pDstC,
-                                    uint32_t pad_top, uint32_t pad_bottom,
-                                    uint32_t pad_left, uint32_t pad_right) {
+void PULP_Conv2d_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right) {
 
+  // Compute core
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
+  // Compute the chunk size for each core
   uint16_t ch_out_chunk =
       (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
   uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
@@ -29,37 +30,72 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
     return;
   }
 
+  // Pointer to the weights for the current core
   const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q;
 
+  // Compute the output dimensions
   uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
   uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
 
-  for (uint32_t h = 0; h < H_out; ++h) {
-    for (uint32_t w = 0; w < W_out; ++w) {
-      for (uint32_t f = 0; f < ch_out_count; ++f) {
-        float32_t sum = 0.0f;
+  // Compute the output
+  if (has_bias) {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        for (uint32_t f = 0; f < ch_out_count; ++f) {
+          float32_t sum = 0.0f;
 
-        for (uint32_t p = 0; p < P; ++p) {
-          for (uint32_t q = 0; q < Q; ++q) {
-            for (uint32_t c = 0; c < C; ++c) {
-              int32_t h_in = h * SP + p - pad_top;
-              int32_t w_in = w * SQ + q - pad_left;
+          for (uint32_t p = 0; p < P; ++p) {
+            for (uint32_t q = 0; q < Q; ++q) {
+              for (uint32_t c = 0; c < C; ++c) {
+                int32_t h_in = h * SP + p - pad_top;
+                int32_t w_in = w * SQ + q - pad_left;
 
-              if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
-                  w_in >= (int32_t)W) {
-                continue;
-              }
+                if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
+                    w_in >= (int32_t)W) {
+                  continue;
+                }
 
-              uint32_t input_idx = (h_in * W + w_in) * C + c;
-              uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
+                uint32_t input_idx = (h_in * W + w_in) * C + c;
+                uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
 
-              sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+                sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+              }
             }
           }
+
+          uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
+          pDstC[output_idx] = sum + pSrcBias[f + ch_out_start];
         }
+      }
+    }
+  } else {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        for (uint32_t f = 0; f < ch_out_count; ++f) {
+          float32_t sum = 0.0f;
+
+          for (uint32_t p = 0; p < P; ++p) {
+            for (uint32_t q = 0; q < Q; ++q) {
+              for (uint32_t c = 0; c < C; ++c) {
+                int32_t h_in = h * SP + p - pad_top;
+                int32_t w_in = w * SQ + q - pad_left;
+
+                if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
+                    w_in >= (int32_t)W) {
+                  continue;
+                }
+
+                uint32_t input_idx = (h_in * W + w_in) * C + c;
+                uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
+
+                sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+              }
+            }
+          }
 
-        uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
-        pDstC[output_idx] = sum;
+          uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
+          pDstC[output_idx] = sum;
+        }
       }
     }
   }
@@ -68,12 +104,17 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
 void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
     const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
-    uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC,
-    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
-    uint32_t pad_right, float32_t *__restrict__ pContextBuffer) {
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer) {
+
+  // Compute core
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
+  // Compute the chunk size for each core
   uint16_t ch_out_chunk =
       (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
   uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
@@ -84,50 +125,95 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     return;
   }
 
+  // Pointer to the weights for the current core
   const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q;
 
   uint32_t im2col_size_per_core = C * P * Q;
   float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core;
 
+  // Compute the output dimensions
   uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
   uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
   uint32_t kernel_size = P * Q * C;
 
-  for (uint32_t h_out = 0; h_out < H_out; h_out++) {
-    for (uint32_t w_out = 0; w_out < W_out; w_out++) {
-      int32_t h_in_start = h_out * SP - pad_top;
-      int32_t w_in_start = w_out * SQ - pad_left;
+  // Compute the output
+  if (has_bias) {
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        for (uint32_t p = 0; p < P; p++) {
+          int32_t h_in = h_in_start + p;
+
+          for (uint32_t q = 0; q < Q; q++) {
+            int32_t w_in = w_in_start + q;
+
+            for (uint32_t c = 0; c < C; c++) {
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                uint32_t in_idx = (h_in * W + w_in) * C + c;
+                im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
+              } else {
+                im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+              }
+            }
+          }
+        }
+
+        for (uint32_t f = ch_out_start; f < ch_out_stop; f++) {
+          float32_t sum = 0.0f;
+          const float32_t *local_weight_ptr =
+              weight_ptr + (f - ch_out_start) * kernel_size;
 
-      for (uint32_t p = 0; p < P; p++) {
-        int32_t h_in = h_in_start + p;
+          for (uint32_t k = 0; k < kernel_size; k++) {
+            sum += im2col_buffer[k] * local_weight_ptr[k];
+          }
 
-        for (uint32_t q = 0; q < Q; q++) {
-          int32_t w_in = w_in_start + q;
+          uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
 
-          for (uint32_t c = 0; c < C; c++) {
-            if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
-                w_in < (int32_t)W) {
-              uint32_t in_idx = (h_in * W + w_in) * C + c;
-              im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
-            } else {
-              im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+          pDstC[out_idx] = sum + pSrcBias[f];
+        }
+      }
+    }
+  } else {
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        for (uint32_t p = 0; p < P; p++) {
+          int32_t h_in = h_in_start + p;
+
+          for (uint32_t q = 0; q < Q; q++) {
+            int32_t w_in = w_in_start + q;
+
+            for (uint32_t c = 0; c < C; c++) {
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                uint32_t in_idx = (h_in * W + w_in) * C + c;
+                im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
+              } else {
+                im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+              }
             }
           }
         }
-      }
 
-      for (uint32_t f = 0; f < ch_out_count; f++) {
-        float32_t sum = 0.0f;
-        const float32_t *local_weight_ptr = weight_ptr + f * kernel_size;
+        for (uint32_t f = ch_out_start; f < ch_out_stop; f++) {
+          float32_t sum = 0.0f;
+          const float32_t *local_weight_ptr =
+              weight_ptr + (f - ch_out_start) * kernel_size;
 
-        for (uint32_t k = 0; k < kernel_size; k++) {
-          sum += im2col_buffer[k] * local_weight_ptr[k];
-        }
+          for (uint32_t k = 0; k < kernel_size; k++) {
+            sum += im2col_buffer[k] * local_weight_ptr[k];
+          }
 
-        uint32_t out_idx =
-            (h_out * W_out + w_out) * F_total + (ch_out_start + f);
-        pDstC[out_idx] = sum;
+          uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+          pDstC[out_idx] = sum;
+        }
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c
new file mode 100644
index 0000000000..b0a06c66eb
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c
@@ -0,0 +1,251 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer) {
+
+  // Compute core information
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = log2(NUM_CORES);
+
+  // Compute the chunk size for each core
+  // (Splitting work along the output channels)
+  uint16_t ch_out_chunk =
+      (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
+  uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
+  uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, F_total);
+  uint16_t ch_out_count = ch_out_stop - ch_out_start;
+
+  // If there is no output channel to process, return
+  // (when F < NUM_CORES and working on a core with id > F)
+  if (ch_out_count == 0) {
+    return;
+  }
+
+  // Move pointer of the weights for the current core
+  const float32_t *weight_ptr = pSrcB + ch_out_start * P * Q;
+
+  // Move pointer of the im2col buffer for the current core
+  uint32_t im2col_size_per_core = P * Q;
+  float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core;
+
+  // Compute the output dimensions
+  uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+  uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+  uint32_t kernel_size = P * Q * F_total;
+
+  // Compute the output
+  if (has_bias) {
+    // Work on individual output elements
+    // (each element depends on a column from the im2col buffer
+    // and one convolutional filter, stored in memory continuously)
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        // Compute height and width starting point
+        // (depending on stride and padding)
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        // Initialize the padded part of the im2col buffer with 0
+        // Work on the TOP padding
+        for (int32_t h_in = (int32_t)h_in_start;
+             h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the BOTTOM padding
+        for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P;
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining LEFT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining RIGHT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q;
+               w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Copy input data to im2col buffer
+        // Input channels depend on the output channels assigned to the core
+        // (each input channel is associated with F_total / C output channels,
+        // number which corresponds to the "group" parameter in the Conv ONNX
+        // operator)
+        for (uint32_t c = ch_out_start / (F_total / C);
+             c < (ch_out_stop + 1) / (F_total / C); c++) {
+          // Copy the valid input data to the im2col buffer
+          for (uint32_t h_in = MAX(0, h_in_start);
+               h_in < MIN(H, h_in_start + P); h_in++) {
+            for (uint32_t w_in = MAX(0, w_in_start);
+                 w_in < MIN(W, w_in_start + Q); w_in++) {
+              uint32_t in_idx = (h_in * W + w_in) * C + c;
+              im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] =
+                  pSrcA[in_idx];
+            }
+          }
+
+          // Compute output channels of interest, based on current input channel
+          // and core
+          uint32_t lower_f, upper_f;
+
+          if (c * (F_total / C) < ch_out_start) {
+            lower_f = ch_out_start;
+          } else {
+            lower_f = c * (F_total / C);
+          }
+
+          if ((c + 1) * (F_total / C) < ch_out_stop) {
+            upper_f = (c + 1) * (F_total / C);
+          } else {
+            upper_f = ch_out_stop;
+          }
+
+          // Perform convolution for the assigned output channels
+          for (uint32_t f = lower_f; f < upper_f; f++) {
+            float32_t sum = 0.0f;
+            uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+            for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) {
+              sum +=
+                  im2col_buffer[im2col_idx] *
+                  weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)];
+            }
+
+            // Copy the result to the output tensor
+            pDstC[out_idx] = sum + pSrcBias[f];
+          }
+        }
+      }
+    }
+  } else {
+    // Work on individual output elements
+    // (each element depends on a column from the im2col buffer
+    // and one convolutional filter, stored in memory continuously)
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        // Compute height and width starting point
+        // (depending on stride and padding)
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        // Initialize the padded part of the im2col buffer with 0
+        // Work on the TOP padding
+        for (int32_t h_in = (int32_t)h_in_start;
+             h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the BOTTOM padding
+        for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P;
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining LEFT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining RIGHT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q;
+               w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Copy input data to im2col buffer
+        // Input channels depend on the output channels assigned to the core
+        // (each input channel is associated with F_total / C output channels,
+        // number which corresponds to the "group" parameter in the Conv ONNX
+        // operator)
+        for (uint32_t c = ch_out_start / (F_total / C);
+             c < (ch_out_stop + 1) / (F_total / C); c++) {
+          // Copy the valid input data to the im2col buffer
+          for (uint32_t h_in = MAX(0, h_in_start);
+               h_in < MIN(H, h_in_start + P); h_in++) {
+            for (uint32_t w_in = MAX(0, w_in_start);
+                 w_in < MIN(W, w_in_start + Q); w_in++) {
+              uint32_t in_idx = (h_in * W + w_in) * C + c;
+              im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] =
+                  pSrcA[in_idx];
+            }
+          }
+
+          // Compute output channels of interest, based on current input channel
+          // and core
+          uint32_t lower_f, upper_f;
+
+          if (c * (F_total / C) < ch_out_start) {
+            lower_f = ch_out_start;
+          } else {
+            lower_f = c * (F_total / C);
+          }
+
+          if ((c + 1) * (F_total / C) < ch_out_stop) {
+            upper_f = (c + 1) * (F_total / C);
+          } else {
+            upper_f = ch_out_stop;
+          }
+
+          // Perform convolution for the assigned output channels
+          for (uint32_t f = lower_f; f < upper_f; f++) {
+            float32_t sum = 0.0f;
+            uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+            for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) {
+              sum +=
+                  im2col_buffer[im2col_idx] *
+                  weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)];
+            }
+
+            // Copy the result to the output tensor
+            pDstC[out_idx] = sum;
+          }
+        }
+      }
+    }
+  }
+
+  return;
+}