diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml
index 7c6a5f754..f59f7fa88 100644
--- a/.github/workflows/ci-platform-siracusa.yml
+++ b/.github/workflows/ci-platform-siracusa.yml
@@ -53,7 +53,15 @@ jobs:
         testBacktracking
         testFloatAdder
         testFloatGEMM
+
         testFloat2DConvolution
+        testFloat2DConvolutionBias
+        testFloat2DConvolutionZeroBias
+
+        testFloat2DDWConvolution
+        testFloat2DDWConvolutionBias
+        testFloat2DDWConvolutionZeroBias
+
         testFloatLayerNorm
         testFloatRelu
         testFloatMaxPool
@@ -64,6 +72,7 @@ jobs:
         Quant
         Dequant
         testFloatReduceSum
+        testFloatReshapeWithSkipConnection
         testFloatSoftmaxGrad
         testFloatSoftmaxCrossEntropy
         testFloatSoftmaxCrossEntropyGrad
@@ -87,4 +96,5 @@ jobs:
         CCT/CCT_1_16_16_8
         CCT/CCT_2_32_32_128_Opset20
         testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8
+        testFloatDemoTinyViT
       num-cores: 8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index faf4de42c..fc7269587 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- TinyViT on non-tiled Siracusa [#117](https://github.com/pulp-platform/Deeploy/pull/117)
 - Support Fully Asynchronous DMAs [#114](https://github.com/pulp-platform/Deeploy/pull/114)
 - Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128)
 - Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123)
@@ -24,6 +25,13 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Fix bias hoisting in generic GEMM with no bias [#126](https://github.com/pulp-platform/Deeploy/pull/126)
 
 ### Added
+- PULP 2D FP DW conv Im2Col template and kernel, with bias support.
+- Bias support for PULP 2D FP regular conv Im2Col in template & kernel.
+- PULP FP DW conv 2D parser.
+- FP conv 2D (simple & DW), reshape & skip connection, and TinyViT demo tests to the non-tiled Siracusa CI pipeline.
+- FP bindings and mappings for PULP slice, DW conv 2D, and reduce mean operations.
+- FP PULP DW conv lowering optimization pass similar to the existent one for integer version.
+- RemoveEmptyConvBiasPass to the PULP optimizer.
 - Add manual type inference feature (CLI: `--input-type-map`/`--input-offset-map`) to resolve ambiguities when test inputs are not representative enough
 - Added a `testTypeInferenceDifferentTypes` test case to validate type inference for different input types
 - Added `_mangleNodeNames` function to avoid duplicate node mappings
@@ -60,6 +68,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy`
 
 ### Changed
+- Reduced size of reshape & skip connection test, for non-tiled Siracusa memory compatibility.
 - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`.
 - mchan HAL is now reduced to bare-bones
 - refactor of the IntrospectiveCodeTransformation to work on the Mako template
@@ -97,6 +106,9 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
 
 ### Fixed
+- Fixed bug for non-batched elements in the PULPOpen FP GEMM and matmul templates.
+- Added underscore to the beginning of closure names to avoid naming issues when they start with unsupported first characters (like numbers).
+- Data types in the PULPOpen FP add and mul templates.
 - Prevent node duplication for graphs generated via GraphSurgeon
 - Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step.
 - Fix license CI check and prevent potential issues with `jq` installation
@@ -185,9 +197,9 @@ This release containing major architectural changes, new platform support, enhan
 
 
 ### Added
-- BatchNorm kernel 
-- ConvTranspose kernel 
-- MaxPool1D kernel 
+- BatchNorm kernel
+- ConvTranspose kernel
+- MaxPool1D kernel
 - Template for 1D Convolution
 - Support for float32 data type in the previous kernels
 - Float binding for Pad1D kernel
@@ -326,7 +338,7 @@ This release containing major architectural changes, new platform support, enhan
 
 ### Changed
 - FloatConvTemplate file
-- Platform.py file  
+- Platform.py file
 - Bump the CMake version to 3.24 as required for the chimera-sdk
 - Bump GVSoC's version and add chimera simulation target
 - Rename the generic source util to utils to avoid name collision with chimera-sdk
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
index c5f9c883a..41073ad64 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
@@ -155,7 +155,8 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        self.closureName = name + self.closureSuffix
+        # Prepend underscore to avoid name issues when beginning with problematic characters (like numbers)
+        self.closureName = "_" + name + self.closureSuffix
         self.functionCall = executionBlock.generate(ctxt)
         self._generateClosureStruct(ctxt, executionBlock)
         ctxt = self._generateClosureCtxt(ctxt, name)
diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
index 4f6dba382..c05ea3b9d 100644
--- a/Deeploy/CommonExtensions/DataTypes.py
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -87,11 +87,11 @@ class float64_t(FloatImmediate):
 
 SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t)
 UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t)
-IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted((
-    *SignedIntegerDataTypes,
-    *UnsignedIntegerDataTypes,
-),
-                                                               key = lambda _type: _type.typeWidth))
+IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple(
+    sorted((
+        *SignedIntegerDataTypes,
+        *UnsignedIntegerDataTypes,
+    ), key = lambda _type: _type.typeWidth))
 FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t)
 
 
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
index f07fe57c9..a8f27b546 100644
--- a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
+++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
@@ -2,11 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Union
-
 import onnx_graphsurgeon as gs
 
-from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity
+from Deeploy.DeeployTypes import CodeGenVerbosity, DeploymentEngine, NetworkContext, NetworkDeployer, _NoVerbosity
 
 
 class NetworkDeployerWrapper(NetworkDeployer):
@@ -68,8 +66,8 @@ def generateBufferAllocationCode(self) -> str:
         return self._innerObject.generateBufferAllocationCode()
 
     # MultiEngineDeployer augment
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
-        return self._innerObject._mapNode(node)
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
+        return self._innerObject._selectEngine(node)
 
     def _printMemorySummary(self):
         return self._innerObject._printMemorySummary()
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 8c2f5d248..5ccfb7dcf 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -325,7 +325,7 @@ def fromNode(cls, node: gs.Node):
         return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape))
 
     def has_live_aliases(self, ctxt: NetworkContext) -> bool:
-        """Checks whether this VariableBuffer has any live ancestors, i.e. buffers that are still live and are aliased by this buffer.
+        """Checks whether this VariableBuffer has any live aliases, i.e. buffers that are still live and are aliased by this buffer.
         Parameters
         ----------
         ctxt : NetworkContext
@@ -333,7 +333,7 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool:
         Returns
         -------
         bool
-            True if this VariableBuffer has any live ancestors, False otherwise
+            True if this VariableBuffer has any live aliases, False otherwise
         """
         # Do a breadth-first search across the aliasing double-linked list
         live = self._live
@@ -2562,10 +2562,10 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
             self.ctxt = layer.codeTransform(self.ctxt, verbose)
         self.transformed = True
 
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
         for engine in self.Platform.engines:
             if node.op in engine.Mapping:
-                return engine.Mapping[node.op](node)
+                return engine
         raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}")
 
     def _bindLayers(self):
@@ -2582,7 +2582,8 @@ def _bindLayers(self):
                 flatSchedule += subGraph
 
         for node in flatSchedule:
-            layer = self._mapNode(node)
+            engine = self._selectEngine(node)
+            layer = engine.Mapping[node.op](node)
             if isinstance(layer, ONNXLayer):
                 log.debug(f"   {SUCCESS_MARK} Bind {node.name} to layer {layer.__class__.__name__}")
                 self.layerBinding[layer.node.name] = layer
diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
index 4b05ab5be..570363b9a 100644
--- a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
+++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
@@ -2,13 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Callable, Dict, Type, Union
+from typing import Callable, Dict, Type
 
 import onnx_graphsurgeon as gs
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
-from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer
+from Deeploy.DeeployTypes import DeploymentEngine, DeploymentPlatform, NetworkDeployer, Schedule, TopologyOptimizer
 from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \
     EngineColoringPass, EngineMapper
 
@@ -48,14 +48,14 @@ def lower(self, graph: gs.Graph) -> gs.Graph:
         ) == 0, f"Missing engine color for nodes {[node.name for node in uncoloredNodes]} with operations {uncoloredOperations}"
         return graph
 
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
         assert "engine" in node.attrs, f"Node {node.name} doesn't have an engine color."
         engineName = node.attrs["engine"]
         assert isinstance(engineName, str) and engineName in self.engineDict, \
             f"Node {node.name} has an invalid engine {engineName} assigned."
         engine = self.engineDict[engineName]
         assert node.op in engine.Mapping, f"No mapping found for {node.op} in engine {engine.name}"
-        return engine.Mapping[node.op](node)
+        return engine
 
 
 class EngineColoringDeployerWrapper(EngineColoringDeployer, NetworkDeployerWrapper):
diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py
index 005b0b889..e4d164f6a 100644
--- a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py
@@ -8,6 +8,7 @@
 
 
 class _FloatReduceMeanTemplate(NodeTemplate):
+    # WARNING: Currently only supports single axis reducing!
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
diff --git a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
index 93d884eb8..67a476ca6 100644
--- a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
@@ -8,6 +8,7 @@
 
 
 class _ReduceMeanTemplate(NodeTemplate):
+    # WARNING: Currently only supports single axis reducing!
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
diff --git a/Deeploy/Targets/Generic/Templates/SliceTemplate.py b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
index 3ffaa4621..5797c9ba6 100644
--- a/Deeploy/Targets/Generic/Templates/SliceTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
@@ -10,6 +10,7 @@
 
 
 class _SliceTemplate(NodeTemplate):
+    # WARNING: Currently only supports single axis slicing!
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 9ff940b2f..cc81527f3 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -9,13 +9,13 @@
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
     MemoryManagementGeneration, MemoryPassthroughGeneration
-from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t, \
-    uint8_t
+from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \
+    int8_t, int32_t, int64_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \
-    GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate
+from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
+    FloatReduceSumTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
     GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
     QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
@@ -27,11 +27,11 @@
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
 from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
 from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \
-    FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, \
-    FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, \
-    ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \
-    SliceTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \
+    FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \
+    FloatMulTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
+    MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, \
+    SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
     iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
@@ -148,16 +148,24 @@
             PointerClass(uint8_t),
             PointerClass(uint8_t),
             PointerClass(uint8_t)
-        ], [PULPDMAFuture(underlyingType = type)]), SliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
+        ], [PULPDMAFuture(underlyingType = type)]), DMASliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
     for type in IntegerDataTypes
 ]
 
+PULPSliceBindings = [
+    NodeBinding(
+        SliceChecker([
+            PointerClass(type),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t),
+            PointerClass(uint8_t)
+        ], [PointerClass(type)]), SliceTemplate.referenceTemplate, ForkTransformer) for type in FloatDataTypes
+]
+
 PULPReshapeBindings = [
-    NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int32_t)], [PointerClass(type)]),
-                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes
-] + [
-    NodeBinding(ReshapeChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
-                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes
+    NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]),
+                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes
 ]
 
 PULPRQAddBindings = [
@@ -225,6 +233,14 @@
         ForkTransformer)
 ]
 
+PULPFloatDWConv2DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(float_type), PointerClass(float_type),
+             PointerClass(float_type)], [PointerClass(float_type)]), FloatConvTemplate.referenceDW2DIm2ColTemplate,
+        ForkTransformer) for float_type in FloatDataTypes
+]
+
 PULPRQSMatrixVecBindings = [
     NodeBinding(
         PULPLinearChecker([PointerClass(type1),
@@ -276,6 +292,11 @@
 PULPReduceMeanBindings = [
     NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
                 ClusterTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]),
+                FloatReduceMeanTemplate.referenceTemplate, ClusterTransformer)
+    for integer_type in SignedIntegerDataTypes
+    for float_type in FloatDataTypes
 ]
 
 PULPReduceSumBindings = [
diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py
index 86bf02e57..bceea01f4 100644
--- a/Deeploy/Targets/PULPOpen/Deployer.py
+++ b/Deeploy/Targets/PULPOpen/Deployer.py
@@ -15,6 +15,7 @@
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
     TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass
 
 _L3AllocTemplate = NodeTemplate("""
@@ -63,7 +64,15 @@ def __init__(self,
 
         self.extNameCount = 0
 
-    def bind(self):
+    def annotateNCores(self) -> None:
+        for layer in self.layerBinding.values():
+            node = layer.node
+            engine = self._selectEngine(node)
+            opRepr = layer.mapper.parser.operatorRepresentation
+            if isinstance(engine, PULPClusterEngine):
+                opRepr["n_cores"] = engine.n_cores
+
+    def bind(self) -> bool:
         # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
         # SCHEREMO: The BindingOptimizationPass system is fairly fragile;
         # it was designed this way because implementing further topology optimizations after
@@ -71,11 +80,16 @@ def bind(self):
         # but if there is only very few cases, this solution is okay.
         autoTransposePass = AutoTransposeMergePass()
         #self.ctxt, self.layerBinding = autoTransposePass.apply(self.ctxt, self.graph, self.layerBinding)
+
+        # LMACAN: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
+        self.annotateNCores()
+
         # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
-        ret = super().bind()
-        if ret:
-            self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
-        return ret
+        if not super().bind():
+            return False
+
+        self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
+        return True
 
     def _l3ConstBuffer(self) -> List[VariableBuffer]:
         return [
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
index e94af6e42..ab99fcabc 100644
--- a/Deeploy/Targets/PULPOpen/Parsers.py
+++ b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -72,24 +72,24 @@ def parseNode(self, node: gs.Node) -> (bool):
         wellFormed = super().parseNode(node)
         if wellFormed:
             ret = all([
-                # Make sure padding is square
+                # Current PULP kernel only supports grouping of 1
                 self.operatorRepresentation['group'] == 1,
+
+                # Make sure padding is square
                 self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
                 self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
                 self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
-                len(node.inputs) == 2
+
+                # Check number of inputs
+                # 2 inputs if no bias, 3 if layer has bias
+                len(node.inputs) in [2, 3],
             ])
 
-            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
-            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
-            self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
-            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+            # Extract additional attributes
             self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
             self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
             self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
             self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
-            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
-            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
 
             return ret
         return False
@@ -102,11 +102,86 @@ def parseNodeCtxt(self,
         newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
 
         if ret:
+            # Set inputs names
+            inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = "true"
+
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
             return newCtxt, True
 
         return ctxt, False
 
 
+class PULPFPDWConv2DParser(Conv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        # Parse root conv 2D information
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            # Check if the node is a depthwise convolution
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+
+                # Check number of inputs
+                # 2 inputs if no bias, 3 if layer has bias
+                len(node.inputs) in [2, 3],
+            ])
+
+            # Extract additional attributes
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+            self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+
+            return ret
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        # Parse node context for 2D conv
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            # Define input names
+            inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = "true"
+
+            # Map input nodes to operator representation
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            # Check if DW
+            if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']:
+                return newCtxt, True
+
+        return ctxt, False
+
+
 class PULPDWConv1DParser(RQSConv1DParser):
 
     def __init__(self, noBiasHoisting = True):
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 99c1c9335..133670da0 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -5,6 +5,8 @@
 import numpy as np
 import onnx_graphsurgeon as gs
 
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    RemoveEmptyConvBiasPass
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
     NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
@@ -27,20 +29,22 @@
     MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
     SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
 from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \
-    PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings
+    PULPDMASliceBindings, PULPDWConv1DBinding
 from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
 from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
-    PULPDWConv2DParser, PULPFPConv2DParser, PULPGEMMParser, PULPMatrixVecParser, PULPTallGEMMParser
+    PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
+    PULPTallGEMMParser
 from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
-    PULPConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUTilingReadyBindings, \
-    PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, \
-    PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormTilingReadyBindings, \
-    PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \
-    PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, \
-    PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \
-    PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \
-    PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
+    PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
+    PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, \
+    PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, \
+    PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \
+    PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \
+    PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, \
+    PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \
+    PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \
+    PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
     PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
     PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
@@ -61,7 +65,7 @@
 RequantShiftMapper = NodeMapper(RequantShiftParser(), PULPRQSTilingReadyBindings)
 UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings)
 
-ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanBindings)
+ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanTilingReadyBindings)
 ReduceSumMapper = NodeMapper(ReduceSumParser(), PULPReduceSumTilingReadyBindings)
 MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings)
 RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
@@ -71,6 +75,7 @@
 DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding])
 FPConv2DMapper = NodeMapper(PULPFPConv2DParser(), PULPConv2DTilingReadyBindings)
 Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
+FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPDWConv2DTilingReadyBindings)
 DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings)
 GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings)
 FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings)
@@ -85,7 +90,9 @@
 
 ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings)
 
-SliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+
+SliceMapper = NodeMapper(SliceParser(), PULPSliceTilingReadyBindings)
 
 iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings)
 
@@ -99,7 +106,7 @@
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
 PULPMapping = {
-    'Conv': ConvLayer([FPConv2DMapper]),
+    'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]),
     'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
     'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
     'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]),
@@ -125,7 +132,7 @@
     'Squeeze': ReshapeLayer([UnsqueezeMapper]),
     'Transpose': TransposeLayer([TransposeMapper]),
     'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
-    'Slice': SliceLayer([SliceMapper]),
+    'Slice': SliceLayer([SliceMapper, DMASliceMapper]),
     'RequantizedAdd': AddLayer([RQAddMapper]),
     'Concat': ConcatLayer([ConcatMapper]),
     'iRMSNorm': iRMSNormLayer([iRMSNormMapper]),
@@ -225,7 +232,8 @@ class PULPStructBuffer(StructBuffer):
     MergeConstAddAndRequantPass(),
     PULPGEMMRequantMergePass(),
     PULPMatMulRequantMergePass(),
-    PULPAddRequantMergePass()
+    PULPAddRequantMergePass(),
+    RemoveEmptyConvBiasPass(),
 ],
                                   name = "PULPOptimizer")
 
@@ -237,8 +245,14 @@ class PULPStructBuffer(StructBuffer):
 
 class PULPClusterEngine(DeploymentEngine):
 
-    def __init__(self, name: str, Mapping = PULPMapping, initCode = "", includeList = _includeList) -> None:
+    def __init__(self,
+                 name: str,
+                 Mapping = PULPMapping,
+                 initCode = "",
+                 includeList = _includeList,
+                 n_cores: int = 8) -> None:
         super().__init__(name, Mapping, initCode, includeList)
+        self.n_cores = n_cores
 
 
 class PULPPlatform(DeploymentPlatform):
diff --git a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py b/Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
similarity index 100%
rename from Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py
rename to Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
index 7f1c2e21c..200ad1b9e 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
@@ -6,14 +6,14 @@
 
 referenceTemplate = NodeTemplate("""
 // Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp})
-int8_t ${nodeName}_core_id = pi_core_id();
-int8_t ${nodeName}_log2Core = log2(NUM_CORES);
-int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
-int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
-int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 uint32_t i = ${nodeName}_chunk_start;
-for (; i+5 < ${nodeName}_chunk_stop; i+=6) {
+for (; i + 5 < ${nodeName}_chunk_stop; i += 6) {
     ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i];
     ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1];
     ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2];
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
index 29a216d72..bfa893db9 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
@@ -18,9 +18,13 @@ def __init__(self, templateStr):
     def computeTransientBuffersSize(
             ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
-        im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
-                              operatorRepresentation['dim_kernel_y'])
+        # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth //
+                      8) * operatorRepresentation["n_cores"] * operatorRepresentation[
+                          'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+
         im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+
         return [(im2col_name, im2col_dim)]
 
     def hoistTransientBuffers(self, ctxt: NetworkContext,
@@ -34,6 +38,39 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         return ctxt, operatorRepresentation, [im2col_name]
 
 
+class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+
+        # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[
+            "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize(
+            ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        # Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers
+        ctxt.lookup(im2col_name)._type.referencedType = ctxt.lookup(
+            operatorRepresentation['data_in'])._type.referencedType
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
 reference2DTemplate = NodeTemplate("""
 // 2D FP Conv HWC with ChannelOut parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
@@ -47,6 +84,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         ${weight}, ${ch_im_out},
         ${dim_kernel_y}, ${dim_kernel_x},
         ${stride_y}, ${stride_x},
+        ${bias}, ${has_bias},
         ref_${data_out}_${data_out},
         ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
     );
@@ -66,15 +104,48 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
 for (uint32_t n=0; n<${batch}; ++n) {
     PULP_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
         ref_${data_out}_${data_in},
-        ${dim_im_in_y},
         ${dim_im_in_x},
+        ${dim_im_in_y},
         ${ch_im_in},
         ${weight},
         ${ch_im_out},
-        ${dim_kernel_y},
         ${dim_kernel_x},
+        ${dim_kernel_y},
+        ${stride_x},
         ${stride_y},
+        ${bias}, ${has_bias},
+        ref_${data_out}_${data_out},
+        ${padding_y_top},
+        ${padding_y_bottom},
+        ${padding_x_left},
+        ${padding_x_right},
+        ${ctxtBuffer}
+    );
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+""")
+
+referenceDW2DIm2ColTemplate = PULP2DFloatDWConvIm2ColTemplate("""
+// 2D DW FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp})
+
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    PULP_DW_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
+        ref_${data_out}_${data_in},
+        ${dim_im_in_x},
+        ${dim_im_in_y},
+        ${ch_im_in},
+        ${weight},
+        ${ch_im_out},
+        ${dim_kernel_x},
+        ${dim_kernel_y},
         ${stride_x},
+        ${stride_y},
+        ${bias}, ${has_bias},
         ref_${data_out}_${data_out},
         ${padding_y_top},
         ${padding_y_bottom},
@@ -86,4 +157,4 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
     ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
     ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
 }
-""")
\ No newline at end of file
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index f4c22b2c2..d007e60df 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -24,9 +24,18 @@
         ${transB}
     );
 
+    % if A_batched:
     ref_${data_out}_${A} += ${M} * ${N};
+    % endif
+
+    % if B_batched:
     ref_${data_out}_${B} += ${N} * ${O};
+    % endif
+
+    % if C_batched:
     ref_${data_out}_${C} += ${M} * ${O};
+    % endif
+
     ref_${data_out}_${data_out} += ${M} * ${O};
 }
 """)
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
index 11b7c9aa2..3cdf26097 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
@@ -8,8 +8,18 @@
 // Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
 for(uint32_t b=0; b<${batch}; b++) {
+    % if A_batched:
     ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+    % else:
+    ${A_type.typeName} batch_A = ${A};
+    % endif
+
+    % if B_batched:
     ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+    % else:
+    ${B_type.typeName} batch_B = ${B};
+    % endif
+
     ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
 
     PULP_MatMul_fp32_fp32_fp32_unroll1x7(
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
index 2f202b24d..ced6c3cbc 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
@@ -7,11 +7,11 @@
 referenceTemplate = NodeTemplate("""
 // Float Mul with parallelism and 6x unrolling (Name: ${nodeName}, Op: ${nodeOp})
 
-int8_t ${nodeName}_core_id = pi_core_id();
-int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+uint32_t ${nodeName}_core_id = pi_core_id();
+uint32_t ${nodeName}_log2Core = (uint32_t) log2(NUM_CORES);
 uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1)) != 0);
-uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${size});
-uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size});
+uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 if (${nodeName}_start < ${nodeName}_end) {
     float32_t ${nodeName}_scalar = ${B}[0];
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
index 849f68eef..9dcea4256 100644
--- a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
@@ -8,6 +8,7 @@
 
 
 class _ReduceMeanTemplate(NodeTemplate):
+    # WARNING: Currently only supports single axis reducing!
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
index 41c4b5366..a795a555e 100644
--- a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
@@ -25,10 +25,11 @@
 
 from typing import Dict, List, Tuple
 
-from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer
+from Deeploy.Targets.Generic.Templates.ReshapeTemplate import _ReshapeTemplate as _GenericReshapeTemplate
 
 
-class _ReshapeTemplate(NodeTemplate):
+class _ReshapeTemplate(_GenericReshapeTemplate):
 
     def __init__(self, templateStr):
         super().__init__(templateStr)
@@ -36,19 +37,18 @@ def __init__(self, templateStr):
     def alignToContext(self, ctxt: NetworkContext,
                        operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
 
-        # SCHEREMO: Selectively mark 'indices' dead, since we don't need them
-        if 'indices' in operatorRepresentation.keys():
-            ctxt.globalObjects[operatorRepresentation['indices']]._deploy = False
-            ctxt.globalObjects[operatorRepresentation['indices']]._live = False
+        ctxt, operatorRepresentation, _ = super().alignToContext(ctxt, operatorRepresentation)
 
-        # Same for "shape"
-        if "shape" in operatorRepresentation.keys():
-            ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False
-            ctxt.globalObjects[operatorRepresentation["shape"]]._live = False
+        # Get buffers
+        bufferIn = ctxt.lookup(operatorRepresentation['data_in'])
+        assert isinstance(bufferIn, VariableBuffer)
 
-        inBuffer = ctxt.lookup(operatorRepresentation['data_in'])
-        outBuffer = ctxt.lookup(operatorRepresentation['data_out'])
-        outBuffer._alias = inBuffer.name
+        bufferOut = ctxt.lookup(operatorRepresentation['data_out'])
+        assert isinstance(bufferOut, VariableBuffer)
+
+        # HACK: Tiling wasn't updated in the Fix aliasing PR so we have to still
+        #       set the _alias argument
+        bufferOut._alias = bufferIn.name
 
         return ctxt, operatorRepresentation, []
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
index c69760df5..e6819f81a 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ortools.constraint_solver.pywrapcp import IntVar
 
@@ -141,6 +141,7 @@ def serializeTilingSolution(
                                                                   operatorRepresentation, addrNames)
 
         varWeight = operatorRepresentation['weight']
+        varIn = operatorRepresentation["data_in"]
         varOut = operatorRepresentation['data_out']
 
         inputInCubes = []
@@ -182,9 +183,16 @@ def serializeTilingSolution(
             (BatchOffset, HOffset, WOffset, COffset) = cube.offset
             (BatchSize, HSize, WSize, CSize) = cube.dims
 
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube(
+                kernelShape = (weightH, weightW),
+                pads = pads,
+                strides = strides,
+                inputCSize = weightC,
+                outputCube = cube,
+                inputDims = ctxt.lookup(varIn).shape,
+                outputDims = ctxt.lookup(varOut).shape,
+            )
+
             padding_left, padding_right, padding_top, padding_bottom = padding_tuple
 
             replacements['dim_im_in_x'].append(InCube.dims[1])
@@ -230,6 +238,7 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         # Get to-be-tiled tensor's buffers
         inputBufferName = parseDict['data_in']
         weightBufferName = parseDict['weight']
+        biasBufferName = parseDict['bias']
         outputBufferName = parseDict['data_out']
 
         strides = parseDict["strides"]
@@ -237,27 +246,38 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         dilation = parseDict["dilations"]
 
         # Add I/O dimensions to the model as variables
-        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
+        for bufferName in [inputBufferName, weightBufferName, biasBufferName, outputBufferName]:
+            if bufferName != "NULL":
+                tilerModel.addTensorDimToModel(ctxt, bufferName)
 
+        # Handle input dimensions
         inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
         inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
         inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
         inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
 
+        # Handle weight dimensions
         weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
         weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
         weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
         weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
 
+        # Handle bias dimensions
+        if biasBufferName != "NULL":
+            biasChannelVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0)
+
+        # Handle output dimensions
         outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
         outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
         outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
         outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
 
+        # Add constraints to the optimization problem of the tiler model
         # Map output dims to inputs dims
         tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
         tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+        if biasBufferName != "NULL":
+            tilerModel.addConstraint(outputChannelVar == biasChannelVar)  # Bias
 
         inputBuffer = ctxt.lookup(inputBufferName)
 
@@ -317,9 +337,14 @@ def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
         return symbolicParseDict
 
     @staticmethod
-    def computeInputCube(kernelShape: Tuple[int, int], pads: Tuple[int, int, int, int], strides: Tuple[int, int],
-                         inputCSize: int, outputCube: HyperRectangle,
-                         outputDims: Tuple[int, int, int]) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]:
+    def computeInputCube(
+            kernelShape: Tuple[int, int],
+            pads: Tuple[int, int, int, int],
+            strides: Tuple[int, int],
+            inputCSize: int,
+            outputCube: HyperRectangle,
+            outputDims: Tuple[int, int, int],
+            inputDims: Optional[Tuple[int, int, int]] = None) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]:
 
         (outputBatchOffset, outputHOffset, outputWOffset, outputCOffset) = outputCube.offset
         (outputBatchSize, outputHSize, outputWSize, outputCSize) = outputCube.dims
@@ -338,8 +363,19 @@ def computeInputCube(kernelShape: Tuple[int, int], pads: Tuple[int, int, int, in
         inputHOffset = max(outputHOffset * strideH - padTop, 0)
         inputWOffset = max(outputWOffset * strideW - padLeft, 0)
 
-        inputHSize = outputHSize * strideH + (kernelShape[0] - 1) - (tilePadTop + tilePadBottom)
-        inputWSize = outputWSize * strideW + (kernelShape[1] - 1) - (tilePadLeft + tilePadRight)
+        if inputDims is not None:
+            # Compute input dimensions according to procedure described in PyTorch's Conv2D documentation
+            # Assuming worst case (cutting of (stride - 1) elements at the end of each dimension)
+            inputHSize = outputHSize * strideH + kernelShape[0] - (tilePadTop + tilePadBottom) - 1
+            inputWSize = outputWSize * strideW + kernelShape[1] - (tilePadLeft + tilePadRight) - 1
+
+            # Mitigating all situations other than the worst case assumed earlier
+            inputHSize = min(inputHSize, inputDims[1])
+            inputWSize = min(inputWSize, inputDims[2])
+        else:
+            # Use previous version, compatible with RQ layers
+            inputHSize = outputHSize * strideH + (kernelShape[0] - 1) - (tilePadTop + tilePadBottom)
+            inputWSize = outputWSize * strideW + (kernelShape[1] - 1) - (tilePadLeft + tilePadRight)
 
         InCube = HyperRectangle((outputBatchOffset, inputHOffset, inputWOffset, 0),
                                 (outputBatchSize, inputHSize, inputWSize, inputCSize))
@@ -351,17 +387,34 @@ def serializeTilingSolution(
             cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
             targetMemLevel: str, ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
-        addrNames = ['data_in', 'weight', 'data_out']
-        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
-                                                                  operatorRepresentation, addrNames)
+        # Extract rectangle information (offsets and dimensions) from output cubes
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
+        # Extract required component information from operator representation
         varWeight = operatorRepresentation['weight']
+        varBias = operatorRepresentation['bias']
+        varIn = operatorRepresentation["data_in"]
         varOut = operatorRepresentation['data_out']
 
+        # Prepare address names, also handling bias
+        if varBias != "NULL":
+            addrNames = ['data_in', 'weight', 'bias', 'data_out']
+        else:
+            addrNames = ['data_in', 'weight', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare cube lists for components
         inputInCubes = []
         inputWeightCubes = []
+        inputBiasCubes = []
+
+        # Prepare replacement lists for the elements inside the operator representation,
+        # for the cubes to be computed further down in this function
         replacements: Dict[str, List[int]] = {
             "dim_im_in_x": [],
             "dim_im_in_y": [],
@@ -386,23 +439,36 @@ def serializeTilingSolution(
             "padding_x_right": PointerClass(uint8_t)
         }
 
+        # Obtain weight dimensions
         weightH = ctxt.lookup(varWeight).shape[1]
         weightW = ctxt.lookup(varWeight).shape[2]
         weightC = ctxt.lookup(varWeight).shape[3]
 
+        # Obtain padding and striding information
         pads = operatorRepresentation['pads']
         strides = operatorRepresentation['strides']
 
+        # Iterate throught the cubes in which the output will be split for tiling
         for cube in outputCubes:
+            # Obtain current cube offsets and dimensions
             (BatchOffset, HOffset, WOffset, COffset) = cube.offset
             (BatchSize, HSize, WSize, CSize) = cube.dims
 
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
-
+            # Compute input cube
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube(
+                kernelShape = (weightH, weightW),
+                pads = pads,
+                strides = strides,
+                inputCSize = weightC,
+                outputCube = cube,
+                inputDims = ctxt.lookup(varIn).shape,
+                outputDims = ctxt.lookup(varOut).shape,
+            )
+
+            # Extract individual padding
             padding_left, padding_right, padding_top, padding_bottom = padding_tuple
 
+            # Add element information for the operator representation
             replacements['dim_im_in_x'].append(InCube.dims[1])
             replacements['dim_im_in_y'].append(InCube.dims[2])
             replacements['dim_im_out_x'].append(HSize)
@@ -414,21 +480,37 @@ def serializeTilingSolution(
             replacements['padding_x_left'].append(padding_left)
             replacements['padding_x_right'].append(padding_right)
 
+            # Add input cube with tiling information to the corresponding list
             inputInCubes.append(InCube)
 
+            # Obtain and add weight cube with tiling information to the corresponding list
             WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightC))
-
             inputWeightCubes.append(WeightCube)
 
+            # Obtain and add bias cube with tiling information to the corresponding list,
+            # if bias exists
+            if varBias != "NULL":
+                BiasCube = HyperRectangle((COffset,), (CSize,))
+                inputBiasCubes.append(BiasCube)
+
+        # Prepare loading schedule lists
         inputLoadSchedule = []
         outputLoadSchedule = []
 
-        for a, b in zip(inputInCubes, inputWeightCubes):
-            inputLoadSchedule.append({"data_in": a, "weight": b})
+        # Create input schedule lists, with bias handling
+        if varBias == "NULL":
+            for a, b in zip(inputInCubes, inputWeightCubes):
+                inputLoadSchedule.append({"data_in": a, "weight": b})
+        else:
+            for a, b, c in zip(inputInCubes, inputWeightCubes, inputBiasCubes):
+                inputLoadSchedule.append({"data_in": a, "weight": b, "bias": c})
 
+        # Create output schedule list
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
index 8d54eea43..71c9fec25 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
@@ -17,7 +17,7 @@
     VariableReplacementScheme
 
 
-class DWConv2DTileConstraint(TileConstraint):
+class RQDWConv2DTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
@@ -233,3 +233,332 @@ def serializeTilingSolution(
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
+
+
+class DWConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        '''
+        This function adds geometrical constraints for a PULP Im2Col 2D DW Convolution Tilling.
+        '''
+
+        # ===== GET NECESSARY INFORMATION =====
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        weightBufferName = parseDict['weight']
+        biasBufferName = parseDict['bias']
+
+        im2colBufferName = parseDict['ctxtBuffer']
+
+        # Get other information
+        has_bias = False if parseDict['has_bias'] == "false" else True
+
+        pads = parseDict['pads']
+        strides = parseDict['strides']
+        dilations = parseDict['dilations']
+        group = parseDict['group']
+        n_cores = parseDict['n_cores']
+
+        im2col_buffer_size = ctxt.lookup(im2colBufferName).size
+        weight_type_width = ctxt.lookup(weightBufferName)._type.typeWidth // 8
+
+        # ===== ADD I/O DIMS TO MODEL AS VARS =====
+        buffersOfInterest = [inputBufferName, outputBufferName, weightBufferName]
+        if has_bias:
+            buffersOfInterest.append(biasBufferName)
+
+        for bufferName in buffersOfInterest:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        #   Input
+        #   NHWC layout
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        #   Output
+        #   NHWC layout
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        #   Weight
+        #   C_out - C_in - H - W layout (depthwise convolution weights,
+        #   with c_in used for grouping different than number of channels)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        #   Bias (C_out)
+        if has_bias:
+            biasDimVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0)
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add constraint for batch size match between input and output
+        tilerModel.addConstraint(inputBatchVar == outputBatchVar)
+
+        #   Add constraint for input width and height sizes match
+        #   (Depends on output height and width, kernel size, padding, dilations, and strides.
+        #   For more information on the connections, see ONNX and/or Torch Conv2D documentation).
+        tilerModel.addConstraint(outputHeightVar == (((inputHeightVar + pads[0] + pads[2] - dilations[0] *
+                                                       (weightHeightVar - 1) - 1) // strides[0]) + 1))
+        tilerModel.addConstraint(outputWidthVar == (((inputWidthVar + pads[1] + pads[3] - dilations[1] *
+                                                      (weightWidthVar - 1) - 1) // strides[1]) + 1))
+
+        #   Add constraint for input channel size match
+        #   (Depends on weight output channel and conv grouping)
+        tilerModel.addConstraint(inputChannelVar == (weightInChannelVar * group))
+
+        #   Add constraint for weight output channels to match
+        #   output number of channels
+        tilerModel.addConstraint(weightOutChannelVar == outputChannelVar)
+
+        #   Add constraint for bias size to match number of output channels
+        if has_bias:
+            tilerModel.addConstraint(biasDimVar == outputChannelVar)
+
+        #   Add constraint for size of im2col buffer to be equal to
+        #   number of cores * width of weight data type * size of a single convolutional filter
+        tilerModel.addConstraint(im2col_buffer_size == (n_cores * weight_type_width * weightHeightVar * weightWidthVar))
+
+        #   Add constraint for relationship between in and out number of channels
+        tilerModel.addConstraint((outputChannelVar % inputChannelVar) == 0)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # ===== GET NECESSARY INFORMATION =====
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        weightBufferName = parseDict['weight']
+        biasBufferName = parseDict['bias']
+
+        # Get other information
+        has_bias = False if parseDict['has_bias'] == "false" else True
+
+        pads = parseDict['pads']
+        strides = parseDict['strides']
+
+        # ===== ADD I/O DIMS TO MODEL AS VARS =====
+        buffersOfInterest = [inputBufferName, outputBufferName, weightBufferName]
+        if has_bias:
+            buffersOfInterest.append(biasBufferName)
+
+        for bufferName in buffersOfInterest:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        #   Input
+        #   NHWC layout
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        #   Output
+        #   NHWC layout
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+
+        #   Weight
+        #   C_out - C_in - H - W layout (depthwise convolution weights,
+        #   with c_in used for grouping different than number of channels)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        #   Bias (C_out)
+        if has_bias:
+            biasDimVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0)
+
+        # ===== ADD CONSTRAINTS =====
+        # Workaround tiling issue with non-wordaligned accesses
+        if "L3" in ctxt.lookup(parseDict['data_in'])._memoryLevel:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict, 'ch_im_in', inputChannelVar, 4)
+
+        # Check that height and width of weights match the parsed values
+        tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+        tilerModel.addConstraint(weightOutChannelVar == parseDict['ch_im_out'])
+
+        # Check bias dimension
+        if biasBufferName != "NULL":
+            tilerModel.addConstraint(biasDimVar == parseDict["ch_im_out"])
+
+        # Constraint the minimum tile size such that at least one kernel can be applied
+        # Account for padding
+        tilerModel.addConstraint(outputHeightVar >= 1 + max([pads[0], pads[2]]))
+        tilerModel.addConstraint(outputWidthVar >= 1 + max([pads[1], pads[3]]))
+
+        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'] + pads[0], strategy = PerformanceHint(1))
+        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'] + pads[1], strategy = PerformanceHint(1))
+
+        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
+        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['ch_im_in'] = tilerModel.getTensorDimVar(inputBuffer.name, 3)
+        symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 2)
+        symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 3)
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+
+        # Extract rectangle information (offsets and dimensions) from output cubes
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # Extract required component information from operator representation
+        varIn = operatorRepresentation['data_in']
+        varWeight = operatorRepresentation['weight']
+        varBias = operatorRepresentation['bias']
+        varOut = operatorRepresentation['data_out']
+
+        # Prepare address names, also handling bias
+        if varBias != "NULL":
+            addrNames = ['data_in', 'weight', 'bias', 'data_out']
+        else:
+            addrNames = ['data_in', 'weight', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare cube lists for components
+        inputInCubes = []
+        inputWeightCubes = []
+        inputBiasCubes = []
+
+        # Prepare replacement lists for the elements inside the operator representation,
+        # for the cubes to be computed further down in this function
+        replacements: Dict[str, List[int]] = {
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "ch_im_out": [],
+            "ch_im_in": [],
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": []
+        }
+
+        replacementTypes = {
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_out": PointerClass(uint16_t),
+            "ch_im_in": PointerClass(uint16_t),
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t)
+        }
+
+        # Obtain weight dimensions
+        # C_out - C_in - H - W layout (depthwise convolution weights,
+        # with c_in used for grouping different than number of channels)
+        weightC_in = ctxt.lookup(varWeight).shape[1]
+        weightH = ctxt.lookup(varWeight).shape[2]
+        weightW = ctxt.lookup(varWeight).shape[3]
+
+        # Obtain padding and striding information
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+        group = operatorRepresentation['group']
+
+        # Iterate throught the cubes in which the output will be split for tiling
+        for cube in outputCubes:
+            # Obtain current cube offsets and dimensions
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            # Compute input cube
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube(kernelShape = (weightH, weightW),
+                                                                          pads = pads,
+                                                                          strides = strides,
+                                                                          inputCSize = weightC_in * group,
+                                                                          outputCube = cube,
+                                                                          inputDims = ctxt.lookup(varIn).shape,
+                                                                          outputDims = ctxt.lookup(varOut).shape)
+
+            # Extract individual padding
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            # Extract InCuve hyperrectangle
+            InCube = HyperRectangle((InCube.offset[0], InCube.offset[1], InCube.offset[2], InCube.offset[3]),
+                                    (InCube.dims[0], InCube.dims[1], InCube.dims[2], InCube.dims[3]))
+
+            # Prepare weight cube
+            WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, InCube.dims[3] // group, weightH, weightW))
+
+            # Add element information for the operator representation
+            replacements['dim_im_in_x'].append(InCube.dims[1])
+            replacements['dim_im_in_y'].append(InCube.dims[2])
+            replacements['dim_im_out_x'].append(HSize)
+            replacements['dim_im_out_y'].append(WSize)
+            replacements['ch_im_out'].append(CSize)
+            replacements['ch_im_in'].append(InCube.dims[3])
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            # Add computed cubes to the respective lists
+            inputInCubes.append(InCube)
+            inputWeightCubes.append(WeightCube)
+
+            # Obtain and add bias cube with tiling information to the corresponding list,
+            # if bias exists
+            if varBias != "NULL":
+                BiasCube = HyperRectangle((COffset,), (CSize,))
+                inputBiasCubes.append(BiasCube)
+
+        # Prepare loading schedule lists
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # Create input schedule lists, with bias handling
+        if varBias == "NULL":
+            for a, b in zip(inputInCubes, inputWeightCubes):
+                inputLoadSchedule.append({"data_in": a, "weight": b})
+        else:
+            for a, b, c in zip(inputInCubes, inputWeightCubes, inputBiasCubes):
+                inputLoadSchedule.append({"data_in": a, "weight": b, "bias": c})
+
+        # Create output schedule list
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
index 8b795be88..db38b841a 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
@@ -19,60 +19,74 @@ class MatMulTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
+        # ===== GET NECESSARY INFORMATION =====
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
         outputBuffer = ctxt.lookup(name = parseDict['data_out'])
 
-        # Add I/O dimensions to the model as variables
+        tensorsShapeLenA = len(bufferA.shape)
+        tensorsShapeLenB = len(bufferB.shape)
+        tensorsShapeLenOutput = len(outputBuffer.shape)
+
+        # ===== ADD I/O DIMS TO MODEL AS VARS =====
         for _buffer in [bufferA, bufferB, outputBuffer]:
             tilerModel.addTensorDimToModel(ctxt, _buffer.name)
 
-        tensorsShapeLen = len(bufferA.shape)
-
-        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
-                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
-        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
-        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
-        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
-        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2))
-        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1))
-
-        # Map output dims to inputs dims
-        for idx in range(tensorsShapeLen - 2):
-            tilerModel.addConstraint(
-                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
-                    tensorName = bufferA.name, dimIdx = idx))
-            tilerModel.addConstraint(
-                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
-                    tensorName = bufferB.name, dimIdx = idx))
-
-        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
-        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
-
-        # Add GEMM Geometrical constraints
-        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        # *Checks on wether dimesnions are reversed via the transA and transB flags
+        #   A dims
+        AMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                        dimIdx = (tensorsShapeLenA - 2) + parseDict['transA'])
+        AMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                         dimIdx = (tensorsShapeLenA - 1) - parseDict['transA'])
+
+        #   B dims
+        BMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                        dimIdx = (tensorsShapeLenB - 2) + parseDict['transB'])
+        BMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                         dimIdx = (tensorsShapeLenB - 1) - parseDict['transB'])
+
+        #   Output dims
+        outputMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                             dimIdx = (tensorsShapeLenOutput - 2))
+        outputMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                              dimIdx = (tensorsShapeLenOutput - 1))
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add batch constraints
+        if (bufferA.shape[:-2] == bufferB.shape[:-2]):
+            for idx in range(tensorsShapeLenA - 2):
+                tilerModel.addConstraint(
+                    tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx)
+                    == tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = tensorsShapeLenA - 3 - idx))
+
+            for idx in range(tensorsShapeLenB - 2):
+                tilerModel.addConstraint(
+                    tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx)
+                    == tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = tensorsShapeLenB - 3 - idx))
+
+        #   Add GEMM geometrical constraints
+        tilerModel.addConstraint(outputMatrixFirstDimVar == AMatrixFirstDimVar)
+        tilerModel.addConstraint(outputMatrixSecondDimVar == BMatrixSecondDimVar)
+
+        tilerModel.addConstraint(AMatrixSecondDimVar == BMatrixFirstDimVar)
 
         return tilerModel
 
     @staticmethod
     def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
 
+        # Get input buffers and other required information
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
 
         tensorsShapeLen = len(bufferA.shape)
 
+        # Get dimensions of interest from the 2 inputs
         ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
                                                    dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
         BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
                                                   dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
-        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
 
         # VIC: We don't want to deal with intermediate results between kernel calls
         tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
@@ -85,28 +99,39 @@ def serializeTilingSolution(
             cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
             targetMemLevel: str, ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        # Get output cubes
         outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
+        # Get names, optimizer variables, buffers, and other information for elements of interest
         addrNames = ['A', 'B', 'data_out']
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
 
         buffA = ctxt.lookup(operatorRepresentation['A'])
         buffB = ctxt.lookup(operatorRepresentation['B'])
+        buffOut = ctxt.lookup(operatorRepresentation['data_out'])
+
+        tensorsShapeLenA = len(buffA.shape)
+        tensorsShapeLenB = len(buffB.shape)
+        tensorsShapeOutput = len(buffOut.shape)
 
         NSize = buffA.shape[-1]
         NOffset = 0
 
+        # Prepare input cubes lists
         inputACubes = []
         inputBCubes = []
 
+        # Prepare replacements lists
         replacements = {"M": [], "O": [], "batch": []}
 
         # Every output is constructed by a pair of inputs. Reconstruct this pair.
         for cube in outputCubes:
+            # Get output dimensions
             MOffset, OOffset = cube.offset[-2:]
             MSize, OSize = cube.dims[-2:]
 
+            # Check that batch tiling is set up properly
             if len(cube.offset) > 2:
                 BatchSize = math.prod(cube.dims[:-2])
 
@@ -117,35 +142,60 @@ def serializeTilingSolution(
             else:
                 BatchSize = 1
 
+            # Prepare cube dimensions replacements
             replacements["M"].append(MSize)
             replacements["O"].append(OSize)
             replacements["batch"].append(BatchSize)
 
+            # Compute A cube information
+            #   Matrix offsets and shape
             AMatrixOffsets = (MOffset, NOffset)
             AMatrixShape = (MSize, NSize)
 
-            if len(buffA.shape) > 2:
-                batchDimCount = len(buffA.shape) - 2
-                AMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + AMatrixOffsets
-                AMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + AMatrixShape
-
-            ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
+            #   Batch offset and shape (with broadcasting handling)
+            ABatchOffsets = list()
+            ABatchShape = list()
+
+            for idx in range(tensorsShapeLenA - 2):
+                if buffA.shape[tensorsShapeLenA - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]:
+                    ABatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx])
+                    ABatchShape.append(cube.dims[len(cube.dims) - 3 - idx])
+                else:
+                    ABatchOffsets.append(0)
+                    ABatchShape.append(1)
+
+            ACube = HyperRectangle(
+                tuple(reversed(ABatchOffsets)) + tuple(AMatrixOffsets),
+                tuple(reversed(ABatchShape)) + tuple(AMatrixShape))
             inputACubes.append(ACube)
 
+            # Compute B cube information
+            #   Matrix offsets and shape
             BMatrixOffsets = (NOffset, OOffset)
             BMatrixShape = (NSize, OSize)
 
-            if len(buffB.shape) > 2:
-                batchDimCount = len(buffB.shape) - 2
-                BMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + BMatrixOffsets
-                BMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + BMatrixShape
-
-            BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
+            #   Batch offset and shape (with broadcasting handling)
+            BBatchOffsets = list()
+            BBatchShape = list()
+
+            for idx in range(tensorsShapeLenB - 2):
+                if buffB.shape[tensorsShapeLenB - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]:
+                    BBatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx])
+                    BBatchShape.append(cube.dims[len(cube.dims) - 3 - idx])
+                else:
+                    BBatchOffsets.append(0)
+                    BBatchShape.append(1)
+
+            BCube = HyperRectangle(
+                tuple(reversed(BBatchOffsets)) + tuple(BMatrixOffsets),
+                tuple(reversed(BBatchShape)) + tuple(BMatrixShape))
             inputBCubes.append(BCube)
 
+        # Prepare load schedule lists for computed cubes
         inputLoadSchedule = []
         outputLoadSchedule = []
 
+        # Prepare replacements
         replacements["N"] = [NSize] * len(outputCubes)
 
         replacementTypes = {
@@ -155,12 +205,14 @@ def serializeTilingSolution(
             "batch": PointerClass(int8_t)
         }
 
+        # Update load schedule lists
         for a, b in zip(inputACubes, inputBCubes):
             inputLoadSchedule.append({"A": a, "B": b})
 
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        # Prepare tiling schedule object
         schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
 
         return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py
new file mode 100644
index 000000000..9ac444fc1
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py
@@ -0,0 +1,164 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class ReduceMeanTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Get necessary information
+        #   Get I/O buffer names
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        #   Get I/O shapes
+        outputShape = parseDict['data_out_shape']
+
+        #   Get other necessary information
+        reduceAxes = parseDict['axes']
+        keepDims = parseDict['keepdims']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # Add constratints for the I/O dimensions
+        input_ax = 0
+        for idx in range(len(outputShape)):
+            # Get current dimension variables
+            outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx)
+
+            if idx in reduceAxes:
+                # For reduced axes, constrain to 1 if keepdims is set,
+                # otherwise skip this axis in the input tensor,
+                # as it needs to be eliminated.
+                if keepDims:
+                    tilerModel.addConstraint(outputDimensionVar == 1)
+                    input_ax += 1
+            else:
+                # Otherwise, input and output dimensions need to be equal
+                inputDimensionVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = input_ax)
+
+                tilerModel.addConstraint(outputDimensionVar == inputDimensionVar)
+
+                input_ax += 1
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict: Dict) -> HyperRectangle:
+        # Get required parameters
+        originalInputShape = parseDict['data_in_shape']
+        keepDims = parseDict['keepdims']
+
+        # Start from the output cube dimensions and offsets
+        in_cube_dims = list(originalInputShape).copy()
+        in_cube_offset = [
+            0,
+        ] * len(in_cube_dims)
+
+        # Iterate through input axes
+        out_idx = 0
+        for ax in range(len(in_cube_dims)):
+            if ax in parseDict['axes']:
+                # This axis is reduced
+                if keepDims:
+                    # Keepdims is set, so the output cube has a dimension here (which will be 1, as it's the reduction result)
+                    out_idx += 1
+            else:
+                # This axis is not reduced, so copy from output cube
+                in_cube_dims[ax] = outputCube.dims[out_idx]
+                in_cube_offset[ax] = outputCube.offset[out_idx]
+                out_idx += 1
+
+        return HyperRectangle(offset = tuple(in_cube_offset), dims = tuple(in_cube_dims))
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+
+        # Prepare address names
+        addrNames = ['data_in', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare replacement lists for the elements inside the operator representation,
+        # for the cubes to be computed further down in this function
+        replacements: Dict[str, List[int]] = {
+            "data_in_shape": [],
+            "data_out_shape": [],
+            "size": [],
+        }
+
+        replacementTypes = {
+            "data_in_shape": [
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t)
+            ],
+            "data_out_shape": [
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t)
+            ],
+            "size": PointerClass(uint16_t),
+        }
+
+        # Prepare loading schedule lists
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # Iterate over output cubes to compute corresponding input cubes
+        for out_cube in [cube.rectangle for cube in absoluteOutputCubes]:
+            # Compute input cube
+            in_cube = ReduceMeanTileConstraint.computeInputCubeFromOutputCube(out_cube,
+                                                                              parseDict = operatorRepresentation)
+
+            # Append replacement elements
+            replacements["data_in_shape"].append(list(in_cube.dims).copy())
+            replacements["data_out_shape"].append(list(out_cube.dims).copy())
+            replacements["size"].append(int(np.prod(out_cube.dims)))
+
+            # Append new cubes
+            inputLoadSchedule.append({"data_in": in_cube})
+            outputLoadSchedule.append({"data_out": out_cube})
+
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
new file mode 100644
index 000000000..623aa9a71
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
@@ -0,0 +1,174 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class SliceTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get necessary information
+        #   Get I/O buffer names
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        #   Get I/O shapes
+        inputShape = parseDict['data_in_shape']
+
+        #   Get other necessary information
+        sliceAxes = parseDict['axes']
+        sliceSteps = parseDict['steps']
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # Add constratints for the I/O dimensions
+        for idx in range(len(inputShape)):
+            # Get current dimension variables
+            inputDimensionVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx)
+            outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx)
+
+            if idx in sliceAxes:
+                # For sliced axes, constrain to minimal input dimension
+                # based on the output dimension and the slicing step
+                axIndex = list(sliceAxes).index(idx)
+                axStep = sliceSteps[axIndex]
+
+                tilerModel.addConstraint(inputDimensionVar == ((outputDimensionVar - 1) * axStep + 1))
+            else:
+                # Otherwise, input and output dimensions need to be equal
+                tilerModel.addConstraint(outputDimensionVar == inputDimensionVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict: Dict) -> HyperRectangle:
+        # Computes the input cube given the output cube and the slicing parameters.
+        #
+        # Will provide a minimal input cube, that only requires the data needed for the output cube
+        # by ignoring the input data that is outside of the slicing scope,
+        # as given by the slicing starting and ending parameters.
+        #
+        # (It will start with the first element required for the output cube,
+        # and will end with the last element required for the output cube).
+        #
+        # *Function is ready for multiple axes slicing.
+
+        # Start from the output cube dimensions and offsets
+        in_cube_dims = list(outputCube.dims).copy()
+        in_cube_offset = list(outputCube.offset).copy()
+
+        # Iterate through the sliced axes
+        for idx, ax in enumerate(parseDict['axes']):
+            # Get current sliced ax parameters
+            start = parseDict['starts'][idx]
+            step = parseDict['steps'][idx]
+
+            # Compute input cube parameters for the current axis
+            in_cube_dims[ax] = (outputCube.dims[ax] - 1) * step + 1
+            in_cube_offset[ax] = start + outputCube.offset[ax] * step
+
+        return HyperRectangle(offset = tuple(in_cube_offset), dims = tuple(in_cube_dims))
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        # Extract rectangle information (offsets and dimensions) from output cubes
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # Prepare address names
+        addrNames = ['data_in', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare replacement lists for the elements inside the operator representation,
+        # for the cubes to be computed further down in this function
+        replacements = {
+            "data_in_shape": [],
+            "data_out_shape": [],
+            "starts": [[
+                0,
+            ] * len(operatorRepresentation['axes'])] * len(outputCubes),
+            "ends": [],
+            "data_in_size": [],
+        }
+
+        replacementTypes = {
+            "data_in_shape": [
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t)
+            ],
+            "data_out_shape": [
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t),
+                PointerClass(uint16_t)
+            ],
+            "starts": PointerClass(uint16_t),
+            "ends": PointerClass(uint16_t),
+            "data_in_size": PointerClass(uint16_t),
+        }
+
+        # Prepare loading schedule lists
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out_cube in outputCubes:
+            # Compute input cube
+            in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, parseDict = operatorRepresentation)
+
+            # Compute new ends for replacement
+            new_ends = list()
+            for ax in operatorRepresentation['axes']:
+                new_ends.append(in_cube.offset[ax] + in_cube.dims[ax])
+
+            # Append replacement elements
+            replacements["data_in_shape"].append(list(in_cube.dims).copy())
+            replacements["data_out_shape"].append(list(out_cube.dims).copy())
+            replacements["ends"].append(new_ends)
+            replacements["data_in_size"].append(int(np.prod(in_cube.dims)))
+
+            # Append new cubes
+            inputLoadSchedule.append({"data_in": in_cube})
+            outputLoadSchedule.append({"data_out": out_cube})
+
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
index a6dbaa4e8..6de8ca300 100644
--- a/Deeploy/Targets/PULPOpen/Tiler.py
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -16,23 +16,26 @@
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
 from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \
-    PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \
-    PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, \
-    PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, \
-    PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \
-    PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSoftmaxBindings, \
-    PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \
-    PULPTransposeBindings, PULPUniformRQSBindings
+    PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, \
+    PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, \
+    PULPMaxPool2DBindings, PULPMulBindings, PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, \
+    PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, \
+    PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, \
+    PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, \
+    PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
-from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
+    RQDWConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import iSoftmaxTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.ReduceMeanConstraint import ReduceMeanTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \
     SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint
 from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
@@ -41,11 +44,14 @@
                                                            tileConstraint = RQConv2DTileConstraint())
 
 PULPRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSDWConv2DBindings,
-                                                             tileConstraint = DWConv2DTileConstraint())
+                                                             tileConstraint = RQDWConv2DTileConstraint())
 
 PULPConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings,
                                                         tileConstraint = Conv2DTileConstraint())
 
+PULPDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConv2DBindings,
+                                                          tileConstraint = DWConv2DTileConstraint())
+
 PULPRQSGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSGEMMBindings,
                                                          tileConstraint = GEMMTileConstraint())
 
@@ -130,4 +136,10 @@
                                                            tileConstraint = UntiledTileConstraint())
 
 PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings,
-                                                     tileConstraint = SGDTileConstraint())
\ No newline at end of file
+                                                     tileConstraint = SGDTileConstraint())
+
+PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings,
+                                                       tileConstraint = SliceTileConstraint())
+
+PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings,
+                                                            tileConstraint = ReduceMeanTileConstraint())
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index bdae0fbdc..27ca222e4 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -435,7 +435,8 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo
 
             if not isinstance(ctxt.lookup(tensorName), TransientBuffer):
 
-                tensorShapeLen = len(ctxt.lookup(tensorName).shape)
+                tensorShapeLen = 1 if isinstance(ctxt.lookup(tensorName).shape, int) else len(
+                    ctxt.lookup(tensorName).shape)
                 newShape: List[int] = []
 
                 if isinstance(memoryConstraint.size, int):
@@ -446,7 +447,7 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo
                         newShape.append(
                             self.tilerModel._resolveVariable(tilerModel.getTensorDimVar(tensorName, i, copyIdx)))
 
-                newMemoryConstraint.shape = tuple(newShape)
+                newMemoryConstraint.shape = (newShape,) if isinstance(newShape, int) else tuple(newShape)
 
             solvedTensorConstraint.addMemoryConstraint(newMemoryConstraint)
 
diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py
index 80f0191d7..db83974f0 100644
--- a/Deeploy/TilingExtension/TilerModel.py
+++ b/Deeploy/TilingExtension/TilerModel.py
@@ -147,7 +147,9 @@ def addTensorDimToModel(self, ctxt: NetworkContext, tensorName: str, copyIdx: Op
         '''
         tensor = ctxt.lookup(tensorName)
 
-        for idx, dim in enumerate(tensor.shape):
+        for idx, dim in enumerate([
+                tensor.shape,
+        ] if isinstance(tensor.shape, int) else tensor.shape):
 
             varName = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx)
 
@@ -170,7 +172,9 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId
 
         tensorDimProductExpr = 1
 
-        for idx, _ in enumerate(tensor.shape):
+        for idx, _ in enumerate([
+                tensor.shape,
+        ] if isinstance(tensor.shape, int) else tensor.shape):
 
             varNameIdx = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx)
             tensorDimProductExpr *= self._variables[varNameIdx]
diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
index 604ba23c9..da27365c7 100644
--- a/Deeploy/TilingExtension/TilingCodegen.py
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -165,7 +165,16 @@ def minimizeVariableReplacement(
     newRepTypes = {}
 
     for key, value in scheme.perTileReplacements.items():
-        if len(set(value)) > 1:
+        more_than_one_unique_item = False
+        items_checked = list()
+        for item in value:
+            if item not in items_checked:
+                items_checked.append(item)
+            if len(items_checked) > 1:
+                more_than_one_unique_item = True
+                break
+
+        if more_than_one_unique_item:
             newPerTileRep[key] = scheme.perTileReplacements[key]
             newRepTypes[key] = scheme.replacementTypes[key]
         else:
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz
index a98a6c33b..36567a96c 100644
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx
index ae1b3ac93..5eb3ae446 100644
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz
index a5d4b6e97..0e2e55fcf 100644
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz and b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz differ
diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
index 24f0638f2..cf8acf05d 100644
--- a/DeeployTest/generateNetwork.py
+++ b/DeeployTest/generateNetwork.py
@@ -20,7 +20,7 @@
 from Deeploy.DeeployTypes import _NoVerbosity
 from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.Targets.CortexM.Platform import CMSISPlatform
-from Deeploy.Targets.PULPOpen.Platform import PULPPlatform
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPPlatform
 
 
 def generateNetwork(args):
@@ -84,6 +84,10 @@ def generateNetwork(args):
 
     platform, signProp = mapPlatform(args.platform)
 
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
     inputTypes = {}
     inputOffsets = {}
 
@@ -183,6 +187,13 @@ def generateNetwork(args):
                         'If not specified, offsets are set to 0. '
                         'Example: --input-offset-map input_0=0 input_1=128 ...')
     parser.add_argument('--shouldFail', action = 'store_true')
+    parser.add_argument(
+        "--cores",
+        type = int,
+        default = 1,
+        help =
+        "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1.",
+    )
     parser.set_defaults(shouldFail = False)
 
     args = parser.parse_args()
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 013f854da..4b1ebef20 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -26,6 +26,7 @@
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
 from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
     AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
 from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
 
 
@@ -76,6 +77,10 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     if args.enableStrides:
         platform.engines[0].enableStrides = True
 
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
     for index, num in enumerate(test_inputs):
         _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
@@ -195,6 +200,13 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     parser.add_argument('--plotMemAlloc',
                         action = 'store_true',
                         help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
+    parser.add_argument(
+        "--cores",
+        type = int,
+        default = 1,
+        help =
+        "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1."
+    )
 
     parser.set_defaults(shouldFail = False)
     args = parser.parse_args()
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index 7d1f7f312..a3329ebf7 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -342,6 +342,10 @@ def generate_test(self):
             generation_script = "generateNetwork.py"
 
         command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}"
+
+        if self._platform in ["Siracusa", "Siracusa_w_neureka"]:
+            command += f" --cores={self._args.cores}"
+
         command += self._argument_parser.generate_cmd_args()
 
         log.debug(f"[TestRunner] Generation Command: {command}")
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
index f6e8308c9..4da9e2abd 100644
--- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
+++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
@@ -32,6 +32,7 @@
 #include "kernel/RequantShift.h"
 #include "kernel/Softmax.h"
 #include "kernel/UniformRequantShift.h"
+#include "kernel/gemm.h"
 #include "kernel/gemv.h"
 #include "kernel/iRMSnorm.h"
 
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
index f5382a339..3ebab54a0 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -9,20 +9,30 @@
 
 #include "DeeployPULPMath.h"
 
-void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                    uint32_t H, uint32_t W, uint32_t C,
-                                    const float32_t *__restrict__ pSrcB,
-                                    uint32_t F_total, uint32_t P, uint32_t Q,
-                                    uint32_t SP, uint32_t SQ,
-                                    float32_t *__restrict__ pDstC,
-                                    uint32_t pad_top, uint32_t pad_bottom,
-                                    uint32_t pad_left, uint32_t pad_right);
+void PULP_Conv2d_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right);
 
 void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
     const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
-    uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC,
-    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
-    uint32_t pad_right, float32_t *__restrict__ pContextBuffer);
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer);
+
+void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer);
 
 #endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
index c33ac31e8..af2129323 100644
--- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c
+++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
@@ -7,18 +7,19 @@
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
 
-void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                    uint32_t H, uint32_t W, uint32_t C,
-                                    const float32_t *__restrict__ pSrcB,
-                                    uint32_t F_total, uint32_t P, uint32_t Q,
-                                    uint32_t SP, uint32_t SQ,
-                                    float32_t *__restrict__ pDstC,
-                                    uint32_t pad_top, uint32_t pad_bottom,
-                                    uint32_t pad_left, uint32_t pad_right) {
+void PULP_Conv2d_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right) {
 
+  // Compute core
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
+  // Compute the chunk size for each core
   uint16_t ch_out_chunk =
       (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
   uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
@@ -29,37 +30,72 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
     return;
   }
 
+  // Pointer to the weights for the current core
   const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q;
 
+  // Compute the output dimensions
   uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
   uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
 
-  for (uint32_t h = 0; h < H_out; ++h) {
-    for (uint32_t w = 0; w < W_out; ++w) {
-      for (uint32_t f = 0; f < ch_out_count; ++f) {
-        float32_t sum = 0.0f;
+  // Compute the output
+  if (has_bias) {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        for (uint32_t f = 0; f < ch_out_count; ++f) {
+          float32_t sum = 0.0f;
 
-        for (uint32_t p = 0; p < P; ++p) {
-          for (uint32_t q = 0; q < Q; ++q) {
-            for (uint32_t c = 0; c < C; ++c) {
-              int32_t h_in = h * SP + p - pad_top;
-              int32_t w_in = w * SQ + q - pad_left;
+          for (uint32_t p = 0; p < P; ++p) {
+            for (uint32_t q = 0; q < Q; ++q) {
+              for (uint32_t c = 0; c < C; ++c) {
+                int32_t h_in = h * SP + p - pad_top;
+                int32_t w_in = w * SQ + q - pad_left;
 
-              if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
-                  w_in >= (int32_t)W) {
-                continue;
-              }
+                if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
+                    w_in >= (int32_t)W) {
+                  continue;
+                }
 
-              uint32_t input_idx = (h_in * W + w_in) * C + c;
-              uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
+                uint32_t input_idx = (h_in * W + w_in) * C + c;
+                uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
 
-              sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+                sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+              }
             }
           }
+
+          uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
+          pDstC[output_idx] = sum + pSrcBias[f + ch_out_start];
         }
+      }
+    }
+  } else {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        for (uint32_t f = 0; f < ch_out_count; ++f) {
+          float32_t sum = 0.0f;
+
+          for (uint32_t p = 0; p < P; ++p) {
+            for (uint32_t q = 0; q < Q; ++q) {
+              for (uint32_t c = 0; c < C; ++c) {
+                int32_t h_in = h * SP + p - pad_top;
+                int32_t w_in = w * SQ + q - pad_left;
+
+                if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
+                    w_in >= (int32_t)W) {
+                  continue;
+                }
+
+                uint32_t input_idx = (h_in * W + w_in) * C + c;
+                uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
+
+                sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+              }
+            }
+          }
 
-        uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
-        pDstC[output_idx] = sum;
+          uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
+          pDstC[output_idx] = sum;
+        }
       }
     }
   }
@@ -68,12 +104,17 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
 void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
     const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
-    uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC,
-    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
-    uint32_t pad_right, float32_t *__restrict__ pContextBuffer) {
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer) {
+
+  // Compute core
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
+  // Compute the chunk size for each core
   uint16_t ch_out_chunk =
       (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
   uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
@@ -84,50 +125,95 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     return;
   }
 
+  // Pointer to the weights for the current core
   const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q;
 
   uint32_t im2col_size_per_core = C * P * Q;
   float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core;
 
+  // Compute the output dimensions
   uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
   uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
   uint32_t kernel_size = P * Q * C;
 
-  for (uint32_t h_out = 0; h_out < H_out; h_out++) {
-    for (uint32_t w_out = 0; w_out < W_out; w_out++) {
-      int32_t h_in_start = h_out * SP - pad_top;
-      int32_t w_in_start = w_out * SQ - pad_left;
+  // Compute the output
+  if (has_bias) {
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        for (uint32_t p = 0; p < P; p++) {
+          int32_t h_in = h_in_start + p;
+
+          for (uint32_t q = 0; q < Q; q++) {
+            int32_t w_in = w_in_start + q;
+
+            for (uint32_t c = 0; c < C; c++) {
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                uint32_t in_idx = (h_in * W + w_in) * C + c;
+                im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
+              } else {
+                im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+              }
+            }
+          }
+        }
+
+        for (uint32_t f = ch_out_start; f < ch_out_stop; f++) {
+          float32_t sum = 0.0f;
+          const float32_t *local_weight_ptr =
+              weight_ptr + (f - ch_out_start) * kernel_size;
 
-      for (uint32_t p = 0; p < P; p++) {
-        int32_t h_in = h_in_start + p;
+          for (uint32_t k = 0; k < kernel_size; k++) {
+            sum += im2col_buffer[k] * local_weight_ptr[k];
+          }
 
-        for (uint32_t q = 0; q < Q; q++) {
-          int32_t w_in = w_in_start + q;
+          uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
 
-          for (uint32_t c = 0; c < C; c++) {
-            if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
-                w_in < (int32_t)W) {
-              uint32_t in_idx = (h_in * W + w_in) * C + c;
-              im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
-            } else {
-              im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+          pDstC[out_idx] = sum + pSrcBias[f];
+        }
+      }
+    }
+  } else {
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        for (uint32_t p = 0; p < P; p++) {
+          int32_t h_in = h_in_start + p;
+
+          for (uint32_t q = 0; q < Q; q++) {
+            int32_t w_in = w_in_start + q;
+
+            for (uint32_t c = 0; c < C; c++) {
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                uint32_t in_idx = (h_in * W + w_in) * C + c;
+                im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
+              } else {
+                im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+              }
             }
           }
         }
-      }
 
-      for (uint32_t f = 0; f < ch_out_count; f++) {
-        float32_t sum = 0.0f;
-        const float32_t *local_weight_ptr = weight_ptr + f * kernel_size;
+        for (uint32_t f = ch_out_start; f < ch_out_stop; f++) {
+          float32_t sum = 0.0f;
+          const float32_t *local_weight_ptr =
+              weight_ptr + (f - ch_out_start) * kernel_size;
 
-        for (uint32_t k = 0; k < kernel_size; k++) {
-          sum += im2col_buffer[k] * local_weight_ptr[k];
-        }
+          for (uint32_t k = 0; k < kernel_size; k++) {
+            sum += im2col_buffer[k] * local_weight_ptr[k];
+          }
 
-        uint32_t out_idx =
-            (h_out * W_out + w_out) * F_total + (ch_out_start + f);
-        pDstC[out_idx] = sum;
+          uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+          pDstC[out_idx] = sum;
+        }
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c
new file mode 100644
index 000000000..88f21b9a2
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c
@@ -0,0 +1,251 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer) {
+
+  // Compute core information
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = (int8_t)log2(NUM_CORES);
+
+  // Compute the chunk size for each core
+  // (Splitting work along the output channels)
+  uint16_t ch_out_chunk =
+      (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
+  uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
+  uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, F_total);
+  uint16_t ch_out_count = ch_out_stop - ch_out_start;
+
+  // If there is no output channel to process, return
+  // (when F < NUM_CORES and working on a core with id > F)
+  if (ch_out_count == 0) {
+    return;
+  }
+
+  // Move pointer of the weights for the current core
+  const float32_t *weight_ptr = pSrcB + ch_out_start * P * Q;
+
+  // Move pointer of the im2col buffer for the current core
+  uint32_t im2col_size_per_core = P * Q;
+  float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core;
+
+  // Compute the output dimensions
+  uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+  uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+  uint32_t kernel_size = P * Q * F_total;
+
+  // Compute the output
+  if (has_bias) {
+    // Work on individual output elements
+    // (each element depends on a column from the im2col buffer
+    // and one convolutional filter, stored in memory continuously)
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        // Compute height and width starting point
+        // (depending on stride and padding)
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        // Initialize the padded part of the im2col buffer with 0
+        // Work on the TOP padding
+        for (int32_t h_in = (int32_t)h_in_start;
+             h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the BOTTOM padding
+        for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P;
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining LEFT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining RIGHT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q;
+               w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Copy input data to im2col buffer
+        // Input channels depend on the output channels assigned to the core
+        // (each input channel is associated with F_total / C output channels,
+        // number which corresponds to the "group" parameter in the Conv ONNX
+        // operator)
+        for (uint32_t c = ch_out_start / (F_total / C);
+             c < (ch_out_stop + 1) / (F_total / C); c++) {
+          // Copy the valid input data to the im2col buffer
+          for (uint32_t h_in = MAX(0, h_in_start);
+               h_in < MIN(H, h_in_start + P); h_in++) {
+            for (uint32_t w_in = MAX(0, w_in_start);
+                 w_in < MIN(W, w_in_start + Q); w_in++) {
+              uint32_t in_idx = (h_in * W + w_in) * C + c;
+              im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] =
+                  pSrcA[in_idx];
+            }
+          }
+
+          // Compute output channels of interest, based on current input channel
+          // and core
+          uint32_t lower_f, upper_f;
+
+          if (c * (F_total / C) < ch_out_start) {
+            lower_f = ch_out_start;
+          } else {
+            lower_f = c * (F_total / C);
+          }
+
+          if ((c + 1) * (F_total / C) < ch_out_stop) {
+            upper_f = (c + 1) * (F_total / C);
+          } else {
+            upper_f = ch_out_stop;
+          }
+
+          // Perform convolution for the assigned output channels
+          for (uint32_t f = lower_f; f < upper_f; f++) {
+            float32_t sum = 0.0f;
+            uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+            for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) {
+              sum +=
+                  im2col_buffer[im2col_idx] *
+                  weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)];
+            }
+
+            // Copy the result to the output tensor
+            pDstC[out_idx] = sum + pSrcBias[f];
+          }
+        }
+      }
+    }
+  } else {
+    // Work on individual output elements
+    // (each element depends on a column from the im2col buffer
+    // and one convolutional filter, stored in memory continuously)
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        // Compute height and width starting point
+        // (depending on stride and padding)
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        // Initialize the padded part of the im2col buffer with 0
+        // Work on the TOP padding
+        for (int32_t h_in = (int32_t)h_in_start;
+             h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the BOTTOM padding
+        for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P;
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining LEFT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining RIGHT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q;
+               w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Copy input data to im2col buffer
+        // Input channels depend on the output channels assigned to the core
+        // (each input channel is associated with F_total / C output channels,
+        // number which corresponds to the "group" parameter in the Conv ONNX
+        // operator)
+        for (uint32_t c = ch_out_start / (F_total / C);
+             c < (ch_out_stop + 1) / (F_total / C); c++) {
+          // Copy the valid input data to the im2col buffer
+          for (uint32_t h_in = MAX(0, h_in_start);
+               h_in < MIN(H, h_in_start + P); h_in++) {
+            for (uint32_t w_in = MAX(0, w_in_start);
+                 w_in < MIN(W, w_in_start + Q); w_in++) {
+              uint32_t in_idx = (h_in * W + w_in) * C + c;
+              im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] =
+                  pSrcA[in_idx];
+            }
+          }
+
+          // Compute output channels of interest, based on current input channel
+          // and core
+          uint32_t lower_f, upper_f;
+
+          if (c * (F_total / C) < ch_out_start) {
+            lower_f = ch_out_start;
+          } else {
+            lower_f = c * (F_total / C);
+          }
+
+          if ((c + 1) * (F_total / C) < ch_out_stop) {
+            upper_f = (c + 1) * (F_total / C);
+          } else {
+            upper_f = ch_out_stop;
+          }
+
+          // Perform convolution for the assigned output channels
+          for (uint32_t f = lower_f; f < upper_f; f++) {
+            float32_t sum = 0.0f;
+            uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+            for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) {
+              sum +=
+                  im2col_buffer[im2col_idx] *
+                  weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)];
+            }
+
+            // Copy the result to the output tensor
+            pDstC[out_idx] = sum;
+          }
+        }
+      }
+    }
+  }
+
+  return;
+}
diff --git a/TargetLibraries/PULPOpen/src/GELU.c b/TargetLibraries/PULPOpen/src/GELU.c
index 281d4674d..ef2319e3b 100644
--- a/TargetLibraries/PULPOpen/src/GELU.c
+++ b/TargetLibraries/PULPOpen/src/GELU.c
@@ -12,23 +12,21 @@
 
 void PULP_GELU_fp32_fp32(float32_t *data_in, float32_t *data_out,
                          int32_t dataSize) {
+  // Get core information
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
+
+  // Split into chunks for each core
   int16_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, dataSize);
   int16_t chunk_stop = MIN(chunk_start + chunk, dataSize);
-  const float32_t sqrt_2_over_pi = 0.7978845608f; // sqrt(2/π)
-  const float32_t coeff = 0.044715f;
 
+  // Compute GELU on the assigned chunk
   for (uint32_t i = chunk_start; i < chunk_stop; i++) {
     float32_t x = data_in[i];
-    float32_t x_cubed = x * x * x;
-    float32_t inner = sqrt_2_over_pi * (x + coeff * x_cubed);
-
-    float32_t exp_2z = expf(2.0f * inner);
-    float32_t tanh_val = (exp_2z - 1.0f) / (exp_2z + 1.0f);
+    float32_t cdf = 0.5f * (1.0f + tanhf((sqrtf(2.0f / (float)M_PI) *
+                                          (x + 0.044715f * powf(x, 3.0f)))));
 
-    float32_t cdf = 0.5f * (1.0f + tanh_val);
     data_out[i] = x * cdf;
   }
 }