diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 551a1e96..9a98fdb8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -87,15 +87,16 @@ sim-vsim:
     - vsim-compile
   script:
     # Run the simulation
-    - make vsim-run-batch
+    - make vsim-run-batch-verify
     # Check either success or failure for non-zero exit codes
     - 'if [ -z "${NZ_EXIT_CODE}" ]; then grep "] SUCCESS" transcript || (exit 1); else grep "] FAILED: return code ${NZ_EXIT_CODE}" transcript || (exit 1); fi'
     # Check for UART output
     - 'if [ ! -z "${USTR}" ]; then (grep " \[UART\] ${USTR}" transcript); fi'
     # Check for any fatal errors
     - 'if grep "Fatal:" transcript; then exit 1; fi'
-    # Check for any errors (except one for non-zero exit codes)
-    - 'if [ ! -z "${NZ_EXIT_CODE}" ]; then count=$(grep -c "Error:" transcript); if [ "$count" -ne 1 ]; then exit 1; fi; else if grep -q "Error:" transcript; then exit 1; fi; fi'
+    # Check for any non-fatal errors. One and only one error is expected with a non-zero exit code.
+    # Ignore all errors when using a separate verification script.
+    - 'if [ -z "${VERIFY_PY}" ]; then if [ ! -z "${NZ_EXIT_CODE}" ]; then count=$(grep -c "Error:" transcript); if [ "$count" -ne 1 ]; then exit 1; fi; else if grep -q "Error:" transcript; then exit 1; fi; fi; fi'
   artifacts:
     paths:
       - transcript
diff --git a/.gitlab/common.yml b/.gitlab/common.yml
index 3091b58e..26aaca7a 100644
--- a/.gitlab/common.yml
+++ b/.gitlab/common.yml
@@ -4,7 +4,7 @@
 
 variables:
   PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
-  PD_COMMIT: "c22baf1fcce6b0cf76b85d2009e146ba3bbb6807"
+  PD_COMMIT: "245dd3d4a7c33f6052a913691382871bd2c53fc0"
   SPU_COMMIT: "c2e8815487bd713624d74ef3e3e0465196b6d67f"
 
 # Check the cache for bender and python dependencies
@@ -66,11 +66,13 @@ variables:
     - .cache-deps
     - .init-env
   script:
-    - make sn-tests
+    - make sn-tests DEBUG=ON
+    - make sn-apps DEBUG=ON
     - make pb-sn-tests
   artifacts:
     paths:
       - sw/snitch/tests/build/*.elf
+      - sw/snitch/apps/**/build/*.elf
     expire_in: 1 day
 
 # Compile the cheshire software tests
diff --git a/.gitlab/sw-tests.yml b/.gitlab/sw-tests.yml
index 5a4bbcb6..dd4b62c0 100644
--- a/.gitlab/sw-tests.yml
+++ b/.gitlab/sw-tests.yml
@@ -7,6 +7,7 @@
   variables:
     CHS_BUILD_DIR: sw/cheshire/tests
     SN_BUILD_DIR: sw/snitch/tests/build
+    SN_ROOT: .deps/snitch_cluster
   parallel:
     matrix:
       - { CHS_BINARY: $CHS_BUILD_DIR/sanity.spm.elf, PRELMODE: 0 }
@@ -29,3 +30,8 @@
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/redmule.elf }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/redmule_quant.elf }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/datamover.elf }
+      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/gemm_2d/build/gemm_2d.elf, VERIFY_PY: $SN_ROOT/sw/blas/gemm/scripts/verify.py, PRELMODE: 3 }
+      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/fused_concat_linear/build/fused_concat_linear.elf, VERIFY_PY: $SN_ROOT/sw/dnn/fused_concat_linear/scripts/verify.py, PRELMODE: 3 }
+      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/mha/build/mha.elf, VERIFY_PY: $SN_ROOT/sw/dnn/mha/scripts/verify.py, PRELMODE: 3 }
+      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/gemm/build/gemm.elf, VERIFY_PY: $SN_ROOT/sw/blas/gemm/scripts/verify.py, PRELMODE: 3 }
+      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/axpy/build/axpy.elf, VERIFY_PY: $SN_ROOT/sw/blas/axpy/scripts/verify.py, PRELMODE: 3 }
diff --git a/Bender.lock b/Bender.lock
index c8cba67f..02dd028e 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -146,7 +146,7 @@ packages:
     dependencies:
     - common_cells
   common_cells:
-    revision: bef3d3c5ed0e2cc211e69a6dbd81c4fe3a97025c
+    revision: e1c09c75775c5f03eb45906d5145dbd2f5bcfb95
     version: null
     source:
       Git: https://github.com/pulp-platform/common_cells.git
@@ -383,7 +383,7 @@ packages:
     - common_cells
     - register_interface
   snitch_cluster:
-    revision: e4eaa0fb64767bb8f6b7d1f5fa705928171092b2
+    revision: ef3ece6c9e119fbfc25b26bb89a429ccdaacb5c6
     version: null
     source:
       Git: https://github.com/pulp-platform/snitch_cluster.git
diff --git a/Bender.yml b/Bender.yml
index a2172691..b342a501 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -12,7 +12,7 @@ dependencies:
   axi:                { git: "https://github.com/pulp-platform/axi.git",                version: "0.39.6"                               }
   common_cells:       { git: "https://github.com/pulp-platform/common_cells.git",       rev: "snitch"                                   }
   cheshire:           { git: "https://github.com/pulp-platform/cheshire.git",           rev: "picobello"                                }
-  snitch_cluster:     { git: "https://github.com/pulp-platform/snitch_cluster.git",     rev: "e4eaa0fb64767bb8f6b7d1f5fa705928171092b2" }
+  snitch_cluster:     { git: "https://github.com/pulp-platform/snitch_cluster.git",     rev: "ef3ece6c9e119fbfc25b26bb89a429ccdaacb5c6" }
   floo_noc:           { git: "https://github.com/pulp-platform/FlooNoC.git",            rev: "develop"                                  }
   obi:                { git: "https://github.com/pulp-platform/obi.git",                rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" }
   redmule:            { git: "https://github.com/pulp-platform/redmule.git",            rev: "picobello"                                }
diff --git a/Makefile b/Makefile
index 29d13c89..0833fff7 100644
--- a/Makefile
+++ b/Makefile
@@ -218,7 +218,7 @@ python-venv: .venv
 	python -m pip install --upgrade pip setuptools && \
 	python -m pip install --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \
 	python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path floo_noc) --no-deps && \
-	python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path snitch_cluster)
+	python -m pip install --cache-dir $(PIP_CACHE_DIR) "$(shell $(BENDER) path snitch_cluster)[kernels]"
 
 python-venv-clean:
 	rm -rf .venv
diff --git a/README.md b/README.md
index 7c17363b..a91d7e00 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,14 @@ Use the `vsim-run-batch` command to run tests in batch mode with RTL optimizatio
 
 Use the `PRELMODE=3` flag to enable fast preload of the Snitch binary, and speed up the simulation.
 
+Some applications produce a lot of output data, which would be time-consuming to check in simulation.
+Said applications usually come with a Python verification script that can check the results from a dump of the memory contents at the end of the simulation.
+For example, a verification script for the GEMM kernel can be found under `$(bender path snitch_cluster)/sw/blas/gemm/scripts/verify.py`
+To run an application on Snitch and verify its results, do:
+```bash
+make vsim-run-batch-verify VERIFY_PY=$(bender path snitch_cluster)/sw/blas/gemm/scripts/verify.py PRELMODE=3 CHS_BINARY=sw/cheshire/tests/simple_offload.spm.elf SN_BINARY=sw/snitch/apps/blas/gemm/build/gemm.elf
+```
+
 ### Additional help
 
 Additionally, you can run the following command to get a list of all available commands:
diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv
index 1458fda0..faf4249a 100644
--- a/hw/mem_tile.sv
+++ b/hw/mem_tile.sv
@@ -16,7 +16,8 @@ module mem_tile
 #(
   parameter bit          AxiUserAtop    = 1'b1,
   parameter int unsigned AxiUserAtopMsb = 3,
-  parameter int unsigned AxiUserAtopLsb = 0
+  parameter int unsigned AxiUserAtopLsb = 0,
+  parameter int unsigned MemTileId      = 0
 ) (
   input  logic                    clk_i,
   input  logic                    rst_ni,
@@ -275,6 +276,25 @@ module mem_tile
     end
   end
 
+`ifndef SYNTHESIS
+  // AXI Monitor dumper to improvce debiugging
+  axi_dumper #(
+    .BusName   ($sformatf("mem_tile_%d", MemTileId)),
+    .LogAW     (1'b1),
+    .LogAR     (1'b1),
+    .LogW      (1'b1),
+    .LogB      (1'b1),
+    .LogR      (1'b1),
+    .axi_req_t (axi_nw_join_req_t),
+    .axi_resp_t(axi_nw_join_rsp_t)
+  ) i_axi_monitor (
+    .clk_i,
+    .rst_ni,
+    .axi_req_i (axi_req),
+    .axi_resp_i(axi_rsp)
+  );
+`endif
+
   axi_to_obi #(
     .ObiCfg      (MgrObiCfg),
     .obi_req_t   (mgr_obi_req_t),
diff --git a/hw/picobello_top.sv b/hw/picobello_top.sv
index 7a150820..e8d737fe 100644
--- a/hw/picobello_top.sv
+++ b/hw/picobello_top.sv
@@ -273,7 +273,9 @@ module picobello_top
     localparam int MemTileX = int'(MemTilePhysicalId.x);
     localparam int MemTileY = int'(MemTilePhysicalId.y);
 
-    mem_tile i_mem_tile (
+    mem_tile #(
+      .MemTileId(int'(m))
+    ) i_mem_tile (
       .clk_i,
       .rst_ni,
       .test_enable_i   (test_mode_i),
diff --git a/requirements.txt b/requirements.txt
index 4892f657..78565385 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,5 +25,5 @@ hjson  # for reggen
 
 # For peakrdl
 peakrdl
-peakrdl-rawheader @ git+https://github.com/micprog/peakrdl-rawheader.git
+peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@7b8dbc9ad5854dc1cdaf36d4ea024c29ffb00a4c
 peakrdl-markdown
diff --git a/sw/snitch/apps/axpy/app.mk b/sw/snitch/apps/axpy/app.mk
new file mode 100644
index 00000000..9e032762
--- /dev/null
+++ b/sw/snitch/apps/axpy/app.mk
@@ -0,0 +1,15 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := axpy
+$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build
+$(APP)_DATA_CFG  := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json
+SRC_DIR          := $(SN_ROOT)/sw/blas/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(SN_ROOT)/sw/blas
+
+include $(SN_ROOT)/sw/apps/common.mk
+include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/sw/snitch/apps/axpy/data/params.json b/sw/snitch/apps/axpy/data/params.json
new file mode 100644
index 00000000..eafb7128
--- /dev/null
+++ b/sw/snitch/apps/axpy/data/params.json
@@ -0,0 +1,9 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    "n_tiles": 5,
+    "n": 2560,
+    "funcptr": "axpy_opt"
+}
diff --git a/sw/snitch/apps/fused_concat_linear/app.mk b/sw/snitch/apps/fused_concat_linear/app.mk
new file mode 100644
index 00000000..fd0eb531
--- /dev/null
+++ b/sw/snitch/apps/fused_concat_linear/app.mk
@@ -0,0 +1,15 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := fused_concat_linear
+$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build
+$(APP)_DATA_CFG  := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json
+SRC_DIR          := $(SN_ROOT)/sw/dnn/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(SN_ROOT)/sw/dnn/src $(SN_ROOT)/sw/blas
+
+include $(SN_ROOT)/sw/apps/common.mk
+include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/sw/snitch/apps/fused_concat_linear/data/params.json b/sw/snitch/apps/fused_concat_linear/data/params.json
new file mode 100644
index 00000000..935cbd99
--- /dev/null
+++ b/sw/snitch/apps/fused_concat_linear/data/params.json
@@ -0,0 +1,11 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    num_inputs: 2,
+    input_shape: [16, 16],
+    output_shape: [16, 16],
+    dtype: "FP64",
+    gemm_implementation: "gemm_fp64_opt"
+}
\ No newline at end of file
diff --git a/sw/snitch/apps/gemm/app.mk b/sw/snitch/apps/gemm/app.mk
new file mode 100644
index 00000000..1dd86b44
--- /dev/null
+++ b/sw/snitch/apps/gemm/app.mk
@@ -0,0 +1,15 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := gemm
+$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build
+$(APP)_DATA_CFG  := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json
+SRC_DIR          := $(SN_ROOT)/sw/blas/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(SN_ROOT)/sw/blas
+
+include $(SN_ROOT)/sw/apps/common.mk
+include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/sw/snitch/apps/gemm/data/params.json b/sw/snitch/apps/gemm/data/params.json
new file mode 100644
index 00000000..524ad42b
--- /dev/null
+++ b/sw/snitch/apps/gemm/data/params.json
@@ -0,0 +1,25 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    setup_ssr: 1,
+    parallelize_m: 1,
+    parallelize_k: 0,
+    m_tiles: 64, // number of tiles in M dimension
+    n_tiles: 1, // number of tiles in N dimension
+    k_tiles: 1, // number of tiles in K dimension
+    load_a: 1,
+    load_b: 1,
+    load_c: 1,
+    double_buffer: 1,
+    partition_banks: 0,
+    transa: false,
+    transb: false, // must be true for SIMD
+    m: 2048,
+    n: 16,
+    k: 16,
+    alpha: 1,
+    beta: 0,
+    gemm_fp: "gemm_fp64_opt"
+}
diff --git a/sw/snitch/apps/gemm_2d/app.mk b/sw/snitch/apps/gemm_2d/app.mk
new file mode 100644
index 00000000..f0155c5b
--- /dev/null
+++ b/sw/snitch/apps/gemm_2d/app.mk
@@ -0,0 +1,17 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Lorenzo Leone <lleone@iis.ee.ethz.ch>
+
+APP              := gemm_2d
+$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build
+SRC_DIR          := $(PB_SNITCH_SW_DIR)/apps/$(APP)/src
+SRCS             := $(SRC_DIR)/gemm_2d.c
+$(APP)_INCDIRS   := $(SN_ROOT)/sw/blas $(SN_ROOT)/sw/blas/gemm/src
+
+# Refer to Snitch scripts
+$(APP)_SCRIPT_DIR :=  $(SN_ROOT)/sw/blas/gemm/scripts
+
+include $(SN_ROOT)/sw/apps/common.mk
+include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/sw/snitch/apps/gemm_2d/data/params.json b/sw/snitch/apps/gemm_2d/data/params.json
new file mode 100644
index 00000000..afc30631
--- /dev/null
+++ b/sw/snitch/apps/gemm_2d/data/params.json
@@ -0,0 +1,25 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    setup_ssr: 1,
+    parallelize_m: 1,
+    parallelize_k: 0,
+    m_tiles: 16, // number of tiles in M dimension
+    n_tiles: 4, // number of tiles in N dimension
+    k_tiles: 1, // number of tiles in K dimension
+    load_a: 1,
+    load_b: 1,
+    load_c: 1,
+    double_buffer: 1,
+    partition_banks: 0,
+    transa: false,
+    transb: false, // must be true for SIMD
+    m: 128,
+    n: 32,
+    k: 16,
+    alpha: 1,
+    beta: 0,
+    gemm_fp: "gemm_fp64_opt"
+}
diff --git a/sw/snitch/apps/gemm_2d/roi.json b/sw/snitch/apps/gemm_2d/roi.json
new file mode 100644
index 00000000..4249f886
--- /dev/null
+++ b/sw/snitch/apps/gemm_2d/roi.json
@@ -0,0 +1,30 @@
+[
+    <% N_TILES = 4 %>
+
+    % for cluster in range(0,16):
+        // Compute cores
+        % for j in range(0, 8):
+        {
+            "thread": "${f'hart_{cluster * 9 + j + 1}'}",
+            "roi": [
+            % for i in range(0, N_TILES):
+                {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"},
+            % endfor
+            ]
+        },
+        % endfor
+
+        // DMA core
+        {
+            "thread": "${f'hart_{cluster * 9 + 8 + 1}'}",
+            "roi": [
+                {"idx": 1, "label": "${f'tile_in_0'}"},
+                % for i in range(0, N_TILES - 1):
+                    {"idx": ${4*i + 3}, "label": "${f'tile_in_{i+1}'}"},
+                    {"idx": ${4*i + 5}, "label": "${f'tile_out_{i}'}"},
+                % endfor
+                {"idx": ${N_TILES * 4 - 1}, "label": "${f'tile_out_{N_TILES-1}'}"},
+            ]
+        },
+    % endfor
+]
diff --git a/sw/snitch/apps/gemm_2d/src/gemm_2d.c b/sw/snitch/apps/gemm_2d/src/gemm_2d.c
new file mode 100644
index 00000000..57dc4797
--- /dev/null
+++ b/sw/snitch/apps/gemm_2d/src/gemm_2d.c
@@ -0,0 +1,488 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Tim Fischer <fischeti@iis.ee.ethz.ch>
+//         Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+//         Luca Colagrande <colluca@iis.ee.ethz.ch>
+//         Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+//         Lorenzo leone <lleone@iis.ee.ethz.ch>
+
+// TODO (lleone): LIMITATIONS
+//
+// - Works only when M_tile = snrt_cluster_num()
+// - Works only if parallelized on M
+
+#include "snrt.h"
+#include <stdalign.h>
+#include <stdint.h>
+
+#include <math.h>
+#include "blas.h"
+
+// #define HW_MCAST
+
+// #define JOB_ARGS_PRELOADED
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreorder-init-list"
+#include "data.h"
+#pragma clang diagnostic pop
+
+// Allocate data in L2 to better map kernel in NoC system
+static inline void allocate_l2_buffers(gemm_args_t *largs ) {
+
+    uint32_t prec = largs->prec;
+    uint32_t mem_tile_idx = pb_closest_mem_tile(snrt_cluster_idx());
+
+    uintptr_t a_off = pb_l2_tile_offset((uintptr_t) largs -> a);
+    uintptr_t c_off = pb_l2_tile_offset((uintptr_t) largs -> c);
+    uintptr_t a_dst = pb_l2_tile_address(mem_tile_idx) + a_off;
+    uintptr_t c_dst = pb_l2_tile_address(mem_tile_idx) + c_off;
+
+    // Move data in the correct memory tile location
+    uint32_t size_a = (size_t)largs->m * (size_t)largs->k;
+    uint32_t size_c = (size_t)largs->m * (size_t)largs->n;
+    snrt_dma_start_1d((void *) a_dst, (void *) largs->a, size_a * prec);
+    snrt_dma_start_1d((void *) c_dst, (void *) largs->c, size_c * prec);
+
+    // Update A and C local pointer to the relocated memory tile address
+    largs->a = (void *) a_dst;
+    largs->c = (void *) c_dst;
+}
+
+// Write back C tiles in original memory tile for verification purposes
+static inline void write_back_c_tiles(gemm_args_t* largs, uint32_t m_tile_size,
+                                      uint32_t n_tile_size) {
+    uintptr_t c_src, c_dst;
+    uint32_t transfer_size;
+    int c_m_abs, c_n_abs;
+
+
+    // Position of the first element in the Tile to be written back
+    c_src = (uintptr_t )largs->c + (snrt_cluster_idx() * largs->n * m_tile_size) * largs->prec;
+    c_dst = pb_l2_tile_address(0) + pb_l2_tile_offset( (uintptr_t) c_src);
+    transfer_size = m_tile_size * largs->n * largs->prec;
+
+    if (c_src != c_dst) snrt_dma_start_1d((void *) c_dst, (void *) c_src, transfer_size);
+}
+
+/**
+ * @brief Performs a General Matrix Multiplication (GEMM) operation on a
+ *        Snitch-based multiple-cluster architecture with support for
+ *        parallelization, tiling, and data movement optimizations.
+ *
+ * @param args Pointer to a `gemm_args_t` structure containing arguments
+ *             for the GEMM operation.
+ *
+ * @details
+ * The function performs the following steps:
+ * 1. Copies the input arguments to local memory for faster access.
+ * 2. Calculates tile sizes based on the input dimensions and number of tiles.
+ * 3. Allocates space in TCDM for local copies of matrix tiles, unless
+ *    matrix tiles are already stored in TCDM (see `load_* arguments`).
+ * 4. Distributes tiles to clusters for parallel processing.
+ * 5. Iterates over the tiles, performing the following:
+ *    - Copies data for the current tile into local memory.
+ *    - Performs the tile computation using the `sc_st_gemm` function.
+ *    - Performs a logarithmic reduction to combine partial results across
+ *      clusters, if `parallelize_k` is enabled.
+ *    - Writes the result back to global memory.
+ *
+ * @note Current implementation assumes that `parallelize_m` and
+ *       `parallelize_k` options are mutually exclusive.
+ */
+static inline int gemm_picobello(const gemm_args_t *args) {
+#ifndef JOB_ARGS_PRELOADED
+    // Copy the arguments to local memory
+    gemm_args_t *largs = (gemm_args_t *)snrt_l1_alloc_cluster_local(
+        sizeof(gemm_args_t), alignof(gemm_args_t));
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d((void *)largs, (void *)args, sizeof(gemm_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+#else
+    const gemm_args_t *largs = args;
+#endif
+
+    // Calculate tile sizes
+    uint32_t tile_m = largs->m / largs->m_tiles;
+    uint32_t tile_n = largs->n / largs->n_tiles;
+    uint32_t tile_k = largs->k / largs->k_tiles;
+    uint32_t tile_a_size = tile_m * tile_k * largs->prec;
+    uint32_t tile_b_size = tile_k * tile_n * largs->prec;
+    uint32_t tile_c_size = tile_m * tile_n * largs->prec;
+
+    // Allocate space for local tile buffers in TCDM, unless preloaded
+    void *a0, *a1, *b0, *b1, *c0, *c1;
+    void *la[2], *lb[2], *lc[2], *lcr;
+    int banks_per_buffer = snrt_cluster_compute_core_num();
+    allocate_buffers(tile_a_size, tile_b_size, tile_c_size, largs,
+                     banks_per_buffer, la, lb, lc, &lcr);
+    if (snrt_cluster_core_idx() == 0) {
+        DUMP(la[0]);
+        DUMP(la[1]);
+        DUMP(lb[0]);
+        DUMP(lb[1]);
+        DUMP(lc[0]);
+        DUMP(lc[1]);
+    }
+    snrt_cluster_hw_barrier();
+
+    // NoC layout (6 columns x 4 rows)
+    /*
+    //
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M3  |---|  C3  |---|  C7  |---| C11  |---| C15  |---|  M7  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+        |           |          |          |          |          |
+        |           |          |          |          |          |
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M2  |---|  C2  |---|  C6  |---| C10  |---| C14  |---|  M6  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+        |           |          |          |          |          |
+        |           |          |          |          |          |
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M1  |---|  C1  |---|  C5  |---|  C9  |---| C13  |---|  M5  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+        |           |          |          |          |          |
+        |           |          |          |          |          |
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M0  |---|  C0  |---|  C4  |---|  C8  |---| C12  |---|  M4  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+    //
+    */
+
+    // Use the DMA core of cluster 0 to place all data in the correct positions
+    // so the problem becomes NoC-optimized.
+    //
+    // Case: parallelization over M, with tiling along both M and N.
+    // - Each cluster processes a set of rows of A:
+    //   [Cluster_idx * Mt : (Cluster_idx + 1) * Mt - 1].
+    // - All clusters share the same set of columns of B:
+    //   [num_iter * Nt : (num_iter + 1) * Nt - 1].
+    // - Each cluster computes a full tile of C (no partial results - no reduction).
+    //
+    // Memory tile mapping:
+    // - Rows of A for a given cluster are placed in the same row.
+    //   * The first half of the clusters load A from the memory tiles on the left [tile 0 - 3].
+    //   * The second half of the clusters load A from the memory tiles on the right [tile 4 - 7].
+    // - The same scheme is used to store the corresponding tile of C.
+    // - Matrix B is stored entirely in the first memory tile.
+    //   * Since all clusters need access to B, its exact location does not affect
+    //     performance significantly.
+    //
+    // Notes:
+    // - All data movement to arrange memory tiles is performed before measuring
+    //   kernel execution time.
+    // - With a proper linker script, data could be placed directly in the correct
+    //   memory tiles without requiring extra DMA work from cluster 0.
+
+    // Distribute m and k tiles to clusters
+    uint32_t cluster_m_tiles = largs->m_tiles;
+    uint32_t cluster_k_tiles = largs->k_tiles;
+    uint32_t num_working_clusters = snrt_cluster_num();
+    if (largs->parallelize_m) {
+        uint32_t m_tiles_quotient = cluster_m_tiles / snrt_cluster_num();
+        uint32_t m_tiles_remainder = cluster_m_tiles % snrt_cluster_num();
+        cluster_m_tiles = m_tiles_quotient;
+        if (snrt_cluster_idx() < m_tiles_remainder) cluster_m_tiles++;
+        if (m_tiles_quotient == 0) num_working_clusters = m_tiles_remainder;
+    }
+    if (largs->parallelize_k) cluster_k_tiles /= snrt_cluster_num();
+
+    snrt_comm_t comm;
+    snrt_comm_create(num_working_clusters, &comm);
+
+    // Calculate number of iterations
+    uint32_t num_tiles = cluster_m_tiles * largs->n_tiles * cluster_k_tiles;
+    uint32_t num_iters = num_tiles;
+    if (largs->double_buffer)
+        num_iters += 2;
+    else
+        num_iters += 1;
+
+    // Place data in the correct memory tile pre-kernel.
+    // TODO (lleone): Improve copying only the necessary information and not the full data stack
+    if (snrt_is_dm_core())
+    {
+        allocate_l2_buffers(largs);
+        snrt_dma_wait_all();
+    }
+    snrt_global_barrier(comm);
+
+
+
+    // Iterate over all tiles
+    for (uint32_t i = 0; i < num_iters; i++) {
+        // Calculate tile indices (we iterate in k->n->m order)
+        int dma_in_i = i;
+        int comp_i = largs->double_buffer ? i - 1 : i;
+        int dma_out_i = largs->double_buffer ? i - 2 : i - 1;
+        int dma_in_k = dma_in_i % cluster_k_tiles;
+        int dma_in_mn = dma_in_i / cluster_k_tiles;
+        int dma_in_n = dma_in_mn % largs->n_tiles;
+        int dma_in_m = dma_in_mn / largs->n_tiles;
+        int comp_k = comp_i % cluster_k_tiles;
+        int comp_mn = comp_i / cluster_k_tiles;
+        int comp_n = comp_mn % largs->n_tiles;
+        int comp_m = comp_mn / largs->n_tiles;
+        int dma_out_k = dma_out_i % cluster_k_tiles;
+        int dma_out_mn = dma_out_i / cluster_k_tiles;
+        int dma_out_n = dma_out_mn % largs->n_tiles;
+        int dma_out_m = dma_out_mn / largs->n_tiles;
+
+        // If m and k tiles are parallelized across clusters,
+        // calculate the absolute m and k indices for each cluster
+        int dma_in_m_abs = dma_in_m;
+        int comp_m_abs = comp_m;
+        int dma_out_m_abs = dma_out_m;
+        int dma_in_k_abs = dma_in_k;
+        int comp_k_abs = comp_k;
+        int dma_out_k_abs = dma_out_k;
+        if (largs->parallelize_m) {
+            dma_in_m_abs += snrt_cluster_idx() * cluster_m_tiles;
+            comp_m_abs += snrt_cluster_idx() * cluster_m_tiles;
+            dma_out_m_abs += snrt_cluster_idx() * cluster_m_tiles;
+        }
+        if (largs->parallelize_k) {
+            dma_in_k_abs += snrt_cluster_idx() * cluster_k_tiles;
+            comp_k_abs += snrt_cluster_idx() * cluster_k_tiles;
+            dma_out_k_abs += snrt_cluster_idx() * cluster_k_tiles;
+        }
+
+        // In the first k iteration we accumulate with the C matrix
+        // scaled by beta, in successive iterations we accumulate
+        // the previous partial result. The tile-level beta is thus
+        // a function of k: beta(k).
+        uint32_t comp_k_beta = comp_k_abs == 0 ? largs->beta : 1;
+        uint32_t dma_in_k_beta = dma_in_k_abs == 0 ? largs->beta : 1;
+
+        // DMA out phase
+        if (snrt_is_dm_core()) {
+            if (dma_out_i >= 0) {
+                snrt_mcycle();
+                // Switch buffers
+                int buff_idx = largs->double_buffer ? dma_out_mn % 2 : 0;
+
+                // Store C
+                // If parallelize_k, then only cluster 0 must writeback
+                if ((snrt_cluster_idx() == 0) || !(largs->parallelize_k)) {
+                    if (largs->partition_banks) {
+                        snrt_dma_2d_to_1d(
+                            (void *)((uintptr_t)largs->c +
+                                     dma_out_m_abs * tile_c_size),
+                            lc[buff_idx], tile_c_size,
+                            banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                            SNRT_TCDM_HYPERBANK_WIDTH);
+                    } else {
+                        snrt_dma_store_2d_tile(largs->c, lc[buff_idx],
+                                               dma_out_m_abs, dma_out_n, tile_m,
+                                               tile_n, largs->ldc, largs->prec);
+                    }
+                    snrt_dma_wait_all();
+                }
+                snrt_mcycle();
+            }
+        }
+
+        // DMA in phase
+        if (snrt_is_dm_core()) {
+            if (dma_in_i < num_tiles) {
+                snrt_mcycle();
+                // Switch buffers
+                // A and B buffers are switched every iteration, while the C
+                // buffer only needs to be switched after fully accumulating
+                // the result, i.e. after finishing the K loop.
+                int buff_idx = largs->double_buffer ? dma_in_i % 2 : 0;
+                int c_buff_idx = largs->double_buffer ? dma_in_mn % 2 : 0;
+                int load_a = largs->double_buffer ? (dma_in_i < 2) : (dma_in_i < 1);
+
+                // Load A
+                // TODO (lleone): When tiling on M and parallelizing on M there is no need
+                // to load At multiple times.
+                // If you have DOBU, you load twice and then At is available
+                // in both buffers. This can be done only when Mt is fully parallelizable
+                // in you system.
+                if (largs->load_a) {
+                    if (largs->partition_banks) {
+                        snrt_dma_1d_to_2d(
+                            la[buff_idx],
+                            (void *)((uintptr_t)largs->a +
+                                     dma_in_m_abs * tile_a_size),
+                            tile_a_size,
+                            banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                            SNRT_TCDM_HYPERBANK_WIDTH);
+                    } else {
+                        if (load_a) {
+                            snrt_dma_load_2d_tile(
+                                la[buff_idx], largs->a, dma_in_m_abs, dma_in_k_abs,
+                                tile_m, tile_k, largs->lda, largs->prec);
+                        }
+                    }
+                }
+
+                // Load B
+                if (largs->load_b) {
+                    if (largs->transb) {
+                        snrt_dma_load_2d_tile(lb[buff_idx], largs->b, dma_in_n,
+                                              dma_in_k_abs, tile_n, tile_k,
+                                              largs->ldb, largs->prec);
+                    } else {
+                        if (largs->partition_banks) {
+                            snrt_dma_1d_to_2d(
+                                lb[buff_idx],
+                                (void *)((uintptr_t)largs->b +
+                                         dma_in_k_abs * tile_b_size),
+                                tile_b_size,
+                                banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                                SNRT_TCDM_HYPERBANK_WIDTH);
+                        } else {
+                            // TODO (lleone): Is it really necessary?
+                            if (largs->parallelize_k) {
+                                snrt_dma_load_2d_tile(
+                                    lb[buff_idx], largs->b, dma_in_k_abs, dma_in_n,
+                                    tile_k, tile_n, largs->ldb, largs->prec);
+                            } else {
+                                // Multicast B to all clusters
+                                #ifdef HW_MCAST
+                                    if (snrt_cluster_idx() == 0) {
+                                        // Load B from L2
+                                        snrt_dma_load_2d_tile_mcast(
+                                        lb[buff_idx], largs->b, dma_in_k_abs, dma_in_n,
+                                        tile_k, tile_n, largs->ldb, largs->prec, 0x003C0000);
+                                    }
+                                #else
+                                    snrt_dma_load_2d_tile(
+                                    lb[buff_idx], largs->b, dma_in_k_abs, dma_in_n,
+                                    tile_k, tile_n, largs->ldb, largs->prec);
+                                #endif
+                            }
+                        }
+                    }
+                }
+
+                // Load C
+                // C tile is loaded only upon the first k iteration, then
+                // the C array will contain the partial results from the
+                // previous iteration
+                if (largs->load_c && dma_in_k_beta != 0) {
+                    if (dma_in_k_abs == 0) {
+                        if (largs->partition_banks) {
+                            snrt_dma_1d_to_2d(
+                                lc[c_buff_idx],
+                                (void *)((uintptr_t)largs->c +
+                                         dma_in_m_abs * tile_c_size),
+                                tile_c_size,
+                                banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                                SNRT_TCDM_HYPERBANK_WIDTH);
+                        } else {
+                            snrt_dma_load_2d_tile(lc[c_buff_idx], largs->c,
+                                                  dma_in_m_abs, dma_in_n,
+                                                  tile_m, tile_n, largs->ldc,
+                                                  largs->prec);
+                        }
+                    } else if (dma_in_k == 0) {
+                        // Clusters other than the first need to initialize
+                        // the C array to zero in their first iteration
+                        if (largs->partition_banks) {
+                            snrt_dma_1d_to_2d(
+                                lc[c_buff_idx], snrt_cluster()->zeromem.mem,
+                                tile_c_size,
+                                banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                                SNRT_TCDM_HYPERBANK_WIDTH);
+                        } else {
+                            snrt_dma_start_1d(lc[c_buff_idx],
+                                              snrt_cluster()->zeromem.mem,
+                                              tile_c_size);
+                        }
+                    }
+                }
+                snrt_dma_wait_all();
+                snrt_mcycle();
+            }
+        }
+
+        // Additional barrier required when not double buffering
+        if (!largs->double_buffer) snrt_global_barrier(comm);
+
+        // Compute phase
+        if (comp_i >= 0 && comp_i < num_tiles) {
+            // Switch buffers
+            int buff_idx = largs->double_buffer ? comp_i % 2 : 0;
+            int c_buff_idx = largs->double_buffer ? comp_mn % 2 : 0;
+
+            // Only compute cores participate in the tile computation
+            if (!snrt_is_dm_core()) {
+                // uint32_t start_cycle = snrt_mcycle();
+
+                // Tile computation
+                sc_st_gemm_args_t sc_st_args;
+                sc_st_args.prec = largs->prec;
+                sc_st_args.setup_ssr = largs->setup_ssr;
+                sc_st_args.partition_banks = largs->partition_banks;
+                sc_st_args.transa = largs->transa;
+                sc_st_args.transb = largs->transb;
+                sc_st_args.a = la[buff_idx];
+                if (largs->transa) {
+                    sc_st_args.lda = tile_m;
+                } else if (largs->partition_banks) {
+                    sc_st_args.lda = calculate_partitioned_banks_stride(
+                        banks_per_buffer, tile_k, largs->prec);
+                } else {
+                    sc_st_args.lda = tile_k;
+                }
+                sc_st_args.b = lb[buff_idx];
+                if (largs->transb) {
+                    sc_st_args.ldb = tile_k;
+                } else if (largs->partition_banks) {
+                    sc_st_args.ldb = calculate_partitioned_banks_stride(
+                        banks_per_buffer, tile_n, largs->prec);
+                } else {
+                    sc_st_args.ldb = tile_n;
+                }
+                sc_st_args.beta = comp_k_beta;
+                sc_st_args.c = lc[c_buff_idx];
+                if (largs->partition_banks) {
+                    sc_st_args.ldc = calculate_partitioned_banks_stride(
+                        banks_per_buffer, tile_n, largs->prec);
+                } else {
+                    sc_st_args.ldc = tile_n;
+                }
+                sc_st_args.m = tile_m;
+                sc_st_args.n = tile_n;
+                sc_st_args.k = tile_k;
+                sc_st_gemm(largs->gemm_fp, &sc_st_args);
+
+                // uint32_t end_cycle = snrt_mcycle();
+            }
+
+            // Add the partial result tiles from the various clusters together
+            // in a logarithmic reduction fashion.
+            // Note: both compute and DMA cores participate in this step.
+            if (largs->parallelize_k && (comp_k == (cluster_k_tiles - 1))) {
+                snrt_global_reduction_dma(
+                    (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n);
+            }
+        }
+
+        // Synchronize cores after every iteration
+        snrt_global_barrier(comm);
+    }
+
+    // Before completing the kernel, each cluster writes back its C tiles in the
+    // original memory tile. This is necessary only to run teh verify.py script
+
+    if (snrt_is_dm_core() && snrt_cluster_idx() < num_working_clusters) {
+        write_back_c_tiles(largs, tile_m, tile_n);
+    }
+
+    return 0;
+}
+
+
+int main () {
+    gemm_picobello(&args);
+    return 0;
+}
diff --git a/sw/snitch/apps/mha/app.mk b/sw/snitch/apps/mha/app.mk
new file mode 100644
index 00000000..4af8cb3b
--- /dev/null
+++ b/sw/snitch/apps/mha/app.mk
@@ -0,0 +1,15 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := mha
+$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build
+$(APP)_DATA_CFG  := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json
+SRC_DIR          := $(SN_ROOT)/sw/dnn/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+$(APP)_INCDIRS   := $(SN_ROOT)/sw/dnn/src $(SN_ROOT)/sw/blas
+
+include $(SN_ROOT)/sw/apps/common.mk
+include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/sw/snitch/apps/mha/data/params.json b/sw/snitch/apps/mha/data/params.json
new file mode 100644
index 00000000..2c63ab3d
--- /dev/null
+++ b/sw/snitch/apps/mha/data/params.json
@@ -0,0 +1,14 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    num_heads: 2,
+	L: 16,
+	S: 16,
+    d: 16,
+    B_r: 16,
+    B_c: 16,
+    dtype: "FP32",
+    baseline: true
+}
\ No newline at end of file
diff --git a/sw/snitch/runtime/src/pb_noc_cfg.h b/sw/snitch/runtime/src/pb_noc_cfg.h
new file mode 100644
index 00000000..bc285ba6
--- /dev/null
+++ b/sw/snitch/runtime/src/pb_noc_cfg.h
@@ -0,0 +1,8 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Lorenzo Leone <lleone@iis.ee.ethz.ch>
+
+#define PB_CLUSTER_PER_ROW 4
+#define PB_CLUSTER_PER_COL 4
diff --git a/sw/snitch/runtime/src/pb_team.c b/sw/snitch/runtime/src/pb_team.c
new file mode 100644
index 00000000..cd280f61
--- /dev/null
+++ b/sw/snitch/runtime/src/pb_team.c
@@ -0,0 +1,19 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+extern inline uintptr_t pb_l2_tile_address(uint32_t tile_idx);
+
+extern inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr);
+
+extern inline uint32_t pb_cluster_row(uint32_t cidx);
+
+extern inline uint32_t pb_cluster_row();
+
+extern inline uint32_t pb_cluster_col(uint32_t cidx);
+
+extern inline uint32_t pb_cluster_col();
+
+extern inline uint32_t pb_closest_mem_tile(uint32_t cidx);
+
+extern inline uint32_t pb_closest_mem_tile();
diff --git a/sw/snitch/runtime/src/pb_team.h b/sw/snitch/runtime/src/pb_team.h
new file mode 100644
index 00000000..1fa8d8d0
--- /dev/null
+++ b/sw/snitch/runtime/src/pb_team.h
@@ -0,0 +1,95 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Lorenzo Leone <lleone@iis.ee.ethz.ch>
+
+
+/**
+ * @file
+ * @brief This file contains functions and macros related to Picobello team
+ * management.
+ */
+
+/**
+ * @brief Get start address of a memory tile
+ * @param tile_idx The memory tile idx in the NoC
+ * @return Start addres of memory tile idx
+ */
+inline uintptr_t pb_l2_tile_address(uint32_t tile_idx) {
+    return (uintptr_t) (picobello_addrmap.l2_spm[tile_idx].mem);
+}
+
+/**
+ * @brief Get the address offset of a data respect to the memory tile start address
+ * @param src_addr The data absolute address
+ * @return Address location offset respect to the tile start address
+ */
+inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr) {
+    return (src_addr - PICOBELLO_ADDRMAP_L2_SPM_0_BASE_ADDR) %
+        PICOBELLO_ADDRMAP_L2_SPM_0_SIZE;
+}
+
+
+/**
+ * @brief Get the NoC row index
+ * @param cidx The cluster index
+ * @return The Row index
+ */
+inline uint32_t pb_cluster_row(uint32_t cidx)
+{
+    return cidx % PB_CLUSTER_PER_ROW;
+}
+
+/**
+ * @brief Get the NoC row index
+ * This is a convenience orload of pb_cluster_row()
+ * @return The Row index
+ */
+inline uint32_t pb_cluster_row()
+{
+    return pb_cluster_row(snrt_cluster_idx());
+}
+
+
+/**
+ * @brief Get the NoC column index
+ * @param cidx The cluster index
+ * @return The Column index
+ */
+inline uint32_t pb_cluster_col(uint32_t cidx)
+{
+    return cidx / PB_CLUSTER_PER_COL;
+}
+
+/**
+ * @brief Get the NoC column index
+ * This is a convenience orload of pb_cluster_row()
+ * @return The Column index
+ */
+inline uint32_t pb_cluster_col()
+{
+    return pb_cluster_col(snrt_cluster_idx());
+}
+
+
+/**
+ * @brief Get the index of the closest memory tile
+ * @param cidx The cluster index
+ * @return Index of the closest memory tile to cidx
+ */
+inline uint32_t pb_closest_mem_tile(uint32_t cidx) {
+    uint32_t row = pb_cluster_row(cidx);
+    // e.g. with 4x4 matrix
+    // first 8 clusters -> left column tiles 0..3
+    // clusters >= 8 -> right column tiles 4..7
+    return (cidx < (snrt_cluster_num() / 2)) ? row : (row + PB_CLUSTER_PER_COL);
+}
+
+/**
+ * @brief Get the index of the closest memory tile
+ * This is a convenience overload of pb_closest_mem_tile()
+ */
+inline uint32_t pb_closest_mem_tile() {
+    return pb_closest_mem_tile(snrt_cluster_idx());
+}
diff --git a/sw/snitch/runtime/src/snitch_cluster_memory.c b/sw/snitch/runtime/src/snitch_cluster_memory.c
new file mode 120000
index 00000000..adb289b9
--- /dev/null
+++ b/sw/snitch/runtime/src/snitch_cluster_memory.c
@@ -0,0 +1 @@
+../../../../.deps/snitch_cluster/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.c
\ No newline at end of file
diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/src/snrt.S
deleted file mode 120000
index 6fb619df..00000000
--- a/sw/snitch/runtime/src/snrt.S
+++ /dev/null
@@ -1 +0,0 @@
-../../../../.deps/snitch_cluster/target/snitch_cluster/sw/runtime/rtl/src/snrt.S
\ No newline at end of file
diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/src/snrt.S
new file mode 100644
index 00000000..89fa8995
--- /dev/null
+++ b/sw/snitch/runtime/src/snrt.S
@@ -0,0 +1,18 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#define SNRT_INIT_INT_REGS
+#define SNRT_INIT_FP_REGS
+#define SNRT_INIT_GP
+#define SNRT_INIT_CORE_INFO
+#define SNRT_INIT_CLS
+#define SNRT_INIT_STACK
+#define SNRT_INIT_TLS
+#define SNRT_CRT0_PARK
+
+#include "pb_raw_addrmap.h"
+#define SNRT_TCDM_START_ADDR PICOBELLO_ADDRMAP_CLUSTER_0_TCDM_BASE_ADDR
+
+#include "snitch_cluster_cfg.h"
+#include "start.S"
diff --git a/sw/snitch/runtime/src/snrt.h b/sw/snitch/runtime/src/snrt.h
index 137a3737..8959c80a 100644
--- a/sw/snitch/runtime/src/snrt.h
+++ b/sw/snitch/runtime/src/snrt.h
@@ -11,7 +11,9 @@
 #include "pb_addrmap.h"
 #include "snitch_cluster_cfg.h"
 #include "snitch_cluster_peripheral_addrmap.h"
-#include "snitch_cluster_raw_addrmap.h"
+#include "pb_raw_addrmap.h"
+#include "pb_noc_cfg.h"
+#define SNRT_TCDM_START_ADDR PICOBELLO_ADDRMAP_CLUSTER_0_TCDM_BASE_ADDR
 
 // TODO: the 40000 stride is hardcoded here, but it would better be
 //       autogenerated by Floogen. At the same time that would be
@@ -50,6 +52,7 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t;
 #include "sync.h"
 #include "team.h"
 #include "types.h"
+#include "pb_team.h"
 
 // Accelerators
 #include "datamover/archi_datamover.h"
diff --git a/sw/sw.mk b/sw/sw.mk
index c4008512..49144ecf 100644
--- a/sw/sw.mk
+++ b/sw/sw.mk
@@ -28,12 +28,24 @@ SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build
 SNRT_INCDIRS        = $(PB_INCDIR) $(PB_GEN_DIR)
 SNRT_BUILD_APPS     = OFF
 SNRT_MEMORY_LD      = $(PB_SNITCH_SW_DIR)/memory.ld
+SNRT_HAL_BUILD_DIR  = $(PB_SNITCH_SW_DIR)/runtime/build
 SNRT_HAL_HDRS       = $(PB_GEN_DIR)/pb_addrmap.h
+SNRT_HAL_HDRS      += $(PB_GEN_DIR)/pb_raw_addrmap.h
 
-ifneq (,$(filter chs-bootrom% chs-sw% sn% pb-sn-tests% sw%,$(MAKECMDGOALS)))
+SNRT_APPS  = $(PB_SNITCH_SW_DIR)/apps/gemm_2d
+SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/gemm
+SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/axpy
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/flashattention_2
+SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/fused_concat_linear
+SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/mha
+
+ifneq (,$(filter $(PB_SNITCH_SW_DIR)% chs-bootrom% chs-sw% sn% pb-sn-tests% sw%,$(MAKECMDGOALS)))
 include $(SN_ROOT)/target/snitch_cluster/sw.mk
 endif
 
+$(PB_GEN_DIR)/pb_raw_addrmap.h: $(PB_RDL_ALL)
+	$(PEAKRDL) raw-header $< -o $@ $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) --base_name $(notdir $(basename $@)) --format c
+
 # Collect Snitch tests which should be built
 PB_SNRT_TESTS_DIR      = $(PB_SNITCH_SW_DIR)/tests
 PB_SNRT_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build
@@ -67,7 +79,7 @@ PB_LINK_MODE ?= spm
 
 # We need to include the address map and snitch cluster includes
 CHS_SW_INCLUDES += -I$(PB_INCDIR)
-CHS_SW_INCLUDES += -I$(SNRT_HAL_HDRS_DIR)
+CHS_SW_INCLUDES += -I$(SNRT_HAL_BUILD_DIR)
 CHS_SW_INCLUDES += -I$(PB_GEN_DIR)
 
 # Collect tests, which should be build for all modes, and their .dump targets
@@ -95,8 +107,9 @@ chs-sw-tests-clean:
 # Alias targets to align them with Picobello naming convention
 sn-tests-clean: sn-clean-tests
 sn-runtime-clean: sn-clean-runtime
+sn-apps-clean: sn-clean-apps
 
 .PHONY: sw sw-tests sw-clean sw-tests-clean
-sw sw-tests: chs-sw-tests sn-tests pb-sn-tests
+sw sw-tests: chs-sw-tests sn-tests pb-sn-tests sn-apps
 
-sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests
+sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests sn-apps-clean
diff --git a/target/sim/include/tb_picobello_tasks.svh b/target/sim/include/tb_picobello_tasks.svh
index 40db1f36..9158a327 100644
--- a/target/sim/include/tb_picobello_tasks.svh
+++ b/target/sim/include/tb_picobello_tasks.svh
@@ -12,6 +12,9 @@ import "DPI-C" context function byte read_section(input longint address, inout b
 import picobello_pkg::*;
 
 `include "pb_addrmap.svh"
+`include "cheshire/typedef.svh"
+
+`CHESHIRE_TYPEDEF_ALL(, fix.vip.DutCfg)
 
 task automatic jtag_enable_tiles();
   $display("Resetting tiles and enabling clock...");
@@ -151,3 +154,128 @@ task automatic fastmode_elf_preload(input string binary, output cheshire_pkg::do
   void'(get_entry(entry));
   $display("[FAST_PRELOAD] Preload complete");
 endtask
+
+// Suitable for loading ELFs with 32b-aligned sections 
+task automatic jtag_32b_elf_preload(input string binary, output bit [63:0] entry);
+  longint sec_addr, sec_len;
+  dm::sbcs_t sbcs = dm::sbcs_t
+'{sbautoincrement: 1'b1, sbreadondata: 1'b1, sbaccess: 2, default: '0};
+  $display("[JTAG] Preloading ELF binary: %s", binary);
+  if (fix.vip.read_elf(binary)) $fatal(1, "[JTAG] Failed to load ELF!");
+  while (fix.vip.get_section(
+      sec_addr, sec_len
+  )) begin
+    byte bf[] = new[sec_len];
+    $display("[JTAG] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len);
+    if (fix.vip.read_section(sec_addr, bf, sec_len))
+      $fatal(1, "[JTAG] Failed to read ELF section!");
+    fix.vip.jtag_write(dm::SBCS, sbcs, 1, 1);
+    // Write address as 64-bit double
+    fix.vip.jtag_write(dm::SBAddress1, sec_addr[63:32]);
+    fix.vip.jtag_write(dm::SBAddress0, sec_addr[31:0]);
+    for (longint i = 0; i <= sec_len; i += 4) begin
+      bit checkpoint = (i != 0 && i % 512 == 0);
+      if (checkpoint)
+        $display(
+            "[JTAG] - %0d/%0d bytes (%0d%%)",
+            i,
+            sec_len,
+            i * 100 / (sec_len > 1 ? sec_len - 1 : 1)
+        );
+      fix.vip.jtag_write(dm::SBData0, {bf[i+3], bf[i+2], bf[i+1], bf[i]}, checkpoint, checkpoint);
+    end
+  end
+  void'(get_entry(entry));
+  $display("[JTAG] Preload complete");
+endtask
+
+// Handles misalignments, burst limits and 4KiB crossings
+task automatic slink_write_generic(input addr_t addr, input longint size, ref byte bytes[]);
+  // Using `slink_write_beats`, writes must be beat-aligned and beat-sized (strobing is not
+  // possible). If we have a misaligned transfer of arbitrary size we may have at most two
+  // incomplete beats (start and end) and one misaligned beat (start). In case of an incomplete
+  // beat we read-modify-write the full beat.
+
+  // Burst and beat geometry
+  const int  beat_bytes = fix.vip.AxiStrbWidth;
+  const int  beat_mask = beat_bytes - 1;
+  const int  SlinkBurstBeats = fix.vip.SlinkBurstBytes / beat_bytes;
+
+  // Iterate beat-by-beat over the address range [addr, addr+size)
+  addr_t     first_aligned = addr_t'(addr) & ~addr_t'(beat_mask);
+  addr_t     end_addr = addr_t'(addr + size);
+  addr_t     last_aligned = addr_t'((end_addr - 1) & ~addr_t'(beat_mask));
+
+  // Running index into bytes[]: "how many bytes have we already consumed?"
+  longint    base_idx = 0;
+
+  // Group beats in a burst
+  addr_t     batch_addr = first_aligned;
+  axi_data_t burst                                                        [$];
+  burst = {};
+
+  for (addr_t beat_addr = first_aligned; beat_addr <= last_aligned; beat_addr += beat_bytes) begin
+    addr_t next_addr;
+    bit crosses_4k_next, exceeds_burst_length, last_beat_in_section;
+
+    // Window of the current beat that has to be written
+    int start_off = (beat_addr == first_aligned) ? int'(addr & beat_mask) : 0;
+    int end_off_excl = (beat_addr == last_aligned) ? int'(end_addr - last_aligned) : beat_bytes;
+    int win_len = end_off_excl - start_off;
+
+    // Compose beat
+    axi_data_t beat = '0;
+    if (win_len == beat_bytes && start_off == 0) begin
+      // FULL BEAT: write directly, no RMW
+      for (int e = 0; e < beat_bytes; e++) begin
+        beat[8*e+:8] = bytes[base_idx+e];
+      end
+    end else begin
+      // PARTIAL BEAT: RMW
+      axi_data_t rd[$];
+      fix.vip.slink_read_beats(beat_addr, fix.vip.AxiStrbBits, 0, rd);
+      beat = rd[0];
+      for (int i = 0; i < win_len; i++) begin
+        beat[8*(start_off+i)+:8] = bytes[base_idx+i];
+      end
+    end
+
+    // Accumulate and advance
+    burst.push_back(beat);
+    base_idx += win_len;
+
+    // Decide if the next beat would cross a 4 KiB boundary, exceed the maximum burst length
+    // or this is the last beat
+    next_addr            = beat_addr + win_len;
+    crosses_4k_next      = ((next_addr & 12'hFFF) == 12'h000);  // next beat starts a new page
+    exceeds_burst_length = (burst.size() == SlinkBurstBeats);
+    last_beat_in_section = (beat_addr == last_aligned);
+
+    if (crosses_4k_next || exceeds_burst_length || last_beat_in_section) begin
+      // Flush accumulated beats for this page
+      fix.vip.slink_write_beats(batch_addr, fix.vip.AxiStrbBits, burst);
+      burst      = {};
+      batch_addr = next_addr;
+    end
+  end
+endtask
+
+task automatic slink_32b_elf_preload(input string binary, output bit [63:0] entry);
+  longint sec_addr, sec_len;
+
+  $display("[SLINK] Preloading ELF binary: %s", binary);
+  if (fix.vip.read_elf(binary)) $fatal(1, "[SLINK] Failed to load ELF!");
+
+  while (fix.vip.get_section(
+      sec_addr, sec_len
+  )) begin
+    byte bf[] = new[sec_len];
+    $display("[SLINK] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len);
+    if (fix.vip.read_section(sec_addr, bf, sec_len))
+      $fatal(1, "[SLINK] Failed to read ELF section!");
+    slink_write_generic(sec_addr, sec_len, bf);
+  end
+
+  void'(fix.vip.get_entry(entry));
+  $display("[SLINK] Preload complete");
+endtask
diff --git a/target/sim/src/tb_picobello_top.sv b/target/sim/src/tb_picobello_top.sv
index dcf8920b..673cbe88 100644
--- a/target/sim/src/tb_picobello_top.sv
+++ b/target/sim/src/tb_picobello_top.sv
@@ -10,9 +10,6 @@ module tb_picobello_top;
                        gen_sram_banks[j].gen_sram_macros[k].i_mem.sram
 
   `include "tb_picobello_tasks.svh"
-  `include "cheshire/typedef.svh"
-
-  `CHESHIRE_TYPEDEF_ALL(, fix.vip.DutCfg)
 
   // Instantiate the fixture
   fixture_picobello_top fix ();
@@ -28,90 +25,6 @@ module tb_picobello_top;
   int           snitch_fn;
   int           chs_fn;
 
-  // Load Snitch binary
-  task automatic jtag_32b_elf_preload(input string binary, output bit [63:0] entry);
-    longint sec_addr, sec_len;
-    dm::sbcs_t sbcs = dm::sbcs_t
-'{sbautoincrement: 1'b1, sbreadondata: 1'b1, sbaccess: 2, default: '0};
-    $display("[JTAG] Preloading ELF binary: %s", binary);
-    if (fix.vip.read_elf(binary)) $fatal(1, "[JTAG] Failed to load ELF!");
-    while (fix.vip.get_section(
-        sec_addr, sec_len
-    )) begin
-      byte bf[] = new[sec_len];
-      $display("[JTAG] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len);
-      if (fix.vip.read_section(sec_addr, bf, sec_len))
-        $fatal(1, "[JTAG] Failed to read ELF section!");
-      fix.vip.jtag_write(dm::SBCS, sbcs, 1, 1);
-      // Write address as 64-bit double
-      fix.vip.jtag_write(dm::SBAddress1, sec_addr[63:32]);
-      fix.vip.jtag_write(dm::SBAddress0, sec_addr[31:0]);
-      for (longint i = 0; i <= sec_len; i += 4) begin
-        bit checkpoint = (i != 0 && i % 512 == 0);
-        if (checkpoint)
-          $display(
-              "[JTAG] - %0d/%0d bytes (%0d%%)",
-              i,
-              sec_len,
-              i * 100 / (sec_len > 1 ? sec_len - 1 : 1)
-          );
-        fix.vip.jtag_write(dm::SBData0, {bf[i+3], bf[i+2], bf[i+1], bf[i]}, checkpoint, checkpoint);
-      end
-    end
-    void'(get_entry(entry));
-    $display("[JTAG] Preload complete");
-  endtask
-
-  task automatic slink_32b_elf_preload(input string binary, output bit [63:0] entry);
-    longint sec_addr, sec_len;
-    $display("[SLINK] Preloading ELF binary: %s", binary);
-    if (fix.vip.read_elf(binary)) $fatal(1, "[SLINK] Failed to load ELF!");
-    while (fix.vip.get_section(
-        sec_addr, sec_len
-    )) begin
-      byte bf        [] = new[sec_len];
-      int  burst_len;
-      $display("[SLINK] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len);
-      if (fix.vip.read_section(sec_addr, bf, sec_len))
-        $fatal(1, "[SLINK] Failed to read ELF section!");
-      // Write section in bursts <= SlinkBurstBytes that never cross a 4 KiB page
-      for (longint sec_offs = 0; sec_offs < sec_len; sec_offs += burst_len) begin
-        longint sec_left, page_left;
-        axi_data_t beats                          [$];
-        int        bus_offs;
-        addr_t     addr_cur = sec_addr + sec_offs;
-        if (sec_offs != 0) begin
-          $display("[SLINK] - %0d/%0d bytes (%0d%%)", sec_offs, sec_len,
-                   sec_offs * 100 / (sec_len > 1 ? sec_len - 1 : 1));
-        end
-        // By default the burst length is SlinkBurstBytes
-        burst_len = fix.vip.SlinkBurstBytes;
-        // Cut the burst length if it exceeds the remaining section length
-        // or it crosses a 4 KiB page boundary
-        sec_left  = sec_len - sec_offs;
-        page_left = 4096 - (addr_cur & 12'hFFF);
-        if (burst_len > sec_left) burst_len = int'(sec_left);
-        if (burst_len > page_left) burst_len = int'(page_left);
-        bus_offs  = addr_cur[fix.vip.AxiStrbBits-1:0];
-
-        // If the address is not aligned subtract the offset from the burst length to avoid an additional write
-        burst_len = burst_len - bus_offs;
-        // Assemble beats, handling unaligned start in the first beat
-        for (int b = -bus_offs; b < burst_len; b += fix.vip.AxiStrbWidth) begin
-          axi_data_t beat = '0;
-          for (int e = 0; e < fix.vip.AxiStrbWidth; ++e)
-          if (b + e >= 0 && b + e < burst_len) beat[8*e+:8] = bf[sec_offs+b+e];
-          beats.push_back(beat);
-        end
-        // Address must be beat‑aligned for slink_write_beats
-        fix.vip.slink_write_beats(addr_cur - bus_offs, fix.vip.AxiStrbBits, beats);
-      end
-    end
-    void'(fix.vip.get_entry(entry));
-    $display("[SLINK] Preload complete");
-  endtask
-
-
   initial begin
     // Fetch plusargs or use safe (fail-fast) defaults
     if (!$value$plusargs("BOOTMODE=%d", boot_mode)) boot_mode = 0;
diff --git a/target/sim/vsim/vsim.mk b/target/sim/vsim/vsim.mk
index 438489fc..40e0ad30 100644
--- a/target/sim/vsim/vsim.mk
+++ b/target/sim/vsim/vsim.mk
@@ -53,3 +53,8 @@ vsim-run:
 
 vsim-run-batch:
 	$(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit"
+
+vsim-run-batch-verify: vsim-run-batch
+ifdef VERIFY_PY
+	$(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000
+endif
\ No newline at end of file