diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 551a1e96..9a98fdb8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -87,15 +87,16 @@ sim-vsim: - vsim-compile script: # Run the simulation - - make vsim-run-batch + - make vsim-run-batch-verify # Check either success or failure for non-zero exit codes - 'if [ -z "${NZ_EXIT_CODE}" ]; then grep "] SUCCESS" transcript || (exit 1); else grep "] FAILED: return code ${NZ_EXIT_CODE}" transcript || (exit 1); fi' # Check for UART output - 'if [ ! -z "${USTR}" ]; then (grep " \[UART\] ${USTR}" transcript); fi' # Check for any fatal errors - 'if grep "Fatal:" transcript; then exit 1; fi' - # Check for any errors (except one for non-zero exit codes) - - 'if [ ! -z "${NZ_EXIT_CODE}" ]; then count=$(grep -c "Error:" transcript); if [ "$count" -ne 1 ]; then exit 1; fi; else if grep -q "Error:" transcript; then exit 1; fi; fi' + # Check for any non-fatal errors. One and only one error is expected with a non-zero exit code. + # Ignore all errors when using a separate verification script. + - 'if [ -z "${VERIFY_PY}" ]; then if [ ! -z "${NZ_EXIT_CODE}" ]; then count=$(grep -c "Error:" transcript); if [ "$count" -ne 1 ]; then exit 1; fi; else if grep -q "Error:" transcript; then exit 1; fi; fi; fi' artifacts: paths: - transcript diff --git a/.gitlab/common.yml b/.gitlab/common.yml index 3091b58e..26aaca7a 100644 --- a/.gitlab/common.yml +++ b/.gitlab/common.yml @@ -4,7 +4,7 @@ variables: PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" - PD_COMMIT: "c22baf1fcce6b0cf76b85d2009e146ba3bbb6807" + PD_COMMIT: "245dd3d4a7c33f6052a913691382871bd2c53fc0" SPU_COMMIT: "c2e8815487bd713624d74ef3e3e0465196b6d67f" # Check the cache for bender and python dependencies @@ -66,11 +66,13 @@ variables: - .cache-deps - .init-env script: - - make sn-tests + - make sn-tests DEBUG=ON + - make sn-apps DEBUG=ON - make pb-sn-tests artifacts: paths: - sw/snitch/tests/build/*.elf + - sw/snitch/apps/**/build/*.elf expire_in: 1 day # Compile the cheshire software tests diff --git a/.gitlab/sw-tests.yml b/.gitlab/sw-tests.yml index 5a4bbcb6..dd4b62c0 100644 --- a/.gitlab/sw-tests.yml +++ b/.gitlab/sw-tests.yml @@ -7,6 +7,7 @@ variables: CHS_BUILD_DIR: sw/cheshire/tests SN_BUILD_DIR: sw/snitch/tests/build + SN_ROOT: .deps/snitch_cluster parallel: matrix: - { CHS_BINARY: $CHS_BUILD_DIR/sanity.spm.elf, PRELMODE: 0 } @@ -29,3 +30,8 @@ - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/redmule.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/redmule_quant.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/datamover.elf } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/gemm_2d/build/gemm_2d.elf, VERIFY_PY: $SN_ROOT/sw/blas/gemm/scripts/verify.py, PRELMODE: 3 } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/fused_concat_linear/build/fused_concat_linear.elf, VERIFY_PY: $SN_ROOT/sw/dnn/fused_concat_linear/scripts/verify.py, PRELMODE: 3 } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/mha/build/mha.elf, VERIFY_PY: $SN_ROOT/sw/dnn/mha/scripts/verify.py, PRELMODE: 3 } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/gemm/build/gemm.elf, VERIFY_PY: $SN_ROOT/sw/blas/gemm/scripts/verify.py, PRELMODE: 3 } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: sw/snitch/apps/axpy/build/axpy.elf, VERIFY_PY: $SN_ROOT/sw/blas/axpy/scripts/verify.py, PRELMODE: 3 } diff --git a/Bender.lock b/Bender.lock index c8cba67f..02dd028e 100644 --- a/Bender.lock +++ b/Bender.lock @@ -146,7 +146,7 @@ packages: dependencies: - common_cells common_cells: - revision: bef3d3c5ed0e2cc211e69a6dbd81c4fe3a97025c + revision: e1c09c75775c5f03eb45906d5145dbd2f5bcfb95 version: null source: Git: https://github.com/pulp-platform/common_cells.git @@ -383,7 +383,7 @@ packages: - common_cells - register_interface snitch_cluster: - revision: e4eaa0fb64767bb8f6b7d1f5fa705928171092b2 + revision: ef3ece6c9e119fbfc25b26bb89a429ccdaacb5c6 version: null source: Git: https://github.com/pulp-platform/snitch_cluster.git diff --git a/Bender.yml b/Bender.yml index a2172691..b342a501 100644 --- a/Bender.yml +++ b/Bender.yml @@ -12,7 +12,7 @@ dependencies: axi: { git: "https://github.com/pulp-platform/axi.git", version: "0.39.6" } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", rev: "snitch" } cheshire: { git: "https://github.com/pulp-platform/cheshire.git", rev: "picobello" } - snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "e4eaa0fb64767bb8f6b7d1f5fa705928171092b2" } + snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "ef3ece6c9e119fbfc25b26bb89a429ccdaacb5c6" } floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "develop" } obi: { git: "https://github.com/pulp-platform/obi.git", rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" } redmule: { git: "https://github.com/pulp-platform/redmule.git", rev: "picobello" } diff --git a/Makefile b/Makefile index 29d13c89..0833fff7 100644 --- a/Makefile +++ b/Makefile @@ -218,7 +218,7 @@ python-venv: .venv python -m pip install --upgrade pip setuptools && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path floo_noc) --no-deps && \ - python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path snitch_cluster) + python -m pip install --cache-dir $(PIP_CACHE_DIR) "$(shell $(BENDER) path snitch_cluster)[kernels]" python-venv-clean: rm -rf .venv diff --git a/README.md b/README.md index 7c17363b..a91d7e00 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,14 @@ Use the `vsim-run-batch` command to run tests in batch mode with RTL optimizatio Use the `PRELMODE=3` flag to enable fast preload of the Snitch binary, and speed up the simulation. +Some applications produce a lot of output data, which would be time-consuming to check in simulation. +Said applications usually come with a Python verification script that can check the results from a dump of the memory contents at the end of the simulation. +For example, a verification script for the GEMM kernel can be found under `$(bender path snitch_cluster)/sw/blas/gemm/scripts/verify.py` +To run an application on Snitch and verify its results, do: +```bash +make vsim-run-batch-verify VERIFY_PY=$(bender path snitch_cluster)/sw/blas/gemm/scripts/verify.py PRELMODE=3 CHS_BINARY=sw/cheshire/tests/simple_offload.spm.elf SN_BINARY=sw/snitch/apps/blas/gemm/build/gemm.elf +``` + ### Additional help Additionally, you can run the following command to get a list of all available commands: diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index 1458fda0..faf4249a 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -16,7 +16,8 @@ module mem_tile #( parameter bit AxiUserAtop = 1'b1, parameter int unsigned AxiUserAtopMsb = 3, - parameter int unsigned AxiUserAtopLsb = 0 + parameter int unsigned AxiUserAtopLsb = 0, + parameter int unsigned MemTileId = 0 ) ( input logic clk_i, input logic rst_ni, @@ -275,6 +276,25 @@ module mem_tile end end +`ifndef SYNTHESIS + // AXI Monitor dumper to improvce debiugging + axi_dumper #( + .BusName ($sformatf("mem_tile_%d", MemTileId)), + .LogAW (1'b1), + .LogAR (1'b1), + .LogW (1'b1), + .LogB (1'b1), + .LogR (1'b1), + .axi_req_t (axi_nw_join_req_t), + .axi_resp_t(axi_nw_join_rsp_t) + ) i_axi_monitor ( + .clk_i, + .rst_ni, + .axi_req_i (axi_req), + .axi_resp_i(axi_rsp) + ); +`endif + axi_to_obi #( .ObiCfg (MgrObiCfg), .obi_req_t (mgr_obi_req_t), diff --git a/hw/picobello_top.sv b/hw/picobello_top.sv index 7a150820..e8d737fe 100644 --- a/hw/picobello_top.sv +++ b/hw/picobello_top.sv @@ -273,7 +273,9 @@ module picobello_top localparam int MemTileX = int'(MemTilePhysicalId.x); localparam int MemTileY = int'(MemTilePhysicalId.y); - mem_tile i_mem_tile ( + mem_tile #( + .MemTileId(int'(m)) + ) i_mem_tile ( .clk_i, .rst_ni, .test_enable_i (test_mode_i), diff --git a/requirements.txt b/requirements.txt index 4892f657..78565385 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,5 +25,5 @@ hjson # for reggen # For peakrdl peakrdl -peakrdl-rawheader @ git+https://github.com/micprog/peakrdl-rawheader.git +peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@7b8dbc9ad5854dc1cdaf36d4ea024c29ffb00a4c peakrdl-markdown diff --git a/sw/snitch/apps/axpy/app.mk b/sw/snitch/apps/axpy/app.mk new file mode 100644 index 00000000..9e032762 --- /dev/null +++ b/sw/snitch/apps/axpy/app.mk @@ -0,0 +1,15 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := axpy +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +$(APP)_DATA_CFG := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json +SRC_DIR := $(SN_ROOT)/sw/blas/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/blas + +include $(SN_ROOT)/sw/apps/common.mk +include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/sw/snitch/apps/axpy/data/params.json b/sw/snitch/apps/axpy/data/params.json new file mode 100644 index 00000000..eafb7128 --- /dev/null +++ b/sw/snitch/apps/axpy/data/params.json @@ -0,0 +1,9 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + "n_tiles": 5, + "n": 2560, + "funcptr": "axpy_opt" +} diff --git a/sw/snitch/apps/fused_concat_linear/app.mk b/sw/snitch/apps/fused_concat_linear/app.mk new file mode 100644 index 00000000..fd0eb531 --- /dev/null +++ b/sw/snitch/apps/fused_concat_linear/app.mk @@ -0,0 +1,15 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := fused_concat_linear +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +$(APP)_DATA_CFG := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json +SRC_DIR := $(SN_ROOT)/sw/dnn/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/dnn/src $(SN_ROOT)/sw/blas + +include $(SN_ROOT)/sw/apps/common.mk +include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/sw/snitch/apps/fused_concat_linear/data/params.json b/sw/snitch/apps/fused_concat_linear/data/params.json new file mode 100644 index 00000000..935cbd99 --- /dev/null +++ b/sw/snitch/apps/fused_concat_linear/data/params.json @@ -0,0 +1,11 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +{ + num_inputs: 2, + input_shape: [16, 16], + output_shape: [16, 16], + dtype: "FP64", + gemm_implementation: "gemm_fp64_opt" +} \ No newline at end of file diff --git a/sw/snitch/apps/gemm/app.mk b/sw/snitch/apps/gemm/app.mk new file mode 100644 index 00000000..1dd86b44 --- /dev/null +++ b/sw/snitch/apps/gemm/app.mk @@ -0,0 +1,15 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := gemm +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +$(APP)_DATA_CFG := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json +SRC_DIR := $(SN_ROOT)/sw/blas/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/blas + +include $(SN_ROOT)/sw/apps/common.mk +include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/sw/snitch/apps/gemm/data/params.json b/sw/snitch/apps/gemm/data/params.json new file mode 100644 index 00000000..524ad42b --- /dev/null +++ b/sw/snitch/apps/gemm/data/params.json @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + setup_ssr: 1, + parallelize_m: 1, + parallelize_k: 0, + m_tiles: 64, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + double_buffer: 1, + partition_banks: 0, + transa: false, + transb: false, // must be true for SIMD + m: 2048, + n: 16, + k: 16, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp64_opt" +} diff --git a/sw/snitch/apps/gemm_2d/app.mk b/sw/snitch/apps/gemm_2d/app.mk new file mode 100644 index 00000000..f0155c5b --- /dev/null +++ b/sw/snitch/apps/gemm_2d/app.mk @@ -0,0 +1,17 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Lorenzo Leone + +APP := gemm_2d +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +SRC_DIR := $(PB_SNITCH_SW_DIR)/apps/$(APP)/src +SRCS := $(SRC_DIR)/gemm_2d.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/blas $(SN_ROOT)/sw/blas/gemm/src + +# Refer to Snitch scripts +$(APP)_SCRIPT_DIR := $(SN_ROOT)/sw/blas/gemm/scripts + +include $(SN_ROOT)/sw/apps/common.mk +include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/sw/snitch/apps/gemm_2d/data/params.json b/sw/snitch/apps/gemm_2d/data/params.json new file mode 100644 index 00000000..afc30631 --- /dev/null +++ b/sw/snitch/apps/gemm_2d/data/params.json @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + setup_ssr: 1, + parallelize_m: 1, + parallelize_k: 0, + m_tiles: 16, // number of tiles in M dimension + n_tiles: 4, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + double_buffer: 1, + partition_banks: 0, + transa: false, + transb: false, // must be true for SIMD + m: 128, + n: 32, + k: 16, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp64_opt" +} diff --git a/sw/snitch/apps/gemm_2d/roi.json b/sw/snitch/apps/gemm_2d/roi.json new file mode 100644 index 00000000..4249f886 --- /dev/null +++ b/sw/snitch/apps/gemm_2d/roi.json @@ -0,0 +1,30 @@ +[ + <% N_TILES = 4 %> + + % for cluster in range(0,16): + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{cluster * 9 + j + 1}'}", + "roi": [ + % for i in range(0, N_TILES): + {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"}, + % endfor + ] + }, + % endfor + + // DMA core + { + "thread": "${f'hart_{cluster * 9 + 8 + 1}'}", + "roi": [ + {"idx": 1, "label": "${f'tile_in_0'}"}, + % for i in range(0, N_TILES - 1): + {"idx": ${4*i + 3}, "label": "${f'tile_in_{i+1}'}"}, + {"idx": ${4*i + 5}, "label": "${f'tile_out_{i}'}"}, + % endfor + {"idx": ${N_TILES * 4 - 1}, "label": "${f'tile_out_{N_TILES-1}'}"}, + ] + }, + % endfor +] diff --git a/sw/snitch/apps/gemm_2d/src/gemm_2d.c b/sw/snitch/apps/gemm_2d/src/gemm_2d.c new file mode 100644 index 00000000..57dc4797 --- /dev/null +++ b/sw/snitch/apps/gemm_2d/src/gemm_2d.c @@ -0,0 +1,488 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Tim Fischer +// Luca Bertaccini +// Luca Colagrande +// Viviane Potocnik +// Lorenzo leone + +// TODO (lleone): LIMITATIONS +// +// - Works only when M_tile = snrt_cluster_num() +// - Works only if parallelized on M + +#include "snrt.h" +#include +#include + +#include +#include "blas.h" + +// #define HW_MCAST + +// #define JOB_ARGS_PRELOADED + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreorder-init-list" +#include "data.h" +#pragma clang diagnostic pop + +// Allocate data in L2 to better map kernel in NoC system +static inline void allocate_l2_buffers(gemm_args_t *largs ) { + + uint32_t prec = largs->prec; + uint32_t mem_tile_idx = pb_closest_mem_tile(snrt_cluster_idx()); + + uintptr_t a_off = pb_l2_tile_offset((uintptr_t) largs -> a); + uintptr_t c_off = pb_l2_tile_offset((uintptr_t) largs -> c); + uintptr_t a_dst = pb_l2_tile_address(mem_tile_idx) + a_off; + uintptr_t c_dst = pb_l2_tile_address(mem_tile_idx) + c_off; + + // Move data in the correct memory tile location + uint32_t size_a = (size_t)largs->m * (size_t)largs->k; + uint32_t size_c = (size_t)largs->m * (size_t)largs->n; + snrt_dma_start_1d((void *) a_dst, (void *) largs->a, size_a * prec); + snrt_dma_start_1d((void *) c_dst, (void *) largs->c, size_c * prec); + + // Update A and C local pointer to the relocated memory tile address + largs->a = (void *) a_dst; + largs->c = (void *) c_dst; +} + +// Write back C tiles in original memory tile for verification purposes +static inline void write_back_c_tiles(gemm_args_t* largs, uint32_t m_tile_size, + uint32_t n_tile_size) { + uintptr_t c_src, c_dst; + uint32_t transfer_size; + int c_m_abs, c_n_abs; + + + // Position of the first element in the Tile to be written back + c_src = (uintptr_t )largs->c + (snrt_cluster_idx() * largs->n * m_tile_size) * largs->prec; + c_dst = pb_l2_tile_address(0) + pb_l2_tile_offset( (uintptr_t) c_src); + transfer_size = m_tile_size * largs->n * largs->prec; + + if (c_src != c_dst) snrt_dma_start_1d((void *) c_dst, (void *) c_src, transfer_size); +} + +/** + * @brief Performs a General Matrix Multiplication (GEMM) operation on a + * Snitch-based multiple-cluster architecture with support for + * parallelization, tiling, and data movement optimizations. + * + * @param args Pointer to a `gemm_args_t` structure containing arguments + * for the GEMM operation. + * + * @details + * The function performs the following steps: + * 1. Copies the input arguments to local memory for faster access. + * 2. Calculates tile sizes based on the input dimensions and number of tiles. + * 3. Allocates space in TCDM for local copies of matrix tiles, unless + * matrix tiles are already stored in TCDM (see `load_* arguments`). + * 4. Distributes tiles to clusters for parallel processing. + * 5. Iterates over the tiles, performing the following: + * - Copies data for the current tile into local memory. + * - Performs the tile computation using the `sc_st_gemm` function. + * - Performs a logarithmic reduction to combine partial results across + * clusters, if `parallelize_k` is enabled. + * - Writes the result back to global memory. + * + * @note Current implementation assumes that `parallelize_m` and + * `parallelize_k` options are mutually exclusive. + */ +static inline int gemm_picobello(const gemm_args_t *args) { +#ifndef JOB_ARGS_PRELOADED + // Copy the arguments to local memory + gemm_args_t *largs = (gemm_args_t *)snrt_l1_alloc_cluster_local( + sizeof(gemm_args_t), alignof(gemm_args_t)); + if (snrt_is_dm_core()) { + snrt_dma_start_1d((void *)largs, (void *)args, sizeof(gemm_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); +#else + const gemm_args_t *largs = args; +#endif + + // Calculate tile sizes + uint32_t tile_m = largs->m / largs->m_tiles; + uint32_t tile_n = largs->n / largs->n_tiles; + uint32_t tile_k = largs->k / largs->k_tiles; + uint32_t tile_a_size = tile_m * tile_k * largs->prec; + uint32_t tile_b_size = tile_k * tile_n * largs->prec; + uint32_t tile_c_size = tile_m * tile_n * largs->prec; + + // Allocate space for local tile buffers in TCDM, unless preloaded + void *a0, *a1, *b0, *b1, *c0, *c1; + void *la[2], *lb[2], *lc[2], *lcr; + int banks_per_buffer = snrt_cluster_compute_core_num(); + allocate_buffers(tile_a_size, tile_b_size, tile_c_size, largs, + banks_per_buffer, la, lb, lc, &lcr); + if (snrt_cluster_core_idx() == 0) { + DUMP(la[0]); + DUMP(la[1]); + DUMP(lb[0]); + DUMP(lb[1]); + DUMP(lc[0]); + DUMP(lc[1]); + } + snrt_cluster_hw_barrier(); + + // NoC layout (6 columns x 4 rows) + /* + // + |------| |------| |------| |------| |------| |------| + | M3 |---| C3 |---| C7 |---| C11 |---| C15 |---| M7 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M2 |---| C2 |---| C6 |---| C10 |---| C14 |---| M6 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M1 |---| C1 |---| C5 |---| C9 |---| C13 |---| M5 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M0 |---| C0 |---| C4 |---| C8 |---| C12 |---| M4 | + |------| |------| |------| |------| |------| |------| + // + */ + + // Use the DMA core of cluster 0 to place all data in the correct positions + // so the problem becomes NoC-optimized. + // + // Case: parallelization over M, with tiling along both M and N. + // - Each cluster processes a set of rows of A: + // [Cluster_idx * Mt : (Cluster_idx + 1) * Mt - 1]. + // - All clusters share the same set of columns of B: + // [num_iter * Nt : (num_iter + 1) * Nt - 1]. + // - Each cluster computes a full tile of C (no partial results - no reduction). + // + // Memory tile mapping: + // - Rows of A for a given cluster are placed in the same row. + // * The first half of the clusters load A from the memory tiles on the left [tile 0 - 3]. + // * The second half of the clusters load A from the memory tiles on the right [tile 4 - 7]. + // - The same scheme is used to store the corresponding tile of C. + // - Matrix B is stored entirely in the first memory tile. + // * Since all clusters need access to B, its exact location does not affect + // performance significantly. + // + // Notes: + // - All data movement to arrange memory tiles is performed before measuring + // kernel execution time. + // - With a proper linker script, data could be placed directly in the correct + // memory tiles without requiring extra DMA work from cluster 0. + + // Distribute m and k tiles to clusters + uint32_t cluster_m_tiles = largs->m_tiles; + uint32_t cluster_k_tiles = largs->k_tiles; + uint32_t num_working_clusters = snrt_cluster_num(); + if (largs->parallelize_m) { + uint32_t m_tiles_quotient = cluster_m_tiles / snrt_cluster_num(); + uint32_t m_tiles_remainder = cluster_m_tiles % snrt_cluster_num(); + cluster_m_tiles = m_tiles_quotient; + if (snrt_cluster_idx() < m_tiles_remainder) cluster_m_tiles++; + if (m_tiles_quotient == 0) num_working_clusters = m_tiles_remainder; + } + if (largs->parallelize_k) cluster_k_tiles /= snrt_cluster_num(); + + snrt_comm_t comm; + snrt_comm_create(num_working_clusters, &comm); + + // Calculate number of iterations + uint32_t num_tiles = cluster_m_tiles * largs->n_tiles * cluster_k_tiles; + uint32_t num_iters = num_tiles; + if (largs->double_buffer) + num_iters += 2; + else + num_iters += 1; + + // Place data in the correct memory tile pre-kernel. + // TODO (lleone): Improve copying only the necessary information and not the full data stack + if (snrt_is_dm_core()) + { + allocate_l2_buffers(largs); + snrt_dma_wait_all(); + } + snrt_global_barrier(comm); + + + + // Iterate over all tiles + for (uint32_t i = 0; i < num_iters; i++) { + // Calculate tile indices (we iterate in k->n->m order) + int dma_in_i = i; + int comp_i = largs->double_buffer ? i - 1 : i; + int dma_out_i = largs->double_buffer ? i - 2 : i - 1; + int dma_in_k = dma_in_i % cluster_k_tiles; + int dma_in_mn = dma_in_i / cluster_k_tiles; + int dma_in_n = dma_in_mn % largs->n_tiles; + int dma_in_m = dma_in_mn / largs->n_tiles; + int comp_k = comp_i % cluster_k_tiles; + int comp_mn = comp_i / cluster_k_tiles; + int comp_n = comp_mn % largs->n_tiles; + int comp_m = comp_mn / largs->n_tiles; + int dma_out_k = dma_out_i % cluster_k_tiles; + int dma_out_mn = dma_out_i / cluster_k_tiles; + int dma_out_n = dma_out_mn % largs->n_tiles; + int dma_out_m = dma_out_mn / largs->n_tiles; + + // If m and k tiles are parallelized across clusters, + // calculate the absolute m and k indices for each cluster + int dma_in_m_abs = dma_in_m; + int comp_m_abs = comp_m; + int dma_out_m_abs = dma_out_m; + int dma_in_k_abs = dma_in_k; + int comp_k_abs = comp_k; + int dma_out_k_abs = dma_out_k; + if (largs->parallelize_m) { + dma_in_m_abs += snrt_cluster_idx() * cluster_m_tiles; + comp_m_abs += snrt_cluster_idx() * cluster_m_tiles; + dma_out_m_abs += snrt_cluster_idx() * cluster_m_tiles; + } + if (largs->parallelize_k) { + dma_in_k_abs += snrt_cluster_idx() * cluster_k_tiles; + comp_k_abs += snrt_cluster_idx() * cluster_k_tiles; + dma_out_k_abs += snrt_cluster_idx() * cluster_k_tiles; + } + + // In the first k iteration we accumulate with the C matrix + // scaled by beta, in successive iterations we accumulate + // the previous partial result. The tile-level beta is thus + // a function of k: beta(k). + uint32_t comp_k_beta = comp_k_abs == 0 ? largs->beta : 1; + uint32_t dma_in_k_beta = dma_in_k_abs == 0 ? largs->beta : 1; + + // DMA out phase + if (snrt_is_dm_core()) { + if (dma_out_i >= 0) { + snrt_mcycle(); + // Switch buffers + int buff_idx = largs->double_buffer ? dma_out_mn % 2 : 0; + + // Store C + // If parallelize_k, then only cluster 0 must writeback + if ((snrt_cluster_idx() == 0) || !(largs->parallelize_k)) { + if (largs->partition_banks) { + snrt_dma_2d_to_1d( + (void *)((uintptr_t)largs->c + + dma_out_m_abs * tile_c_size), + lc[buff_idx], tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_store_2d_tile(largs->c, lc[buff_idx], + dma_out_m_abs, dma_out_n, tile_m, + tile_n, largs->ldc, largs->prec); + } + snrt_dma_wait_all(); + } + snrt_mcycle(); + } + } + + // DMA in phase + if (snrt_is_dm_core()) { + if (dma_in_i < num_tiles) { + snrt_mcycle(); + // Switch buffers + // A and B buffers are switched every iteration, while the C + // buffer only needs to be switched after fully accumulating + // the result, i.e. after finishing the K loop. + int buff_idx = largs->double_buffer ? dma_in_i % 2 : 0; + int c_buff_idx = largs->double_buffer ? dma_in_mn % 2 : 0; + int load_a = largs->double_buffer ? (dma_in_i < 2) : (dma_in_i < 1); + + // Load A + // TODO (lleone): When tiling on M and parallelizing on M there is no need + // to load At multiple times. + // If you have DOBU, you load twice and then At is available + // in both buffers. This can be done only when Mt is fully parallelizable + // in you system. + if (largs->load_a) { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + la[buff_idx], + (void *)((uintptr_t)largs->a + + dma_in_m_abs * tile_a_size), + tile_a_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + if (load_a) { + snrt_dma_load_2d_tile( + la[buff_idx], largs->a, dma_in_m_abs, dma_in_k_abs, + tile_m, tile_k, largs->lda, largs->prec); + } + } + } + + // Load B + if (largs->load_b) { + if (largs->transb) { + snrt_dma_load_2d_tile(lb[buff_idx], largs->b, dma_in_n, + dma_in_k_abs, tile_n, tile_k, + largs->ldb, largs->prec); + } else { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lb[buff_idx], + (void *)((uintptr_t)largs->b + + dma_in_k_abs * tile_b_size), + tile_b_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + // TODO (lleone): Is it really necessary? + if (largs->parallelize_k) { + snrt_dma_load_2d_tile( + lb[buff_idx], largs->b, dma_in_k_abs, dma_in_n, + tile_k, tile_n, largs->ldb, largs->prec); + } else { + // Multicast B to all clusters + #ifdef HW_MCAST + if (snrt_cluster_idx() == 0) { + // Load B from L2 + snrt_dma_load_2d_tile_mcast( + lb[buff_idx], largs->b, dma_in_k_abs, dma_in_n, + tile_k, tile_n, largs->ldb, largs->prec, 0x003C0000); + } + #else + snrt_dma_load_2d_tile( + lb[buff_idx], largs->b, dma_in_k_abs, dma_in_n, + tile_k, tile_n, largs->ldb, largs->prec); + #endif + } + } + } + } + + // Load C + // C tile is loaded only upon the first k iteration, then + // the C array will contain the partial results from the + // previous iteration + if (largs->load_c && dma_in_k_beta != 0) { + if (dma_in_k_abs == 0) { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lc[c_buff_idx], + (void *)((uintptr_t)largs->c + + dma_in_m_abs * tile_c_size), + tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_load_2d_tile(lc[c_buff_idx], largs->c, + dma_in_m_abs, dma_in_n, + tile_m, tile_n, largs->ldc, + largs->prec); + } + } else if (dma_in_k == 0) { + // Clusters other than the first need to initialize + // the C array to zero in their first iteration + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lc[c_buff_idx], snrt_cluster()->zeromem.mem, + tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_start_1d(lc[c_buff_idx], + snrt_cluster()->zeromem.mem, + tile_c_size); + } + } + } + snrt_dma_wait_all(); + snrt_mcycle(); + } + } + + // Additional barrier required when not double buffering + if (!largs->double_buffer) snrt_global_barrier(comm); + + // Compute phase + if (comp_i >= 0 && comp_i < num_tiles) { + // Switch buffers + int buff_idx = largs->double_buffer ? comp_i % 2 : 0; + int c_buff_idx = largs->double_buffer ? comp_mn % 2 : 0; + + // Only compute cores participate in the tile computation + if (!snrt_is_dm_core()) { + // uint32_t start_cycle = snrt_mcycle(); + + // Tile computation + sc_st_gemm_args_t sc_st_args; + sc_st_args.prec = largs->prec; + sc_st_args.setup_ssr = largs->setup_ssr; + sc_st_args.partition_banks = largs->partition_banks; + sc_st_args.transa = largs->transa; + sc_st_args.transb = largs->transb; + sc_st_args.a = la[buff_idx]; + if (largs->transa) { + sc_st_args.lda = tile_m; + } else if (largs->partition_banks) { + sc_st_args.lda = calculate_partitioned_banks_stride( + banks_per_buffer, tile_k, largs->prec); + } else { + sc_st_args.lda = tile_k; + } + sc_st_args.b = lb[buff_idx]; + if (largs->transb) { + sc_st_args.ldb = tile_k; + } else if (largs->partition_banks) { + sc_st_args.ldb = calculate_partitioned_banks_stride( + banks_per_buffer, tile_n, largs->prec); + } else { + sc_st_args.ldb = tile_n; + } + sc_st_args.beta = comp_k_beta; + sc_st_args.c = lc[c_buff_idx]; + if (largs->partition_banks) { + sc_st_args.ldc = calculate_partitioned_banks_stride( + banks_per_buffer, tile_n, largs->prec); + } else { + sc_st_args.ldc = tile_n; + } + sc_st_args.m = tile_m; + sc_st_args.n = tile_n; + sc_st_args.k = tile_k; + sc_st_gemm(largs->gemm_fp, &sc_st_args); + + // uint32_t end_cycle = snrt_mcycle(); + } + + // Add the partial result tiles from the various clusters together + // in a logarithmic reduction fashion. + // Note: both compute and DMA cores participate in this step. + if (largs->parallelize_k && (comp_k == (cluster_k_tiles - 1))) { + snrt_global_reduction_dma( + (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n); + } + } + + // Synchronize cores after every iteration + snrt_global_barrier(comm); + } + + // Before completing the kernel, each cluster writes back its C tiles in the + // original memory tile. This is necessary only to run teh verify.py script + + if (snrt_is_dm_core() && snrt_cluster_idx() < num_working_clusters) { + write_back_c_tiles(largs, tile_m, tile_n); + } + + return 0; +} + + +int main () { + gemm_picobello(&args); + return 0; +} diff --git a/sw/snitch/apps/mha/app.mk b/sw/snitch/apps/mha/app.mk new file mode 100644 index 00000000..4af8cb3b --- /dev/null +++ b/sw/snitch/apps/mha/app.mk @@ -0,0 +1,15 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := mha +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +$(APP)_DATA_CFG := $(PB_SNITCH_SW_DIR)/apps/$(APP)/data/params.json +SRC_DIR := $(SN_ROOT)/sw/dnn/$(APP)/src +SRCS := $(SRC_DIR)/main.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/dnn/src $(SN_ROOT)/sw/blas + +include $(SN_ROOT)/sw/apps/common.mk +include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk diff --git a/sw/snitch/apps/mha/data/params.json b/sw/snitch/apps/mha/data/params.json new file mode 100644 index 00000000..2c63ab3d --- /dev/null +++ b/sw/snitch/apps/mha/data/params.json @@ -0,0 +1,14 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +{ + num_heads: 2, + L: 16, + S: 16, + d: 16, + B_r: 16, + B_c: 16, + dtype: "FP32", + baseline: true +} \ No newline at end of file diff --git a/sw/snitch/runtime/src/pb_noc_cfg.h b/sw/snitch/runtime/src/pb_noc_cfg.h new file mode 100644 index 00000000..bc285ba6 --- /dev/null +++ b/sw/snitch/runtime/src/pb_noc_cfg.h @@ -0,0 +1,8 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Lorenzo Leone + +#define PB_CLUSTER_PER_ROW 4 +#define PB_CLUSTER_PER_COL 4 diff --git a/sw/snitch/runtime/src/pb_team.c b/sw/snitch/runtime/src/pb_team.c new file mode 100644 index 00000000..cd280f61 --- /dev/null +++ b/sw/snitch/runtime/src/pb_team.c @@ -0,0 +1,19 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +extern inline uintptr_t pb_l2_tile_address(uint32_t tile_idx); + +extern inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr); + +extern inline uint32_t pb_cluster_row(uint32_t cidx); + +extern inline uint32_t pb_cluster_row(); + +extern inline uint32_t pb_cluster_col(uint32_t cidx); + +extern inline uint32_t pb_cluster_col(); + +extern inline uint32_t pb_closest_mem_tile(uint32_t cidx); + +extern inline uint32_t pb_closest_mem_tile(); diff --git a/sw/snitch/runtime/src/pb_team.h b/sw/snitch/runtime/src/pb_team.h new file mode 100644 index 00000000..1fa8d8d0 --- /dev/null +++ b/sw/snitch/runtime/src/pb_team.h @@ -0,0 +1,95 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Lorenzo Leone + + +/** + * @file + * @brief This file contains functions and macros related to Picobello team + * management. + */ + +/** + * @brief Get start address of a memory tile + * @param tile_idx The memory tile idx in the NoC + * @return Start addres of memory tile idx + */ +inline uintptr_t pb_l2_tile_address(uint32_t tile_idx) { + return (uintptr_t) (picobello_addrmap.l2_spm[tile_idx].mem); +} + +/** + * @brief Get the address offset of a data respect to the memory tile start address + * @param src_addr The data absolute address + * @return Address location offset respect to the tile start address + */ +inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr) { + return (src_addr - PICOBELLO_ADDRMAP_L2_SPM_0_BASE_ADDR) % + PICOBELLO_ADDRMAP_L2_SPM_0_SIZE; +} + + +/** + * @brief Get the NoC row index + * @param cidx The cluster index + * @return The Row index + */ +inline uint32_t pb_cluster_row(uint32_t cidx) +{ + return cidx % PB_CLUSTER_PER_ROW; +} + +/** + * @brief Get the NoC row index + * This is a convenience orload of pb_cluster_row() + * @return The Row index + */ +inline uint32_t pb_cluster_row() +{ + return pb_cluster_row(snrt_cluster_idx()); +} + + +/** + * @brief Get the NoC column index + * @param cidx The cluster index + * @return The Column index + */ +inline uint32_t pb_cluster_col(uint32_t cidx) +{ + return cidx / PB_CLUSTER_PER_COL; +} + +/** + * @brief Get the NoC column index + * This is a convenience orload of pb_cluster_row() + * @return The Column index + */ +inline uint32_t pb_cluster_col() +{ + return pb_cluster_col(snrt_cluster_idx()); +} + + +/** + * @brief Get the index of the closest memory tile + * @param cidx The cluster index + * @return Index of the closest memory tile to cidx + */ +inline uint32_t pb_closest_mem_tile(uint32_t cidx) { + uint32_t row = pb_cluster_row(cidx); + // e.g. with 4x4 matrix + // first 8 clusters -> left column tiles 0..3 + // clusters >= 8 -> right column tiles 4..7 + return (cidx < (snrt_cluster_num() / 2)) ? row : (row + PB_CLUSTER_PER_COL); +} + +/** + * @brief Get the index of the closest memory tile + * This is a convenience overload of pb_closest_mem_tile() + */ +inline uint32_t pb_closest_mem_tile() { + return pb_closest_mem_tile(snrt_cluster_idx()); +} diff --git a/sw/snitch/runtime/src/snitch_cluster_memory.c b/sw/snitch/runtime/src/snitch_cluster_memory.c new file mode 120000 index 00000000..adb289b9 --- /dev/null +++ b/sw/snitch/runtime/src/snitch_cluster_memory.c @@ -0,0 +1 @@ +../../../../.deps/snitch_cluster/target/snitch_cluster/sw/runtime/common/snitch_cluster_memory.c \ No newline at end of file diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/src/snrt.S deleted file mode 120000 index 6fb619df..00000000 --- a/sw/snitch/runtime/src/snrt.S +++ /dev/null @@ -1 +0,0 @@ -../../../../.deps/snitch_cluster/target/snitch_cluster/sw/runtime/rtl/src/snrt.S \ No newline at end of file diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/src/snrt.S new file mode 100644 index 00000000..89fa8995 --- /dev/null +++ b/sw/snitch/runtime/src/snrt.S @@ -0,0 +1,18 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#define SNRT_INIT_INT_REGS +#define SNRT_INIT_FP_REGS +#define SNRT_INIT_GP +#define SNRT_INIT_CORE_INFO +#define SNRT_INIT_CLS +#define SNRT_INIT_STACK +#define SNRT_INIT_TLS +#define SNRT_CRT0_PARK + +#include "pb_raw_addrmap.h" +#define SNRT_TCDM_START_ADDR PICOBELLO_ADDRMAP_CLUSTER_0_TCDM_BASE_ADDR + +#include "snitch_cluster_cfg.h" +#include "start.S" diff --git a/sw/snitch/runtime/src/snrt.h b/sw/snitch/runtime/src/snrt.h index 137a3737..8959c80a 100644 --- a/sw/snitch/runtime/src/snrt.h +++ b/sw/snitch/runtime/src/snrt.h @@ -11,7 +11,9 @@ #include "pb_addrmap.h" #include "snitch_cluster_cfg.h" #include "snitch_cluster_peripheral_addrmap.h" -#include "snitch_cluster_raw_addrmap.h" +#include "pb_raw_addrmap.h" +#include "pb_noc_cfg.h" +#define SNRT_TCDM_START_ADDR PICOBELLO_ADDRMAP_CLUSTER_0_TCDM_BASE_ADDR // TODO: the 40000 stride is hardcoded here, but it would better be // autogenerated by Floogen. At the same time that would be @@ -50,6 +52,7 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t; #include "sync.h" #include "team.h" #include "types.h" +#include "pb_team.h" // Accelerators #include "datamover/archi_datamover.h" diff --git a/sw/sw.mk b/sw/sw.mk index c4008512..49144ecf 100644 --- a/sw/sw.mk +++ b/sw/sw.mk @@ -28,12 +28,24 @@ SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build SNRT_INCDIRS = $(PB_INCDIR) $(PB_GEN_DIR) SNRT_BUILD_APPS = OFF SNRT_MEMORY_LD = $(PB_SNITCH_SW_DIR)/memory.ld +SNRT_HAL_BUILD_DIR = $(PB_SNITCH_SW_DIR)/runtime/build SNRT_HAL_HDRS = $(PB_GEN_DIR)/pb_addrmap.h +SNRT_HAL_HDRS += $(PB_GEN_DIR)/pb_raw_addrmap.h -ifneq (,$(filter chs-bootrom% chs-sw% sn% pb-sn-tests% sw%,$(MAKECMDGOALS))) +SNRT_APPS = $(PB_SNITCH_SW_DIR)/apps/gemm_2d +SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/gemm +SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/axpy +SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/flashattention_2 +SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/fused_concat_linear +SNRT_APPS += $(PB_SNITCH_SW_DIR)/apps/mha + +ifneq (,$(filter $(PB_SNITCH_SW_DIR)% chs-bootrom% chs-sw% sn% pb-sn-tests% sw%,$(MAKECMDGOALS))) include $(SN_ROOT)/target/snitch_cluster/sw.mk endif +$(PB_GEN_DIR)/pb_raw_addrmap.h: $(PB_RDL_ALL) + $(PEAKRDL) raw-header $< -o $@ $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) --base_name $(notdir $(basename $@)) --format c + # Collect Snitch tests which should be built PB_SNRT_TESTS_DIR = $(PB_SNITCH_SW_DIR)/tests PB_SNRT_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build @@ -67,7 +79,7 @@ PB_LINK_MODE ?= spm # We need to include the address map and snitch cluster includes CHS_SW_INCLUDES += -I$(PB_INCDIR) -CHS_SW_INCLUDES += -I$(SNRT_HAL_HDRS_DIR) +CHS_SW_INCLUDES += -I$(SNRT_HAL_BUILD_DIR) CHS_SW_INCLUDES += -I$(PB_GEN_DIR) # Collect tests, which should be build for all modes, and their .dump targets @@ -95,8 +107,9 @@ chs-sw-tests-clean: # Alias targets to align them with Picobello naming convention sn-tests-clean: sn-clean-tests sn-runtime-clean: sn-clean-runtime +sn-apps-clean: sn-clean-apps .PHONY: sw sw-tests sw-clean sw-tests-clean -sw sw-tests: chs-sw-tests sn-tests pb-sn-tests +sw sw-tests: chs-sw-tests sn-tests pb-sn-tests sn-apps -sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests +sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests sn-apps-clean diff --git a/target/sim/include/tb_picobello_tasks.svh b/target/sim/include/tb_picobello_tasks.svh index 40db1f36..9158a327 100644 --- a/target/sim/include/tb_picobello_tasks.svh +++ b/target/sim/include/tb_picobello_tasks.svh @@ -12,6 +12,9 @@ import "DPI-C" context function byte read_section(input longint address, inout b import picobello_pkg::*; `include "pb_addrmap.svh" +`include "cheshire/typedef.svh" + +`CHESHIRE_TYPEDEF_ALL(, fix.vip.DutCfg) task automatic jtag_enable_tiles(); $display("Resetting tiles and enabling clock..."); @@ -151,3 +154,128 @@ task automatic fastmode_elf_preload(input string binary, output cheshire_pkg::do void'(get_entry(entry)); $display("[FAST_PRELOAD] Preload complete"); endtask + +// Suitable for loading ELFs with 32b-aligned sections +task automatic jtag_32b_elf_preload(input string binary, output bit [63:0] entry); + longint sec_addr, sec_len; + dm::sbcs_t sbcs = dm::sbcs_t +'{sbautoincrement: 1'b1, sbreadondata: 1'b1, sbaccess: 2, default: '0}; + $display("[JTAG] Preloading ELF binary: %s", binary); + if (fix.vip.read_elf(binary)) $fatal(1, "[JTAG] Failed to load ELF!"); + while (fix.vip.get_section( + sec_addr, sec_len + )) begin + byte bf[] = new[sec_len]; + $display("[JTAG] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len); + if (fix.vip.read_section(sec_addr, bf, sec_len)) + $fatal(1, "[JTAG] Failed to read ELF section!"); + fix.vip.jtag_write(dm::SBCS, sbcs, 1, 1); + // Write address as 64-bit double + fix.vip.jtag_write(dm::SBAddress1, sec_addr[63:32]); + fix.vip.jtag_write(dm::SBAddress0, sec_addr[31:0]); + for (longint i = 0; i <= sec_len; i += 4) begin + bit checkpoint = (i != 0 && i % 512 == 0); + if (checkpoint) + $display( + "[JTAG] - %0d/%0d bytes (%0d%%)", + i, + sec_len, + i * 100 / (sec_len > 1 ? sec_len - 1 : 1) + ); + fix.vip.jtag_write(dm::SBData0, {bf[i+3], bf[i+2], bf[i+1], bf[i]}, checkpoint, checkpoint); + end + end + void'(get_entry(entry)); + $display("[JTAG] Preload complete"); +endtask + +// Handles misalignments, burst limits and 4KiB crossings +task automatic slink_write_generic(input addr_t addr, input longint size, ref byte bytes[]); + // Using `slink_write_beats`, writes must be beat-aligned and beat-sized (strobing is not + // possible). If we have a misaligned transfer of arbitrary size we may have at most two + // incomplete beats (start and end) and one misaligned beat (start). In case of an incomplete + // beat we read-modify-write the full beat. + + // Burst and beat geometry + const int beat_bytes = fix.vip.AxiStrbWidth; + const int beat_mask = beat_bytes - 1; + const int SlinkBurstBeats = fix.vip.SlinkBurstBytes / beat_bytes; + + // Iterate beat-by-beat over the address range [addr, addr+size) + addr_t first_aligned = addr_t'(addr) & ~addr_t'(beat_mask); + addr_t end_addr = addr_t'(addr + size); + addr_t last_aligned = addr_t'((end_addr - 1) & ~addr_t'(beat_mask)); + + // Running index into bytes[]: "how many bytes have we already consumed?" + longint base_idx = 0; + + // Group beats in a burst + addr_t batch_addr = first_aligned; + axi_data_t burst [$]; + burst = {}; + + for (addr_t beat_addr = first_aligned; beat_addr <= last_aligned; beat_addr += beat_bytes) begin + addr_t next_addr; + bit crosses_4k_next, exceeds_burst_length, last_beat_in_section; + + // Window of the current beat that has to be written + int start_off = (beat_addr == first_aligned) ? int'(addr & beat_mask) : 0; + int end_off_excl = (beat_addr == last_aligned) ? int'(end_addr - last_aligned) : beat_bytes; + int win_len = end_off_excl - start_off; + + // Compose beat + axi_data_t beat = '0; + if (win_len == beat_bytes && start_off == 0) begin + // FULL BEAT: write directly, no RMW + for (int e = 0; e < beat_bytes; e++) begin + beat[8*e+:8] = bytes[base_idx+e]; + end + end else begin + // PARTIAL BEAT: RMW + axi_data_t rd[$]; + fix.vip.slink_read_beats(beat_addr, fix.vip.AxiStrbBits, 0, rd); + beat = rd[0]; + for (int i = 0; i < win_len; i++) begin + beat[8*(start_off+i)+:8] = bytes[base_idx+i]; + end + end + + // Accumulate and advance + burst.push_back(beat); + base_idx += win_len; + + // Decide if the next beat would cross a 4 KiB boundary, exceed the maximum burst length + // or this is the last beat + next_addr = beat_addr + win_len; + crosses_4k_next = ((next_addr & 12'hFFF) == 12'h000); // next beat starts a new page + exceeds_burst_length = (burst.size() == SlinkBurstBeats); + last_beat_in_section = (beat_addr == last_aligned); + + if (crosses_4k_next || exceeds_burst_length || last_beat_in_section) begin + // Flush accumulated beats for this page + fix.vip.slink_write_beats(batch_addr, fix.vip.AxiStrbBits, burst); + burst = {}; + batch_addr = next_addr; + end + end +endtask + +task automatic slink_32b_elf_preload(input string binary, output bit [63:0] entry); + longint sec_addr, sec_len; + + $display("[SLINK] Preloading ELF binary: %s", binary); + if (fix.vip.read_elf(binary)) $fatal(1, "[SLINK] Failed to load ELF!"); + + while (fix.vip.get_section( + sec_addr, sec_len + )) begin + byte bf[] = new[sec_len]; + $display("[SLINK] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len); + if (fix.vip.read_section(sec_addr, bf, sec_len)) + $fatal(1, "[SLINK] Failed to read ELF section!"); + slink_write_generic(sec_addr, sec_len, bf); + end + + void'(fix.vip.get_entry(entry)); + $display("[SLINK] Preload complete"); +endtask diff --git a/target/sim/src/tb_picobello_top.sv b/target/sim/src/tb_picobello_top.sv index dcf8920b..673cbe88 100644 --- a/target/sim/src/tb_picobello_top.sv +++ b/target/sim/src/tb_picobello_top.sv @@ -10,9 +10,6 @@ module tb_picobello_top; gen_sram_banks[j].gen_sram_macros[k].i_mem.sram `include "tb_picobello_tasks.svh" - `include "cheshire/typedef.svh" - - `CHESHIRE_TYPEDEF_ALL(, fix.vip.DutCfg) // Instantiate the fixture fixture_picobello_top fix (); @@ -28,90 +25,6 @@ module tb_picobello_top; int snitch_fn; int chs_fn; - // Load Snitch binary - task automatic jtag_32b_elf_preload(input string binary, output bit [63:0] entry); - longint sec_addr, sec_len; - dm::sbcs_t sbcs = dm::sbcs_t -'{sbautoincrement: 1'b1, sbreadondata: 1'b1, sbaccess: 2, default: '0}; - $display("[JTAG] Preloading ELF binary: %s", binary); - if (fix.vip.read_elf(binary)) $fatal(1, "[JTAG] Failed to load ELF!"); - while (fix.vip.get_section( - sec_addr, sec_len - )) begin - byte bf[] = new[sec_len]; - $display("[JTAG] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len); - if (fix.vip.read_section(sec_addr, bf, sec_len)) - $fatal(1, "[JTAG] Failed to read ELF section!"); - fix.vip.jtag_write(dm::SBCS, sbcs, 1, 1); - // Write address as 64-bit double - fix.vip.jtag_write(dm::SBAddress1, sec_addr[63:32]); - fix.vip.jtag_write(dm::SBAddress0, sec_addr[31:0]); - for (longint i = 0; i <= sec_len; i += 4) begin - bit checkpoint = (i != 0 && i % 512 == 0); - if (checkpoint) - $display( - "[JTAG] - %0d/%0d bytes (%0d%%)", - i, - sec_len, - i * 100 / (sec_len > 1 ? sec_len - 1 : 1) - ); - fix.vip.jtag_write(dm::SBData0, {bf[i+3], bf[i+2], bf[i+1], bf[i]}, checkpoint, checkpoint); - end - end - void'(get_entry(entry)); - $display("[JTAG] Preload complete"); - endtask - - task automatic slink_32b_elf_preload(input string binary, output bit [63:0] entry); - longint sec_addr, sec_len; - $display("[SLINK] Preloading ELF binary: %s", binary); - if (fix.vip.read_elf(binary)) $fatal(1, "[SLINK] Failed to load ELF!"); - while (fix.vip.get_section( - sec_addr, sec_len - )) begin - byte bf [] = new[sec_len]; - int burst_len; - $display("[SLINK] Preloading section at 0x%h (%0d bytes)", sec_addr, sec_len); - if (fix.vip.read_section(sec_addr, bf, sec_len)) - $fatal(1, "[SLINK] Failed to read ELF section!"); - // Write section in bursts <= SlinkBurstBytes that never cross a 4 KiB page - for (longint sec_offs = 0; sec_offs < sec_len; sec_offs += burst_len) begin - longint sec_left, page_left; - axi_data_t beats [$]; - int bus_offs; - addr_t addr_cur = sec_addr + sec_offs; - if (sec_offs != 0) begin - $display("[SLINK] - %0d/%0d bytes (%0d%%)", sec_offs, sec_len, - sec_offs * 100 / (sec_len > 1 ? sec_len - 1 : 1)); - end - // By default the burst length is SlinkBurstBytes - burst_len = fix.vip.SlinkBurstBytes; - // Cut the burst length if it exceeds the remaining section length - // or it crosses a 4 KiB page boundary - sec_left = sec_len - sec_offs; - page_left = 4096 - (addr_cur & 12'hFFF); - if (burst_len > sec_left) burst_len = int'(sec_left); - if (burst_len > page_left) burst_len = int'(page_left); - bus_offs = addr_cur[fix.vip.AxiStrbBits-1:0]; - - // If the address is not aligned subtract the offset from the burst length to avoid an additional write - burst_len = burst_len - bus_offs; - // Assemble beats, handling unaligned start in the first beat - for (int b = -bus_offs; b < burst_len; b += fix.vip.AxiStrbWidth) begin - axi_data_t beat = '0; - for (int e = 0; e < fix.vip.AxiStrbWidth; ++e) - if (b + e >= 0 && b + e < burst_len) beat[8*e+:8] = bf[sec_offs+b+e]; - beats.push_back(beat); - end - // Address must be beat‑aligned for slink_write_beats - fix.vip.slink_write_beats(addr_cur - bus_offs, fix.vip.AxiStrbBits, beats); - end - end - void'(fix.vip.get_entry(entry)); - $display("[SLINK] Preload complete"); - endtask - - initial begin // Fetch plusargs or use safe (fail-fast) defaults if (!$value$plusargs("BOOTMODE=%d", boot_mode)) boot_mode = 0; diff --git a/target/sim/vsim/vsim.mk b/target/sim/vsim/vsim.mk index 438489fc..40e0ad30 100644 --- a/target/sim/vsim/vsim.mk +++ b/target/sim/vsim/vsim.mk @@ -53,3 +53,8 @@ vsim-run: vsim-run-batch: $(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit" + +vsim-run-batch-verify: vsim-run-batch +ifdef VERIFY_PY + $(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000 +endif \ No newline at end of file