From 8e0540b5e6da47fe1715348819980d459deed3e0 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 11:14:44 +1000 Subject: [PATCH 01/12] setup-guide.rst: note amaranth-yosys requirement Notes that amaranth-yosys may be required. Installation of amaranth-yosys needs pip3 which is usually supplied by conda or a Python virtual environement. Signed-off-by: Alan Green --- docs/source/setup-guide.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/setup-guide.rst b/docs/source/setup-guide.rst index 4d0856092..e3b26c879 100644 --- a/docs/source/setup-guide.rst +++ b/docs/source/setup-guide.rst @@ -45,6 +45,12 @@ This updates submodules, builds some local executables, and installs missing Lin cd CFU-Playground ./scripts/setup +If you intend to use Amaranth to build CFUs, you may need a compatible version of Yosys, +which can be installed with: + +.. code-block:: bash + + pip3 install amaranth-yosys Step 4: Install Toolchain -------------------------------------------- From 5d91f189d6ef2a58999c675dc32c02153675848e Mon Sep 17 00:00:00 2001 From: Alan Green Date: Mon, 18 Apr 2022 12:43:39 +1000 Subject: [PATCH 02/12] fccm_tutorial: copy from proj_template Creates a new directory, fccm_tutorial, which is a (largely) a copy of proj_template. This directory will contain code for CFU Playground tutorial to be held at FCCM 2022. Signed-off-by: Alan Green --- proj/fccm_tutorial/Makefile | 47 +++++++++++++ proj/fccm_tutorial/cfu.py | 67 +++++++++++++++++++ proj/fccm_tutorial/cfu_gen.py | 38 +++++++++++ proj/fccm_tutorial/ci/ci_build_params.txt | 0 proj/fccm_tutorial/ci/ci_exclude_targets.txt | 23 +++++++ proj/fccm_tutorial/proj_template.robot | 20 ++++++ proj/fccm_tutorial/src/README.md | 13 ++++ proj/fccm_tutorial/src/proj_menu.cc | 69 ++++++++++++++++++++ proj/fccm_tutorial/src/software_cfu.cc | 31 +++++++++ 9 files changed, 308 insertions(+) create mode 100644 proj/fccm_tutorial/Makefile create mode 100644 proj/fccm_tutorial/cfu.py create mode 100644 proj/fccm_tutorial/cfu_gen.py create mode 100644 proj/fccm_tutorial/ci/ci_build_params.txt create mode 100644 proj/fccm_tutorial/ci/ci_exclude_targets.txt create mode 100644 proj/fccm_tutorial/proj_template.robot create mode 100644 proj/fccm_tutorial/src/README.md create mode 100644 proj/fccm_tutorial/src/proj_menu.cc create mode 100644 proj/fccm_tutorial/src/software_cfu.cc diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile new file mode 100644 index 000000000..e7e13296d --- /dev/null +++ b/proj/fccm_tutorial/Makefile @@ -0,0 +1,47 @@ +#!/bin/env python +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This variable lists symbols to define to the C preprocessor +export DEFINES := + +# Uncomment this line to use software defined CFU functions in software_cfu.cc +#DEFINES += CFU_SOFTWARE_DEFINED + +# Uncomment this line to skip debug code (large effect on performance) +DEFINES += NDEBUG + +# Uncomment this line to skip individual profiling output (has minor effect on performance). +#DEFINES += NPROFILE + +# Uncomment to include specified model in built binary +DEFINES += INCLUDE_MODEL_PDTI8 +#DEFINES += INCLUDE_MODEL_MICRO_SPEECH +#DEFINES += INCLUDE_MODEL_MAGIC_WAND +#DEFINES += INCLUDE_MODEL_MNV2 +#DEFINES += INCLUDE_MODEL_HPS +#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_ANOMD +#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_IMGC +#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS +#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_VWW + +# Uncomment to include all TFLM examples (pdti8, micro_speech, magic_wand) +#DEFINES += INCLUDE_ALL_TFLM_EXAMPLES + + +# Defaults to Symbiflow and Arty +TARGET=digilent_arty +USE_SYMBIFLOW=1 + +include ../proj.mk diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py new file mode 100644 index 000000000..3761c1231 --- /dev/null +++ b/proj/fccm_tutorial/cfu.py @@ -0,0 +1,67 @@ +#!/bin/env python +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from amaranth import * +from amaranth_cfu import InstructionBase, InstructionTestBase, simple_cfu, CfuTestBase +import unittest + +# See proj_example for further example instructions + + +class TemplateInstruction(InstructionBase): + """Template instruction + """ + + def elab(self, m): + with m.If(self.start): + m.d.sync += self.output.eq(self.in0 + self.in1) + m.d.sync += self.done.eq(1) + with m.Else(): + m.d.sync += self.done.eq(0) + + +class TemplateInstructionTest(InstructionTestBase): + def create_dut(self): + return TemplateInstruction() + + def test(self): + self.verify([ + (0, 0, 0), + (4, 5, 9), + (0xffffffff, 0xffffffff, 0xfffffffe), + ]) + + +def make_cfu(): + return simple_cfu({ + # Add instructions here... + 0: TemplateInstruction(), + }) + + +class CfuTest(CfuTestBase): + def create_dut(self): + return make_cfu() + + def test(self): + DATA = [ + # Test CFU calls here... + ((0, 22, 22), 44), + ] + return self.run_ops(DATA) + + +if __name__ == '__main__': + unittest.main() diff --git a/proj/fccm_tutorial/cfu_gen.py b/proj/fccm_tutorial/cfu_gen.py new file mode 100644 index 000000000..7761d9841 --- /dev/null +++ b/proj/fccm_tutorial/cfu_gen.py @@ -0,0 +1,38 @@ +# Copyright 2021 The CFU-Playground Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path +from amaranth import * +from amaranth.back import rtlil, verilog + +from cfu import make_cfu + +VERILOG_FILENAME = "cfu.v" + +def read_file(): + if os.path.exists(VERILOG_FILENAME): + with open(VERILOG_FILENAME, "r") as f: + return f.read() + return None + +def main(): + cfu = make_cfu() + new_verilog = verilog.convert(cfu, name='Cfu', ports=cfu.ports) + old_verilog = read_file() + if new_verilog != old_verilog: + with open(VERILOG_FILENAME, "w") as f: + f.write(new_verilog) + +if __name__ == '__main__': + main() diff --git a/proj/fccm_tutorial/ci/ci_build_params.txt b/proj/fccm_tutorial/ci/ci_build_params.txt new file mode 100644 index 000000000..e69de29bb diff --git a/proj/fccm_tutorial/ci/ci_exclude_targets.txt b/proj/fccm_tutorial/ci/ci_exclude_targets.txt new file mode 100644 index 000000000..8d0ea72f0 --- /dev/null +++ b/proj/fccm_tutorial/ci/ci_exclude_targets.txt @@ -0,0 +1,23 @@ +1bitsquared_icebreaker_bitsy +colorlight_5a_75x +decklink_intensity_pro_4k +digilent_basys3 +digilent_cmod_a7 +ego1 +lattice_crosslink_nx_evn +lattice_crosslink_nx_vip +lattice_ecp5_evn +lattice_ice40up5k_evn +micronova_mercury2 +muselab_icesugar +pano_logic_g2 +redpitaya +simple +sqrl_fk33 +terasic_deca +terasic_sockit +tinyfpga_bx +trenz_te0725 +xilinx_zybo_z7 +qmtech_xc7a35t +hps diff --git a/proj/fccm_tutorial/proj_template.robot b/proj/fccm_tutorial/proj_template.robot new file mode 100644 index 000000000..4044b734c --- /dev/null +++ b/proj/fccm_tutorial/proj_template.robot @@ -0,0 +1,20 @@ +*** Settings *** +Suite Setup Setup +Suite Teardown Teardown +Test Setup Reset Emulation +Test Teardown Test Teardown +Resource ${RENODEKEYWORDS} + +*** Test Cases *** +Should Walk The Menu + Execute Command include @${CURDIR}/TARGET.resc + Create Terminal Tester sysbus.uart + + Start Emulation + + Wait For Line On Uart CFU Playground + Wait For Prompt On Uart main> + Write Line To Uart 3 + Wait For Line On Uart Project Menu + Write Line To Uart h + Wait For Line On Uart Hello, World! diff --git a/proj/fccm_tutorial/src/README.md b/proj/fccm_tutorial/src/README.md new file mode 100644 index 000000000..0f41d6261 --- /dev/null +++ b/proj/fccm_tutorial/src/README.md @@ -0,0 +1,13 @@ +# Source Overlay directory + +The files in this directory will be "overlaid" on a copy of the common source tree and Tensorflow Lite for Microcontrollers. + +For example: + + * the proj_menu.c file in this directory replaces the proj_menu.c file in + common when building. + * to replace the integer 2D convolution implementation, you could create a + file with the path: + + tensorflow/lite/kernels/interal/reference/integer_ops/conv.h. + diff --git a/proj/fccm_tutorial/src/proj_menu.cc b/proj/fccm_tutorial/src/proj_menu.cc new file mode 100644 index 000000000..a0e5f794a --- /dev/null +++ b/proj/fccm_tutorial/src/proj_menu.cc @@ -0,0 +1,69 @@ +/* + * Copyright 2021 The CFU-Playground Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "proj_menu.h" + +#include + +#include "cfu.h" +#include "menu.h" + +namespace { + +// Template Fn +void do_hello_world(void) { puts("Hello, World!!!\n"); } + +// Test template instruction +void do_exercise_cfu_op0(void) { + puts("\r\nExercise CFU Op0 aka ADD\r\n"); + + unsigned int a = 0; + unsigned int b = 0; + unsigned int cfu = 0; + unsigned int count = 0; + unsigned int pass_count = 0; + unsigned int fail_count = 0; + + for (a = 0x00004567; a < 0xF8000000; a += 0x00212345) { + for (b = 0x0000ba98; b < 0xFF000000; b += 0x00770077) { + cfu = cfu_op0(0, a, b); + if (cfu != a + b) { + printf("[%4d] a: %08x b:%08x a+b=%08x cfu=%08x FAIL\r\n", count, a, b, + a + b, cfu); + fail_count++; + } else { + pass_count++; + } + count++; + } + } + + printf("\r\nPerformed %d comparisons, %d pass, %d fail\r\n", count, + pass_count, fail_count); +} + +struct Menu MENU = { + "Project Menu", + "project", + { + MENU_ITEM('0', "exercise cfu op0", do_exercise_cfu_op0), + MENU_ITEM('h', "say Hello", do_hello_world), + MENU_END, + }, +}; +}; // anonymous namespace + +extern "C" void do_proj_menu() { menu_run(&MENU); } diff --git a/proj/fccm_tutorial/src/software_cfu.cc b/proj/fccm_tutorial/src/software_cfu.cc new file mode 100644 index 000000000..a8886ff54 --- /dev/null +++ b/proj/fccm_tutorial/src/software_cfu.cc @@ -0,0 +1,31 @@ +/* + * Copyright 2021 The CFU-Playground Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "software_cfu.h" + +// +// In this function, place C code to emulate your CFU. You can switch between +// hardware and emulated CFU by setting the CFU_SOFTWARE_DEFINED DEFINE in +// the Makefile. +uint32_t software_cfu(int funct3, int funct7, uint32_t rs1, uint32_t rs2) +{ + if (funct3 == 0) + { + return rs1 + rs2; + } + return rs1; +} From e75445de139d0bdd0c9af0aa68b588e1117bcec0 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 15:57:55 +1000 Subject: [PATCH 03/12] fccm_tutorial/software_cfu.cc: New Software CFU Adds a software based CFU emulation and a test in proj_menu.cc. Signed-off-by: Alan Green --- proj/fccm_tutorial/Makefile | 2 +- proj/fccm_tutorial/src/fccm_cfu.h | 26 +++++++++++++++ proj/fccm_tutorial/src/proj_menu.cc | 46 +++++++++----------------- proj/fccm_tutorial/src/software_cfu.cc | 38 ++++++++++++++++----- 4 files changed, 72 insertions(+), 40 deletions(-) create mode 100644 proj/fccm_tutorial/src/fccm_cfu.h diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile index e7e13296d..31e022b84 100644 --- a/proj/fccm_tutorial/Makefile +++ b/proj/fccm_tutorial/Makefile @@ -17,7 +17,7 @@ export DEFINES := # Uncomment this line to use software defined CFU functions in software_cfu.cc -#DEFINES += CFU_SOFTWARE_DEFINED +DEFINES += CFU_SOFTWARE_DEFINED # Uncomment this line to skip debug code (large effect on performance) DEFINES += NDEBUG diff --git a/proj/fccm_tutorial/src/fccm_cfu.h b/proj/fccm_tutorial/src/fccm_cfu.h new file mode 100644 index 000000000..97da8dd85 --- /dev/null +++ b/proj/fccm_tutorial/src/fccm_cfu.h @@ -0,0 +1,26 @@ +/* + * Copyright 2022 The CFU-Playground Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cfu.h" + +#ifndef _FCCM_CFH_H +#define _FCCM_CFU_H + +#define cfu_reset() cfu_op0(0, 0, 0) +#define cfu_accumulate(a, b) cfu_op1(0, a, b) +#define cfu_read() cfu_op1(0, 0, 0) + +#endif diff --git a/proj/fccm_tutorial/src/proj_menu.cc b/proj/fccm_tutorial/src/proj_menu.cc index a0e5f794a..3714ecfee 100644 --- a/proj/fccm_tutorial/src/proj_menu.cc +++ b/proj/fccm_tutorial/src/proj_menu.cc @@ -1,5 +1,5 @@ /* - * Copyright 2021 The CFU-Playground Authors + * Copyright 2022 The CFU-Playground Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ #include -#include "cfu.h" +#include "fccm_cfu.h" #include "menu.h" namespace { @@ -26,44 +26,28 @@ namespace { // Template Fn void do_hello_world(void) { puts("Hello, World!!!\n"); } -// Test template instruction -void do_exercise_cfu_op0(void) { - puts("\r\nExercise CFU Op0 aka ADD\r\n"); - - unsigned int a = 0; - unsigned int b = 0; - unsigned int cfu = 0; - unsigned int count = 0; - unsigned int pass_count = 0; - unsigned int fail_count = 0; - - for (a = 0x00004567; a < 0xF8000000; a += 0x00212345) { - for (b = 0x0000ba98; b < 0xFF000000; b += 0x00770077) { - cfu = cfu_op0(0, a, b); - if (cfu != a + b) { - printf("[%4d] a: %08x b:%08x a+b=%08x cfu=%08x FAIL\r\n", count, a, b, - a + b, cfu); - fail_count++; - } else { - pass_count++; - } - count++; - } - } - - printf("\r\nPerformed %d comparisons, %d pass, %d fail\r\n", count, - pass_count, fail_count); +// Tests multiply-add CFU +void do_test_cfu(void) { + printf("\r\nCFU Test... "); + + // Calculated on a spreadsheet + cfu_reset(); + cfu_accumulate(0x884CF61A, 0xE49F2F1C); + cfu_accumulate(0x0BE31854, 0x527FDBCF); + cfu_accumulate(0xADB43251, 0x4E36D172); + cfu_accumulate(0x6F867FFB, 0x442C7B76); + printf("%s\n", static_cast(cfu_read()) == 81978 ? "PASS!" : "FAIL"); } struct Menu MENU = { "Project Menu", "project", { - MENU_ITEM('0', "exercise cfu op0", do_exercise_cfu_op0), + MENU_ITEM('1', "test cfu", do_test_cfu), MENU_ITEM('h', "say Hello", do_hello_world), MENU_END, }, }; }; // anonymous namespace -extern "C" void do_proj_menu() { menu_run(&MENU); } +extern "C" void do_proj_menu() { menu_run(&MENU); } \ No newline at end of file diff --git a/proj/fccm_tutorial/src/software_cfu.cc b/proj/fccm_tutorial/src/software_cfu.cc index a8886ff54..028599fac 100644 --- a/proj/fccm_tutorial/src/software_cfu.cc +++ b/proj/fccm_tutorial/src/software_cfu.cc @@ -1,5 +1,5 @@ /* - * Copyright 2021 The CFU-Playground Authors + * Copyright 2022 The CFU-Playground Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,40 @@ * limitations under the License. */ -#include #include "software_cfu.h" +#include + +int32_t accumulator; + +// Gets a byte as an int8 from the given word +inline int8_t extract_byte(uint32_t word, int num) { + return static_cast(0xff & (word >> (num * 8))); +} + +// Multipy rs1 bytes by rs2 bytes and sum everything together +int32_t multiply_add4(uint32_t rs1, uint32_t rs2){ + return ( + (128 + extract_byte(rs1, 0)) * extract_byte(rs2, 0) + + (128 + extract_byte(rs1, 1)) * extract_byte(rs2, 1) + + (128 + extract_byte(rs1, 2)) * extract_byte(rs2, 2) + + (128 + extract_byte(rs1, 3)) * extract_byte(rs2, 3)); +} + + // // In this function, place C code to emulate your CFU. You can switch between // hardware and emulated CFU by setting the CFU_SOFTWARE_DEFINED DEFINE in // the Makefile. -uint32_t software_cfu(int funct3, int funct7, uint32_t rs1, uint32_t rs2) -{ - if (funct3 == 0) - { - return rs1 + rs2; +uint32_t software_cfu(int funct3, int funct7, uint32_t rs1, uint32_t rs2) { + switch (funct3) { + case 0: + accumulator = 0; + break; + case 1: + accumulator += multiply_add4(rs1, rs2); + default: + break; } - return rs1; + return static_cast(accumulator); } From 50b1d67101f87be921fd9b3d81fbcbc86e1d8de3 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Mon, 18 Apr 2022 12:45:37 +1000 Subject: [PATCH 04/12] fccm_tutorial/README.md: new readme file Adds a readme file, outlining the eventual content of this directory. Signed-off-by: Alan Green --- proj/fccm_tutorial/README.md | 74 ++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 proj/fccm_tutorial/README.md diff --git a/proj/fccm_tutorial/README.md b/proj/fccm_tutorial/README.md new file mode 100644 index 000000000..c0a9f0b71 --- /dev/null +++ b/proj/fccm_tutorial/README.md @@ -0,0 +1,74 @@ +# FCCM Tutorial Example + +This example accelerator was created for the FCCM 2022 tutorial +"CFU-Playground: Build Your Own Custom TinyML Processor". + +https://www.fccm.org/workshop-tutorial-2022/ + +## Amaranth CFU + +`cfu.py` contains a complete CFU written in Amaranth. It can perform +these functions: + +* Operation 0: Reset accumulator +* Operation 1: Read accumulator +* Operation 2: Set signed offset +* Operation 3: 4-way multiply accumulate. + +Test cases can be run by executing `cfu.py`: + +``` +$ ../../scripts/pyrun cfu.py +``` + +## Building and Running + +To build and program a Digilent Arty board, first follow the standard [setup +instructions](https://cfu-playground.readthedocs.io/en/latest/setup-guide.html) +to install Symbiflow and a RISCV compiler. Then: + +``` +$ make TARGET=digilent_arty USE_SYMBIFLOW=1 prog +``` + +You should see the familiar flashing lights, then: + +``` +$ make TARGET=digilent_arty USE_SYMBIFLOW=1 BUILD_JOBS=8 prog +``` + +This will load the software and start a terminal. Interesting options are: + +* 1 (Models), 1 (person detection int 8), 1 (person) [[check this]] +* 3 (Project menu), 1 (Exercise CFU) + +To ignore the CFU when running models, comment out this line: + +``` +# DEFINES += ACCEL_CONV2D +``` + +With gateware ignored, the inference times are very close to the inference +times as measured with `proj/proj_template`. + +To use the CFU operation emulator defined in the `src/software_cfu.cc` file, +uncomment this line: + +``` +# DEFINES += CFU_SOFTWARE_DEFINED +``` + +While it is much slower, it is often convenient to use emulated operations +while debugging. + + +## `proj_menu.cc` + +Contains snippets demonstrating the integration of the CFU with the SoC. + +[[Insert instructions here]] + +## Tensorflow Lite for Microcontrollers + + + From 7552608cf2d6e50cc6514dea8090f8aa795d9dd3 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Mon, 18 Apr 2022 14:39:42 +1000 Subject: [PATCH 05/12] fccm_tutorial/cfu.py: Add 4x Multiply-Accumulate Adds a 4x parallel multiply accumulate operation. Signed-off-by: Alan Green --- proj/fccm_tutorial/cfu.py | 102 ++++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 31 deletions(-) diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py index 3761c1231..855dab6be 100644 --- a/proj/fccm_tutorial/cfu.py +++ b/proj/fccm_tutorial/cfu.py @@ -14,54 +14,94 @@ # limitations under the License. from amaranth import * -from amaranth_cfu import InstructionBase, InstructionTestBase, simple_cfu, CfuTestBase +from amaranth.sim import Delay, Tick +from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals import unittest -# See proj_example for further example instructions +class MultiplyAdd4(SimpleElaboratable): + """Performs four, 8 bit wide multiply-accumulates in parallel. - -class TemplateInstruction(InstructionBase): - """Template instruction + Uses `SimpleElaboratable` helper class as a convenience. """ + def __init__(self): + # "a" and "b" inputs - each four, 8 bit signed numbers + self.a_word = Signal(32) + self.b_word = Signal(32) - def elab(self, m): - with m.If(self.start): - m.d.sync += self.output.eq(self.in0 + self.in1) - m.d.sync += self.done.eq(1) - with m.Else(): - m.d.sync += self.done.eq(0) - + # clear to reset accumulator, enable to perform multiply- accumulate + self.clear = Signal() + self.enable = Signal() -class TemplateInstructionTest(InstructionTestBase): - def create_dut(self): - return TemplateInstruction() + # result + self.accumulator = Signal(signed(32)) - def test(self): - self.verify([ - (0, 0, 0), - (4, 5, 9), - (0xffffffff, 0xffffffff, 0xfffffffe), - ]) + def elab(self, m): + """The actual gateware produced""" + # Divide a_word and b_word each into four, 8-bit parts + a_bytes = [self.a_word[i:i+8].as_signed() for i in range(0, 32, 8)] + b_bytes = [self.b_word[i:i+8].as_signed() for i in range(0, 32, 8)] -def make_cfu(): - return simple_cfu({ - # Add instructions here... - 0: TemplateInstruction(), - }) + # Calculate the sum of (a+offset)*b for each part + calculations = [(a + Const(128)) * b for a, b in zip(a_bytes, b_bytes)] + summed = Signal(signed(32)) + m.d.comb += summed.eq(sum(calculations)) + + with m.If(self.clear): + m.d.sync += self.accumulator.eq(0) + with m.Elif(self.enable): + m.d.sync += self.accumulator.eq(self.accumulator + summed) -class CfuTest(CfuTestBase): +class MultiplyAdd4Test(TestBase): def create_dut(self): - return make_cfu() + return MultiplyAdd4() def test(self): + + def a(a, b, c, d): return pack_vals(a, b, c, d, offset=-128) + def b(a, b, c, d): return pack_vals(a, b, c, d, offset=0) DATA = [ - # Test CFU calls here... - ((0, 22, 22), 44), + # (a_word, b_word, enable, clear), expected accumulator + ((a(0, 0, 0, 0), b(0, 0, 0, 0), 0, 0), 0), + + # Simple tests: with just first byte + ((a(10, 0, 0, 0), b(3, 0, 0, 0), 1, 0), 0), + ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 1, 0), 30), + ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 0), -14), + # Since was not enabled last cycle, accumulator will not change + ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 1, 0), -14), + # Since was enabled last cycle, will change accumlator + ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 1), -58), + # Accumulator cleared + ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 0), 0), + + # Uses all bytes (calculated on a spreadsheet) + ((a(99, 22, 2, 1), b(-2, 6, 7, 111), 1, 0), 0), + ((a(2, 45, 79, 22), b(-33, 6, -97, -22), 1, 0), 59), + ((a(23, 34, 45, 56), b(-128, -121, 119, 117), 1, 0), -7884), + ((a(188, 34, 236, 246), b(-87, 56, 52, -117), 1, 0), -3035), + ((a(131, 92, 21, 83), b(-114, -72, -31, -44), 1, 0), -33997), + ((a(74, 68, 170, 39), b(102, 12, 53, -128), 1, 0), -59858), + ((a(16, 63, 1, 198), b(29, 36, 106, 62), 1, 0), -47476), + ((a(0, 0, 0, 0), b(0, 0, 0, 0), 0, 1), -32362), ] - return self.run_ops(DATA) + dut = self.dut + + def process(): + for (a_word, b_word, enable, clear), expected in DATA: + yield dut.a_word.eq(a_word) + yield dut.b_word.eq(b_word) + yield dut.enable.eq(enable) + yield dut.clear.eq(clear) + yield Delay(0.1) # Wait for input values to settle + + # Check on accumulator, as calcuated last cycle + self.assertEqual(expected, (yield dut.accumulator)) + yield Tick() + + self.run_sim(process, write_trace=False) if __name__ == '__main__': unittest.main() From ff6e022d1140ddd0ee3736f1cc8bfb0d3de3928d Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 11:19:40 +1000 Subject: [PATCH 06/12] fccm_tutorial/cfu.py: New CFU Adds a CFU that does a four-way parallel multiply add. Signed-off-by: Alan Green --- proj/fccm_tutorial/Makefile | 4 +- proj/fccm_tutorial/README.md | 5 +-- proj/fccm_tutorial/cfu.py | 74 ++++++++++++++++++++++++++++++++++-- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile index 31e022b84..9c4e50752 100644 --- a/proj/fccm_tutorial/Makefile +++ b/proj/fccm_tutorial/Makefile @@ -26,8 +26,8 @@ DEFINES += NDEBUG #DEFINES += NPROFILE # Uncomment to include specified model in built binary -DEFINES += INCLUDE_MODEL_PDTI8 -#DEFINES += INCLUDE_MODEL_MICRO_SPEECH +#DEFINES += INCLUDE_MODEL_PDTI8 +DEFINES += INCLUDE_MODEL_MICRO_SPEECH #DEFINES += INCLUDE_MODEL_MAGIC_WAND #DEFINES += INCLUDE_MODEL_MNV2 #DEFINES += INCLUDE_MODEL_HPS diff --git a/proj/fccm_tutorial/README.md b/proj/fccm_tutorial/README.md index c0a9f0b71..e2e8b68ca 100644 --- a/proj/fccm_tutorial/README.md +++ b/proj/fccm_tutorial/README.md @@ -11,9 +11,8 @@ https://www.fccm.org/workshop-tutorial-2022/ these functions: * Operation 0: Reset accumulator -* Operation 1: Read accumulator -* Operation 2: Set signed offset -* Operation 3: 4-way multiply accumulate. +* Operation 1: 4-way multiply accumulate. +* Operation 2: Read accumulator Test cases can be run by executing `cfu.py`: diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py index 855dab6be..8ba429747 100644 --- a/proj/fccm_tutorial/cfu.py +++ b/proj/fccm_tutorial/cfu.py @@ -15,10 +15,10 @@ from amaranth import * from amaranth.sim import Delay, Tick -from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals +from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals, CfuBase, CfuTestBase import unittest -class MultiplyAdd4(SimpleElaboratable): +class MultiplyAccumulate4(SimpleElaboratable): """Performs four, 8 bit wide multiply-accumulates in parallel. Uses `SimpleElaboratable` helper class as a convenience. @@ -53,9 +53,9 @@ def elab(self, m): m.d.sync += self.accumulator.eq(self.accumulator + summed) -class MultiplyAdd4Test(TestBase): +class MultiplyAccumulate4Test(TestBase): def create_dut(self): - return MultiplyAdd4() + return MultiplyAccumulate4() def test(self): @@ -103,5 +103,71 @@ def process(): self.run_sim(process, write_trace=False) + +class Cfu(CfuBase): + """Simple CFU that provides access to a MultiplyAccumulate4. + + The supported operations are: + * Operation 0: Reset accumulator + * Operation 1: 4-way multiply accumulate. + * Operation 2: Read accumulator + + The implementation here assumes the CPU is always ready to read a response. + """ + + def elab(self, m): + # Build the submodule + m.submodules.macc4 = macc4 = MultiplyAccumulate4() + + # Check operation number + funct3 = Signal(3) + m.d.comb += funct3.eq(self.cmd_function_id[:3]) + + # All commands take 1 cycle. CFU is always read to receive a command + m.d.comb += self.cmd_ready.eq(1) + + # There is only one response, and it is always valid + m.d.comb += self.rsp_out.eq(macc4.accumulator) + m.d.comb += self.rsp_valid.eq(1) + + # Inputs to Macc4 always set to CFU inputs + m.d.comb += macc4.a_word.eq(self.cmd_in0) + m.d.comb += macc4.b_word.eq(self.cmd_in1) + + # clear on zero, enable on 1 + m.d.comb += macc4.clear.eq(self.cmd_valid & (funct3 == 0)) + m.d.comb += macc4.enable.eq(self.cmd_valid & (funct3 == 1)) + +def make_cfu(): + return Cfu() + +class CfuTest(CfuTestBase): + def create_dut(self): + return make_cfu() + + def test(self): + "Tests CFU plumbs to Madd4 correctly" + def a(a, b, c, d): return pack_vals(a, b, c, d, offset=-128) + def b(a, b, c, d): return pack_vals(a, b, c, d, offset=0) + # These values were calculated with a spreadsheet + DATA = [ + # ((fn3, op1, op2), result) + ((0, 0, 0), None), #reset + ((1, a(130, 7, 76, 47), b(104, -14, -24, 71)), None), # calculate + ((1, a(84, 90, 36, 191), b(109, 57, -50, -1)), None), + ((1, a(203, 246, 89, 178), b(-87, 26, 77, 71)), None), + ((1, a(43, 27, 78, 167), b(-24, -8, 65, 124)), None), + ((2, 0, 0), 59986), # read result + + ((0, 0, 0), None), #reset + ((1, a(67, 81, 184, 130), b(81, 38, -116, 65)), None), + ((1, a(208, 175, 180, 198), b(-120, -70, 8, 11)), None), + ((1, a(185, 81, 101, 108), b(90, 6, -92, 83)), None), + ((1, a(219, 216, 114, 236), b(-116, -9, -109, -16)), None), + ((2, 0, 0), -64723), # read result + ] + self.run_ops(DATA) + + if __name__ == '__main__': unittest.main() From 6bae3a285441d43e865bc92777739fd9bfbeed20 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 12:00:26 +1000 Subject: [PATCH 07/12] fccm_tutorial/depthwise_conv.h: Copied file Copied file from TfLM directory. Signed-off-by: Alan Green --- .../reference/integer_ops/depthwise_conv.h | 289 ++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h new file mode 100644 index 000000000..f0ca09c74 --- /dev/null +++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h @@ -0,0 +1,289 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ + +#include "tensorflow/lite/kernels/internal/common.h" + +namespace tflite { +namespace reference_integer_ops { +inline void DepthwiseConvPerChannel( + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { + // Get parameters. + // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro. + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Check dimensions of the tensors. + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) { + for (int m = 0; m < depth_multiplier; ++m) { + const int output_channel = m + in_channel * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + if (is_point_inside_image) { + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset( + filter_shape, 0, filter_y, filter_x, output_channel)]; + // Accumulate with 32 bits accumulator. + // In the nudging process during model quantization, we force + // real value of 0.0 be represented by a quantized value. This + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - + // [-128, 127]), which is [-32512, 32512]. log2(32512) + // = 14.98, which means we can accumulate at least 2^16 + // multiplications without overflow. The accumulator is + // applied to a filter so the accumulation logic will hold as + // long as the filter size (filter_y * filter_x * in_channel) + // does not exceed 2^16, which is the case in all the models + // we have seen so far. + // TODO(b/174275578): Add a check to make sure the + // accumulator depth is smaller than 2^16. + acc += filter_val * (input_val + input_offset); + } + } + } + if (bias_data) { + acc += bias_data[output_channel]; + } + acc = MultiplyByQuantizedMultiplier( + acc, output_multiplier[output_channel], + output_shift[output_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, + output_channel)] = static_cast(acc); + } + } + } + } + } +} + +inline void DepthwiseConvPerChannel( + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int16_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const std::int64_t* bias_data, const RuntimeShape& output_shape, + int16_t* output_data) { + // Get parameters. + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Check dimensions of the tensors. + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) { + for (int m = 0; m < depth_multiplier; ++m) { + const int output_channel = m + in_channel * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + std::int64_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + if (is_point_inside_image) { + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset( + filter_shape, 0, filter_y, filter_x, output_channel)]; + // Accumulate with 64 bits accumulator. + // We assume maximum of 2^16 accumulations as with the 8-bit + // case so actually the value in the accumulator should not + // exceed 40 bits + acc += static_cast(filter_val) * + static_cast(input_val); + } + } + } + if (bias_data) { + acc += bias_data[output_channel]; + } + int32_t scaled_acc = MultiplyByQuantizedMultiplier( + acc, output_multiplier[output_channel], + output_shift[output_channel]); + scaled_acc = std::max(scaled_acc, output_activation_min); + scaled_acc = std::min(scaled_acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, + output_channel)] = + static_cast(scaled_acc); + } + } + } + } + } +} + +inline void DepthwiseConvHybridPerChannel( + const DepthwiseParams& params, float* scaling_factors_ptr, + const RuntimeShape& input_shape, const int8_t* input_data, + const RuntimeShape& filter_shape, const int8_t* filter_data, + const RuntimeShape& bias_shape, const float* bias_data, + const RuntimeShape& output_shape, float* output_data, + const float* per_channel_scale, int32_t* input_offset) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + // Check dimensions of the tensors. + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int bias_depth = bias_shape.FlatSize(); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_depth, output_depth); + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) { + for (int m = 0; m < depth_multiplier; ++m) { + const int output_channel = m + in_channel * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = + in_y_origin + dilation_height_factor * filter_y; + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + if (is_point_inside_image) { + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset( + filter_shape, 0, filter_y, filter_x, output_channel)]; + acc += filter_val * (input_val - input_offset[batch]); + } + } + } + float acc_float = static_cast(acc); + acc_float *= + per_channel_scale[output_channel] * scaling_factors_ptr[batch]; + if (bias_data && output_channel < bias_depth) { + acc_float += bias_data[output_channel]; + } + output_data[Offset(output_shape, batch, out_y, out_x, + output_channel)] = + ActivationFunctionWithMinMax(acc_float, output_activation_min, + output_activation_max); + } + } + } + } + } +} + +} // namespace reference_integer_ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ From 9316743c78740eada852fb39c072bb4aa735fcdf Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 16:11:51 +1000 Subject: [PATCH 08/12] fccm_tutorial/depthwise_conv.h: add indirection Adds an indirect call to the original Depthwise Conv 2D implementation. This gives us a place from which we can add calls to an accelerated version. Signed-off-by: Alan Green --- .../reference/integer_ops/depthwise_conv.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h index f0ca09c74..563b49b2f 100644 --- a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h +++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h @@ -19,7 +19,8 @@ limitations under the License. namespace tflite { namespace reference_integer_ops { -inline void DepthwiseConvPerChannel( + +inline void OriginalDepthwiseConvPerChannel( const DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const RuntimeShape& input_shape, const int8_t* input_data, const RuntimeShape& filter_shape, @@ -119,6 +120,22 @@ inline void DepthwiseConvPerChannel( } } +inline void DepthwiseConvPerChannel( + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { + + // Call original + OriginalDepthwiseConvPerChannel(params, output_multiplier, output_shift, + input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, + output_shape, output_data); +} + + inline void DepthwiseConvPerChannel( const DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const RuntimeShape& input_shape, From fdb76a25b234b6c372a3e30183543bfba104b734 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 16:37:49 +1000 Subject: [PATCH 09/12] fccm_tutorial: output parameters Based on this work, I think the accelerator I already wrote isn't quite right for this. Let's investigate 1x1 Conv2Ds --- proj/fccm_tutorial/Makefile | 8 +++++++- .../internal/reference/integer_ops/depthwise_conv.h | 10 +++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile index 9c4e50752..aa0f48d8f 100644 --- a/proj/fccm_tutorial/Makefile +++ b/proj/fccm_tutorial/Makefile @@ -17,7 +17,13 @@ export DEFINES := # Uncomment this line to use software defined CFU functions in software_cfu.cc -DEFINES += CFU_SOFTWARE_DEFINED +#DEFINES += CFU_SOFTWARE_DEFINED + +# Uncomment this line to print parameters of the conv2d operation +DEFINES += DEPTHWISE_PRIMT_PARAMS + +# Uncomment this line to allow acceleration +DEFINES += DEPTHWISE_ACCELERATE # Uncomment this line to skip debug code (large effect on performance) DEFINES += NDEBUG diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h index 563b49b2f..8adea40ce 100644 --- a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h +++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h @@ -15,6 +15,7 @@ limitations under the License. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_ +#include "playground_util/print_params.h" #include "tensorflow/lite/kernels/internal/common.h" namespace tflite { @@ -128,6 +129,14 @@ inline void DepthwiseConvPerChannel( const int32_t* bias_data, const RuntimeShape& output_shape, int8_t* output_data) { +#ifdef DEPTHWISE_PRINT_PARAMS + print_depthwise_params(params, input_shape, filter_shape, output_shape); +#endif + +#ifdef DEPTHWISE_ACCELERATE + // Check whether we can accelerate +#endif + // Call original OriginalDepthwiseConvPerChannel(params, output_multiplier, output_shift, input_shape, input_data, filter_shape, @@ -135,7 +144,6 @@ inline void DepthwiseConvPerChannel( output_shape, output_data); } - inline void DepthwiseConvPerChannel( const DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const RuntimeShape& input_shape, From 0bc16f639ff40ae7b4b666b25dea37f4bbcd1c62 Mon Sep 17 00:00:00 2001 From: Alan Green Date: Tue, 19 Apr 2022 16:53:14 +1000 Subject: [PATCH 10/12] fccm_tutorial/conv.h: Copied file Copied file from TfLM directory. Signed-off-by: Alan Green --- .../internal/reference/integer_ops/conv.h | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h new file mode 100644 index 000000000..3f869a3af --- /dev/null +++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h @@ -0,0 +1,236 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_ + +#include "tensorflow/lite/kernels/internal/common.h" + +namespace tflite { +namespace reference_integer_ops { + +// Fixed-point per-channel-quantization convolution reference kernel. +inline void ConvPerChannel( + const ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + + if (!is_point_inside_image) { + continue; + } + + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; + int32_t filter_val = filter_data[Offset( + filter_shape, out_channel, filter_y, filter_x, in_channel)]; + // Accumulate with 32 bits accumulator. + // In the nudging process during model quantization, we force + // real value of 0.0 be represented by a quantized value. This + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - + // [-128, 127]), which is [-32512, 32512]. log2(32512) + // = 14.98, which means we can accumulate at least 2^16 + // multiplications without overflow. The accumulator is + // applied to a filter so the accumulation logic will hold as + // long as the filter size (filter_y * filter_x * in_channel) + // does not exceed 2^16, which is the case in all the models + // we have seen so far. + // TODO(b/174275578): Add a check to make sure the + // accumulator depth is smaller than 2^16. + acc += filter_val * (input_val + input_offset); + } + } + } + + if (bias_data) { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier( + acc, output_multiplier[out_channel], output_shift[out_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast(acc); + } + } + } + } +} + +// Fixed-point per-channel-quantization convolution reference kernel. +// 16-bit data and 8-bit filter +template +inline void ConvPerChannel( + const ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int16_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const AccumScalar* bias_data, const RuntimeShape& output_shape, + int16_t* output_data) { + // Get parameters. + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; + AccumScalar acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + + if (!is_point_inside_image) { + continue; + } + + for (int in_channel = 0; in_channel < filter_input_depth; + ++in_channel) { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; + int32_t filter_val = filter_data[Offset( + filter_shape, out_channel, filter_y, filter_x, in_channel)]; + // Accumulate with 64 bits accumulator. + // int64_t += int8_t * int16_t so the highest value we can + // get from each accumulation is [-127, 127] * ([-32768, + // 32767] - + // [-32768, 32767]), which is [-8322945, 8322945]. + // log2(8322945) = 22.99. + acc += filter_val * input_val; + } + } + } + if (bias_data) { + acc += bias_data[out_channel]; + } + int32_t scaled_acc = MultiplyByQuantizedMultiplier( + acc, output_multiplier[out_channel], output_shift[out_channel]); + scaled_acc = std::max(scaled_acc, output_activation_min); + scaled_acc = std::min(scaled_acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast(scaled_acc); + } + } + } + } +} + +} // namespace reference_integer_ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_ From e0028c4d28e54e3cdbc537f757a2d6bfe1bb416a Mon Sep 17 00:00:00 2001 From: Alan Green Date: Thu, 28 Apr 2022 10:07:25 +1000 Subject: [PATCH 11/12] fccm_tutorial/conv.h: Rename function to "original" Signed-off-by: Alan Green --- .../internal/reference/integer_ops/conv.h | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h index 3f869a3af..d0644f320 100644 --- a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h @@ -15,13 +15,14 @@ limitations under the License. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_ #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_ +#include "playground_util/print_params.h" #include "tensorflow/lite/kernels/internal/common.h" namespace tflite { namespace reference_integer_ops { // Fixed-point per-channel-quantization convolution reference kernel. -inline void ConvPerChannel( +inline void OriginalConvPerChannel( const ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const RuntimeShape& input_shape, const int8_t* input_data, const RuntimeShape& filter_shape, @@ -131,6 +132,28 @@ inline void ConvPerChannel( } } +inline void ConvPerChannel( + const ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { + + +#ifdef CONV_PRINT_PARAMS + print_conv_params(params, input_shape, filter_shape, output_shape); +#endif + +#ifdef CONV_ACCELERATE + // Check whether we can accelerate +#endif + // Call original + OriginalConvPerChannel(params, output_multiplier, output_shift, input_shape, + input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data); +} + // Fixed-point per-channel-quantization convolution reference kernel. // 16-bit data and 8-bit filter template From 83700c5fe898308d55c29e893b7a84bba9ae55df Mon Sep 17 00:00:00 2001 From: Alan Green Date: Thu, 28 Apr 2022 10:08:26 +1000 Subject: [PATCH 12/12] fccm_tutorial/Makefile: use KWS model Signed-off-by: Alan Green --- proj/fccm_tutorial/Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile index aa0f48d8f..8225b99d3 100644 --- a/proj/fccm_tutorial/Makefile +++ b/proj/fccm_tutorial/Makefile @@ -19,10 +19,12 @@ export DEFINES := # Uncomment this line to use software defined CFU functions in software_cfu.cc #DEFINES += CFU_SOFTWARE_DEFINED -# Uncomment this line to print parameters of the conv2d operation -DEFINES += DEPTHWISE_PRIMT_PARAMS +# Uncomment these line to print parameters of the conv2d operation +DEFINES += CONV_PRINT_PARAMS +DEFINES += DEPTHWISE_PRINT_PARAMS # Uncomment this line to allow acceleration +DEFINES += CONV_ACCELERATE DEFINES += DEPTHWISE_ACCELERATE # Uncomment this line to skip debug code (large effect on performance) @@ -33,13 +35,13 @@ DEFINES += NDEBUG # Uncomment to include specified model in built binary #DEFINES += INCLUDE_MODEL_PDTI8 -DEFINES += INCLUDE_MODEL_MICRO_SPEECH +#DEFINES += INCLUDE_MODEL_MICRO_SPEECH #DEFINES += INCLUDE_MODEL_MAGIC_WAND #DEFINES += INCLUDE_MODEL_MNV2 #DEFINES += INCLUDE_MODEL_HPS #DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_ANOMD #DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_IMGC -#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS +DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS #DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_VWW # Uncomment to include all TFLM examples (pdti8, micro_speech, magic_wand)