From 8e0540b5e6da47fe1715348819980d459deed3e0 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 11:14:44 +1000
Subject: [PATCH 01/12] setup-guide.rst: note amaranth-yosys requirement

Notes that amaranth-yosys may be required.

Installation of amaranth-yosys needs pip3 which is usually supplied by
conda or a Python virtual environement.

Signed-off-by: Alan Green <avg@google.com>
---
 docs/source/setup-guide.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/setup-guide.rst b/docs/source/setup-guide.rst
index 4d0856092..e3b26c879 100644
--- a/docs/source/setup-guide.rst
+++ b/docs/source/setup-guide.rst
@@ -45,6 +45,12 @@ This updates submodules, builds some local executables, and installs missing Lin
    cd CFU-Playground
    ./scripts/setup
 
+If you intend to use Amaranth to build CFUs, you may need a compatible version of Yosys,
+which can be installed with:
+
+.. code-block:: bash
+
+ pip3 install amaranth-yosys
 
 Step 4: Install Toolchain
 --------------------------------------------

From 5d91f189d6ef2a58999c675dc32c02153675848e Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Mon, 18 Apr 2022 12:43:39 +1000
Subject: [PATCH 02/12] fccm_tutorial: copy from proj_template

Creates a new directory, fccm_tutorial, which is a (largely) a copy of
proj_template.

This directory will contain code for CFU Playground tutorial to be held
at FCCM 2022.

Signed-off-by: Alan Green <avg@google.com>
---
 proj/fccm_tutorial/Makefile                  | 47 +++++++++++++
 proj/fccm_tutorial/cfu.py                    | 67 +++++++++++++++++++
 proj/fccm_tutorial/cfu_gen.py                | 38 +++++++++++
 proj/fccm_tutorial/ci/ci_build_params.txt    |  0
 proj/fccm_tutorial/ci/ci_exclude_targets.txt | 23 +++++++
 proj/fccm_tutorial/proj_template.robot       | 20 ++++++
 proj/fccm_tutorial/src/README.md             | 13 ++++
 proj/fccm_tutorial/src/proj_menu.cc          | 69 ++++++++++++++++++++
 proj/fccm_tutorial/src/software_cfu.cc       | 31 +++++++++
 9 files changed, 308 insertions(+)
 create mode 100644 proj/fccm_tutorial/Makefile
 create mode 100644 proj/fccm_tutorial/cfu.py
 create mode 100644 proj/fccm_tutorial/cfu_gen.py
 create mode 100644 proj/fccm_tutorial/ci/ci_build_params.txt
 create mode 100644 proj/fccm_tutorial/ci/ci_exclude_targets.txt
 create mode 100644 proj/fccm_tutorial/proj_template.robot
 create mode 100644 proj/fccm_tutorial/src/README.md
 create mode 100644 proj/fccm_tutorial/src/proj_menu.cc
 create mode 100644 proj/fccm_tutorial/src/software_cfu.cc

diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile
new file mode 100644
index 000000000..e7e13296d
--- /dev/null
+++ b/proj/fccm_tutorial/Makefile
@@ -0,0 +1,47 @@
+#!/bin/env python
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This variable lists symbols to define to the C preprocessor
+export DEFINES :=
+
+# Uncomment this line to use software defined CFU functions in software_cfu.cc
+#DEFINES += CFU_SOFTWARE_DEFINED
+
+# Uncomment this line to skip debug code (large effect on performance)
+DEFINES += NDEBUG
+
+# Uncomment this line to skip individual profiling output (has minor effect on performance).
+#DEFINES += NPROFILE
+
+# Uncomment to include specified model in built binary
+DEFINES += INCLUDE_MODEL_PDTI8
+#DEFINES += INCLUDE_MODEL_MICRO_SPEECH
+#DEFINES += INCLUDE_MODEL_MAGIC_WAND
+#DEFINES += INCLUDE_MODEL_MNV2
+#DEFINES += INCLUDE_MODEL_HPS
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_ANOMD
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_IMGC
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_VWW
+
+# Uncomment to include all TFLM examples (pdti8, micro_speech, magic_wand)
+#DEFINES += INCLUDE_ALL_TFLM_EXAMPLES
+
+
+# Defaults to Symbiflow and Arty
+TARGET=digilent_arty
+USE_SYMBIFLOW=1
+
+include ../proj.mk
diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py
new file mode 100644
index 000000000..3761c1231
--- /dev/null
+++ b/proj/fccm_tutorial/cfu.py
@@ -0,0 +1,67 @@
+#!/bin/env python
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from amaranth import *
+from amaranth_cfu import InstructionBase, InstructionTestBase, simple_cfu, CfuTestBase
+import unittest
+
+# See proj_example for further example instructions
+
+
+class TemplateInstruction(InstructionBase):
+    """Template instruction
+    """
+
+    def elab(self, m):
+        with m.If(self.start):
+            m.d.sync += self.output.eq(self.in0 + self.in1)
+            m.d.sync += self.done.eq(1)
+        with m.Else():
+            m.d.sync += self.done.eq(0)
+
+
+class TemplateInstructionTest(InstructionTestBase):
+    def create_dut(self):
+        return TemplateInstruction()
+
+    def test(self):
+        self.verify([
+            (0, 0, 0),
+            (4, 5, 9),
+            (0xffffffff, 0xffffffff, 0xfffffffe),
+        ])
+
+
+def make_cfu():
+    return simple_cfu({
+        # Add instructions here...
+        0: TemplateInstruction(),
+    })
+
+
+class CfuTest(CfuTestBase):
+    def create_dut(self):
+        return make_cfu()
+
+    def test(self):
+        DATA = [
+            # Test CFU calls here...
+            ((0, 22, 22), 44),
+        ]
+        return self.run_ops(DATA)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/proj/fccm_tutorial/cfu_gen.py b/proj/fccm_tutorial/cfu_gen.py
new file mode 100644
index 000000000..7761d9841
--- /dev/null
+++ b/proj/fccm_tutorial/cfu_gen.py
@@ -0,0 +1,38 @@
+# Copyright 2021 The CFU-Playground Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path
+from amaranth import *
+from amaranth.back import rtlil, verilog
+
+from cfu import make_cfu
+
+VERILOG_FILENAME = "cfu.v"
+
+def read_file():
+    if os.path.exists(VERILOG_FILENAME):
+        with open(VERILOG_FILENAME, "r") as f:
+            return f.read()
+    return None
+
+def main():
+    cfu = make_cfu()
+    new_verilog = verilog.convert(cfu, name='Cfu', ports=cfu.ports)
+    old_verilog = read_file()
+    if new_verilog != old_verilog:
+        with open(VERILOG_FILENAME, "w") as f:
+            f.write(new_verilog)
+
+if __name__ == '__main__':
+    main()
diff --git a/proj/fccm_tutorial/ci/ci_build_params.txt b/proj/fccm_tutorial/ci/ci_build_params.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/proj/fccm_tutorial/ci/ci_exclude_targets.txt b/proj/fccm_tutorial/ci/ci_exclude_targets.txt
new file mode 100644
index 000000000..8d0ea72f0
--- /dev/null
+++ b/proj/fccm_tutorial/ci/ci_exclude_targets.txt
@@ -0,0 +1,23 @@
+1bitsquared_icebreaker_bitsy
+colorlight_5a_75x
+decklink_intensity_pro_4k
+digilent_basys3
+digilent_cmod_a7
+ego1
+lattice_crosslink_nx_evn
+lattice_crosslink_nx_vip
+lattice_ecp5_evn
+lattice_ice40up5k_evn
+micronova_mercury2
+muselab_icesugar
+pano_logic_g2
+redpitaya
+simple
+sqrl_fk33
+terasic_deca
+terasic_sockit
+tinyfpga_bx
+trenz_te0725
+xilinx_zybo_z7
+qmtech_xc7a35t
+hps
diff --git a/proj/fccm_tutorial/proj_template.robot b/proj/fccm_tutorial/proj_template.robot
new file mode 100644
index 000000000..4044b734c
--- /dev/null
+++ b/proj/fccm_tutorial/proj_template.robot
@@ -0,0 +1,20 @@
+*** Settings ***
+Suite Setup                   Setup
+Suite Teardown                Teardown
+Test Setup                    Reset Emulation
+Test Teardown                 Test Teardown
+Resource                      ${RENODEKEYWORDS}
+
+*** Test Cases ***
+Should Walk The Menu
+    Execute Command          include @${CURDIR}/TARGET.resc
+    Create Terminal Tester   sysbus.uart
+
+    Start Emulation
+
+    Wait For Line On Uart    CFU Playground
+    Wait For Prompt On Uart  main>
+    Write Line To Uart       3
+    Wait For Line On Uart    Project Menu
+    Write Line To Uart       h
+    Wait For Line On Uart    Hello, World!
diff --git a/proj/fccm_tutorial/src/README.md b/proj/fccm_tutorial/src/README.md
new file mode 100644
index 000000000..0f41d6261
--- /dev/null
+++ b/proj/fccm_tutorial/src/README.md
@@ -0,0 +1,13 @@
+# Source Overlay directory
+
+The files in this directory will be "overlaid" on a copy of the common source tree and Tensorflow Lite for Microcontrollers.
+
+For example:
+
+    *   the proj_menu.c file in this directory replaces the proj_menu.c file in
+        common when building.
+    *   to replace the integer 2D convolution implementation, you could create a
+        file with the path:
+
+        tensorflow/lite/kernels/interal/reference/integer_ops/conv.h.
+
diff --git a/proj/fccm_tutorial/src/proj_menu.cc b/proj/fccm_tutorial/src/proj_menu.cc
new file mode 100644
index 000000000..a0e5f794a
--- /dev/null
+++ b/proj/fccm_tutorial/src/proj_menu.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2021 The CFU-Playground Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "proj_menu.h"
+
+#include <stdio.h>
+
+#include "cfu.h"
+#include "menu.h"
+
+namespace {
+
+// Template Fn
+void do_hello_world(void) { puts("Hello, World!!!\n"); }
+
+// Test template instruction
+void do_exercise_cfu_op0(void) {
+  puts("\r\nExercise CFU Op0 aka ADD\r\n");
+
+  unsigned int a = 0;
+  unsigned int b = 0;
+  unsigned int cfu = 0;
+  unsigned int count = 0;
+  unsigned int pass_count = 0;
+  unsigned int fail_count = 0;
+
+  for (a = 0x00004567; a < 0xF8000000; a += 0x00212345) {
+    for (b = 0x0000ba98; b < 0xFF000000; b += 0x00770077) {
+      cfu = cfu_op0(0, a, b);
+      if (cfu != a + b) {
+        printf("[%4d] a: %08x b:%08x a+b=%08x cfu=%08x FAIL\r\n", count, a, b,
+               a + b, cfu);
+        fail_count++;
+      } else {
+        pass_count++;
+      }
+      count++;
+    }
+  }
+
+  printf("\r\nPerformed %d comparisons, %d pass, %d fail\r\n", count,
+         pass_count, fail_count);
+}
+
+struct Menu MENU = {
+    "Project Menu",
+    "project",
+    {
+        MENU_ITEM('0', "exercise cfu op0", do_exercise_cfu_op0),
+        MENU_ITEM('h', "say Hello", do_hello_world),
+        MENU_END,
+    },
+};
+};  // anonymous namespace
+
+extern "C" void do_proj_menu() { menu_run(&MENU); }
diff --git a/proj/fccm_tutorial/src/software_cfu.cc b/proj/fccm_tutorial/src/software_cfu.cc
new file mode 100644
index 000000000..a8886ff54
--- /dev/null
+++ b/proj/fccm_tutorial/src/software_cfu.cc
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2021 The CFU-Playground Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include "software_cfu.h"
+
+//
+// In this function, place C code to emulate your CFU. You can switch between
+// hardware and emulated CFU by setting the CFU_SOFTWARE_DEFINED DEFINE in
+// the Makefile.
+uint32_t software_cfu(int funct3, int funct7, uint32_t rs1, uint32_t rs2)
+{
+  if (funct3 == 0)
+  {
+    return rs1 + rs2;
+  }
+  return rs1;
+}

From e75445de139d0bdd0c9af0aa68b588e1117bcec0 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 15:57:55 +1000
Subject: [PATCH 03/12] fccm_tutorial/software_cfu.cc: New Software CFU

Adds a software based CFU emulation and a test in proj_menu.cc.

Signed-off-by: Alan Green <avg@google.com>
---
 proj/fccm_tutorial/Makefile            |  2 +-
 proj/fccm_tutorial/src/fccm_cfu.h      | 26 +++++++++++++++
 proj/fccm_tutorial/src/proj_menu.cc    | 46 +++++++++-----------------
 proj/fccm_tutorial/src/software_cfu.cc | 38 ++++++++++++++++-----
 4 files changed, 72 insertions(+), 40 deletions(-)
 create mode 100644 proj/fccm_tutorial/src/fccm_cfu.h

diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile
index e7e13296d..31e022b84 100644
--- a/proj/fccm_tutorial/Makefile
+++ b/proj/fccm_tutorial/Makefile
@@ -17,7 +17,7 @@
 export DEFINES :=
 
 # Uncomment this line to use software defined CFU functions in software_cfu.cc
-#DEFINES += CFU_SOFTWARE_DEFINED
+DEFINES += CFU_SOFTWARE_DEFINED
 
 # Uncomment this line to skip debug code (large effect on performance)
 DEFINES += NDEBUG
diff --git a/proj/fccm_tutorial/src/fccm_cfu.h b/proj/fccm_tutorial/src/fccm_cfu.h
new file mode 100644
index 000000000..97da8dd85
--- /dev/null
+++ b/proj/fccm_tutorial/src/fccm_cfu.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2022 The CFU-Playground Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cfu.h"
+
+#ifndef _FCCM_CFH_H
+#define _FCCM_CFU_H
+
+#define cfu_reset() cfu_op0(0, 0, 0)
+#define cfu_accumulate(a, b) cfu_op1(0, a, b)
+#define cfu_read() cfu_op1(0, 0, 0)
+
+#endif
diff --git a/proj/fccm_tutorial/src/proj_menu.cc b/proj/fccm_tutorial/src/proj_menu.cc
index a0e5f794a..3714ecfee 100644
--- a/proj/fccm_tutorial/src/proj_menu.cc
+++ b/proj/fccm_tutorial/src/proj_menu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright 2021 The CFU-Playground Authors
+ * Copyright 2022 The CFU-Playground Authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <stdio.h>
 
-#include "cfu.h"
+#include "fccm_cfu.h"
 #include "menu.h"
 
 namespace {
@@ -26,44 +26,28 @@ namespace {
 // Template Fn
 void do_hello_world(void) { puts("Hello, World!!!\n"); }
 
-// Test template instruction
-void do_exercise_cfu_op0(void) {
-  puts("\r\nExercise CFU Op0 aka ADD\r\n");
-
-  unsigned int a = 0;
-  unsigned int b = 0;
-  unsigned int cfu = 0;
-  unsigned int count = 0;
-  unsigned int pass_count = 0;
-  unsigned int fail_count = 0;
-
-  for (a = 0x00004567; a < 0xF8000000; a += 0x00212345) {
-    for (b = 0x0000ba98; b < 0xFF000000; b += 0x00770077) {
-      cfu = cfu_op0(0, a, b);
-      if (cfu != a + b) {
-        printf("[%4d] a: %08x b:%08x a+b=%08x cfu=%08x FAIL\r\n", count, a, b,
-               a + b, cfu);
-        fail_count++;
-      } else {
-        pass_count++;
-      }
-      count++;
-    }
-  }
-
-  printf("\r\nPerformed %d comparisons, %d pass, %d fail\r\n", count,
-         pass_count, fail_count);
+// Tests multiply-add CFU
+void do_test_cfu(void) {
+  printf("\r\nCFU Test... ");
+
+  // Calculated on a spreadsheet
+  cfu_reset();
+  cfu_accumulate(0x884CF61A, 0xE49F2F1C);
+  cfu_accumulate(0x0BE31854, 0x527FDBCF);
+  cfu_accumulate(0xADB43251, 0x4E36D172);
+  cfu_accumulate(0x6F867FFB, 0x442C7B76);
+  printf("%s\n", static_cast<int32_t>(cfu_read()) == 81978 ? "PASS!" : "FAIL");
 }
 
 struct Menu MENU = {
     "Project Menu",
     "project",
     {
-        MENU_ITEM('0', "exercise cfu op0", do_exercise_cfu_op0),
+        MENU_ITEM('1', "test cfu", do_test_cfu),
         MENU_ITEM('h', "say Hello", do_hello_world),
         MENU_END,
     },
 };
 };  // anonymous namespace
 
-extern "C" void do_proj_menu() { menu_run(&MENU); }
+extern "C" void do_proj_menu() { menu_run(&MENU); }
\ No newline at end of file
diff --git a/proj/fccm_tutorial/src/software_cfu.cc b/proj/fccm_tutorial/src/software_cfu.cc
index a8886ff54..028599fac 100644
--- a/proj/fccm_tutorial/src/software_cfu.cc
+++ b/proj/fccm_tutorial/src/software_cfu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright 2021 The CFU-Playground Authors
+ * Copyright 2022 The CFU-Playground Authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,40 @@
  * limitations under the License.
  */
 
-#include <stdint.h>
 #include "software_cfu.h"
 
+#include <stdint.h>
+
+int32_t accumulator;
+
+// Gets a byte as an int8 from the given word
+inline int8_t extract_byte(uint32_t word, int num) {
+  return static_cast<int8_t>(0xff & (word >> (num * 8)));
+}
+
+// Multipy rs1 bytes by rs2 bytes and sum everything together
+int32_t multiply_add4(uint32_t rs1, uint32_t rs2){ 
+  return (
+    (128 + extract_byte(rs1, 0)) * extract_byte(rs2, 0) +
+    (128 + extract_byte(rs1, 1)) * extract_byte(rs2, 1) +
+    (128 + extract_byte(rs1, 2)) * extract_byte(rs2, 2) +
+    (128 + extract_byte(rs1, 3)) * extract_byte(rs2, 3));
+}
+
+
 //
 // In this function, place C code to emulate your CFU. You can switch between
 // hardware and emulated CFU by setting the CFU_SOFTWARE_DEFINED DEFINE in
 // the Makefile.
-uint32_t software_cfu(int funct3, int funct7, uint32_t rs1, uint32_t rs2)
-{
-  if (funct3 == 0)
-  {
-    return rs1 + rs2;
+uint32_t software_cfu(int funct3, int funct7, uint32_t rs1, uint32_t rs2) {
+  switch (funct3) {
+    case 0:
+      accumulator = 0;
+      break;
+    case 1:
+      accumulator += multiply_add4(rs1, rs2);
+    default:
+      break;
   }
-  return rs1;
+  return static_cast<uint32_t>(accumulator);
 }

From 50b1d67101f87be921fd9b3d81fbcbc86e1d8de3 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Mon, 18 Apr 2022 12:45:37 +1000
Subject: [PATCH 04/12] fccm_tutorial/README.md: new readme file

Adds a readme file, outlining the eventual content of this directory.

Signed-off-by: Alan Green <avg@google.com>
---
 proj/fccm_tutorial/README.md | 74 ++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 proj/fccm_tutorial/README.md

diff --git a/proj/fccm_tutorial/README.md b/proj/fccm_tutorial/README.md
new file mode 100644
index 000000000..c0a9f0b71
--- /dev/null
+++ b/proj/fccm_tutorial/README.md
@@ -0,0 +1,74 @@
+# FCCM Tutorial Example
+
+This example accelerator was created for the FCCM 2022 tutorial 
+"CFU-Playground: Build Your Own Custom TinyML Processor".
+
+https://www.fccm.org/workshop-tutorial-2022/
+
+## Amaranth CFU
+
+`cfu.py` contains a complete CFU written in Amaranth. It can perform
+these functions:
+
+* Operation 0: Reset accumulator
+* Operation 1: Read accumulator
+* Operation 2: Set signed offset
+* Operation 3: 4-way multiply accumulate.
+
+Test cases can be run by executing `cfu.py`:
+
+```
+$ ../../scripts/pyrun cfu.py
+```
+
+## Building and Running
+
+To build and program a Digilent Arty board, first follow the standard [setup
+instructions](https://cfu-playground.readthedocs.io/en/latest/setup-guide.html)
+to install Symbiflow and a RISCV compiler. Then:
+
+```
+$ make TARGET=digilent_arty USE_SYMBIFLOW=1 prog
+```
+
+You should see the familiar flashing lights, then:
+
+```
+$ make TARGET=digilent_arty USE_SYMBIFLOW=1 BUILD_JOBS=8 prog
+```
+
+This will load the software and start a terminal. Interesting options are:
+
+* 1 (Models), 1 (person detection int 8), 1 (person)   [[check this]]
+* 3 (Project menu), 1 (Exercise CFU)
+
+To ignore the CFU when running models, comment out this line:
+
+```
+# DEFINES += ACCEL_CONV2D
+```
+
+With gateware ignored, the inference times are very close to the inference
+times as measured with `proj/proj_template`.
+
+To use the CFU operation emulator defined in the `src/software_cfu.cc` file,
+uncomment this line:
+
+```
+# DEFINES += CFU_SOFTWARE_DEFINED
+```
+
+While it is much slower, it is often convenient to use emulated operations
+while debugging.
+
+
+## `proj_menu.cc`
+
+Contains snippets demonstrating the integration of the CFU with the SoC.
+
+[[Insert instructions here]]
+
+## Tensorflow Lite for Microcontrollers
+
+
+

From 7552608cf2d6e50cc6514dea8090f8aa795d9dd3 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Mon, 18 Apr 2022 14:39:42 +1000
Subject: [PATCH 05/12] fccm_tutorial/cfu.py: Add 4x Multiply-Accumulate

Adds a 4x parallel multiply accumulate operation.

Signed-off-by: Alan Green <avg@google.com>
---
 proj/fccm_tutorial/cfu.py | 102 ++++++++++++++++++++++++++------------
 1 file changed, 71 insertions(+), 31 deletions(-)

diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py
index 3761c1231..855dab6be 100644
--- a/proj/fccm_tutorial/cfu.py
+++ b/proj/fccm_tutorial/cfu.py
@@ -14,54 +14,94 @@
 # limitations under the License.
 
 from amaranth import *
-from amaranth_cfu import InstructionBase, InstructionTestBase, simple_cfu, CfuTestBase
+from amaranth.sim import Delay, Tick
+from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals
 import unittest
 
-# See proj_example for further example instructions
+class MultiplyAdd4(SimpleElaboratable):
+    """Performs four, 8 bit wide multiply-accumulates in parallel.
 
-
-class TemplateInstruction(InstructionBase):
-    """Template instruction
+    Uses `SimpleElaboratable` helper class as a convenience.
     """
+    def __init__(self):
+        # "a" and "b" inputs - each four, 8 bit signed numbers
+        self.a_word = Signal(32)
+        self.b_word = Signal(32)
 
-    def elab(self, m):
-        with m.If(self.start):
-            m.d.sync += self.output.eq(self.in0 + self.in1)
-            m.d.sync += self.done.eq(1)
-        with m.Else():
-            m.d.sync += self.done.eq(0)
-
+        # clear to reset accumulator, enable to perform multiply- accumulate
+        self.clear = Signal()
+        self.enable = Signal()
 
-class TemplateInstructionTest(InstructionTestBase):
-    def create_dut(self):
-        return TemplateInstruction()
+        # result
+        self.accumulator = Signal(signed(32))
 
-    def test(self):
-        self.verify([
-            (0, 0, 0),
-            (4, 5, 9),
-            (0xffffffff, 0xffffffff, 0xfffffffe),
-        ])
+    def elab(self, m):
+        """The actual gateware produced"""
 
+        # Divide a_word and b_word each into four, 8-bit parts
+        a_bytes = [self.a_word[i:i+8].as_signed() for i in range(0, 32, 8)]
+        b_bytes = [self.b_word[i:i+8].as_signed() for i in range(0, 32, 8)]
 
-def make_cfu():
-    return simple_cfu({
-        # Add instructions here...
-        0: TemplateInstruction(),
-    })
+        # Calculate the sum of (a+offset)*b for each part
+        calculations = [(a + Const(128)) * b for a, b in zip(a_bytes, b_bytes)]
+        summed = Signal(signed(32))
+        m.d.comb += summed.eq(sum(calculations))
+            
+        with m.If(self.clear):
+            m.d.sync += self.accumulator.eq(0)
+        with m.Elif(self.enable):
+            m.d.sync += self.accumulator.eq(self.accumulator + summed)
 
 
-class CfuTest(CfuTestBase):
+class MultiplyAdd4Test(TestBase):
     def create_dut(self):
-        return make_cfu()
+        return MultiplyAdd4()
 
     def test(self):
+
+        def a(a, b, c, d): return pack_vals(a, b, c, d, offset=-128)
+        def b(a, b, c, d): return pack_vals(a, b, c, d, offset=0)
         DATA = [
-            # Test CFU calls here...
-            ((0, 22, 22), 44),
+            # (a_word, b_word, enable, clear), expected accumulator
+            ((a(0, 0, 0, 0),  b(0, 0, 0, 0), 0, 0), 0),
+
+            # Simple tests: with just first byte
+            ((a(10, 0, 0, 0), b(3, 0, 0, 0),  1, 0),   0),
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 1, 0),  30),
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 0), -14),
+            # Since was not enabled last cycle, accumulator will not change
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 1, 0), -14),
+            # Since was enabled last cycle, will change accumlator
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 1), -58),
+            # Accumulator cleared
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 0),  0),
+
+            # Uses all bytes (calculated on a spreadsheet)
+            ((a(99, 22, 2, 1),      b(-2, 6, 7, 111), 1, 0),             0),
+            ((a(2, 45, 79, 22),     b(-33, 6, -97, -22), 1, 0),         59),
+            ((a(23, 34, 45, 56),    b(-128, -121, 119, 117), 1, 0),  -7884),
+            ((a(188, 34, 236, 246), b(-87, 56, 52, -117), 1, 0),     -3035),
+            ((a(131, 92, 21, 83),   b(-114, -72, -31, -44), 1, 0),  -33997),
+            ((a(74, 68, 170, 39),   b(102, 12, 53, -128), 1, 0),    -59858),
+            ((a(16, 63, 1, 198),    b(29, 36, 106, 62), 1, 0),      -47476),
+            ((a(0, 0, 0, 0),        b(0, 0, 0, 0), 0, 1),           -32362),        
         ]
-        return self.run_ops(DATA)
 
+        dut = self.dut
+
+        def process():
+            for (a_word, b_word, enable, clear), expected in DATA:
+                yield dut.a_word.eq(a_word)
+                yield dut.b_word.eq(b_word)
+                yield dut.enable.eq(enable)
+                yield dut.clear.eq(clear)
+                yield Delay(0.1) # Wait for input values to settle
+
+                # Check on accumulator, as calcuated last cycle
+                self.assertEqual(expected, (yield dut.accumulator))
+                yield Tick()
+
+        self.run_sim(process, write_trace=False)
 
 if __name__ == '__main__':
     unittest.main()

From ff6e022d1140ddd0ee3736f1cc8bfb0d3de3928d Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 11:19:40 +1000
Subject: [PATCH 06/12] fccm_tutorial/cfu.py: New CFU

Adds a CFU that does a four-way parallel multiply add.

Signed-off-by: Alan Green <avg@google.com>
---
 proj/fccm_tutorial/Makefile  |  4 +-
 proj/fccm_tutorial/README.md |  5 +--
 proj/fccm_tutorial/cfu.py    | 74 ++++++++++++++++++++++++++++++++++--
 3 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile
index 31e022b84..9c4e50752 100644
--- a/proj/fccm_tutorial/Makefile
+++ b/proj/fccm_tutorial/Makefile
@@ -26,8 +26,8 @@ DEFINES += NDEBUG
 #DEFINES += NPROFILE
 
 # Uncomment to include specified model in built binary
-DEFINES += INCLUDE_MODEL_PDTI8
-#DEFINES += INCLUDE_MODEL_MICRO_SPEECH
+#DEFINES += INCLUDE_MODEL_PDTI8
+DEFINES += INCLUDE_MODEL_MICRO_SPEECH
 #DEFINES += INCLUDE_MODEL_MAGIC_WAND
 #DEFINES += INCLUDE_MODEL_MNV2
 #DEFINES += INCLUDE_MODEL_HPS
diff --git a/proj/fccm_tutorial/README.md b/proj/fccm_tutorial/README.md
index c0a9f0b71..e2e8b68ca 100644
--- a/proj/fccm_tutorial/README.md
+++ b/proj/fccm_tutorial/README.md
@@ -11,9 +11,8 @@ https://www.fccm.org/workshop-tutorial-2022/
 these functions:
 
 * Operation 0: Reset accumulator
-* Operation 1: Read accumulator
-* Operation 2: Set signed offset
-* Operation 3: 4-way multiply accumulate.
+* Operation 1: 4-way multiply accumulate.
+* Operation 2: Read accumulator
 
 Test cases can be run by executing `cfu.py`:
 
diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py
index 855dab6be..8ba429747 100644
--- a/proj/fccm_tutorial/cfu.py
+++ b/proj/fccm_tutorial/cfu.py
@@ -15,10 +15,10 @@
 
 from amaranth import *
 from amaranth.sim import Delay, Tick
-from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals
+from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals, CfuBase, CfuTestBase
 import unittest
 
-class MultiplyAdd4(SimpleElaboratable):
+class MultiplyAccumulate4(SimpleElaboratable):
     """Performs four, 8 bit wide multiply-accumulates in parallel.
 
     Uses `SimpleElaboratable` helper class as a convenience.
@@ -53,9 +53,9 @@ def elab(self, m):
             m.d.sync += self.accumulator.eq(self.accumulator + summed)
 
 
-class MultiplyAdd4Test(TestBase):
+class MultiplyAccumulate4Test(TestBase):
     def create_dut(self):
-        return MultiplyAdd4()
+        return MultiplyAccumulate4()
 
     def test(self):
 
@@ -103,5 +103,71 @@ def process():
 
         self.run_sim(process, write_trace=False)
 
+
+class Cfu(CfuBase):
+    """Simple CFU that provides access to a MultiplyAccumulate4.
+
+    The supported operations are:
+        * Operation 0: Reset accumulator
+        * Operation 1: 4-way multiply accumulate.
+        * Operation 2: Read accumulator
+
+    The implementation here assumes the CPU is always ready to read a response.
+    """
+
+    def elab(self, m):
+        # Build the submodule
+        m.submodules.macc4 = macc4 = MultiplyAccumulate4()
+
+        # Check operation number
+        funct3 = Signal(3)
+        m.d.comb += funct3.eq(self.cmd_function_id[:3])
+
+        # All commands take 1 cycle. CFU is always read to receive a command
+        m.d.comb += self.cmd_ready.eq(1)
+
+        # There is only one response, and it is always valid
+        m.d.comb += self.rsp_out.eq(macc4.accumulator)
+        m.d.comb += self.rsp_valid.eq(1)
+
+        # Inputs to Macc4 always set to CFU inputs
+        m.d.comb += macc4.a_word.eq(self.cmd_in0)
+        m.d.comb += macc4.b_word.eq(self.cmd_in1)
+
+        # clear on zero, enable on 1
+        m.d.comb += macc4.clear.eq(self.cmd_valid & (funct3 == 0))
+        m.d.comb += macc4.enable.eq(self.cmd_valid & (funct3 == 1))
+
+def make_cfu():
+    return Cfu()
+
+class CfuTest(CfuTestBase):
+    def create_dut(self):
+        return make_cfu()
+
+    def test(self):
+        "Tests CFU plumbs to Madd4 correctly"
+        def a(a, b, c, d): return pack_vals(a, b, c, d, offset=-128)
+        def b(a, b, c, d): return pack_vals(a, b, c, d, offset=0)
+        # These values were calculated with a spreadsheet
+        DATA = [
+            # ((fn3, op1, op2), result)
+            ((0, 0, 0), None),  #reset
+            ((1, a(130, 7, 76, 47), b(104, -14, -24, 71)), None), # calculate
+            ((1, a(84, 90, 36, 191), b(109, 57, -50, -1)), None),
+            ((1, a(203, 246, 89, 178), b(-87, 26, 77, 71)), None),
+            ((1, a(43, 27, 78, 167), b(-24, -8, 65, 124)), None),
+            ((2, 0, 0), 59986), # read result
+
+            ((0, 0, 0), None),  #reset
+            ((1, a(67, 81, 184, 130), b(81, 38, -116, 65)), None),
+            ((1, a(208, 175, 180, 198), b(-120, -70, 8, 11)), None),
+            ((1, a(185, 81, 101, 108), b(90, 6, -92, 83)), None),
+            ((1, a(219, 216, 114, 236), b(-116, -9, -109, -16)), None),
+            ((2, 0, 0), -64723), # read result
+        ]
+        self.run_ops(DATA)
+
+
 if __name__ == '__main__':
     unittest.main()

From 6bae3a285441d43e865bc92777739fd9bfbeed20 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 12:00:26 +1000
Subject: [PATCH 07/12] fccm_tutorial/depthwise_conv.h: Copied file

Copied file from TfLM directory.

Signed-off-by: Alan Green <avg@google.com>
---
 .../reference/integer_ops/depthwise_conv.h    | 289 ++++++++++++++++++
 1 file changed, 289 insertions(+)
 create mode 100644 proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h

diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
new file mode 100644
index 000000000..f0ca09c74
--- /dev/null
+++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -0,0 +1,289 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(b/174275578): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // We assume maximum of 2^16 accumulations as with the 8-bit
+                  // case so actually the value in the accumulator should not
+                  // exceed 40 bits
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            scaled_acc = std::max(scaled_acc, output_activation_min);
+            scaled_acc = std::min(scaled_acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                static_cast<int16_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConvHybridPerChannel(
+    const DepthwiseParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int bias_depth = bias_shape.FlatSize();
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_depth, output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val - input_offset[batch]);
+                }
+              }
+            }
+            float acc_float = static_cast<float>(acc);
+            acc_float *=
+                per_channel_scale[output_channel] * scaling_factors_ptr[batch];
+            if (bias_data && output_channel < bias_depth) {
+              acc_float += bias_data[output_channel];
+            }
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                ActivationFunctionWithMinMax(acc_float, output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_

From 9316743c78740eada852fb39c072bb4aa735fcdf Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 16:11:51 +1000
Subject: [PATCH 08/12] fccm_tutorial/depthwise_conv.h: add indirection

Adds an indirect call to the original Depthwise Conv 2D implementation.
This gives us a place from which we can add calls to an accelerated
version.

Signed-off-by: Alan Green <avg@google.com>
---
 .../reference/integer_ops/depthwise_conv.h    | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index f0ca09c74..563b49b2f 100644
--- a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tflite {
 namespace reference_integer_ops {
-inline void DepthwiseConvPerChannel(
+
+inline void OriginalDepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
     const int8_t* input_data, const RuntimeShape& filter_shape,
@@ -119,6 +120,22 @@ inline void DepthwiseConvPerChannel(
   }
 }
 
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+
+  // Call original
+  OriginalDepthwiseConvPerChannel(params, output_multiplier, output_shift,
+                                  input_shape, input_data, filter_shape,
+                                  filter_data, bias_shape, bias_data,
+                                  output_shape, output_data);
+}
+
+
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,

From fdb76a25b234b6c372a3e30183543bfba104b734 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 16:37:49 +1000
Subject: [PATCH 09/12] fccm_tutorial: output parameters

Based on this work, I think the accelerator I already wrote isn't quite
right for this. Let's investigate 1x1 Conv2Ds
---
 proj/fccm_tutorial/Makefile                            |  8 +++++++-
 .../internal/reference/integer_ops/depthwise_conv.h    | 10 +++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile
index 9c4e50752..aa0f48d8f 100644
--- a/proj/fccm_tutorial/Makefile
+++ b/proj/fccm_tutorial/Makefile
@@ -17,7 +17,13 @@
 export DEFINES :=
 
 # Uncomment this line to use software defined CFU functions in software_cfu.cc
-DEFINES += CFU_SOFTWARE_DEFINED
+#DEFINES += CFU_SOFTWARE_DEFINED
+
+# Uncomment this line to print parameters of the conv2d operation
+DEFINES += DEPTHWISE_PRIMT_PARAMS
+
+# Uncomment this line to allow acceleration
+DEFINES += DEPTHWISE_ACCELERATE
 
 # Uncomment this line to skip debug code (large effect on performance)
 DEFINES += NDEBUG
diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 563b49b2f..8adea40ce 100644
--- a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
 
+#include "playground_util/print_params.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
@@ -128,6 +129,14 @@ inline void DepthwiseConvPerChannel(
     const int32_t* bias_data, const RuntimeShape& output_shape,
     int8_t* output_data) {
 
+#ifdef DEPTHWISE_PRINT_PARAMS
+  print_depthwise_params(params, input_shape, filter_shape, output_shape);
+#endif
+
+#ifdef DEPTHWISE_ACCELERATE
+  // Check whether we can accelerate
+#endif
+
   // Call original
   OriginalDepthwiseConvPerChannel(params, output_multiplier, output_shift,
                                   input_shape, input_data, filter_shape,
@@ -135,7 +144,6 @@ inline void DepthwiseConvPerChannel(
                                   output_shape, output_data);
 }
 
-
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,

From 0bc16f639ff40ae7b4b666b25dea37f4bbcd1c62 Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Tue, 19 Apr 2022 16:53:14 +1000
Subject: [PATCH 10/12] fccm_tutorial/conv.h: Copied file

Copied file from TfLM directory.

Signed-off-by: Alan Green <avg@google.com>
---
 .../internal/reference/integer_ops/conv.h     | 236 ++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h

diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
new file mode 100644
index 000000000..3f869a3af
--- /dev/null
+++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -0,0 +1,236 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(b/174275578): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+// 16-bit data and 8-bit filter
+template <typename AccumScalar>
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const AccumScalar* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int filter_input_depth = filter_shape.Dims(3);
+  const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
+  const int filters_per_group = output_depth / groups;
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          auto group = out_channel / filters_per_group;
+          AccumScalar acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < filter_input_depth;
+                   ++in_channel) {
+                int32_t input_val =
+                    input_data[Offset(input_shape, batch, in_y, in_x,
+                                      in_channel + group * filter_input_depth)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          int32_t scaled_acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          scaled_acc = std::max(scaled_acc, output_activation_min);
+          scaled_acc = std::min(scaled_acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int16_t>(scaled_acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_

From e0028c4d28e54e3cdbc537f757a2d6bfe1bb416a Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Thu, 28 Apr 2022 10:07:25 +1000
Subject: [PATCH 11/12] fccm_tutorial/conv.h: Rename function to "original"

Signed-off-by: Alan Green <avg@google.com>
---
 .../internal/reference/integer_ops/conv.h     | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index 3f869a3af..d0644f320 100644
--- a/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/proj/fccm_tutorial/src/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -15,13 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
 
+#include "playground_util/print_params.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
 namespace reference_integer_ops {
 
 // Fixed-point per-channel-quantization convolution reference kernel.
-inline void ConvPerChannel(
+inline void OriginalConvPerChannel(
     const ConvParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
     const int8_t* input_data, const RuntimeShape& filter_shape,
@@ -131,6 +132,28 @@ inline void ConvPerChannel(
   }
 }
 
+inline void ConvPerChannel(
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+
+
+#ifdef CONV_PRINT_PARAMS
+  print_conv_params(params, input_shape, filter_shape, output_shape);
+#endif
+
+#ifdef CONV_ACCELERATE
+  // Check whether we can accelerate
+#endif      
+  // Call original
+  OriginalConvPerChannel(params, output_multiplier, output_shift, input_shape,
+                         input_data, filter_shape, filter_data, bias_shape,
+                         bias_data, output_shape, output_data);
+}
+
 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
 template <typename AccumScalar>

From 83700c5fe898308d55c29e893b7a84bba9ae55df Mon Sep 17 00:00:00 2001
From: Alan Green <avg@google.com>
Date: Thu, 28 Apr 2022 10:08:26 +1000
Subject: [PATCH 12/12] fccm_tutorial/Makefile: use KWS model

Signed-off-by: Alan Green <avg@google.com>
---
 proj/fccm_tutorial/Makefile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile
index aa0f48d8f..8225b99d3 100644
--- a/proj/fccm_tutorial/Makefile
+++ b/proj/fccm_tutorial/Makefile
@@ -19,10 +19,12 @@ export DEFINES :=
 # Uncomment this line to use software defined CFU functions in software_cfu.cc
 #DEFINES += CFU_SOFTWARE_DEFINED
 
-# Uncomment this line to print parameters of the conv2d operation
-DEFINES += DEPTHWISE_PRIMT_PARAMS
+# Uncomment these line to print parameters of the conv2d operation
+DEFINES += CONV_PRINT_PARAMS
+DEFINES += DEPTHWISE_PRINT_PARAMS
 
 # Uncomment this line to allow acceleration
+DEFINES += CONV_ACCELERATE
 DEFINES += DEPTHWISE_ACCELERATE
 
 # Uncomment this line to skip debug code (large effect on performance)
@@ -33,13 +35,13 @@ DEFINES += NDEBUG
 
 # Uncomment to include specified model in built binary
 #DEFINES += INCLUDE_MODEL_PDTI8
-DEFINES += INCLUDE_MODEL_MICRO_SPEECH
+#DEFINES += INCLUDE_MODEL_MICRO_SPEECH
 #DEFINES += INCLUDE_MODEL_MAGIC_WAND
 #DEFINES += INCLUDE_MODEL_MNV2
 #DEFINES += INCLUDE_MODEL_HPS
 #DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_ANOMD
 #DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_IMGC
-#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS
+DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS
 #DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_VWW
 
 # Uncomment to include all TFLM examples (pdti8, micro_speech, magic_wand)