diff --git a/README.md b/README.md
index 50f0ac1..b9c4039 100644
--- a/README.md
+++ b/README.md
@@ -39,10 +39,14 @@ These contain shared IP cores that are used by the projects in this repo. These
 * Hash map implementation
   - Fully parameterized for bit widths and uses CRC as the hashing function
 * Blocks for parsing/processing streams
-* Karabutsa multiplier
+* Karatsuba multiplier
   - Fully parameterized for number of levels
 * Barret reduction for modulo reduction when the modulus does not allow fast reduction
   - Both a fully pipelined high performance version and a slower but smaller resource utilization version
+* Fully parallel multiplier with carry save adder tree and RAM for modular reduction
+  - Fully pipelined, 3x performance over Karatsuba + Barret, but uses FPGA RAM  
+* Multiplier using carry tree to accumulate products with BRAM for modular reduction
+  - 3x performance over Karatsuba + Barret approach
 * Addition and subtraction modules
   - Fully parameterized so that they can be used for large bit-width arithmetic
 * Extended Euclidean algorithm for calculating multiplicative inverses
@@ -60,7 +64,7 @@ It optionally contains the following top-level engines (you can include in a bui
   - Verifies the equihash solution and difficulty filters
 * Transparent Signature Verification Engine (secp256k1 ECDSA core)
   - Uses efficient endomorphism to reduce key bit size
-  - Signature verification calculates multiple EC point operations in parallel, using a resource-shared single fully pipelined karabutsa multiplier and quick modulo reduction technique
+  - Signature verification calculates multiple EC point operations in parallel, using a resource-shared single fully pipelined karatsuba multiplier and quick modulo reduction technique
 * BLS12-381 coprocessor (zk-SNARK accelerator)
   - Custom instruction set with 2kB instruction memory
   - 12kB Data slot URAM at curve native bit width of 381b
diff --git a/ip_cores/accum_mult_mod/data/.gitignore b/ip_cores/accum_mult_mod/data/.gitignore
new file mode 100644
index 0000000..a3a0c8b
--- /dev/null
+++ b/ip_cores/accum_mult_mod/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/scripts/generate_files.py b/ip_cores/accum_mult_mod/scripts/generate_files.py
new file mode 100644
index 0000000..5d03de6
--- /dev/null
+++ b/ip_cores/accum_mult_mod/scripts/generate_files.py
@@ -0,0 +1,310 @@
+#!/usr/bin/python3
+
+import math
+
+#  This needs to be called before simulation / synthesis to make sure the
+#  reduction ram files and include files are created.
+#
+#  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+####################
+# Generate the multiplier output to carry-save adder tree mapping
+####################
+
+BITS = 381
+MODULUS = 0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab
+A_DSP_W = 26
+B_DSP_W = 17
+GRID_BIT = 64
+RAM_A_W = 10
+
+URAM_PERCENT = 50
+USE_INIT = 1
+
+#8b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af600db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1  ^2 =
+#64a3a594868a2a4dab071ff6d880ae0f459c87e11ab01b3454b95a7d6a93f853f6e07f754b6e7933799e0afe2779a56
+
+
+RES_W = A_DSP_W+B_DSP_W
+NUM_COL = (BITS+A_DSP_W-1)//A_DSP_W;
+NUM_ROW = (BITS+B_DSP_W-1)//B_DSP_W;
+
+A_DIFF = A_DSP_W//GRID_BIT
+B_DIFF = B_DSP_W//GRID_BIT
+
+
+def get_accum_gen():
+  MAX_COEF = ((2*BITS)+GRID_BIT-1)//GRID_BIT
+  accum_s = '\n'
+  ram_s = '\n'
+  products = list()
+  # Make a list of all offsets where products start
+  for x in range(NUM_COL):
+    for y in range(NUM_ROW):
+      products.append((x, y, x*A_DSP_W+y*B_DSP_W))
+
+
+  # Now match these to coef
+  coef = list()
+  max_bits_l = list()
+  for i in range(MAX_COEF):
+    size = list()
+    # First do a pass just to check bit sizes  - also need to account for offset
+    for j in products:
+      start = max(j[2], i*GRID_BIT)
+      end = min(j[2]+RES_W, (i+1)*GRID_BIT)
+      if (end > start):
+        size.append(end-i*GRID_BIT)#start)
+    # Max bits 1 + clog2() of the max size in our list
+    #max_bits = max(size) + math.ceil(math.log2(size.count(max(size))))
+    max_bits = max(size) + math.ceil(math.log2(len(size)))
+    max_bits_l.append(max_bits)
+
+    coef_l = list()
+    for j in products:
+      # Check if we are in range
+      offset = (j[0]*A_DSP_W)+(j[1]*B_DSP_W)
+      start = max(j[2], i*GRID_BIT)
+      end = min(j[2]+RES_W, (i+1)*GRID_BIT)
+      if (end > start):
+        bitwidth = end-start
+        start_padding = max(start - i*GRID_BIT, 0)
+        end_padding = max(start+max_bits-end-start_padding, 0)
+        coef_l.append('{{{{{}{{1\'d0}}}},mul_grid[{}][{}][{}+:{}],{{{}{{1\'d0}}}}}}'.format(end_padding, j[0], j[1], start-offset, bitwidth, start_padding))
+
+
+
+    coef.append(coef_l)
+
+  # Create compressor trees and output
+  for idx, i in enumerate(coef):
+    if (len(i) == 1):
+      accum_s +='''
+// Coef {}
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= {};
+'''.format(idx, idx, i[0])
+    elif (len(i) == 2):
+      accum_s +='''
+// Coef {}
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= {};
+'''.format(idx, idx, ' + '.join(i))
+    else:
+      accum_s +='''
+// Coef {}
+logic [{}:0] accum_i_{} [{}];
+logic [{}:0] accum_o_c_{}, accum_o_s_{};
+compressor_tree_3_to_2 #(
+  .NUM_ELEMENTS({}),
+  .BIT_LEN({})
+)
+ct_{} (
+  .terms(accum_i_{}),
+  .C(accum_o_c_{}),
+  .S(accum_o_s_{})
+);
+always_comb accum_i_{} = {{{}}};
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= accum_o_c_{} + accum_o_s_{};
+'''.format(idx, max_bits_l[idx]-1, idx, len(i), max_bits_l[idx]-1, idx, idx, len(i), max_bits_l[idx], idx, idx, idx, idx, idx, ','.join(i), idx, idx, idx)
+
+  # If the bits of this coef are above the modulus, we start generating lookup RAM
+  # and output of RAM goes into address trees together with other partial products
+
+  curr_bit = 0
+  curr_bit_cnt = 0
+  coef = 0
+  ram_bit_low = 0
+  ram_addr_bits = list()
+
+  curr_bit = MODULUS.bit_length() % GRID_BIT
+  coef = (MODULUS.bit_length()//GRID_BIT)
+  reduc_coef = coef
+  reduc_bit = curr_bit
+  ram_s += 'always_ff @ (posedge i_clk) if (o_mul.rdy) begin\n'
+  mem_s = ''
+  #Reduce all bits after this
+  while(coef < MAX_COEF):
+    # Get max bits we can take from this coef
+    max_bits = min(max_bits_l[coef]-curr_bit, RAM_A_W-ram_bit_low)
+    ram_s += '  mod_ram_{}_a[{}+:{}] <= accum_grid_o[{}][{}+:{}];\n'.format(len(ram_addr_bits), ram_bit_low, max_bits, coef, curr_bit, max_bits)
+
+    if ((ram_bit_low + max_bits == RAM_A_W) or (coef == MAX_COEF - 1 and curr_bit + max_bits == max_bits_l[coef])):
+      if (ram_bit_low + max_bits != RAM_A_W):
+        ram_s += '  mod_ram_{}_a[{}+:{}] <= 0;\n'.format(len(ram_addr_bits), ram_bit_low+max_bits, RAM_A_W-(ram_bit_low+max_bits))
+
+      # Generate the init file lines - need to take into account earlier address bits
+      max_bits_value = max_bits + ram_bit_low
+      #print("max_bits {} ram_bit_low {}".format( max_bits, ram_bit_low))
+      for i in range(1 << max_bits_value):
+        # The value of a bit here will depend on the GRID and posisition of bit
+        # Assume (?) any bits not in this GRID are from previous
+        if (ram_bit_low != 0):
+          bit_l = i % (1 << ram_bit_low)
+          value_l = bit_l << ((max_bits_l[coef-1]-ram_bit_low)+(coef-1)*GRID_BIT)
+        else:
+          value_l = 0
+        bit_h = (i >> ram_bit_low)
+        value_h = bit_h << (coef*GRID_BIT + curr_bit)
+        value = hex((value_l + value_h) % MODULUS)[2:]
+
+        mem_s += "{}\n".format(value.zfill(math.ceil(MODULUS.bit_length()/4)))
+
+      f = open('../data/mod_ram_{}.mem'.format(len(ram_addr_bits)), 'w')
+      f.write(mem_s)
+      f.close()
+      mem_s = ''
+
+      ram_addr_bits.append(ram_bit_low + max_bits)
+      ram_bit_low = 0
+    else:
+      ram_bit_low += max_bits
+
+
+    if (curr_bit + max_bits == max_bits_l[coef]):
+      coef += 1
+      curr_bit = 0
+    else:
+      curr_bit += max_bits
+
+  ram_s += 'end\n'
+  # Add the RAMs
+  ram_s1 = ''
+  for idx, i in enumerate(ram_addr_bits):
+    uram_s = '(* ram_style="ultra" *)' if URAM_PERCENT > 100*idx/len(ram_addr_bits) else ''
+    init_s = 'initial $readmemh( "mod_ram_{}.mem", mod_ram_{}_ram);'.format(idx, idx) if USE_INIT else ''
+    ram_s1 += '''
+logic [{}:0]    mod_ram_{}_a;
+(* DONT_TOUCH = "yes" *) logic [{}:0]    mod_ram_{}_q;
+logic [{}:0]    mod_ram_{}_d;
+{}logic [{}:0]    mod_ram_{}_ram [{}];
+always_ff @ (posedge i_clk) if (o_mul.rdy) begin
+  mod_ram_{}_q <= mod_ram_{}_ram[mod_ram_{}_a];
+end
+{}
+'''.format(RAM_A_W-1, idx, MODULUS.bit_length()-1, idx, MODULUS.bit_length()-1, idx, uram_s, MODULUS.bit_length()-1, idx, 1 << RAM_A_W, idx, idx, idx, init_s)
+
+  # We now generate the tree adders to sum the reduction values with the accum_grid_o values
+  accum2_s = '\n'
+  for coef in range(math.ceil(MODULUS.bit_length()/GRID_BIT)):
+    # Make sure we have the right bit widths
+    if (coef == reduc_coef):
+      ram_bits = min(GRID_BIT, reduc_bit)
+    else:
+      ram_bits = GRID_BIT
+    padding = max_bits_l[coef] - ram_bits
+    #if (padding == 0):
+    max_bits_l[coef] += math.ceil(math.log2(len(ram_addr_bits)))
+    padding = max_bits_l[coef] - ram_bits
+    in_s = ['{{{{{}{{1\'d0}}}}, mod_ram_{}_q[{}+:{}]}}'.format(padding, i, coef*GRID_BIT, ram_bits) for i in range(len(ram_addr_bits))]
+    # Need to check if we also had reduction in this range
+    end = max_bits_l[coef]-1
+    padding = 0
+    if (reduc_coef == coef):
+      padding = end - reduc_bit
+      end = reduc_bit-1
+    in_s.append('{{{{{}{{1\'d0}}}}, accum_grid_o_rr[{}][{}:0]}}'.format(padding, coef, end))
+    accum2_s +='''
+// Coef {} accum 2 stage
+logic [{}:0] accum2_i_{} [{}];
+logic [{}:0] accum2_o_c_{}, accum2_o_s_{};
+compressor_tree_3_to_2 #(
+  .NUM_ELEMENTS({}),
+  .BIT_LEN({})
+)
+ct2_{} (
+  .terms(accum2_i_{}),
+  .C(accum2_o_c_{}),
+  .S(accum2_o_s_{})
+);
+always_comb accum2_i_{} = {{{}}};
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum2_grid_o[{}] <= accum2_o_c_{} + accum2_o_s_{};
+'''.format(coef, max_bits_l[coef]-1, coef, len(ram_addr_bits)+1, max_bits_l[coef]-1, coef, coef, len(ram_addr_bits)+1, max_bits_l[coef], coef, coef, coef, coef, coef, ','.join(in_s), coef, coef, coef)
+
+  ram_s = ram_s1 + ram_s
+
+  # We also need to do a final level reduction
+  accum3_s = '''
+logic [{}:0]    mod_ram2_0_a;
+logic [{}:0]    mod_ram2_0_q;
+always_comb begin
+  mod_ram2_0_a = res0_r[{}+:{}];
+end
+always_ff @ (posedge i_clk) if (o_mul.rdy) begin
+  mod_ram2_0_q <= mod_ram_0_ram[mod_ram2_0_a];
+end
+
+always_comb begin
+  res1_c = res0_rr[{}:0] + mod_ram2_0_q;
+  res1_m_c = res0_rr[{}:0] + mod_ram2_0_q - MODULUS;
+  res1_m_c_ = res0_rr[{}:0] + mod_ram2_0_q - 2*MODULUS;
+end
+'''.format(RAM_A_W-1, MODULUS.bit_length()-1, MODULUS.bit_length(), RAM_A_W, MODULUS.bit_length()-1, MODULUS.bit_length()-1, MODULUS.bit_length()-1)
+
+  # We also generate the arrays since we know the max sizes
+  logic_s = '''
+
+logic [{}:0]                  accum_grid_o [{}];
+logic [{}:0]                  accum_grid_o_r [{}];
+logic [{}:0]                  accum_grid_o_rr [{}];
+logic [{}:0]                  accum2_grid_o [{}];
+'''.format(max(max_bits_l)-1, MAX_COEF, max(max_bits_l)-1, MAX_COEF//2, max(max_bits_l)-1, MAX_COEF//2, max(max_bits_l)-1, MAX_COEF//2)
+
+  # Add logic for writing to memory
+  # Make long scan chain, width of RAM_D_W
+  ram_write_s = '''
+localparam int RAM_PIPE = 4;
+logic [RAM_PIPE:0][RAM_A_W-1:0] addr;
+logic [RAM_PIPE:0][RAM_D_W-1:0] ram_d;
+logic [RAM_PIPE:0]              ram_we;
+logic [RAM_PIPE:0]              ram_se;
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    addr <= 0;
+    ram_we <= 0;
+    ram_se <= 0;
+    ram_d <= 0;
+  end else begin
+    ram_we <= {ram_we, i_ram_we};
+    ram_d  <= {ram_d, i_ram_d};
+    ram_se <= {ram_se, i_ram_se};
+    if (ram_we[RAM_PIPE]) begin
+      addr <= addr + 1;'''
+  for idx, i in enumerate(ram_addr_bits):
+    ram_write_s+= '''
+      mod_ram_{}_ram[addr] <= mod_ram_{}_d;'''.format(idx, idx)
+  ram_write_s += '''
+    end
+'''
+  ram_write_s += '''
+    if (ram_se[RAM_PIPE]) begin'''
+  for idx, i in enumerate(ram_addr_bits):
+    previous_ram = "ram_d[RAM_PIPE]" if idx == 0 else "mod_ram_{}_d[{}:({}%RAM_D_W)]".format(idx-1, MODULUS.bit_length()-1, MODULUS.bit_length())
+    ram_write_s += '''
+      mod_ram_{}_d <= {{mod_ram_{}_d, {}}};'''.format(idx, idx, previous_ram)
+
+  ram_write_s += '''
+    end
+  end
+end
+'''
+  return logic_s + accum_s + ram_s + accum2_s + accum3_s + ram_write_s
+
+
+
+f = open('../src/rtl/accum_mult_mod_generated.sv', 'w')
+f.write(get_accum_gen())
+f.close()
+
diff --git a/ip_cores/accum_mult_mod/src/rtl/.gitignore b/ip_cores/accum_mult_mod/src/rtl/.gitignore
new file mode 100644
index 0000000..467c5c9
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/.gitignore
@@ -0,0 +1,2 @@
+accum_mult_mod_generated.sv
+!.gitignore
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv
new file mode 100644
index 0000000..3b222ab
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv
@@ -0,0 +1,137 @@
+/*
+  This does a BITS multiplication using adder tree and parameterizable
+  DSP sizes. A python script generates the accum_gen.sv file.
+
+  Does modulus reduction using RAM tables. Multiplication and reduction has
+  latency of 5 clock cycles and a throughput of 1 clock cycle per result.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+module accum_mult_mod #(
+  parameter DAT_BITS,
+  parameter MODULUS,
+  parameter CTL_BITS,
+  parameter A_DSP_W,
+  parameter B_DSP_W,
+  parameter GRID_BIT,
+  parameter RAM_A_W,
+  parameter RAM_D_W
+)(
+  input i_clk,
+  input i_rst,
+  if_axi_stream.sink   i_mul,
+  if_axi_stream.source o_mul,
+  input [RAM_D_W-1:0] i_ram_d,
+  input               i_ram_we,
+  input               i_ram_se
+);
+
+localparam int TOT_DSP_W = A_DSP_W+B_DSP_W;
+localparam int NUM_COL = (DAT_BITS+A_DSP_W-1)/A_DSP_W;
+localparam int NUM_ROW = (DAT_BITS+B_DSP_W-1)/B_DSP_W;
+localparam int MAX_COEF = (2*DAT_BITS+GRID_BIT-1)/GRID_BIT;
+localparam int PIPE = 9;
+
+logic [A_DSP_W*NUM_COL-1:0]             dat_a;
+logic [B_DSP_W*NUM_ROW-1:0]             dat_b;
+(* DONT_TOUCH = "yes" *) logic [A_DSP_W+B_DSP_W-1:0] mul_grid [NUM_COL][NUM_ROW];
+logic [2*DAT_BITS:0] res0_c, res0_r, res0_rr;
+logic [DAT_BITS:0]   res1_c, res1_m_c, res1_m_c_;
+
+// Most of the code is generated
+`include "accum_mult_mod_generated.sv"
+
+logic [PIPE-1:0] val, sop, eop;
+logic [PIPE-1:0][CTL_BITS-1:0] ctl;
+
+genvar gx, gy;
+
+// Flow control
+always_comb begin
+  i_mul.rdy = o_mul.rdy;
+  o_mul.val = val[PIPE-1];
+  o_mul.sop = sop[PIPE-1];
+  o_mul.eop = eop[PIPE-1];
+  o_mul.ctl = ctl[PIPE-1];
+  o_mul.err = 0;
+  o_mul.mod = 0;
+end
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    val <= 0;
+    sop <= 0;
+    eop <= 0;
+    ctl <= 0;
+  end else begin
+    if (o_mul.rdy) begin
+      val <= {val, i_mul.val};
+      sop <= {sop, i_mul.sop};
+      eop <= {eop, i_mul.eop};
+      ctl <= {ctl, i_mul.ctl};
+    end
+  end
+end
+
+// Logic for handling multiple pipelines
+always_ff @ (posedge i_clk) begin
+  if (o_mul.rdy) begin
+    for (int i = 0; i < NUM_COL; i++)
+      dat_a <= 0;
+      dat_b <= 0;
+      dat_a <= i_mul.dat[0+:DAT_BITS];
+      dat_b <= i_mul.dat[DAT_BITS+:DAT_BITS];
+  end
+end
+
+
+always_ff @ (posedge i_clk) begin
+  for (int i = 0; i < NUM_COL; i++)
+    for (int j = 0; j < NUM_ROW; j++) begin
+      if (o_mul.rdy)
+        mul_grid[i][j] <= dat_a[i*A_DSP_W +: A_DSP_W] * dat_b[j*B_DSP_W +: B_DSP_W];
+    end
+end
+
+// Register lower half accumulator output while we lookup BRAM
+always_ff @ (posedge i_clk)
+  for (int i = 0; i < MAX_COEF/2; i++) begin
+    if (o_mul.rdy) begin
+      accum_grid_o_r[i] <= accum_grid_o[i];
+      accum_grid_o_rr[i] <= accum_grid_o_r[i];
+    end
+  end
+
+// Two paths to make sure we are < MODULUS
+always_comb begin
+  res0_c = 0;
+  for (int i = 0; i < MAX_COEF/2; i++)
+      res0_c += accum2_grid_o[i] << (i*GRID_BIT);
+end
+
+// We do a second level reduction to get back within MODULUS bits
+
+always_ff @ (posedge i_clk) begin
+  if (o_mul.rdy) begin
+    res0_r <= res0_c;
+    res0_rr <= res0_r;
+    // Do final adjustment
+    o_mul.dat <= res1_m_c_ < res1_c ? res1_m_c_ : res1_c < res1_m_c ? res1_c : res1_m_c;
+  end
+end
+
+endmodule
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod_wrapper.sv b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod_wrapper.sv
new file mode 100644
index 0000000..663b8b5
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod_wrapper.sv
@@ -0,0 +1,83 @@
+/*
+  Wrapper for synthesis.
+
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+module accum_mult_mod_wrapper #(
+  parameter BITS = 381,
+  parameter [380:0] MODULUS = 381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab,
+  parameter A_DSP_W = 26,
+  parameter B_DSP_W = 17,
+  parameter GRID_BIT = 32,
+  parameter RAM_A_W = 8,
+  parameter RAM_D_W = 32
+)(
+  input i_clk,
+  input i_rst,
+  input i_val,
+  input i_rdy,
+  output logic o_val,
+  output logic o_rdy,
+  input        [BITS-1:0] i_dat_a,
+  input        [BITS-1:0] i_dat_b,
+  output logic [BITS-1:0] o_dat,
+  input [RAM_D_W-1:0] i_ram_d,
+  input               i_ram_we,
+  input               i_ram_se
+);
+
+logic [RAM_D_W-1:0] ram_d_r;
+logic               ram_we_r;
+logic               ram_se_r;
+
+if_axi_stream #(.DAT_BYTS(BITS*2), .CTL_BITS(8)) in_if(i_clk);
+if_axi_stream #(.DAT_BYTS(BITS), .CTL_BITS(8)) out_if(i_clk);
+
+always_ff @ (posedge i_clk) begin
+  in_if.dat[0+:BITS] <= i_dat_a;
+  in_if.dat[BITS+:BITS] <= i_dat_b;
+  o_dat <= out_if.dat;
+  in_if.val <= i_val;
+  o_rdy <= in_if.rdy;
+  out_if.rdy <= i_rdy;
+  o_val <= out_if.val;
+  ram_d_r <= i_ram_d;
+  ram_we_r <= i_ram_we;
+  ram_se_r <= i_ram_se;
+end
+
+accum_mult_mod #(
+  .DAT_BITS ( BITS     ),
+  .CTL_BITS ( 8        ),
+  .MODULUS  ( MODULUS  ),
+  .A_DSP_W  ( A_DSP_W  ),
+  .B_DSP_W  ( B_DSP_W  ),
+  .GRID_BIT ( GRID_BIT ),
+  .RAM_A_W  ( RAM_A_W  ),
+  .RAM_D_W  ( RAM_D_W  )
+)
+accum_mult_mod (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
+  .i_mul ( in_if ),
+  .o_mul ( out_if ),
+  .i_ram_d  ( ram_d_r  ),
+  .i_ram_we ( ram_we_r ),
+  .i_ram_se ( ram_se_r )
+);
+
+endmodule
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv b/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv
new file mode 100644
index 0000000..e3683f5
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv
@@ -0,0 +1,56 @@
+/*******************************************************************************
+  Copyright 2019 Supranational LLC
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*******************************************************************************/
+
+/*    
+  A parameterized carry save adder (CSA)
+  Loops through each input bit and feeds a full adder (FA)
+             --------------------------------
+            | CSA                            |
+            |         for each i in BIT_LEN  |
+            |            -------             |
+            |           | FA    |            |
+  A[]   --> |  Ai   --> |       | --> Si     | --> S[]
+  B[]   --> |  Bi   --> |       |            |
+  Cin[] --> |  Cini --> |       | --> Couti  | --> Cout[]
+            |            -------             |
+             --------------------------------
+*/
+
+module carry_save_adder
+   #(
+     parameter int BIT_LEN = 19
+    )
+   (
+    input  logic [BIT_LEN-1:0] A,
+    input  logic [BIT_LEN-1:0] B,
+    input  logic [BIT_LEN-1:0] Cin,
+    output logic [BIT_LEN-1:0] Cout,
+    output logic [BIT_LEN-1:0] S
+   );
+
+   genvar i;
+   generate
+      for (i=0; i<BIT_LEN; i++) begin : csa_fas
+         full_adder full_adder(
+                               .A(A[i]),
+                               .B(B[i]),
+                               .Cin(Cin[i]),
+                               .Cout(Cout[i]),
+                               .S(S[i])
+                              );
+      end
+   endgenerate
+endmodule
diff --git a/ip_cores/accum_mult_mod/src/rtl/carry_save_adder_tree_level.sv b/ip_cores/accum_mult_mod/src/rtl/carry_save_adder_tree_level.sv
new file mode 100644
index 0000000..ed6795b
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/carry_save_adder_tree_level.sv
@@ -0,0 +1,60 @@
+/*******************************************************************************
+  Copyright 2019 Supranational LLC
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*******************************************************************************/
+
+/*
+  Group the input terms into sets of three for input into a carry save adder
+  Shift the CSA carry output by 1 for use in the next level
+  The sum already has the correct weight, therefore we only pad for consistency
+  Any leftover terms that did not fit into a set are returned padded
+*/
+
+module carry_save_adder_tree_level
+   #(
+     parameter int NUM_ELEMENTS = 3,
+     parameter int BIT_LEN      = 19,
+
+     parameter int NUM_RESULTS  = (integer'(NUM_ELEMENTS/3) * 2) + 
+                                   (NUM_ELEMENTS%3)
+    )
+   (
+    input  logic [BIT_LEN-1:0] terms[NUM_ELEMENTS],
+    output logic [BIT_LEN-1:0] results[NUM_RESULTS]
+   );
+
+   genvar i;
+   generate
+      for (i=0; i<(NUM_ELEMENTS / 3); i++) begin : csa_insts
+         // Add three consecutive terms 
+         carry_save_adder #(.BIT_LEN(BIT_LEN))
+            carry_save_adder (
+                              .A(terms[i*3]),
+                              .B(terms[(i*3)+1]),
+                              .Cin(terms[(i*3)+2]),
+                              .Cout({results[i*2][0],
+                                     results[i*2][BIT_LEN-1:1]}),
+                              .S(results[(i*2)+1][BIT_LEN-1:0])
+                             );
+      end
+
+      // Save any unused terms for the next level 
+      for (i=0; i<(NUM_ELEMENTS % 3); i++) begin : csa_level_extras
+         always_comb begin
+            results[(NUM_RESULTS - 1) - i][BIT_LEN-1:0] = 
+               terms[(NUM_ELEMENTS- 1) - i][BIT_LEN-1:0];
+         end
+      end
+   endgenerate
+endmodule
diff --git a/ip_cores/accum_mult_mod/src/rtl/compressor_tree_3_to_2.sv b/ip_cores/accum_mult_mod/src/rtl/compressor_tree_3_to_2.sv
new file mode 100644
index 0000000..39e87ad
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/compressor_tree_3_to_2.sv
@@ -0,0 +1,107 @@
+/*******************************************************************************
+  Copyright 2019 Supranational LLC
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*******************************************************************************/
+
+/*
+  Tree built out of 3:2 compressors.  
+  Parameterized to take any number of inputs, each of a common size
+*/
+
+module compressor_tree_3_to_2
+   #(
+     parameter int NUM_ELEMENTS      = 9,
+     parameter int BIT_LEN           = 16
+    )
+   (
+    input  logic [BIT_LEN-1:0] terms[NUM_ELEMENTS],
+    output logic [BIT_LEN-1:0] C,
+    output logic [BIT_LEN-1:0] S
+   );
+
+`ifdef FASTSIM
+   // This is intended for simulation only to improve compile and run time
+   always_comb begin
+      C = 0;
+      S = 0;
+      for(int k = 0; k < NUM_ELEMENTS; k++) begin
+         S += terms[k];
+      end
+   end
+   
+`else
+
+   // If there is only one or two elements, then return the input (no tree)
+   // If there are three elements, this is the last level in the tree
+   // For greater than three elements:
+   //   Instantiate a set of carry save adders to process this level's terms
+   //   Recursive instantiate this module to complete the rest of the tree
+   generate
+      if (NUM_ELEMENTS == 1) begin // Return value
+         always_comb begin
+            C[BIT_LEN-1:0] = '0;
+            S[BIT_LEN-1:0] = terms[0];
+         end
+      end
+      else if (NUM_ELEMENTS == 2) begin // Return value
+         always_comb begin
+            C[BIT_LEN-1:0] = terms[1];
+            S[BIT_LEN-1:0] = terms[0];
+         end
+      end
+      else if (NUM_ELEMENTS == 3) begin // last level
+         /* verilator lint_off UNUSED */
+        logic [BIT_LEN-1:0] Cout;
+         /* verilator lint_on UNUSED */
+         
+         carry_save_adder #(.BIT_LEN(BIT_LEN))
+            carry_save_adder (
+                              .A(terms[0]),
+                              .B(terms[1]),
+                              .Cin(terms[2]),
+                              .Cout(Cout),
+                             .S(S[BIT_LEN-1:0])
+                             );
+         always_comb begin
+            C[BIT_LEN-1:0] = {Cout[BIT_LEN-2:0], 1'b0};
+         end
+      end
+      else begin
+         //localparam integer NUM_RESULTS = ($rtoi($floor(NUM_ELEMENTS/3)) * 2) + 
+         //                                 (NUM_ELEMENTS%3);
+         localparam integer NUM_RESULTS = (integer'(NUM_ELEMENTS/3) * 2) + 
+                                          (NUM_ELEMENTS%3);
+
+         logic [BIT_LEN-1:0] next_level_terms[NUM_RESULTS];
+
+         carry_save_adder_tree_level #(.NUM_ELEMENTS(NUM_ELEMENTS),
+                                       .BIT_LEN(BIT_LEN)
+                                      )
+            carry_save_adder_tree_level (
+                                         .terms(terms),
+                                         .results(next_level_terms)
+                                        );
+
+         compressor_tree_3_to_2 #(.NUM_ELEMENTS(NUM_RESULTS),
+                                  .BIT_LEN(BIT_LEN)
+                                 )
+            compressor_tree_3_to_2 (
+                                    .terms(next_level_terms),
+                                    .C(C),
+                                    .S(S)
+                                   );
+      end
+   endgenerate
+`endif
+endmodule
diff --git a/ip_cores/accum_mult_mod/src/rtl/full_adder.sv b/ip_cores/accum_mult_mod/src/rtl/full_adder.sv
new file mode 100644
index 0000000..40cbd25
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/full_adder.sv
@@ -0,0 +1,40 @@
+/*******************************************************************************
+  Copyright 2019 Supranational LLC
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*******************************************************************************/
+
+/*
+  A basic 1-bit full adder
+              -------
+             | FA    |
+    A    --> |       | --> S
+    B    --> |       |
+    Cin  --> |       | --> Cout
+              -------
+*/
+
+module full_adder
+   (
+    input  logic A,
+    input  logic B,
+    input  logic Cin,
+    output logic Cout,
+    output logic S
+   );
+
+   always_comb begin
+      S    =  A ^ B ^ Cin;
+      Cout = (A & B) | (Cin & (A ^ B));
+   end
+endmodule
diff --git a/ip_cores/accum_mult_mod/src/tb/accum_mult_mod_tb.sv b/ip_cores/accum_mult_mod/src/tb/accum_mult_mod_tb.sv
new file mode 100644
index 0000000..517e186
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/tb/accum_mult_mod_tb.sv
@@ -0,0 +1,127 @@
+/*
+  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+*/
+`timescale 1ps/1ps
+`define SIMULATION
+
+module accum_mult_mod_tb ();
+import common_pkg::*;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+parameter            BITS = 381;
+parameter [BITS-1:0] MODULUS = 'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab;
+parameter            A_DSP_W = 26;
+parameter            B_DSP_W = 17;
+parameter            GRID_BIT = 64;
+parameter            RAM_A_W = 10;
+parameter            RAM_D_W = 32;
+
+// This is the max size we can expect on the output
+
+if_axi_stream #(.DAT_BYTS((2*BITS+7)/8), .CTL_BITS(8)) in_if(clk);
+if_axi_stream #(.DAT_BYTS((BITS+7)/8), .CTL_BITS(8)) out_if(clk);
+
+initial begin
+  rst = 0;
+  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+  clk = 0;
+  forever #CLK_PERIOD clk = ~clk;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+  if (out_if.val && out_if.err)
+    $error(1, "%m %t ERROR: output .err asserted", $time);
+
+
+accum_mult_mod #(
+  .DAT_BITS ( BITS     ),
+  .MODULUS  ( MODULUS  ),
+  .CTL_BITS ( 8        ),
+  .A_DSP_W  ( A_DSP_W  ),
+  .B_DSP_W  ( B_DSP_W  ),
+  .GRID_BIT ( GRID_BIT ),
+  .RAM_A_W  ( RAM_A_W  ),
+  .RAM_D_W  ( RAM_D_W  )
+)
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_mul ( in_if  ),
+  .o_mul ( out_if ),
+  .i_ram_d (),
+  .i_ram_we (),
+  .i_ram_se ()
+);
+
+task test_loop();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
+  logic [BITS-1:0] in_a, in_b, out;
+  logic [BITS*2-1:0] expected;
+  integer t;
+  integer i, max;
+
+  $display("Running test_loop...");
+  i = 0;
+  max = 1000;
+
+  while (i < max) begin
+    in_a = random_vector((BITS+7)/8);
+    in_b = random_vector((BITS+7)/8);
+    expected = (in_a * in_b);
+    expected = expected % MODULUS;
+
+    fork
+      in_if.put_stream({in_b, in_a}, ((BITS*2)+7)/8, i);
+      out_if.get_stream(get_dat, get_len, 0);
+    join
+
+    out = get_dat;
+
+    t = out / MODULUS;
+    out = out % MODULUS;
+
+    assert(out == expected) else begin
+      $display("Expected: 0x%0x", expected);
+      $display("Was:      0x%0x (t=%0d)", out, t);
+      $fatal(1, "ERROR: Output did not match");
+    end
+    $display("test_loop PASSED loop %d/%d - 0x%0x (t=%0d)", i, max, out, t);
+    i = i + 1;
+  end
+
+  $display("test_loop PASSED");
+end
+endtask;
+
+initial begin
+  out_if.rdy = 0;
+  in_if.reset_source();
+  #(40*CLK_PERIOD);
+
+  test_loop();
+
+  #1us $finish();
+end
+endmodule
\ No newline at end of file
diff --git a/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv b/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
index ec7b44e..aa9217f 100644
--- a/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
+++ b/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
@@ -49,7 +49,7 @@ localparam NUM_OVR_WRT_BIT = 5;
 FE2_TYPE a_a, b_b, c_c, t;
 FE6_TYPE out, in_a, in_b;
 
-logic [22:0] eq_val, eq_wait;
+logic [22:0] eq_val, eq_wait, eq_sent;
 logic mul_cnt, add_cnt, sub_cnt, mnr_cnt;
 logic mul_en, add_en, sub_en, mnr_en;
 logic [4:0] nxt_fe2_mul, nxt_fe2_mnr, nxt_fe_add, nxt_fe_sub;
@@ -71,6 +71,7 @@ always_ff @ (posedge i_clk) begin
     i_mnr_fe2_if.rdy <= 0;
     eq_val <= 0;
     eq_wait <= 0;
+    eq_sent <= 0;
     rdy_l <= 0;
     a_a <= 0;
     b_b <= 0;
@@ -94,6 +95,7 @@ always_ff @ (posedge i_clk) begin
     if (o_sub_fe_if.rdy) o_sub_fe_if.val <= 0;
     if (o_add_fe_if.rdy) o_add_fe_if.val <= 0;
     if (o_mnr_fe2_if.rdy) o_mnr_fe2_if.val <= 0;
+    
 
     if (~sub_en) get_next_sub();
     if (~add_en) get_next_add();
@@ -116,6 +118,7 @@ always_ff @ (posedge i_clk) begin
       if(out_cnt == 5) begin
         eq_val <= 0;
         eq_wait <= 0;
+        eq_sent <= 0;
         rdy_l <= 0;
         a_a <= 0;
         b_b <= 0;
@@ -253,6 +256,7 @@ task fe2_subtraction(input int unsigned ctl, input FE2_TYPE a, b);
     eq_wait[ctl] <= 1;
     if (sub_cnt == 1) begin
       get_next_sub();
+      eq_sent[ctl] <= 1;
     end
     sub_cnt <= sub_cnt + 1;
   end
@@ -270,6 +274,7 @@ task fe2_addition(input int unsigned ctl, input FE2_TYPE a, b);
     eq_wait[ctl] <= 1;
     if (add_cnt == 1) begin
       get_next_add();
+      eq_sent[ctl] <= 1;
     end
     add_cnt <= add_cnt + 1;
   end
@@ -287,6 +292,7 @@ task fe2_multiply(input int unsigned ctl, input FE2_TYPE a, b);
     eq_wait[ctl] <= 1;
     if (mul_cnt == 1) begin
       get_next_fe2_mul();
+      eq_sent[ctl] <= 1;
     end
     mul_cnt <= mul_cnt + 1;
   end
@@ -303,6 +309,7 @@ task fe2_mnr(input int unsigned ctl, input FE2_TYPE a);
     eq_wait[ctl] <= 1;
     if (mnr_cnt == 1) begin
       get_next_fe2_mnr();
+      eq_sent[ctl] <= 1;
     end
     mnr_cnt <= mnr_cnt + 1;
   end
@@ -334,13 +341,13 @@ task get_next_add();
     nxt_fe_add <= 4;
   else if(~eq_wait[8] && rdy_l)
     nxt_fe_add <= 8;
-  else if(~eq_wait[9] && eq_wait[5] && rdy_l)
+  else if(~eq_wait[9] && eq_sent[5] && rdy_l)
     nxt_fe_add <= 9;
   else if (~eq_wait[12] && eq_val[11] && eq_val[1])
     nxt_fe_add <= 12;
   else if(~eq_wait[13] && rdy_l)
     nxt_fe_add <= 13;
-  else if(~eq_wait[14] && eq_wait[10] && rdy_l)
+  else if(~eq_wait[14] && eq_sent[10] && rdy_l)
     nxt_fe_add <= 14;
   else if(~eq_wait[19] && eq_val[18] && eq_val[0])
     nxt_fe_add <= 19;
@@ -372,7 +379,7 @@ task get_next_fe2_mnr();
   mnr_en <= 1;
   if(~eq_wait[18] && eq_val[7])
     nxt_fe2_mnr <= 18;
-  else if(~eq_wait[21] && eq_wait[20])
+  else if(~eq_wait[21] && eq_sent[20])
     nxt_fe2_mnr <= 21;
   else
     mnr_en <= 0;
diff --git a/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv b/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv
index 01d3921..8e6da3d 100644
--- a/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv
@@ -15,6 +15,8 @@
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 `timescale 1ps/1ps
+`define SIMULATION
+`define BL12_381_NEWMULT
 
 module ec_fe12_pow_s_tb ();
 
@@ -70,17 +72,23 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if
 if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(POW_BITS)) pow_fe12_o_if (clk);
 if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(POW_BITS)) pow_fe12_i_if (clk);
 
-
-ec_fp_mult_mod #(
-  .P             ( P        ),
-  .KARATSUBA_LVL ( 3        ),
-  .CTL_BITS      ( CTL_BITS )
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE)),
+  .CTL_BITS ( CTL_BITS ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 64 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( clk         ),
-  .i_rst( rst         ),
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
   .i_mul ( mul_fe_o_if ),
-  .o_mul ( mul_fe_i_if )
+  .o_mul ( mul_fe_i_if ),
+  .i_ram_d (),
+  .i_ram_we (),
+  .i_ram_se ()
 );
 
 adder_pipe # (
diff --git a/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv b/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv
index d1c802c..b04dcbb 100644
--- a/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv
@@ -15,6 +15,8 @@
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 `timescale 1ps/1ps
+`define SIMULATION
+`define BL12_381_NEWMULT
 
 module ec_fe6_mul_s_tb ();
 
@@ -59,20 +61,28 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe2_i_if (cl
 if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) o_mul_fe6_if (clk);
 if_axi_stream #(.DAT_BYTS((2*$bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) i_mul_fe6_if (clk);
 
-ec_fp_mult_mod #(
-  .P             ( P        ),
-  .KARATSUBA_LVL ( 3        ),
-  .CTL_BITS      ( CTL_BITS )
+
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE) ),
+  .CTL_BITS ( CTL_BITS ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 32 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( clk         ),
-  .i_rst( rst         ),
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
   .i_mul ( mul_fe_o_if ),
-  .o_mul ( mul_fe_i_if )
+  .o_mul ( mul_fe_i_if ),
+  .i_ram_d (),
+  .i_ram_we (),
+  .i_ram_se ()
 );
 
 adder_pipe # (
-  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .BITS     ( $bits(FE_TYPE) ),
   .P        ( P        ),
   .CTL_BITS ( CTL_BITS ),
   .LEVEL    ( 2        )
@@ -85,7 +95,7 @@ adder_pipe (
 );
 
 subtractor_pipe # (
-  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .BITS     ( $bits(FE_TYPE) ),
   .P        ( P        ),
   .CTL_BITS ( CTL_BITS ),
   .LEVEL    ( 2        )
@@ -231,8 +241,6 @@ task test();
 
 endtask
 
-
-
 initial begin
   i_mul_fe6_if.reset_source();
   o_mul_fe6_if.rdy = 0;
diff --git a/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv b/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv
index a279849..ee8bee4 100644
--- a/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv
@@ -174,20 +174,28 @@ fe6_mul_by_nonresidue_i (
   .i_mnr_fe2_if ( mnr_fe2_o_if[1] )
 );
 
-ec_fp_mult_mod #(
-  .P             ( P        ),
-  .KARATSUBA_LVL ( 3        ),
-  .CTL_BITS      ( CTL_BITS )
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE) ),
+  .MODULUS  ( P ),
+  .CTL_BITS ( CTL_BITS ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 64 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( clk         ),
-  .i_rst( rst         ),
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
   .i_mul ( mul_fe_in_if  ),
-  .o_mul ( mul_fe_out_if )
+  .o_mul ( mul_fe_out_if ),
+  .i_ram_d ( '0 ),
+  .i_ram_we ( '0 ),
+  .i_ram_se ( '0 )
 );
 
 adder_pipe # (
-  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .BITS     ( $bits(FE_TYPE) ),
   .P        ( P        ),
   .CTL_BITS ( CTL_BITS ),
   .LEVEL    ( 2        )
@@ -200,7 +208,7 @@ adder_pipe (
 );
 
 subtractor_pipe # (
-  .BITS     ( bls12_381_pkg::DAT_BITS ),
+  .BITS     ( $bits(FE_TYPE) ),
   .P        ( P        ),
   .CTL_BITS ( CTL_BITS ),
   .LEVEL    ( 2        )
diff --git a/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv b/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
index eba7ae7..2d9b6d1 100644
--- a/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
@@ -197,16 +197,24 @@ resource_share_add (
   .o_axi ( add_out_if[1:0] )
 );
 
-ec_fp_mult_mod #(
-  .P             ( P   ),
-  .KARATSUBA_LVL ( 3   ),
-  .CTL_BITS      ( 16  )
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE) ),
+  .MODULUS  ( P ),
+  .CTL_BITS ( 16 ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 64 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( clk         ),
-  .i_rst( rst         ),
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
   .i_mul ( mult_in_if[2] ),
-  .o_mul ( mult_out_if[2] )
+  .o_mul ( mult_out_if[2] ),
+  .i_ram_d ( '0 ),
+  .i_ram_we ( '0 ),
+  .i_ram_se ( '0 )
 );
 
 adder_pipe # (
@@ -264,8 +272,6 @@ begin
 end
 endtask;
 
-logic [380:0] in_k;
-
 initial begin
   out_if.rdy = 0;
   in_if.val = 0;
diff --git a/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv b/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
index 269299b..d18a98b 100644
--- a/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
@@ -14,6 +14,7 @@
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
+
 `timescale 1ps/1ps
 
 module ec_fp_point_mult_tb ();
@@ -36,8 +37,6 @@ parameter P            = bls12_381_pkg::P;
 if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8), .CTL_BITS(KEY_BITS)) in_if(clk);
 if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8)) out_if(clk);
 
-
-
 if_axi_stream #(.DAT_BITS(2*$bits(FP_TYPE))) add_i_if(clk);
 if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) add_o_if(clk);
 if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) dbl_i_if(clk);
@@ -187,20 +186,30 @@ resource_share_add (
   .o_axi ( add_out_if[1:0] )
 );
 
-ec_fp_mult_mod #(
-  .P             ( P   ),
-  .KARATSUBA_LVL ( 3   ),
-  .CTL_BITS      ( 16  )
+
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE) ),
+  .MODULUS  ( P ),
+  .CTL_BITS ( 16 ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 64 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( clk         ),
-  .i_rst( rst         ),
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
   .i_mul ( mult_in_if[2] ),
-  .o_mul ( mult_out_if[2] )
+  .o_mul ( mult_out_if[2] ),
+  .i_ram_d ( '0 ),
+  .i_ram_we ( '0 ),
+  .i_ram_se ( '0 )
 );
 
 adder_pipe # (
   .P        ( P   ),
+  .BITS     ( $bits(FE_TYPE) ),
   .CTL_BITS ( 16  ),
   .LEVEL    ( 2   )
 )
@@ -213,6 +222,7 @@ adder_pipe (
 
 subtractor_pipe # (
   .P        ( P   ),
+  .BITS     ( $bits(FE_TYPE) ),
   .CTL_BITS ( 16  ),
   .LEVEL    ( 2   )
 )
@@ -230,7 +240,7 @@ begin
   logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
   integer start_time, finish_time;
   FP_TYPE  p_out, p_exp;
-  $display("Running test with k= %d", k);
+  $display("Running test with k= %0d", k);
   p_exp = `MULT_FUNC(k, `G_POINT);
   start_time = $time;
   fork
@@ -262,6 +272,7 @@ initial begin
   #(40*CLK_PERIOD);
 
    in_k = P-1;
+   test(381'h2);
    test(381'haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);
    test(in_k);
    
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv
index 16ba363..2109691 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv
@@ -31,7 +31,12 @@ module bls12_381_axi_bridge (
   output logic [31:0] o_new_inst_pt,
   output logic        o_new_inst_pt_val,
   output logic        o_reset_inst_ram,
-  output logic        o_reset_data_ram
+  output logic        o_reset_data_ram,
+  
+  // Interface to memory used in multiplier
+  output logic [31:0] o_ram_d,
+  output logic        o_ram_we,
+  output logic        o_ram_se
 );
 
 import bls12_381_pkg::*;
@@ -47,7 +52,6 @@ logic [31:0] last_inst_cnt;
 always_ff @ (posedge i_clk) begin
   curr_inst_pt <= i_curr_inst_pt;
   last_inst_cnt <= i_last_inst_cnt;
-
 end
 
 always_ff @ (posedge i_clk) begin
@@ -63,6 +67,10 @@ always_ff @ (posedge i_clk) begin
     o_new_inst_pt <= 0;
     o_reset_inst_ram <= 0;
     o_reset_data_ram <= 0;
+    
+    o_ram_d <= 0;
+    o_ram_we <= 0;
+    o_ram_se <= 0;
   end else begin
 
     o_reset_inst_ram <= 0;
@@ -150,6 +158,13 @@ always_ff @ (posedge i_clk) begin
             o_reset_inst_ram <= axi_lite_if.wdata[0]; // This will reset the instruction ram
             o_reset_data_ram <= axi_lite_if.wdata[1]; // This will reset the data ram
           end
+          32'h18: begin
+            o_ram_d <= axi_lite_if.wdata;
+          end
+          32'h1c: begin
+            o_ram_we <= axi_lite_if.wdata[0];
+            o_ram_se <= axi_lite_if.wdata[1];
+          end          
         endcase
       end else
       if (wr_addr < DATA_AXIL_START) begin
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
index 34fc144..59cd8e7 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@@ -21,11 +21,11 @@ package bls12_381_pkg;
   localparam DAT_BITS = 381;
   localparam MUL_BITS = 384;
   localparam [DAT_BITS-1:0] P = 381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab;
+  
+  typedef logic [380:0] fe_t;
 
-  typedef logic [DAT_BITS-1:0] fe_t;
-
-  fe_t Gx = 381'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
-  fe_t Gy = 381'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;
+  fe_t Gx = 'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
+  fe_t Gy = 'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;
 
   localparam [63:0] ATE_X = 64'hd201000000010000;
   localparam ATE_X_START = 63;
@@ -280,7 +280,7 @@ package bls12_381_pkg;
      fe2_mul[1] = fe_add(fe_mul(a[0], b[1]), fe_mul(a[1], b[0]));
    endfunction
 
-      // Function to double point in Jacobian coordinates (for comparison in testbench)
+   // Function to double point in Jacobian coordinates (for comparison in testbench)
    // Here a is 0, and we also mod the result
    function jb_point_t dbl_jb_point(input jb_point_t p);
      fe_t I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;
@@ -291,6 +291,7 @@ package bls12_381_pkg;
      I_Y = p.y;
      I_Z = p.z;
      A = fe_mul(I_Y, I_Y);
+
      B = fe_mul(fe_mul(4, I_X), A);
      C = fe_mul(fe_mul(8, A), A);
      D = fe_mul(fe_mul(3, I_X), I_X);
@@ -379,7 +380,7 @@ package bls12_381_pkg;
        if (c[0]) begin
          result = add_jb_point(result, addend);
        end
-       addend = dbl_jb_point(addend);
+       addend = dbl_jb_point(addend);  
        c = c >> 1;
      end
      return result;
@@ -530,7 +531,8 @@ package bls12_381_pkg;
      a_a = fe2_mul(a[0], b[0]);  // 0. a_a = fe2_mul(a[0], b[0])
      b_b = fe2_mul(a[1], b[1]);  // 1. b_b = fe2_mul(a[1], b[1])
      c_c = fe2_mul(a[2], b[2]);  // 2. c_c = fe2_mul(a[2], b[2])
-
+     
+  
      fe6_mul[0] = fe2_add(a[1], a[2]); // 3. fe6_mul[0] = fe2_add(a[1], a[2])
      t = fe2_add(b[1], b[2]);         // 4. t =  fe2_add(b[1], b[2])
 
@@ -540,7 +542,6 @@ package bls12_381_pkg;
 
      fe6_mul[2] = fe2_add(b[0], b[2]);  // 8. fe6_mul[2] = fe2_add(b[0], b[2])
      t = fe2_add(a[0], a[2]);           // 9. t = fe2_add(a[0], a[2])    [wait 5]
-
      fe6_mul[2] = fe2_mul(fe6_mul[2], t);  // 10. fe6_mul[2] = fe2_mul(fe6_mul[2], t)   [8, 9]
      fe6_mul[2] = fe2_sub(fe6_mul[2], a_a); // 11. fe6_mul[2] = fe2_sub(fe6_mul[2], a_a)  [10, 0]
      fe6_mul[2] = fe2_add(fe6_mul[2], b_b);  // 12. fe6_mul[2] = fe2_add(fe6_mul[2], b_b) [11, 1]
@@ -600,12 +601,14 @@ package bls12_381_pkg;
 
      fe12_sqr[0] = fe6_add(fe12_sqr[0], a[0]);
      fe12_sqr[0] = fe6_mul(fe12_sqr[0], c0c1);
+     
 
      fe12_sqr[0] = fe6_sub(fe12_sqr[0], ab);
      fe12_sqr[1] = fe6_add(ab, ab);
 
      ab = fe6_mul_by_nonresidue(ab);
      fe12_sqr[0] = fe6_sub(fe12_sqr[0], ab);
+     
    endfunction
 
 
@@ -681,15 +684,14 @@ package bls12_381_pkg;
      t3 = fe2_add(R.x, t1); // 6. [4]
      t3 = fe2_mul(t3, t3); // 7. [6]
      t3 = fe2_sub(t3, t0); // 8. [7, 1]
-
      t3 = fe2_sub(t3, t2); // 9. [8, 5]
-
      t3 = fe2_add(t3, t3); // 10. [9]
-
      t6 = fe2_add(R.x, t4); // 11. [3]
      t5 = fe2_mul(t4, t4); // 12. [3]
 
      R.x = fe2_sub(t5, t3); // 13. [12, 10]
+
+    
      R.x = fe2_sub(R.x, t3); // 14. [13]
 
      R.z = fe2_add(R.z, R.y); // 15. [R.val, wait 0]
@@ -730,7 +732,7 @@ package bls12_381_pkg;
    // This performs both the line evaluation and the addition
    task automatic miller_add_step(ref fp2_jb_point_t R, input fp2_af_point_t Q, input af_point_t P, ref fe12_t f);
      fe2_t zsquared, ysquared, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
-
+     
      zsquared = fe2_mul(R.z, R.z); // 0. [R.val]
      ysquared = fe2_mul(Q.y, Q.y); // 1. [Q.val]
 
@@ -797,7 +799,7 @@ package bls12_381_pkg;
      t1[1]  = fe_mul(t1[1], P.x); // 42. [38]
 
      f = {{FE2_zero, t10, FE2_zero}, {FE2_zero, t1, t9}};
-
+     
    endtask
 
    function fe2_t fe2_fmap(input fe2_t a, input int pow);
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
index ee04004..180599b 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
@@ -49,6 +49,9 @@ logic [7:0] interrupt_hdr_byt;
 logic [READ_CYCLE:0] inst_ram_read, data_ram_read;
 logic reset_inst_ram, reset_data_ram;
 
+logic [31:0] mult_ram_d;
+logic mult_ram_we, mult_ram_se;
+
 // Instruction RAM
 if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_sys_if(.i_clk(i_clk), .i_rst(i_rst || reset_inst_ram));
 if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_usr_if(.i_clk(i_clk), .i_rst(i_rst || reset_inst_ram));
@@ -270,7 +273,10 @@ bls12_381_axi_bridge bls12_381_axi_bridge (
   .o_new_inst_pt      ( new_inst_pt     ),
   .o_new_inst_pt_val  ( new_inst_pt_val ),
   .o_reset_inst_ram   ( reset_inst_ram  ),
-  .o_reset_data_ram   ( reset_data_ram  )
+  .o_reset_data_ram   ( reset_data_ram  ),
+  .o_ram_d            ( mult_ram_d      ),
+  .o_ram_we           ( mult_ram_we     ),
+  .o_ram_se           ( mult_ram_se     )
 );
 
 always_comb begin
@@ -339,16 +345,24 @@ resource_share_mul (
   .o_axi ( mul_out_if[1:0] )
 );
 
-ec_fp_mult_mod #(
-  .P             ( P        ),
-  .KARATSUBA_LVL ( 3        ),
-  .CTL_BITS      ( CTL_BITS )
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE) ),
+  .MODULUS  ( P ),
+  .CTL_BITS ( CTL_BITS ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 64 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( i_clk ),
-  .i_rst( i_rst ),
+accum_mult_mod (
+  .i_clk ( i_clk ),
+  .i_rst ( i_rst ),
   .i_mul ( mul_in_if[2]  ),
-  .o_mul ( mul_out_if[2] )
+  .o_mul ( mul_out_if[2] ),
+  .i_ram_d  ( mult_ram_d ),
+  .i_ram_we ( mult_ram_we ),
+  .i_ram_se ( mult_ram_se )
 );
 
 adder_pipe # (
@@ -560,7 +574,7 @@ task task_mul_element();
         new_data.dat <= mul_out_if[1].dat;
         new_data.pt <= pt_l;
         data_ram_sys_if.we <= 1;
-        cnt <= 34;
+        cnt <= 33;
       end
     end
     3: begin
@@ -625,7 +639,7 @@ task task_mul_element();
         new_data.pt <= pt_l;
         data_ram_sys_if.we <= 1;
         data_ram_sys_if.a <=  curr_inst.c + 1;
-        cnt <= 34;
+        cnt <= 33;
       end
     end
     // FE12 multiplication
@@ -633,7 +647,6 @@ task task_mul_element();
     20,21,22,23,24,25,26,27,28,29,30,31: begin
       mul_fe12_i_if.rdy <= 0;
       
-
       if (|data_ram_read[READ_CYCLE:1]== 0 && (~mul_fe12_o_if.val || (mul_fe12_o_if.val && mul_fe12_o_if.rdy))) begin
         if (data_ram_read[0]) begin
           data_ram_read[0] <= 1;
diff --git a/zcash_fpga/src/rtl/top/include.f b/zcash_fpga/src/rtl/top/include.f
index 0208da0..a781022 100644
--- a/zcash_fpga/src/rtl/top/include.f
+++ b/zcash_fpga/src/rtl/top/include.f
@@ -54,6 +54,12 @@
 ${ZCASH_DIR}/ip_cores/util/src/rtl/adder_pipe.sv
 ${ZCASH_DIR}/ip_cores/util/src/rtl/subtracter_pipe.sv
 
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/carry_save_adder_tree_level.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/compressor_tree_3_to_2.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/full_adder.sv
+
 ${ZCASH_DIR}/ip_cores/ec/src/rtl/ec_fp_mult_mod.sv
 ${ZCASH_DIR}/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv
 ${ZCASH_DIR}/ip_cores/ec/src/rtl/ec_fp2_point_add.sv
diff --git a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
index 7ab1ed2..4ff0bb9 100644
--- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
+++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
@@ -27,7 +27,7 @@ package zcash_fpga_pkg;
 
   import bls12_381_pkg::point_type_t;
 
-  parameter FPGA_VERSION = 32'h01_03_00;  //v1.3.0
+  parameter FPGA_VERSION = 32'h01_04_00;  //v1.4.0
 
   // What features are enabled in this build
   parameter bit ENB_VERIFY_SECP256K1_SIG = 1;
diff --git a/zcash_fpga/src/tb/bls12_381_fe12_final_exp_tb.sv b/zcash_fpga/src/tb/bls12_381_fe12_final_exp_tb.sv
deleted file mode 100644
index 1157fa6..0000000
--- a/zcash_fpga/src/tb/bls12_381_fe12_final_exp_tb.sv
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
-  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
-
-  This program is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <https://www.gnu.org/licenses/>.
-*/
-`timescale 1ps/1ps
-
-module bls12_381_fe12_final_exp_tb ();
-
-import common_pkg::*;
-import bls12_381_pkg::*;
-
-parameter type FE_TYPE   = bls12_381_pkg::fe_t;
-parameter type FE2_TYPE  = bls12_381_pkg::fe2_t;
-parameter type FE6_TYPE  = bls12_381_pkg::fe6_t;
-parameter type FE12_TYPE = bls12_381_pkg::fe12_t;
-parameter P              = bls12_381_pkg::P;
-
-localparam POW_BITS = $bits(ATE_X);
-localparam POW_BIT =  64;
-localparam FMAP_BIT = 56;
-localparam SQ_BIT   = 60;
-localparam CTL_BITS = POW_BIT + POW_BITS;
-
-localparam CLK_PERIOD = 100;
-
-logic clk, rst;
-
-initial begin
-  rst = 0;
-  repeat(2) #(20*CLK_PERIOD) rst = ~rst;
-end
-
-initial begin
-  clk = 0;
-  forever #(CLK_PERIOD/2) clk = ~clk;
-end
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if [3:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe_i_if [3:0] (clk);
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_o_if [4:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   add_fe_i_if [4:0] (clk);
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_o_if [6:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   sub_fe_i_if [6:0] (clk);
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_o_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe2_i_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe2_o_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe2_i_if [2:0] (clk);
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe6_o_if       (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe6_i_if       (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_o_if       (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_i_if       (clk);
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if [2:0] (clk);
-
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe12_o_if      (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe12_i_if      (clk);
-
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   fmap_fe12_o_if     (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   fmap_fe12_i_if     (clk);
-
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   pow_fe12_o_if      (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   pow_fe12_i_if      (clk);
-
-if_axi_stream #(.DAT_BYTS((7+$bits(FE_TYPE))/8), .CTL_BITS(CTL_BITS))   final_exp_fe12_o_if      (clk);
-if_axi_stream #(.DAT_BYTS((7+$bits(FE_TYPE))/8), .CTL_BITS(CTL_BITS))   final_exp_fe12_i_if      (clk);
-
-ec_fp_mult_mod #(
-  .P             ( P        ),
-  .KARATSUBA_LVL ( 3        ),
-  .CTL_BITS      ( CTL_BITS )
-)
-ec_fp_mult_mod (
-  .i_clk( clk          ),
-  .i_rst( rst          ),
-  .i_mul ( mul_fe_o_if[3] ),
-  .o_mul ( mul_fe_i_if[3] )
-);
-
-adder_pipe # (
-  .BITS     ( bls12_381_pkg::DAT_BITS ),
-  .P        ( P        ),
-  .CTL_BITS ( CTL_BITS ),
-  .LEVEL    ( 2        )
-)
-adder_pipe (
-  .i_clk ( clk        ),
-  .i_rst ( rst        ),
-  .i_add ( add_fe_o_if[4] ),
-  .o_add ( add_fe_i_if[4] )
-);
-
-subtractor_pipe # (
-  .BITS     ( bls12_381_pkg::DAT_BITS ),
-  .P        ( P        ),
-  .CTL_BITS ( CTL_BITS ),
-  .LEVEL    ( 2        )
-)
-subtractor_pipe (
-  .i_clk ( clk        ),
-  .i_rst ( rst        ),
-  .i_sub ( sub_fe_o_if[6] ),
-  .o_sub ( sub_fe_i_if[6] )
-);
-
-ec_fe2_mul_s #(
-  .FE_TYPE  ( FE_TYPE  ),
-  .CTL_BITS ( CTL_BITS )
-)
-ec_fe2_mul_s (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mul_fe2_if ( mul_fe2_i_if[2] ),
-  .i_mul_fe2_if ( mul_fe2_o_if[2] ),
-  .o_add_fe_if ( add_fe_o_if[0] ),
-  .i_add_fe_if ( add_fe_i_if[0] ),
-  .o_sub_fe_if ( sub_fe_o_if[0] ),
-  .i_sub_fe_if ( sub_fe_i_if[0] ),
-  .o_mul_fe_if ( mul_fe_o_if[0] ),
-  .i_mul_fe_if ( mul_fe_i_if[0] )
-);
-
-fe2_mul_by_nonresidue_s #(
-  .FE_TYPE  ( FE_TYPE  )
-)
-fe2_mul_by_nonresidue_s (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mnr_fe2_if ( mnr_fe2_i_if[2] ),
-  .i_mnr_fe2_if ( mnr_fe2_o_if[2] ),
-  .o_add_fe_if ( add_fe_o_if[1] ),
-  .i_add_fe_if ( add_fe_i_if[1] ),
-  .o_sub_fe_if ( sub_fe_o_if[1] ),
-  .i_sub_fe_if ( sub_fe_i_if[1] )
-);
-
-ec_fe6_mul_s #(
-  .FE_TYPE  ( FE_TYPE  ),
-  .FE2_TYPE ( FE2_TYPE ),
-  .FE6_TYPE ( FE6_TYPE ),
-  .OVR_WRT_BIT ( 0 )
-)
-ec_fe6_mul_s (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mul_fe2_if ( mul_fe2_o_if[0] ),
-  .i_mul_fe2_if ( mul_fe2_i_if[0] ),
-  .o_add_fe_if ( add_fe_o_if[2] ),
-  .i_add_fe_if ( add_fe_i_if[2] ),
-  .o_sub_fe_if ( sub_fe_o_if[2] ),
-  .i_sub_fe_if ( sub_fe_i_if[2] ),
-  .o_mnr_fe2_if ( mnr_fe2_o_if[0] ),
-  .i_mnr_fe2_if ( mnr_fe2_i_if[0] ),
-  .o_mul_fe6_if ( mul_fe6_i_if ),
-  .i_mul_fe6_if ( mul_fe6_o_if )
-);
-
-fe6_mul_by_nonresidue_s #(
-  .FE_TYPE  ( FE_TYPE  )
-)
-fe6_mul_by_nonresidue_s (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mnr_fe2_if ( mnr_fe2_o_if[1] ),
-  .i_mnr_fe2_if ( mnr_fe2_i_if[1] ),
-  .o_mnr_fe6_if ( mnr_fe6_i_if ),
-  .i_mnr_fe6_if ( mnr_fe6_o_if )
-);
-
-ec_fe12_mul_s #(
-  .FE_TYPE  ( FE_TYPE  ),
-  .OVR_WRT_BIT ( 8 ),
-  .SQ_BIT      ( SQ_BIT )
-)
-ec_fe12_mul_s (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mul_fe6_if ( mul_fe6_o_if ),
-  .i_mul_fe6_if ( mul_fe6_i_if ),
-  .o_add_fe_if ( add_fe_o_if[3] ),
-  .i_add_fe_if ( add_fe_i_if[3] ),
-  .o_sub_fe_if ( sub_fe_o_if[3] ),
-  .i_sub_fe_if ( sub_fe_i_if[3] ),
-  .o_mnr_fe6_if ( mnr_fe6_o_if ),
-  .i_mnr_fe6_if ( mnr_fe6_i_if ),
-  .o_mul_fe12_if ( mul_fe12_i_if[2] ),
-  .i_mul_fe12_if ( mul_fe12_o_if[2] )
-);
-
-bls12_381_fe12_fmap_wrapper #(
-  .FE_TYPE     ( FE_TYPE  ),
-  .CTL_BITS    ( CTL_BITS ),
-  .CTL_BIT_POW ( FMAP_BIT  )
-)
-bls12_381_fe12_fmap_wrapper (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_fmap_fe12_if ( fmap_fe12_i_if ),
-  .i_fmap_fe12_if ( fmap_fe12_o_if ),
-  .o_mul_fe2_if ( mul_fe2_o_if[1] ),
-  .i_mul_fe2_if ( mul_fe2_i_if[1] ),
-  .o_mul_fe_if ( mul_fe_o_if[1] ),
-  .i_mul_fe_if ( mul_fe_i_if[1] )
-);
-
-bls12_381_fe12_inv_wrapper #(
-  .FE_TYPE  ( FE_TYPE ),
-  .CTL_BITS ( CTL_BITS ),
-  .OVR_WRT_BIT ( 0 )
-)
-bls12_381_fe12_inv_wrapper (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_inv_fe12_if ( inv_fe12_i_if ),
-  .i_inv_fe12_if ( inv_fe12_o_if ),
-  .o_mul_fe_if   ( mul_fe_o_if[2]   ),
-  .i_mul_fe_if   ( mul_fe_i_if[2]   )
-);
-
-ec_fe12_pow_s #(
-  .FE_TYPE  ( FE_TYPE  ),
-  .CTL_BIT_POW ( POW_BIT   ),
-  .POW_BITS ( POW_BITS ),
-  .SQ_BIT   ( SQ_BIT   )
-)
-ec_fe12_pow_s (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mul_fe12_if ( mul_fe12_o_if[0] ),
-  .i_mul_fe12_if ( mul_fe12_i_if[0] ),
-  .o_sub_fe_if ( sub_fe_o_if[4] ),
-  .i_sub_fe_if ( sub_fe_i_if[4] ),
-  .o_pow_fe12_if ( pow_fe12_i_if ),
-  .i_pow_fe12_if ( pow_fe12_o_if )
-);
-
-bls12_381_final_exponent #(
-  .OVR_WRT_BIT ( 32 ),
-  .FMAP_BIT    ( FMAP_BIT ),
-  .POW_BIT     ( POW_BIT  ),
-  .SQ_BIT      ( SQ_BIT   )
-)
-bls12_381_final_exponent (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .o_mul_fe12_if ( mul_fe12_o_if[1] ),
-  .i_mul_fe12_if ( mul_fe12_i_if[1] ),
-  .o_pow_fe12_if ( pow_fe12_o_if ),
-  .i_pow_fe12_if ( pow_fe12_i_if ),
-  .o_fmap_fe12_if ( fmap_fe12_o_if ),
-  .i_fmap_fe12_if ( fmap_fe12_i_if ),
-  .o_inv_fe12_if ( inv_fe12_o_if ),
-  .i_inv_fe12_if ( inv_fe12_i_if ),
-  .o_sub_fe_if ( sub_fe_o_if[5] ),
-  .i_sub_fe_if ( sub_fe_i_if[5] ),
-  .o_final_exp_fe12_if ( final_exp_fe12_i_if ),
-  .i_final_exp_fe12_if ( final_exp_fe12_o_if )
-);
-
-
-resource_share # (
-  .NUM_IN       ( 4                ),
-  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
-  .CTL_BITS     ( CTL_BITS         ),
-  .OVR_WRT_BIT  ( 40 ),
-  .PIPELINE_IN  ( 1                ),
-  .PIPELINE_OUT ( 1                )
-)
-resource_share_fe_add (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .i_axi ( add_fe_o_if[3:0] ),
-  .o_res ( add_fe_o_if[4]   ),
-  .i_res ( add_fe_i_if[4]   ),
-  .o_axi ( add_fe_i_if[3:0] )
-);
-
-resource_share # (
-  .NUM_IN       ( 6                ),
-  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
-  .CTL_BITS     ( CTL_BITS         ),
-  .OVR_WRT_BIT  ( 40 ),
-  .PIPELINE_IN  ( 1                ),
-  .PIPELINE_OUT ( 1                )
-)
-resource_share_fe_sub (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .i_axi ( sub_fe_o_if[5:0] ),
-  .o_res ( sub_fe_o_if[6]   ),
-  .i_res ( sub_fe_i_if[6]   ),
-  .o_axi ( sub_fe_i_if[5:0] )
-);
-
-resource_share # (
-  .NUM_IN       ( 3                ),
-  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
-  .CTL_BITS     ( CTL_BITS         ),
-  .OVR_WRT_BIT  ( 40 ),
-  .PIPELINE_IN  ( 1                ),
-  .PIPELINE_OUT ( 1                )
-)
-resource_share_fe_mul (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .i_axi ( mul_fe_o_if[2:0] ),
-  .o_res ( mul_fe_o_if[3]   ),
-  .i_res ( mul_fe_i_if[3]   ),
-  .o_axi ( mul_fe_i_if[2:0] )
-);
-
-resource_share # (
-  .NUM_IN       ( 2                ),
-  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
-  .CTL_BITS     ( CTL_BITS         ),
-  .OVR_WRT_BIT  ( 44               ),
-  .PIPELINE_IN  ( 1                ),
-  .PIPELINE_OUT ( 1                )
-)
-resource_share_fe2_mul (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .i_axi ( mul_fe2_o_if[1:0] ),
-  .o_res ( mul_fe2_o_if[2]   ),
-  .i_res ( mul_fe2_i_if[2]   ),
-  .o_axi ( mul_fe2_i_if[1:0] )
-);
-
-resource_share # (
-  .NUM_IN       ( 2                ),
-  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
-  .CTL_BITS     ( CTL_BITS         ),
-  .OVR_WRT_BIT  ( 48 ),
-  .PIPELINE_IN  ( 1                ),
-  .PIPELINE_OUT ( 1                )
-)
-resource_share_fe12_mul (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .i_axi ( mul_fe12_o_if[1:0] ),
-  .o_res ( mul_fe12_o_if[2]   ),
-  .i_res ( mul_fe12_i_if[2]   ),
-  .o_axi ( mul_fe12_i_if[1:0] )
-);
-
-resource_share # (
-  .NUM_IN       ( 2                ),
-  .DAT_BITS     ( 2*$bits(FE_TYPE) ),
-  .CTL_BITS     ( CTL_BITS         ),
-  .OVR_WRT_BIT  ( 52               ),
-  .PIPELINE_IN  ( 1                ),
-  .PIPELINE_OUT ( 1                )
-)
-resource_share_fe2_mnr (
-  .i_clk ( clk ),
-  .i_rst ( rst ),
-  .i_axi ( mnr_fe2_o_if[1:0] ),
-  .o_res ( mnr_fe2_o_if[2]   ),
-  .i_res ( mnr_fe2_i_if[2]   ),
-  .o_axi ( mnr_fe2_i_if[1:0] )
-);
-
-
-// This just tests our software model vs a known good result
-task test_sw();
-  af_point_t P;
-  fp2_af_point_t Q;
-  fe12_t f, f_exp;
-
-  $display("Running test_sw ...");
-
-  // Known good result from zcash rust code
-  f_exp = {381'h0f41e58663bf08cf068672cbd01a7ec73baca4d72ca93544deff686bfd6df543d48eaa24afe47e1efde449383b676631,
-           381'h04c581234d086a9902249b64728ffd21a189e87935a954051c7cdba7b3872629a4fafc05066245cb9108f0242d0fe3ef,
-           381'h03350f55a7aefcd3c31b4fcb6ce5771cc6a0e9786ab5973320c806ad360829107ba810c5a09ffdd9be2291a0c25a99a2,
-           381'h11b8b424cd48bf38fcef68083b0b0ec5c81a93b330ee1a677d0d15ff7b984e8978ef48881e32fac91b93b47333e2ba57,
-           381'h06fba23eb7c5af0d9f80940ca771b6ffd5857baaf222eb95a7d2809d61bfe02e1bfd1b68ff02f0b8102ae1c2d5d5ab1a,
-           381'h19f26337d205fb469cd6bd15c3d5a04dc88784fbb3d0b2dbdea54d43b2b73f2cbb12d58386a8703e0f948226e47ee89d,
-           381'h018107154f25a764bd3c79937a45b84546da634b8f6be14a8061e55cceba478b23f7dacaa35c8ca78beae9624045b4b6,
-           381'h01b2f522473d171391125ba84dc4007cfbf2f8da752f7c74185203fcca589ac719c34dffbbaad8431dad1c1fb597aaa5,
-           381'h193502b86edb8857c273fa075a50512937e0794e1e65a7617c90d8bd66065b1fffe51d7a579973b1315021ec3c19934f,
-           381'h1368bb445c7c2d209703f239689ce34c0378a68e72a6b3b216da0e22a5031b54ddff57309396b38c881c4c849ec23e87,
-           381'h089a1c5b46e5110b86750ec6a532348868a84045483c92b7af5af689452eafabf1a8943e50439f1d59882a98eaa0170f,
-           381'h1250ebd871fc0a92a7b2d83168d0d727272d441befa15c503dd8e90ce98db3e7b6d194f60839c508a84305aaca1789b6};
-
-  // Output of miller loop - input to our model
-  f =    {381'h049eaeacea5c5e9ad17ab1909cb31c653b0cb7184cc9187f77a934b1189b088d4ca64d0ff60eb0b6be8805757ba3df04,
-          381'h0198faba7d94607ce154e6a711ef859a5c4623722d4136c961a801c2b984aae5838a532aae5c2211660d3b8689b8f015,
-          381'h12b091c5b34124368d2e95a7fd6cfa3b456447e49cd298de506572c5f3afb8727f2a186f0ea14bf5eed2171c4568b5c5,
-          381'h05cfef8c26f3886e502008fc1fd74b86d400c32cb432323f994c060db185e9f8519cf76afcc9969379c2967f2f6ba36a,
-          381'h0465162c766430cf4a98e217e3d765643118598715cc2538c56e933f0528f56dd6ac82507df446545a2fde77349ad37e,
-          381'h1427e91ee8eff7e7187d560c375f5da3a9f0f162192ac4277bff1b14f560355e0b5cf069f452ab4d35ce11b39facc280,
-          381'h087d1320fe5bad5c2d8e12c49e6aff41a0b80e1497bbe85682e22ed853f256041bdf97ef02bdb5d80a5f9bc31d85f25e,
-          381'h159ef660e2d84185f55c0ccae1dd7f8f71b12c0beb7a431fede9e62794d9154e9a0ce4715f64b032492459076224c99b,
-          381'h0cbc592a19a3f60c9938676b257b9c01ed9d708f9428b29e272a811d13d734485970d9d3f1c097b12bfa3d1678096b1d,
-          381'h0751a051e0beb4a0e2351a7527d813b371e189056307d718a446e4016a3df787568a842f3401768dc03b966bd1db90ac,
-          381'h0e760e96f911ae38a6042da82d7b0e30787864e725e9d5462d224c91c4497104d838d566d894564bc19e09d8af706c3f,
-          381'h05194f5785436c8debf0eb2bab4c6ef3de7dc0633c85769173777b782bf897fa45025fd03e7be941123c4ee19910e62e};
-
-  final_exponent(f);
-  $display("After final exponent:");
-  print_fe12(f);
-  assert(f == f_exp) else $fatal(1, "Test_sw final exp. did not match known good result");
-  $display("test_sw PASSED");
-
-endtask
-
-
-task test_hw();
-begin
-  integer signed get_len;
-  logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in, get_dat;
-  integer start_time, finish_time;
-  FE12_TYPE  f_in, f_out, f_exp;
-  $display("Running hw test ...");
-
-  for (int lp = 0; lp < 10; lp++) begin
-    $display("Loop %d", lp);
-    dat_in = 0;
-    for (int i = 0; i < 2; i++)
-      for (int j = 0; j < 3; j++)
-        for (int k = 0; k < 2; k++) begin
-          f_in[i][j][k] = random_vector(384/8) % P;
-          dat_in[(i*6+j*2+k)*384 +: $bits(FE_TYPE)] = {f_in[i][j][k]};
-        end
-
-    f_exp = f_in;
-    final_exponent(f_exp);
-
-    start_time = $time;
-    fork
-      final_exp_fe12_o_if.put_stream(dat_in, 12*384/8);
-      final_exp_fe12_i_if.get_stream(get_dat, get_len);
-    join
-    finish_time = $time;
-
-    for (int i = 0; i < 2; i++)
-      for (int j = 0; j < 3; j++)
-        for (int k = 0; k < 2; k++)
-          f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
-
-    $display("hw test finished in %d clocks", (finish_time-start_time)/(CLK_PERIOD));
-
-    if (f_exp != f_out) begin
-      $display("Input:");
-      print_fe12(f_in);
-      $display("Output:");
-      print_fe12(f_out);
-      $display("Expected:");
-      print_fe12(f_exp);
-       $fatal(1, "%m %t ERROR: output was wrong", $time);
-    end
-  end
-
-  $display("all hw tests PASSED");
-end
-endtask;
-
-initial begin
-  final_exp_fe12_o_if.reset_source();
-  final_exp_fe12_i_if.rdy = 0;
-  #100ns;
-
-  test_sw();
-  test_hw();
-
-  #1us $finish();
-end
-
-endmodule
\ No newline at end of file
diff --git a/zcash_fpga/src/tb/bls12_381_fmap_tb.sv b/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
index 791218b..d3f0fb3 100644
--- a/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
@@ -15,6 +15,8 @@
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 `timescale 1ps/1ps
+`define BL12_381_NEWMULT
+`define SIMULATION
 
 module bls12_381_fmap_tb ();
 
diff --git a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
index d74171e..aa2c831 100644
--- a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
@@ -64,16 +64,24 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_i_if(clk)
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_o_if(clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_i_if(clk);
 
-ec_fp_mult_mod #(
-  .P             ( P        ),
-  .KARATSUBA_LVL ( 3        ),
-  .CTL_BITS      ( CTL_BITS )
+accum_mult_mod #(
+  .DAT_BITS ( $bits(FE_TYPE) ),
+  .MODULUS  ( P ),
+  .CTL_BITS ( CTL_BITS ),
+  .A_DSP_W  ( 26 ),
+  .B_DSP_W  ( 17 ),
+  .GRID_BIT ( 64 ),
+  .RAM_A_W  ( 8  ),
+  .RAM_D_W  ( 32 )
 )
-ec_fp_mult_mod (
-  .i_clk( clk          ),
-  .i_rst( rst          ),
-  .i_mul ( mul_fe_o_if ),
-  .o_mul ( mul_fe_i_if )
+accum_mult_mod (
+  .i_clk ( clk ),
+  .i_rst ( rst ),
+  .i_mul ( mul_fe_o_if  ),
+  .o_mul ( mul_fe_i_if ),
+  .i_ram_d ( '0 ),
+  .i_ram_we ( '0 ),
+  .i_ram_se ( '0 )
 );
 
 bls12_381_pairing_wrapper #(