New multiplier with RAM for modulo reduction, pairing is 30% faster.

Updated several testbenches to use new multiplier.
HowToLoveChina · Sep 15, 2019 · 57ee6c8 · 57ee6c8
1 parent a598d71
commit 57ee6c8
Show file tree

Hide file tree

Showing 25 changed files with 1,117 additions and 579 deletions.
diff --git a/README.md b/README.md
@@ -39,10 +39,14 @@ These contain shared IP cores that are used by the projects in this repo. These
 * Hash map implementation
   - Fully parameterized for bit widths and uses CRC as the hashing function
 * Blocks for parsing/processing streams
-* Karabutsa multiplier
+* Karatsuba multiplier
   - Fully parameterized for number of levels
 * Barret reduction for modulo reduction when the modulus does not allow fast reduction
   - Both a fully pipelined high performance version and a slower but smaller resource utilization version
+* Fully parallel multiplier with carry save adder tree and RAM for modular reduction
+  - Fully pipelined, 3x performance over Karatsuba + Barret, but uses FPGA RAM  
+* Multiplier using carry tree to accumulate products with BRAM for modular reduction
+  - 3x performance over Karatsuba + Barret approach
 * Addition and subtraction modules
   - Fully parameterized so that they can be used for large bit-width arithmetic
 * Extended Euclidean algorithm for calculating multiplicative inverses
@@ -60,7 +64,7 @@ It optionally contains the following top-level engines (you can include in a bui
   - Verifies the equihash solution and difficulty filters
 * Transparent Signature Verification Engine (secp256k1 ECDSA core)
   - Uses efficient endomorphism to reduce key bit size
-  - Signature verification calculates multiple EC point operations in parallel, using a resource-shared single fully pipelined karabutsa multiplier and quick modulo reduction technique
+  - Signature verification calculates multiple EC point operations in parallel, using a resource-shared single fully pipelined karatsuba multiplier and quick modulo reduction technique
 * BLS12-381 coprocessor (zk-SNARK accelerator)
   - Custom instruction set with 2kB instruction memory
   - 12kB Data slot URAM at curve native bit width of 381b

diff --git a/ip_cores/accum_mult_mod/data/.gitignore b/ip_cores/accum_mult_mod/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/ip_cores/accum_mult_mod/scripts/generate_files.py b/ip_cores/accum_mult_mod/scripts/generate_files.py
@@ -0,0 +1,310 @@
+#!/usr/bin/python3
+
+import math
+
+#  This needs to be called before simulation / synthesis to make sure the
+#  reduction ram files and include files are created.
+#
+#  Copyright (C) 2019  Benjamin Devlin and Zcash Foundation
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+####################
+# Generate the multiplier output to carry-save adder tree mapping
+####################
+
+BITS = 381
+MODULUS = 0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab
+A_DSP_W = 26
+B_DSP_W = 17
+GRID_BIT = 64
+RAM_A_W = 10
+
+URAM_PERCENT = 50
+USE_INIT = 1
+
+#8b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af600db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1  ^2 =
+#64a3a594868a2a4dab071ff6d880ae0f459c87e11ab01b3454b95a7d6a93f853f6e07f754b6e7933799e0afe2779a56
+
+
+RES_W = A_DSP_W+B_DSP_W
+NUM_COL = (BITS+A_DSP_W-1)//A_DSP_W;
+NUM_ROW = (BITS+B_DSP_W-1)//B_DSP_W;
+
+A_DIFF = A_DSP_W//GRID_BIT
+B_DIFF = B_DSP_W//GRID_BIT
+
+
+def get_accum_gen():
+  MAX_COEF = ((2*BITS)+GRID_BIT-1)//GRID_BIT
+  accum_s = '\n'
+  ram_s = '\n'
+  products = list()
+  # Make a list of all offsets where products start
+  for x in range(NUM_COL):
+    for y in range(NUM_ROW):
+      products.append((x, y, x*A_DSP_W+y*B_DSP_W))
+
+
+  # Now match these to coef
+  coef = list()
+  max_bits_l = list()
+  for i in range(MAX_COEF):
+    size = list()
+    # First do a pass just to check bit sizes  - also need to account for offset
+    for j in products:
+      start = max(j[2], i*GRID_BIT)
+      end = min(j[2]+RES_W, (i+1)*GRID_BIT)
+      if (end > start):
+        size.append(end-i*GRID_BIT)#start)
+    # Max bits 1 + clog2() of the max size in our list
+    #max_bits = max(size) + math.ceil(math.log2(size.count(max(size))))
+    max_bits = max(size) + math.ceil(math.log2(len(size)))
+    max_bits_l.append(max_bits)
+
+    coef_l = list()
+    for j in products:
+      # Check if we are in range
+      offset = (j[0]*A_DSP_W)+(j[1]*B_DSP_W)
+      start = max(j[2], i*GRID_BIT)
+      end = min(j[2]+RES_W, (i+1)*GRID_BIT)
+      if (end > start):
+        bitwidth = end-start
+        start_padding = max(start - i*GRID_BIT, 0)
+        end_padding = max(start+max_bits-end-start_padding, 0)
+        coef_l.append('{{{{{}{{1\'d0}}}},mul_grid[{}][{}][{}+:{}],{{{}{{1\'d0}}}}}}'.format(end_padding, j[0], j[1], start-offset, bitwidth, start_padding))
+
+
+
+    coef.append(coef_l)
+
+  # Create compressor trees and output
+  for idx, i in enumerate(coef):
+    if (len(i) == 1):
+      accum_s +='''
+// Coef {}
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= {};
+'''.format(idx, idx, i[0])
+    elif (len(i) == 2):
+      accum_s +='''
+// Coef {}
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= {};
+'''.format(idx, idx, ' + '.join(i))
+    else:
+      accum_s +='''
+// Coef {}
+logic [{}:0] accum_i_{} [{}];
+logic [{}:0] accum_o_c_{}, accum_o_s_{};
+compressor_tree_3_to_2 #(
+  .NUM_ELEMENTS({}),
+  .BIT_LEN({})
+)
+ct_{} (
+  .terms(accum_i_{}),
+  .C(accum_o_c_{}),
+  .S(accum_o_s_{})
+);
+always_comb accum_i_{} = {{{}}};
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= accum_o_c_{} + accum_o_s_{};
+'''.format(idx, max_bits_l[idx]-1, idx, len(i), max_bits_l[idx]-1, idx, idx, len(i), max_bits_l[idx], idx, idx, idx, idx, idx, ','.join(i), idx, idx, idx)
+
+  # If the bits of this coef are above the modulus, we start generating lookup RAM
+  # and output of RAM goes into address trees together with other partial products
+
+  curr_bit = 0
+  curr_bit_cnt = 0
+  coef = 0
+  ram_bit_low = 0
+  ram_addr_bits = list()
+
+  curr_bit = MODULUS.bit_length() % GRID_BIT
+  coef = (MODULUS.bit_length()//GRID_BIT)
+  reduc_coef = coef
+  reduc_bit = curr_bit
+  ram_s += 'always_ff @ (posedge i_clk) if (o_mul.rdy) begin\n'
+  mem_s = ''
+  #Reduce all bits after this
+  while(coef < MAX_COEF):
+    # Get max bits we can take from this coef
+    max_bits = min(max_bits_l[coef]-curr_bit, RAM_A_W-ram_bit_low)
+    ram_s += '  mod_ram_{}_a[{}+:{}] <= accum_grid_o[{}][{}+:{}];\n'.format(len(ram_addr_bits), ram_bit_low, max_bits, coef, curr_bit, max_bits)
+
+    if ((ram_bit_low + max_bits == RAM_A_W) or (coef == MAX_COEF - 1 and curr_bit + max_bits == max_bits_l[coef])):
+      if (ram_bit_low + max_bits != RAM_A_W):
+        ram_s += '  mod_ram_{}_a[{}+:{}] <= 0;\n'.format(len(ram_addr_bits), ram_bit_low+max_bits, RAM_A_W-(ram_bit_low+max_bits))
+
+      # Generate the init file lines - need to take into account earlier address bits
+      max_bits_value = max_bits + ram_bit_low
+      #print("max_bits {} ram_bit_low {}".format( max_bits, ram_bit_low))
+      for i in range(1 << max_bits_value):
+        # The value of a bit here will depend on the GRID and posisition of bit
+        # Assume (?) any bits not in this GRID are from previous
+        if (ram_bit_low != 0):
+          bit_l = i % (1 << ram_bit_low)
+          value_l = bit_l << ((max_bits_l[coef-1]-ram_bit_low)+(coef-1)*GRID_BIT)
+        else:
+          value_l = 0
+        bit_h = (i >> ram_bit_low)
+        value_h = bit_h << (coef*GRID_BIT + curr_bit)
+        value = hex((value_l + value_h) % MODULUS)[2:]
+
+        mem_s += "{}\n".format(value.zfill(math.ceil(MODULUS.bit_length()/4)))
+
+      f = open('../data/mod_ram_{}.mem'.format(len(ram_addr_bits)), 'w')
+      f.write(mem_s)
+      f.close()
+      mem_s = ''
+
+      ram_addr_bits.append(ram_bit_low + max_bits)
+      ram_bit_low = 0
+    else:
+      ram_bit_low += max_bits
+
+
+    if (curr_bit + max_bits == max_bits_l[coef]):
+      coef += 1
+      curr_bit = 0
+    else:
+      curr_bit += max_bits
+
+  ram_s += 'end\n'
+  # Add the RAMs
+  ram_s1 = ''
+  for idx, i in enumerate(ram_addr_bits):
+    uram_s = '(* ram_style="ultra" *)' if URAM_PERCENT > 100*idx/len(ram_addr_bits) else ''
+    init_s = 'initial $readmemh( "mod_ram_{}.mem", mod_ram_{}_ram);'.format(idx, idx) if USE_INIT else ''
+    ram_s1 += '''
+logic [{}:0]    mod_ram_{}_a;
+(* DONT_TOUCH = "yes" *) logic [{}:0]    mod_ram_{}_q;
+logic [{}:0]    mod_ram_{}_d;
+{}logic [{}:0]    mod_ram_{}_ram [{}];
+always_ff @ (posedge i_clk) if (o_mul.rdy) begin
+  mod_ram_{}_q <= mod_ram_{}_ram[mod_ram_{}_a];
+end
+{}
+'''.format(RAM_A_W-1, idx, MODULUS.bit_length()-1, idx, MODULUS.bit_length()-1, idx, uram_s, MODULUS.bit_length()-1, idx, 1 << RAM_A_W, idx, idx, idx, init_s)
+
+  # We now generate the tree adders to sum the reduction values with the accum_grid_o values
+  accum2_s = '\n'
+  for coef in range(math.ceil(MODULUS.bit_length()/GRID_BIT)):
+    # Make sure we have the right bit widths
+    if (coef == reduc_coef):
+      ram_bits = min(GRID_BIT, reduc_bit)
+    else:
+      ram_bits = GRID_BIT
+    padding = max_bits_l[coef] - ram_bits
+    #if (padding == 0):
+    max_bits_l[coef] += math.ceil(math.log2(len(ram_addr_bits)))
+    padding = max_bits_l[coef] - ram_bits
+    in_s = ['{{{{{}{{1\'d0}}}}, mod_ram_{}_q[{}+:{}]}}'.format(padding, i, coef*GRID_BIT, ram_bits) for i in range(len(ram_addr_bits))]
+    # Need to check if we also had reduction in this range
+    end = max_bits_l[coef]-1
+    padding = 0
+    if (reduc_coef == coef):
+      padding = end - reduc_bit
+      end = reduc_bit-1
+    in_s.append('{{{{{}{{1\'d0}}}}, accum_grid_o_rr[{}][{}:0]}}'.format(padding, coef, end))
+    accum2_s +='''
+// Coef {} accum 2 stage
+logic [{}:0] accum2_i_{} [{}];
+logic [{}:0] accum2_o_c_{}, accum2_o_s_{};
+compressor_tree_3_to_2 #(
+  .NUM_ELEMENTS({}),
+  .BIT_LEN({})
+)
+ct2_{} (
+  .terms(accum2_i_{}),
+  .C(accum2_o_c_{}),
+  .S(accum2_o_s_{})
+);
+always_comb accum2_i_{} = {{{}}};
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum2_grid_o[{}] <= accum2_o_c_{} + accum2_o_s_{};
+'''.format(coef, max_bits_l[coef]-1, coef, len(ram_addr_bits)+1, max_bits_l[coef]-1, coef, coef, len(ram_addr_bits)+1, max_bits_l[coef], coef, coef, coef, coef, coef, ','.join(in_s), coef, coef, coef)
+
+  ram_s = ram_s1 + ram_s
+
+  # We also need to do a final level reduction
+  accum3_s = '''
+logic [{}:0]    mod_ram2_0_a;
+logic [{}:0]    mod_ram2_0_q;
+always_comb begin
+  mod_ram2_0_a = res0_r[{}+:{}];
+end
+always_ff @ (posedge i_clk) if (o_mul.rdy) begin
+  mod_ram2_0_q <= mod_ram_0_ram[mod_ram2_0_a];
+end
+
+always_comb begin
+  res1_c = res0_rr[{}:0] + mod_ram2_0_q;
+  res1_m_c = res0_rr[{}:0] + mod_ram2_0_q - MODULUS;
+  res1_m_c_ = res0_rr[{}:0] + mod_ram2_0_q - 2*MODULUS;
+end
+'''.format(RAM_A_W-1, MODULUS.bit_length()-1, MODULUS.bit_length(), RAM_A_W, MODULUS.bit_length()-1, MODULUS.bit_length()-1, MODULUS.bit_length()-1)
+
+  # We also generate the arrays since we know the max sizes
+  logic_s = '''
+
+logic [{}:0]                  accum_grid_o [{}];
+logic [{}:0]                  accum_grid_o_r [{}];
+logic [{}:0]                  accum_grid_o_rr [{}];
+logic [{}:0]                  accum2_grid_o [{}];
+'''.format(max(max_bits_l)-1, MAX_COEF, max(max_bits_l)-1, MAX_COEF//2, max(max_bits_l)-1, MAX_COEF//2, max(max_bits_l)-1, MAX_COEF//2)
+
+  # Add logic for writing to memory
+  # Make long scan chain, width of RAM_D_W
+  ram_write_s = '''
+localparam int RAM_PIPE = 4;
+logic [RAM_PIPE:0][RAM_A_W-1:0] addr;
+logic [RAM_PIPE:0][RAM_D_W-1:0] ram_d;
+logic [RAM_PIPE:0]              ram_we;
+logic [RAM_PIPE:0]              ram_se;
+
+always_ff @ (posedge i_clk) begin
+  if (i_rst) begin
+    addr <= 0;
+    ram_we <= 0;
+    ram_se <= 0;
+    ram_d <= 0;
+  end else begin
+    ram_we <= {ram_we, i_ram_we};
+    ram_d  <= {ram_d, i_ram_d};
+    ram_se <= {ram_se, i_ram_se};
+    if (ram_we[RAM_PIPE]) begin
+      addr <= addr + 1;'''
+  for idx, i in enumerate(ram_addr_bits):
+    ram_write_s+= '''
+      mod_ram_{}_ram[addr] <= mod_ram_{}_d;'''.format(idx, idx)
+  ram_write_s += '''
+    end
+'''
+  ram_write_s += '''
+    if (ram_se[RAM_PIPE]) begin'''
+  for idx, i in enumerate(ram_addr_bits):
+    previous_ram = "ram_d[RAM_PIPE]" if idx == 0 else "mod_ram_{}_d[{}:({}%RAM_D_W)]".format(idx-1, MODULUS.bit_length()-1, MODULUS.bit_length())
+    ram_write_s += '''
+      mod_ram_{}_d <= {{mod_ram_{}_d, {}}};'''.format(idx, idx, previous_ram)
+
+  ram_write_s += '''
+    end
+  end
+end
+'''
+  return logic_s + accum_s + ram_s + accum2_s + accum3_s + ram_write_s
+
+
+
+f = open('../src/rtl/accum_mult_mod_generated.sv', 'w')
+f.write(get_accum_gen())
+f.close()
+
diff --git a/ip_cores/accum_mult_mod/src/rtl/.gitignore b/ip_cores/accum_mult_mod/src/rtl/.gitignore
@@ -0,0 +1,2 @@
+accum_mult_mod_generated.sv
+!.gitignore