diff --git a/README.md b/README.md
index 50f0ac1..b9c4039 100644
--- a/README.md
+++ b/README.md
@@ -39,10 +39,14 @@ These contain shared IP cores that are used by the projects in this repo. These
* Hash map implementation
- Fully parameterized for bit widths and uses CRC as the hashing function
* Blocks for parsing/processing streams
-* Karabutsa multiplier
+* Karatsuba multiplier
- Fully parameterized for number of levels
* Barret reduction for modulo reduction when the modulus does not allow fast reduction
- Both a fully pipelined high performance version and a slower but smaller resource utilization version
+* Fully parallel multiplier with carry save adder tree and RAM for modular reduction
+ - Fully pipelined, 3x performance over Karatsuba + Barret, but uses FPGA RAM
+* Multiplier using carry tree to accumulate products with BRAM for modular reduction
+ - 3x performance over Karatsuba + Barret approach
* Addition and subtraction modules
- Fully parameterized so that they can be used for large bit-width arithmetic
* Extended Euclidean algorithm for calculating multiplicative inverses
@@ -60,7 +64,7 @@ It optionally contains the following top-level engines (you can include in a bui
- Verifies the equihash solution and difficulty filters
* Transparent Signature Verification Engine (secp256k1 ECDSA core)
- Uses efficient endomorphism to reduce key bit size
- - Signature verification calculates multiple EC point operations in parallel, using a resource-shared single fully pipelined karabutsa multiplier and quick modulo reduction technique
+ - Signature verification calculates multiple EC point operations in parallel, using a resource-shared single fully pipelined karatsuba multiplier and quick modulo reduction technique
* BLS12-381 coprocessor (zk-SNARK accelerator)
- Custom instruction set with 2kB instruction memory
- 12kB Data slot URAM at curve native bit width of 381b
diff --git a/ip_cores/accum_mult_mod/data/.gitignore b/ip_cores/accum_mult_mod/data/.gitignore
new file mode 100644
index 0000000..a3a0c8b
--- /dev/null
+++ b/ip_cores/accum_mult_mod/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/scripts/generate_files.py b/ip_cores/accum_mult_mod/scripts/generate_files.py
new file mode 100644
index 0000000..5d03de6
--- /dev/null
+++ b/ip_cores/accum_mult_mod/scripts/generate_files.py
@@ -0,0 +1,310 @@
+#!/usr/bin/python3
+
+import math
+
+# This needs to be called before simulation / synthesis to make sure the
+# reduction ram files and include files are created.
+#
+# Copyright (C) 2019 Benjamin Devlin and Zcash Foundation
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+####################
+# Generate the multiplier output to carry-save adder tree mapping
+####################
+
+BITS = 381
+MODULUS = 0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab
+A_DSP_W = 26
+B_DSP_W = 17
+GRID_BIT = 64
+RAM_A_W = 10
+
+URAM_PERCENT = 50
+USE_INIT = 1
+
+#8b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af600db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 ^2 =
+#64a3a594868a2a4dab071ff6d880ae0f459c87e11ab01b3454b95a7d6a93f853f6e07f754b6e7933799e0afe2779a56
+
+
+RES_W = A_DSP_W+B_DSP_W
+NUM_COL = (BITS+A_DSP_W-1)//A_DSP_W;
+NUM_ROW = (BITS+B_DSP_W-1)//B_DSP_W;
+
+A_DIFF = A_DSP_W//GRID_BIT
+B_DIFF = B_DSP_W//GRID_BIT
+
+
+def get_accum_gen():
+ MAX_COEF = ((2*BITS)+GRID_BIT-1)//GRID_BIT
+ accum_s = '\n'
+ ram_s = '\n'
+ products = list()
+ # Make a list of all offsets where products start
+ for x in range(NUM_COL):
+ for y in range(NUM_ROW):
+ products.append((x, y, x*A_DSP_W+y*B_DSP_W))
+
+
+ # Now match these to coef
+ coef = list()
+ max_bits_l = list()
+ for i in range(MAX_COEF):
+ size = list()
+ # First do a pass just to check bit sizes - also need to account for offset
+ for j in products:
+ start = max(j[2], i*GRID_BIT)
+ end = min(j[2]+RES_W, (i+1)*GRID_BIT)
+ if (end > start):
+ size.append(end-i*GRID_BIT)#start)
+ # Max bits 1 + clog2() of the max size in our list
+ #max_bits = max(size) + math.ceil(math.log2(size.count(max(size))))
+ max_bits = max(size) + math.ceil(math.log2(len(size)))
+ max_bits_l.append(max_bits)
+
+ coef_l = list()
+ for j in products:
+ # Check if we are in range
+ offset = (j[0]*A_DSP_W)+(j[1]*B_DSP_W)
+ start = max(j[2], i*GRID_BIT)
+ end = min(j[2]+RES_W, (i+1)*GRID_BIT)
+ if (end > start):
+ bitwidth = end-start
+ start_padding = max(start - i*GRID_BIT, 0)
+ end_padding = max(start+max_bits-end-start_padding, 0)
+ coef_l.append('{{{{{}{{1\'d0}}}},mul_grid[{}][{}][{}+:{}],{{{}{{1\'d0}}}}}}'.format(end_padding, j[0], j[1], start-offset, bitwidth, start_padding))
+
+
+
+ coef.append(coef_l)
+
+ # Create compressor trees and output
+ for idx, i in enumerate(coef):
+ if (len(i) == 1):
+ accum_s +='''
+// Coef {}
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= {};
+'''.format(idx, idx, i[0])
+ elif (len(i) == 2):
+ accum_s +='''
+// Coef {}
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= {};
+'''.format(idx, idx, ' + '.join(i))
+ else:
+ accum_s +='''
+// Coef {}
+logic [{}:0] accum_i_{} [{}];
+logic [{}:0] accum_o_c_{}, accum_o_s_{};
+compressor_tree_3_to_2 #(
+ .NUM_ELEMENTS({}),
+ .BIT_LEN({})
+)
+ct_{} (
+ .terms(accum_i_{}),
+ .C(accum_o_c_{}),
+ .S(accum_o_s_{})
+);
+always_comb accum_i_{} = {{{}}};
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum_grid_o[{}] <= accum_o_c_{} + accum_o_s_{};
+'''.format(idx, max_bits_l[idx]-1, idx, len(i), max_bits_l[idx]-1, idx, idx, len(i), max_bits_l[idx], idx, idx, idx, idx, idx, ','.join(i), idx, idx, idx)
+
+ # If the bits of this coef are above the modulus, we start generating lookup RAM
+ # and output of RAM goes into address trees together with other partial products
+
+ curr_bit = 0
+ curr_bit_cnt = 0
+ coef = 0
+ ram_bit_low = 0
+ ram_addr_bits = list()
+
+ curr_bit = MODULUS.bit_length() % GRID_BIT
+ coef = (MODULUS.bit_length()//GRID_BIT)
+ reduc_coef = coef
+ reduc_bit = curr_bit
+ ram_s += 'always_ff @ (posedge i_clk) if (o_mul.rdy) begin\n'
+ mem_s = ''
+ #Reduce all bits after this
+ while(coef < MAX_COEF):
+ # Get max bits we can take from this coef
+ max_bits = min(max_bits_l[coef]-curr_bit, RAM_A_W-ram_bit_low)
+ ram_s += ' mod_ram_{}_a[{}+:{}] <= accum_grid_o[{}][{}+:{}];\n'.format(len(ram_addr_bits), ram_bit_low, max_bits, coef, curr_bit, max_bits)
+
+ if ((ram_bit_low + max_bits == RAM_A_W) or (coef == MAX_COEF - 1 and curr_bit + max_bits == max_bits_l[coef])):
+ if (ram_bit_low + max_bits != RAM_A_W):
+ ram_s += ' mod_ram_{}_a[{}+:{}] <= 0;\n'.format(len(ram_addr_bits), ram_bit_low+max_bits, RAM_A_W-(ram_bit_low+max_bits))
+
+ # Generate the init file lines - need to take into account earlier address bits
+ max_bits_value = max_bits + ram_bit_low
+ #print("max_bits {} ram_bit_low {}".format( max_bits, ram_bit_low))
+ for i in range(1 << max_bits_value):
+ # The value of a bit here will depend on the GRID and posisition of bit
+ # Assume (?) any bits not in this GRID are from previous
+ if (ram_bit_low != 0):
+ bit_l = i % (1 << ram_bit_low)
+ value_l = bit_l << ((max_bits_l[coef-1]-ram_bit_low)+(coef-1)*GRID_BIT)
+ else:
+ value_l = 0
+ bit_h = (i >> ram_bit_low)
+ value_h = bit_h << (coef*GRID_BIT + curr_bit)
+ value = hex((value_l + value_h) % MODULUS)[2:]
+
+ mem_s += "{}\n".format(value.zfill(math.ceil(MODULUS.bit_length()/4)))
+
+ f = open('../data/mod_ram_{}.mem'.format(len(ram_addr_bits)), 'w')
+ f.write(mem_s)
+ f.close()
+ mem_s = ''
+
+ ram_addr_bits.append(ram_bit_low + max_bits)
+ ram_bit_low = 0
+ else:
+ ram_bit_low += max_bits
+
+
+ if (curr_bit + max_bits == max_bits_l[coef]):
+ coef += 1
+ curr_bit = 0
+ else:
+ curr_bit += max_bits
+
+ ram_s += 'end\n'
+ # Add the RAMs
+ ram_s1 = ''
+ for idx, i in enumerate(ram_addr_bits):
+ uram_s = '(* ram_style="ultra" *)' if URAM_PERCENT > 100*idx/len(ram_addr_bits) else ''
+ init_s = 'initial $readmemh( "mod_ram_{}.mem", mod_ram_{}_ram);'.format(idx, idx) if USE_INIT else ''
+ ram_s1 += '''
+logic [{}:0] mod_ram_{}_a;
+(* DONT_TOUCH = "yes" *) logic [{}:0] mod_ram_{}_q;
+logic [{}:0] mod_ram_{}_d;
+{}logic [{}:0] mod_ram_{}_ram [{}];
+always_ff @ (posedge i_clk) if (o_mul.rdy) begin
+ mod_ram_{}_q <= mod_ram_{}_ram[mod_ram_{}_a];
+end
+{}
+'''.format(RAM_A_W-1, idx, MODULUS.bit_length()-1, idx, MODULUS.bit_length()-1, idx, uram_s, MODULUS.bit_length()-1, idx, 1 << RAM_A_W, idx, idx, idx, init_s)
+
+ # We now generate the tree adders to sum the reduction values with the accum_grid_o values
+ accum2_s = '\n'
+ for coef in range(math.ceil(MODULUS.bit_length()/GRID_BIT)):
+ # Make sure we have the right bit widths
+ if (coef == reduc_coef):
+ ram_bits = min(GRID_BIT, reduc_bit)
+ else:
+ ram_bits = GRID_BIT
+ padding = max_bits_l[coef] - ram_bits
+ #if (padding == 0):
+ max_bits_l[coef] += math.ceil(math.log2(len(ram_addr_bits)))
+ padding = max_bits_l[coef] - ram_bits
+ in_s = ['{{{{{}{{1\'d0}}}}, mod_ram_{}_q[{}+:{}]}}'.format(padding, i, coef*GRID_BIT, ram_bits) for i in range(len(ram_addr_bits))]
+ # Need to check if we also had reduction in this range
+ end = max_bits_l[coef]-1
+ padding = 0
+ if (reduc_coef == coef):
+ padding = end - reduc_bit
+ end = reduc_bit-1
+ in_s.append('{{{{{}{{1\'d0}}}}, accum_grid_o_rr[{}][{}:0]}}'.format(padding, coef, end))
+ accum2_s +='''
+// Coef {} accum 2 stage
+logic [{}:0] accum2_i_{} [{}];
+logic [{}:0] accum2_o_c_{}, accum2_o_s_{};
+compressor_tree_3_to_2 #(
+ .NUM_ELEMENTS({}),
+ .BIT_LEN({})
+)
+ct2_{} (
+ .terms(accum2_i_{}),
+ .C(accum2_o_c_{}),
+ .S(accum2_o_s_{})
+);
+always_comb accum2_i_{} = {{{}}};
+always_ff @ (posedge i_clk) if (o_mul.rdy) accum2_grid_o[{}] <= accum2_o_c_{} + accum2_o_s_{};
+'''.format(coef, max_bits_l[coef]-1, coef, len(ram_addr_bits)+1, max_bits_l[coef]-1, coef, coef, len(ram_addr_bits)+1, max_bits_l[coef], coef, coef, coef, coef, coef, ','.join(in_s), coef, coef, coef)
+
+ ram_s = ram_s1 + ram_s
+
+ # We also need to do a final level reduction
+ accum3_s = '''
+logic [{}:0] mod_ram2_0_a;
+logic [{}:0] mod_ram2_0_q;
+always_comb begin
+ mod_ram2_0_a = res0_r[{}+:{}];
+end
+always_ff @ (posedge i_clk) if (o_mul.rdy) begin
+ mod_ram2_0_q <= mod_ram_0_ram[mod_ram2_0_a];
+end
+
+always_comb begin
+ res1_c = res0_rr[{}:0] + mod_ram2_0_q;
+ res1_m_c = res0_rr[{}:0] + mod_ram2_0_q - MODULUS;
+ res1_m_c_ = res0_rr[{}:0] + mod_ram2_0_q - 2*MODULUS;
+end
+'''.format(RAM_A_W-1, MODULUS.bit_length()-1, MODULUS.bit_length(), RAM_A_W, MODULUS.bit_length()-1, MODULUS.bit_length()-1, MODULUS.bit_length()-1)
+
+ # We also generate the arrays since we know the max sizes
+ logic_s = '''
+
+logic [{}:0] accum_grid_o [{}];
+logic [{}:0] accum_grid_o_r [{}];
+logic [{}:0] accum_grid_o_rr [{}];
+logic [{}:0] accum2_grid_o [{}];
+'''.format(max(max_bits_l)-1, MAX_COEF, max(max_bits_l)-1, MAX_COEF//2, max(max_bits_l)-1, MAX_COEF//2, max(max_bits_l)-1, MAX_COEF//2)
+
+ # Add logic for writing to memory
+ # Make long scan chain, width of RAM_D_W
+ ram_write_s = '''
+localparam int RAM_PIPE = 4;
+logic [RAM_PIPE:0][RAM_A_W-1:0] addr;
+logic [RAM_PIPE:0][RAM_D_W-1:0] ram_d;
+logic [RAM_PIPE:0] ram_we;
+logic [RAM_PIPE:0] ram_se;
+
+always_ff @ (posedge i_clk) begin
+ if (i_rst) begin
+ addr <= 0;
+ ram_we <= 0;
+ ram_se <= 0;
+ ram_d <= 0;
+ end else begin
+ ram_we <= {ram_we, i_ram_we};
+ ram_d <= {ram_d, i_ram_d};
+ ram_se <= {ram_se, i_ram_se};
+ if (ram_we[RAM_PIPE]) begin
+ addr <= addr + 1;'''
+ for idx, i in enumerate(ram_addr_bits):
+ ram_write_s+= '''
+ mod_ram_{}_ram[addr] <= mod_ram_{}_d;'''.format(idx, idx)
+ ram_write_s += '''
+ end
+'''
+ ram_write_s += '''
+ if (ram_se[RAM_PIPE]) begin'''
+ for idx, i in enumerate(ram_addr_bits):
+ previous_ram = "ram_d[RAM_PIPE]" if idx == 0 else "mod_ram_{}_d[{}:({}%RAM_D_W)]".format(idx-1, MODULUS.bit_length()-1, MODULUS.bit_length())
+ ram_write_s += '''
+ mod_ram_{}_d <= {{mod_ram_{}_d, {}}};'''.format(idx, idx, previous_ram)
+
+ ram_write_s += '''
+ end
+ end
+end
+'''
+ return logic_s + accum_s + ram_s + accum2_s + accum3_s + ram_write_s
+
+
+
+f = open('../src/rtl/accum_mult_mod_generated.sv', 'w')
+f.write(get_accum_gen())
+f.close()
+
diff --git a/ip_cores/accum_mult_mod/src/rtl/.gitignore b/ip_cores/accum_mult_mod/src/rtl/.gitignore
new file mode 100644
index 0000000..467c5c9
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/.gitignore
@@ -0,0 +1,2 @@
+accum_mult_mod_generated.sv
+!.gitignore
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv
new file mode 100644
index 0000000..3b222ab
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv
@@ -0,0 +1,137 @@
+/*
+ This does a BITS multiplication using adder tree and parameterizable
+ DSP sizes. A python script generates the accum_gen.sv file.
+
+ Does modulus reduction using RAM tables. Multiplication and reduction has
+ latency of 5 clock cycles and a throughput of 1 clock cycle per result.
+
+ Copyright (C) 2019 Benjamin Devlin and Zcash Foundation
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+ */
+
+module accum_mult_mod #(
+ parameter DAT_BITS,
+ parameter MODULUS,
+ parameter CTL_BITS,
+ parameter A_DSP_W,
+ parameter B_DSP_W,
+ parameter GRID_BIT,
+ parameter RAM_A_W,
+ parameter RAM_D_W
+)(
+ input i_clk,
+ input i_rst,
+ if_axi_stream.sink i_mul,
+ if_axi_stream.source o_mul,
+ input [RAM_D_W-1:0] i_ram_d,
+ input i_ram_we,
+ input i_ram_se
+);
+
+localparam int TOT_DSP_W = A_DSP_W+B_DSP_W;
+localparam int NUM_COL = (DAT_BITS+A_DSP_W-1)/A_DSP_W;
+localparam int NUM_ROW = (DAT_BITS+B_DSP_W-1)/B_DSP_W;
+localparam int MAX_COEF = (2*DAT_BITS+GRID_BIT-1)/GRID_BIT;
+localparam int PIPE = 9;
+
+logic [A_DSP_W*NUM_COL-1:0] dat_a;
+logic [B_DSP_W*NUM_ROW-1:0] dat_b;
+(* DONT_TOUCH = "yes" *) logic [A_DSP_W+B_DSP_W-1:0] mul_grid [NUM_COL][NUM_ROW];
+logic [2*DAT_BITS:0] res0_c, res0_r, res0_rr;
+logic [DAT_BITS:0] res1_c, res1_m_c, res1_m_c_;
+
+// Most of the code is generated
+`include "accum_mult_mod_generated.sv"
+
+logic [PIPE-1:0] val, sop, eop;
+logic [PIPE-1:0][CTL_BITS-1:0] ctl;
+
+genvar gx, gy;
+
+// Flow control
+always_comb begin
+ i_mul.rdy = o_mul.rdy;
+ o_mul.val = val[PIPE-1];
+ o_mul.sop = sop[PIPE-1];
+ o_mul.eop = eop[PIPE-1];
+ o_mul.ctl = ctl[PIPE-1];
+ o_mul.err = 0;
+ o_mul.mod = 0;
+end
+
+always_ff @ (posedge i_clk) begin
+ if (i_rst) begin
+ val <= 0;
+ sop <= 0;
+ eop <= 0;
+ ctl <= 0;
+ end else begin
+ if (o_mul.rdy) begin
+ val <= {val, i_mul.val};
+ sop <= {sop, i_mul.sop};
+ eop <= {eop, i_mul.eop};
+ ctl <= {ctl, i_mul.ctl};
+ end
+ end
+end
+
+// Logic for handling multiple pipelines
+always_ff @ (posedge i_clk) begin
+ if (o_mul.rdy) begin
+ for (int i = 0; i < NUM_COL; i++)
+ dat_a <= 0;
+ dat_b <= 0;
+ dat_a <= i_mul.dat[0+:DAT_BITS];
+ dat_b <= i_mul.dat[DAT_BITS+:DAT_BITS];
+ end
+end
+
+
+always_ff @ (posedge i_clk) begin
+ for (int i = 0; i < NUM_COL; i++)
+ for (int j = 0; j < NUM_ROW; j++) begin
+ if (o_mul.rdy)
+ mul_grid[i][j] <= dat_a[i*A_DSP_W +: A_DSP_W] * dat_b[j*B_DSP_W +: B_DSP_W];
+ end
+end
+
+// Register lower half accumulator output while we lookup BRAM
+always_ff @ (posedge i_clk)
+ for (int i = 0; i < MAX_COEF/2; i++) begin
+ if (o_mul.rdy) begin
+ accum_grid_o_r[i] <= accum_grid_o[i];
+ accum_grid_o_rr[i] <= accum_grid_o_r[i];
+ end
+ end
+
+// Two paths to make sure we are < MODULUS
+always_comb begin
+ res0_c = 0;
+ for (int i = 0; i < MAX_COEF/2; i++)
+ res0_c += accum2_grid_o[i] << (i*GRID_BIT);
+end
+
+// We do a second level reduction to get back within MODULUS bits
+
+always_ff @ (posedge i_clk) begin
+ if (o_mul.rdy) begin
+ res0_r <= res0_c;
+ res0_rr <= res0_r;
+ // Do final adjustment
+ o_mul.dat <= res1_m_c_ < res1_c ? res1_m_c_ : res1_c < res1_m_c ? res1_c : res1_m_c;
+ end
+end
+
+endmodule
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod_wrapper.sv b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod_wrapper.sv
new file mode 100644
index 0000000..663b8b5
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod_wrapper.sv
@@ -0,0 +1,83 @@
+/*
+ Wrapper for synthesis.
+
+ Copyright (C) 2019 Benjamin Devlin and Zcash Foundation
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+ */
+
+module accum_mult_mod_wrapper #(
+ parameter BITS = 381,
+ parameter [380:0] MODULUS = 381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab,
+ parameter A_DSP_W = 26,
+ parameter B_DSP_W = 17,
+ parameter GRID_BIT = 32,
+ parameter RAM_A_W = 8,
+ parameter RAM_D_W = 32
+)(
+ input i_clk,
+ input i_rst,
+ input i_val,
+ input i_rdy,
+ output logic o_val,
+ output logic o_rdy,
+ input [BITS-1:0] i_dat_a,
+ input [BITS-1:0] i_dat_b,
+ output logic [BITS-1:0] o_dat,
+ input [RAM_D_W-1:0] i_ram_d,
+ input i_ram_we,
+ input i_ram_se
+);
+
+logic [RAM_D_W-1:0] ram_d_r;
+logic ram_we_r;
+logic ram_se_r;
+
+if_axi_stream #(.DAT_BYTS(BITS*2), .CTL_BITS(8)) in_if(i_clk);
+if_axi_stream #(.DAT_BYTS(BITS), .CTL_BITS(8)) out_if(i_clk);
+
+always_ff @ (posedge i_clk) begin
+ in_if.dat[0+:BITS] <= i_dat_a;
+ in_if.dat[BITS+:BITS] <= i_dat_b;
+ o_dat <= out_if.dat;
+ in_if.val <= i_val;
+ o_rdy <= in_if.rdy;
+ out_if.rdy <= i_rdy;
+ o_val <= out_if.val;
+ ram_d_r <= i_ram_d;
+ ram_we_r <= i_ram_we;
+ ram_se_r <= i_ram_se;
+end
+
+accum_mult_mod #(
+ .DAT_BITS ( BITS ),
+ .CTL_BITS ( 8 ),
+ .MODULUS ( MODULUS ),
+ .A_DSP_W ( A_DSP_W ),
+ .B_DSP_W ( B_DSP_W ),
+ .GRID_BIT ( GRID_BIT ),
+ .RAM_A_W ( RAM_A_W ),
+ .RAM_D_W ( RAM_D_W )
+)
+accum_mult_mod (
+ .i_clk ( i_clk ),
+ .i_rst ( i_rst ),
+ .i_mul ( in_if ),
+ .o_mul ( out_if ),
+ .i_ram_d ( ram_d_r ),
+ .i_ram_we ( ram_we_r ),
+ .i_ram_se ( ram_se_r )
+);
+
+endmodule
\ No newline at end of file
diff --git a/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv b/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv
new file mode 100644
index 0000000..e3683f5
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ Copyright 2019 Supranational LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*******************************************************************************/
+
+/*
+ A parameterized carry save adder (CSA)
+ Loops through each input bit and feeds a full adder (FA)
+ --------------------------------
+ | CSA |
+ | for each i in BIT_LEN |
+ | ------- |
+ | | FA | |
+ A[] --> | Ai --> | | --> Si | --> S[]
+ B[] --> | Bi --> | | |
+ Cin[] --> | Cini --> | | --> Couti | --> Cout[]
+ | ------- |
+ --------------------------------
+*/
+
+module carry_save_adder
+ #(
+ parameter int BIT_LEN = 19
+ )
+ (
+ input logic [BIT_LEN-1:0] A,
+ input logic [BIT_LEN-1:0] B,
+ input logic [BIT_LEN-1:0] Cin,
+ output logic [BIT_LEN-1:0] Cout,
+ output logic [BIT_LEN-1:0] S
+ );
+
+ genvar i;
+ generate
+ for (i=0; i | | --> S
+ B --> | |
+ Cin --> | | --> Cout
+ -------
+*/
+
+module full_adder
+ (
+ input logic A,
+ input logic B,
+ input logic Cin,
+ output logic Cout,
+ output logic S
+ );
+
+ always_comb begin
+ S = A ^ B ^ Cin;
+ Cout = (A & B) | (Cin & (A ^ B));
+ end
+endmodule
diff --git a/ip_cores/accum_mult_mod/src/tb/accum_mult_mod_tb.sv b/ip_cores/accum_mult_mod/src/tb/accum_mult_mod_tb.sv
new file mode 100644
index 0000000..517e186
--- /dev/null
+++ b/ip_cores/accum_mult_mod/src/tb/accum_mult_mod_tb.sv
@@ -0,0 +1,127 @@
+/*
+ Copyright (C) 2019 Benjamin Devlin and Zcash Foundation
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+`timescale 1ps/1ps
+`define SIMULATION
+
+module accum_mult_mod_tb ();
+import common_pkg::*;
+
+localparam CLK_PERIOD = 100;
+
+logic clk, rst;
+
+parameter BITS = 381;
+parameter [BITS-1:0] MODULUS = 'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab;
+parameter A_DSP_W = 26;
+parameter B_DSP_W = 17;
+parameter GRID_BIT = 64;
+parameter RAM_A_W = 10;
+parameter RAM_D_W = 32;
+
+// This is the max size we can expect on the output
+
+if_axi_stream #(.DAT_BYTS((2*BITS+7)/8), .CTL_BITS(8)) in_if(clk);
+if_axi_stream #(.DAT_BYTS((BITS+7)/8), .CTL_BITS(8)) out_if(clk);
+
+initial begin
+ rst = 0;
+ repeat(2) #(20*CLK_PERIOD) rst = ~rst;
+end
+
+initial begin
+ clk = 0;
+ forever #CLK_PERIOD clk = ~clk;
+end
+
+// Check for errors
+always_ff @ (posedge clk)
+ if (out_if.val && out_if.err)
+ $error(1, "%m %t ERROR: output .err asserted", $time);
+
+
+accum_mult_mod #(
+ .DAT_BITS ( BITS ),
+ .MODULUS ( MODULUS ),
+ .CTL_BITS ( 8 ),
+ .A_DSP_W ( A_DSP_W ),
+ .B_DSP_W ( B_DSP_W ),
+ .GRID_BIT ( GRID_BIT ),
+ .RAM_A_W ( RAM_A_W ),
+ .RAM_D_W ( RAM_D_W )
+)
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
+ .i_mul ( in_if ),
+ .o_mul ( out_if ),
+ .i_ram_d (),
+ .i_ram_we (),
+ .i_ram_se ()
+);
+
+task test_loop();
+begin
+ integer signed get_len;
+ logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
+ logic [BITS-1:0] in_a, in_b, out;
+ logic [BITS*2-1:0] expected;
+ integer t;
+ integer i, max;
+
+ $display("Running test_loop...");
+ i = 0;
+ max = 1000;
+
+ while (i < max) begin
+ in_a = random_vector((BITS+7)/8);
+ in_b = random_vector((BITS+7)/8);
+ expected = (in_a * in_b);
+ expected = expected % MODULUS;
+
+ fork
+ in_if.put_stream({in_b, in_a}, ((BITS*2)+7)/8, i);
+ out_if.get_stream(get_dat, get_len, 0);
+ join
+
+ out = get_dat;
+
+ t = out / MODULUS;
+ out = out % MODULUS;
+
+ assert(out == expected) else begin
+ $display("Expected: 0x%0x", expected);
+ $display("Was: 0x%0x (t=%0d)", out, t);
+ $fatal(1, "ERROR: Output did not match");
+ end
+ $display("test_loop PASSED loop %d/%d - 0x%0x (t=%0d)", i, max, out, t);
+ i = i + 1;
+ end
+
+ $display("test_loop PASSED");
+end
+endtask;
+
+initial begin
+ out_if.rdy = 0;
+ in_if.reset_source();
+ #(40*CLK_PERIOD);
+
+ test_loop();
+
+ #1us $finish();
+end
+endmodule
\ No newline at end of file
diff --git a/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv b/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
index ec7b44e..aa9217f 100644
--- a/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
+++ b/ip_cores/ec/src/rtl/ec_fe6_mul_s.sv
@@ -49,7 +49,7 @@ localparam NUM_OVR_WRT_BIT = 5;
FE2_TYPE a_a, b_b, c_c, t;
FE6_TYPE out, in_a, in_b;
-logic [22:0] eq_val, eq_wait;
+logic [22:0] eq_val, eq_wait, eq_sent;
logic mul_cnt, add_cnt, sub_cnt, mnr_cnt;
logic mul_en, add_en, sub_en, mnr_en;
logic [4:0] nxt_fe2_mul, nxt_fe2_mnr, nxt_fe_add, nxt_fe_sub;
@@ -71,6 +71,7 @@ always_ff @ (posedge i_clk) begin
i_mnr_fe2_if.rdy <= 0;
eq_val <= 0;
eq_wait <= 0;
+ eq_sent <= 0;
rdy_l <= 0;
a_a <= 0;
b_b <= 0;
@@ -94,6 +95,7 @@ always_ff @ (posedge i_clk) begin
if (o_sub_fe_if.rdy) o_sub_fe_if.val <= 0;
if (o_add_fe_if.rdy) o_add_fe_if.val <= 0;
if (o_mnr_fe2_if.rdy) o_mnr_fe2_if.val <= 0;
+
if (~sub_en) get_next_sub();
if (~add_en) get_next_add();
@@ -116,6 +118,7 @@ always_ff @ (posedge i_clk) begin
if(out_cnt == 5) begin
eq_val <= 0;
eq_wait <= 0;
+ eq_sent <= 0;
rdy_l <= 0;
a_a <= 0;
b_b <= 0;
@@ -253,6 +256,7 @@ task fe2_subtraction(input int unsigned ctl, input FE2_TYPE a, b);
eq_wait[ctl] <= 1;
if (sub_cnt == 1) begin
get_next_sub();
+ eq_sent[ctl] <= 1;
end
sub_cnt <= sub_cnt + 1;
end
@@ -270,6 +274,7 @@ task fe2_addition(input int unsigned ctl, input FE2_TYPE a, b);
eq_wait[ctl] <= 1;
if (add_cnt == 1) begin
get_next_add();
+ eq_sent[ctl] <= 1;
end
add_cnt <= add_cnt + 1;
end
@@ -287,6 +292,7 @@ task fe2_multiply(input int unsigned ctl, input FE2_TYPE a, b);
eq_wait[ctl] <= 1;
if (mul_cnt == 1) begin
get_next_fe2_mul();
+ eq_sent[ctl] <= 1;
end
mul_cnt <= mul_cnt + 1;
end
@@ -303,6 +309,7 @@ task fe2_mnr(input int unsigned ctl, input FE2_TYPE a);
eq_wait[ctl] <= 1;
if (mnr_cnt == 1) begin
get_next_fe2_mnr();
+ eq_sent[ctl] <= 1;
end
mnr_cnt <= mnr_cnt + 1;
end
@@ -334,13 +341,13 @@ task get_next_add();
nxt_fe_add <= 4;
else if(~eq_wait[8] && rdy_l)
nxt_fe_add <= 8;
- else if(~eq_wait[9] && eq_wait[5] && rdy_l)
+ else if(~eq_wait[9] && eq_sent[5] && rdy_l)
nxt_fe_add <= 9;
else if (~eq_wait[12] && eq_val[11] && eq_val[1])
nxt_fe_add <= 12;
else if(~eq_wait[13] && rdy_l)
nxt_fe_add <= 13;
- else if(~eq_wait[14] && eq_wait[10] && rdy_l)
+ else if(~eq_wait[14] && eq_sent[10] && rdy_l)
nxt_fe_add <= 14;
else if(~eq_wait[19] && eq_val[18] && eq_val[0])
nxt_fe_add <= 19;
@@ -372,7 +379,7 @@ task get_next_fe2_mnr();
mnr_en <= 1;
if(~eq_wait[18] && eq_val[7])
nxt_fe2_mnr <= 18;
- else if(~eq_wait[21] && eq_wait[20])
+ else if(~eq_wait[21] && eq_sent[20])
nxt_fe2_mnr <= 21;
else
mnr_en <= 0;
diff --git a/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv b/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv
index 01d3921..8e6da3d 100644
--- a/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fe12_pow_s_tb.sv
@@ -15,6 +15,8 @@
along with this program. If not, see .
*/
`timescale 1ps/1ps
+`define SIMULATION
+`define BL12_381_NEWMULT
module ec_fe12_pow_s_tb ();
@@ -70,17 +72,23 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_i_if
if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(POW_BITS)) pow_fe12_o_if (clk);
if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(POW_BITS)) pow_fe12_i_if (clk);
-
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( CTL_BITS )
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE)),
+ .CTL_BITS ( CTL_BITS ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 64 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
.i_mul ( mul_fe_o_if ),
- .o_mul ( mul_fe_i_if )
+ .o_mul ( mul_fe_i_if ),
+ .i_ram_d (),
+ .i_ram_we (),
+ .i_ram_se ()
);
adder_pipe # (
diff --git a/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv b/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv
index d1c802c..b04dcbb 100644
--- a/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fe6_mul_s_tb.sv
@@ -15,6 +15,8 @@
along with this program. If not, see .
*/
`timescale 1ps/1ps
+`define SIMULATION
+`define BL12_381_NEWMULT
module ec_fe6_mul_s_tb ();
@@ -59,20 +61,28 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe2_i_if (cl
if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) o_mul_fe6_if (clk);
if_axi_stream #(.DAT_BYTS((2*$bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) i_mul_fe6_if (clk);
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( CTL_BITS )
+
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE) ),
+ .CTL_BITS ( CTL_BITS ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 32 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
.i_mul ( mul_fe_o_if ),
- .o_mul ( mul_fe_i_if )
+ .o_mul ( mul_fe_i_if ),
+ .i_ram_d (),
+ .i_ram_we (),
+ .i_ram_se ()
);
adder_pipe # (
- .BITS ( bls12_381_pkg::DAT_BITS ),
+ .BITS ( $bits(FE_TYPE) ),
.P ( P ),
.CTL_BITS ( CTL_BITS ),
.LEVEL ( 2 )
@@ -85,7 +95,7 @@ adder_pipe (
);
subtractor_pipe # (
- .BITS ( bls12_381_pkg::DAT_BITS ),
+ .BITS ( $bits(FE_TYPE) ),
.P ( P ),
.CTL_BITS ( CTL_BITS ),
.LEVEL ( 2 )
@@ -231,8 +241,6 @@ task test();
endtask
-
-
initial begin
i_mul_fe6_if.reset_source();
o_mul_fe6_if.rdy = 0;
diff --git a/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv b/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv
index a279849..ee8bee4 100644
--- a/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp12_arithmetic_tb.sv
@@ -174,20 +174,28 @@ fe6_mul_by_nonresidue_i (
.i_mnr_fe2_if ( mnr_fe2_o_if[1] )
);
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( CTL_BITS )
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE) ),
+ .MODULUS ( P ),
+ .CTL_BITS ( CTL_BITS ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 64 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
.i_mul ( mul_fe_in_if ),
- .o_mul ( mul_fe_out_if )
+ .o_mul ( mul_fe_out_if ),
+ .i_ram_d ( '0 ),
+ .i_ram_we ( '0 ),
+ .i_ram_se ( '0 )
);
adder_pipe # (
- .BITS ( bls12_381_pkg::DAT_BITS ),
+ .BITS ( $bits(FE_TYPE) ),
.P ( P ),
.CTL_BITS ( CTL_BITS ),
.LEVEL ( 2 )
@@ -200,7 +208,7 @@ adder_pipe (
);
subtractor_pipe # (
- .BITS ( bls12_381_pkg::DAT_BITS ),
+ .BITS ( $bits(FE_TYPE) ),
.P ( P ),
.CTL_BITS ( CTL_BITS ),
.LEVEL ( 2 )
diff --git a/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv b/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
index eba7ae7..2d9b6d1 100644
--- a/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp2_point_mult_tb.sv
@@ -197,16 +197,24 @@ resource_share_add (
.o_axi ( add_out_if[1:0] )
);
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( 16 )
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE) ),
+ .MODULUS ( P ),
+ .CTL_BITS ( 16 ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 64 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
.i_mul ( mult_in_if[2] ),
- .o_mul ( mult_out_if[2] )
+ .o_mul ( mult_out_if[2] ),
+ .i_ram_d ( '0 ),
+ .i_ram_we ( '0 ),
+ .i_ram_se ( '0 )
);
adder_pipe # (
@@ -264,8 +272,6 @@ begin
end
endtask;
-logic [380:0] in_k;
-
initial begin
out_if.rdy = 0;
in_if.val = 0;
diff --git a/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv b/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
index 269299b..d18a98b 100644
--- a/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
+++ b/ip_cores/ec/src/tb/ec_fp_point_mult_tb.sv
@@ -14,6 +14,7 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
+
`timescale 1ps/1ps
module ec_fp_point_mult_tb ();
@@ -36,8 +37,6 @@ parameter P = bls12_381_pkg::P;
if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8), .CTL_BITS(KEY_BITS)) in_if(clk);
if_axi_stream #(.DAT_BYTS(($bits(FP_TYPE)+7)/8)) out_if(clk);
-
-
if_axi_stream #(.DAT_BITS(2*$bits(FP_TYPE))) add_i_if(clk);
if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) add_o_if(clk);
if_axi_stream #(.DAT_BITS($bits(FP_TYPE))) dbl_i_if(clk);
@@ -187,20 +186,30 @@ resource_share_add (
.o_axi ( add_out_if[1:0] )
);
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( 16 )
+
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE) ),
+ .MODULUS ( P ),
+ .CTL_BITS ( 16 ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 64 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
.i_mul ( mult_in_if[2] ),
- .o_mul ( mult_out_if[2] )
+ .o_mul ( mult_out_if[2] ),
+ .i_ram_d ( '0 ),
+ .i_ram_we ( '0 ),
+ .i_ram_se ( '0 )
);
adder_pipe # (
.P ( P ),
+ .BITS ( $bits(FE_TYPE) ),
.CTL_BITS ( 16 ),
.LEVEL ( 2 )
)
@@ -213,6 +222,7 @@ adder_pipe (
subtractor_pipe # (
.P ( P ),
+ .BITS ( $bits(FE_TYPE) ),
.CTL_BITS ( 16 ),
.LEVEL ( 2 )
)
@@ -230,7 +240,7 @@ begin
logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
integer start_time, finish_time;
FP_TYPE p_out, p_exp;
- $display("Running test with k= %d", k);
+ $display("Running test with k= %0d", k);
p_exp = `MULT_FUNC(k, `G_POINT);
start_time = $time;
fork
@@ -262,6 +272,7 @@ initial begin
#(40*CLK_PERIOD);
in_k = P-1;
+ test(381'h2);
test(381'haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa);
test(in_k);
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv
index 16ba363..2109691 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_axi_bridge.sv
@@ -31,7 +31,12 @@ module bls12_381_axi_bridge (
output logic [31:0] o_new_inst_pt,
output logic o_new_inst_pt_val,
output logic o_reset_inst_ram,
- output logic o_reset_data_ram
+ output logic o_reset_data_ram,
+
+ // Interface to memory used in multiplier
+ output logic [31:0] o_ram_d,
+ output logic o_ram_we,
+ output logic o_ram_se
);
import bls12_381_pkg::*;
@@ -47,7 +52,6 @@ logic [31:0] last_inst_cnt;
always_ff @ (posedge i_clk) begin
curr_inst_pt <= i_curr_inst_pt;
last_inst_cnt <= i_last_inst_cnt;
-
end
always_ff @ (posedge i_clk) begin
@@ -63,6 +67,10 @@ always_ff @ (posedge i_clk) begin
o_new_inst_pt <= 0;
o_reset_inst_ram <= 0;
o_reset_data_ram <= 0;
+
+ o_ram_d <= 0;
+ o_ram_we <= 0;
+ o_ram_se <= 0;
end else begin
o_reset_inst_ram <= 0;
@@ -150,6 +158,13 @@ always_ff @ (posedge i_clk) begin
o_reset_inst_ram <= axi_lite_if.wdata[0]; // This will reset the instruction ram
o_reset_data_ram <= axi_lite_if.wdata[1]; // This will reset the data ram
end
+ 32'h18: begin
+ o_ram_d <= axi_lite_if.wdata;
+ end
+ 32'h1c: begin
+ o_ram_we <= axi_lite_if.wdata[0];
+ o_ram_se <= axi_lite_if.wdata[1];
+ end
endcase
end else
if (wr_addr < DATA_AXIL_START) begin
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
index 34fc144..59cd8e7 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@@ -21,11 +21,11 @@ package bls12_381_pkg;
localparam DAT_BITS = 381;
localparam MUL_BITS = 384;
localparam [DAT_BITS-1:0] P = 381'h1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab;
+
+ typedef logic [380:0] fe_t;
- typedef logic [DAT_BITS-1:0] fe_t;
-
- fe_t Gx = 381'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
- fe_t Gy = 381'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;
+ fe_t Gx = 'h17F1D3A73197D7942695638C4FA9AC0FC3688C4F9774B905A14E3A3F171BAC586C55E83FF97A1AEFFB3AF00ADB22C6BB;
+ fe_t Gy = 'h08B3F481E3AAA0F1A09E30ED741D8AE4FCF5E095D5D00AF600DB18CB2C04B3EDD03CC744A2888AE40CAA232946C5E7E1;
localparam [63:0] ATE_X = 64'hd201000000010000;
localparam ATE_X_START = 63;
@@ -280,7 +280,7 @@ package bls12_381_pkg;
fe2_mul[1] = fe_add(fe_mul(a[0], b[1]), fe_mul(a[1], b[0]));
endfunction
- // Function to double point in Jacobian coordinates (for comparison in testbench)
+ // Function to double point in Jacobian coordinates (for comparison in testbench)
// Here a is 0, and we also mod the result
function jb_point_t dbl_jb_point(input jb_point_t p);
fe_t I_X, I_Y, I_Z, A, B, C, D, X, Y, Z;
@@ -291,6 +291,7 @@ package bls12_381_pkg;
I_Y = p.y;
I_Z = p.z;
A = fe_mul(I_Y, I_Y);
+
B = fe_mul(fe_mul(4, I_X), A);
C = fe_mul(fe_mul(8, A), A);
D = fe_mul(fe_mul(3, I_X), I_X);
@@ -379,7 +380,7 @@ package bls12_381_pkg;
if (c[0]) begin
result = add_jb_point(result, addend);
end
- addend = dbl_jb_point(addend);
+ addend = dbl_jb_point(addend);
c = c >> 1;
end
return result;
@@ -530,7 +531,8 @@ package bls12_381_pkg;
a_a = fe2_mul(a[0], b[0]); // 0. a_a = fe2_mul(a[0], b[0])
b_b = fe2_mul(a[1], b[1]); // 1. b_b = fe2_mul(a[1], b[1])
c_c = fe2_mul(a[2], b[2]); // 2. c_c = fe2_mul(a[2], b[2])
-
+
+
fe6_mul[0] = fe2_add(a[1], a[2]); // 3. fe6_mul[0] = fe2_add(a[1], a[2])
t = fe2_add(b[1], b[2]); // 4. t = fe2_add(b[1], b[2])
@@ -540,7 +542,6 @@ package bls12_381_pkg;
fe6_mul[2] = fe2_add(b[0], b[2]); // 8. fe6_mul[2] = fe2_add(b[0], b[2])
t = fe2_add(a[0], a[2]); // 9. t = fe2_add(a[0], a[2]) [wait 5]
-
fe6_mul[2] = fe2_mul(fe6_mul[2], t); // 10. fe6_mul[2] = fe2_mul(fe6_mul[2], t) [8, 9]
fe6_mul[2] = fe2_sub(fe6_mul[2], a_a); // 11. fe6_mul[2] = fe2_sub(fe6_mul[2], a_a) [10, 0]
fe6_mul[2] = fe2_add(fe6_mul[2], b_b); // 12. fe6_mul[2] = fe2_add(fe6_mul[2], b_b) [11, 1]
@@ -600,12 +601,14 @@ package bls12_381_pkg;
fe12_sqr[0] = fe6_add(fe12_sqr[0], a[0]);
fe12_sqr[0] = fe6_mul(fe12_sqr[0], c0c1);
+
fe12_sqr[0] = fe6_sub(fe12_sqr[0], ab);
fe12_sqr[1] = fe6_add(ab, ab);
ab = fe6_mul_by_nonresidue(ab);
fe12_sqr[0] = fe6_sub(fe12_sqr[0], ab);
+
endfunction
@@ -681,15 +684,14 @@ package bls12_381_pkg;
t3 = fe2_add(R.x, t1); // 6. [4]
t3 = fe2_mul(t3, t3); // 7. [6]
t3 = fe2_sub(t3, t0); // 8. [7, 1]
-
t3 = fe2_sub(t3, t2); // 9. [8, 5]
-
t3 = fe2_add(t3, t3); // 10. [9]
-
t6 = fe2_add(R.x, t4); // 11. [3]
t5 = fe2_mul(t4, t4); // 12. [3]
R.x = fe2_sub(t5, t3); // 13. [12, 10]
+
+
R.x = fe2_sub(R.x, t3); // 14. [13]
R.z = fe2_add(R.z, R.y); // 15. [R.val, wait 0]
@@ -730,7 +732,7 @@ package bls12_381_pkg;
// This performs both the line evaluation and the addition
task automatic miller_add_step(ref fp2_jb_point_t R, input fp2_af_point_t Q, input af_point_t P, ref fe12_t f);
fe2_t zsquared, ysquared, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
-
+
zsquared = fe2_mul(R.z, R.z); // 0. [R.val]
ysquared = fe2_mul(Q.y, Q.y); // 1. [Q.val]
@@ -797,7 +799,7 @@ package bls12_381_pkg;
t1[1] = fe_mul(t1[1], P.x); // 42. [38]
f = {{FE2_zero, t10, FE2_zero}, {FE2_zero, t1, t9}};
-
+
endtask
function fe2_t fe2_fmap(input fe2_t a, input int pow);
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
index ee04004..180599b 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
@@ -49,6 +49,9 @@ logic [7:0] interrupt_hdr_byt;
logic [READ_CYCLE:0] inst_ram_read, data_ram_read;
logic reset_inst_ram, reset_data_ram;
+logic [31:0] mult_ram_d;
+logic mult_ram_we, mult_ram_se;
+
// Instruction RAM
if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_sys_if(.i_clk(i_clk), .i_rst(i_rst || reset_inst_ram));
if_ram #(.RAM_WIDTH(bls12_381_pkg::INST_RAM_WIDTH), .RAM_DEPTH(bls12_381_pkg::INST_RAM_DEPTH)) inst_ram_usr_if(.i_clk(i_clk), .i_rst(i_rst || reset_inst_ram));
@@ -270,7 +273,10 @@ bls12_381_axi_bridge bls12_381_axi_bridge (
.o_new_inst_pt ( new_inst_pt ),
.o_new_inst_pt_val ( new_inst_pt_val ),
.o_reset_inst_ram ( reset_inst_ram ),
- .o_reset_data_ram ( reset_data_ram )
+ .o_reset_data_ram ( reset_data_ram ),
+ .o_ram_d ( mult_ram_d ),
+ .o_ram_we ( mult_ram_we ),
+ .o_ram_se ( mult_ram_se )
);
always_comb begin
@@ -339,16 +345,24 @@ resource_share_mul (
.o_axi ( mul_out_if[1:0] )
);
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( CTL_BITS )
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE) ),
+ .MODULUS ( P ),
+ .CTL_BITS ( CTL_BITS ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 64 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( i_clk ),
- .i_rst( i_rst ),
+accum_mult_mod (
+ .i_clk ( i_clk ),
+ .i_rst ( i_rst ),
.i_mul ( mul_in_if[2] ),
- .o_mul ( mul_out_if[2] )
+ .o_mul ( mul_out_if[2] ),
+ .i_ram_d ( mult_ram_d ),
+ .i_ram_we ( mult_ram_we ),
+ .i_ram_se ( mult_ram_se )
);
adder_pipe # (
@@ -560,7 +574,7 @@ task task_mul_element();
new_data.dat <= mul_out_if[1].dat;
new_data.pt <= pt_l;
data_ram_sys_if.we <= 1;
- cnt <= 34;
+ cnt <= 33;
end
end
3: begin
@@ -625,7 +639,7 @@ task task_mul_element();
new_data.pt <= pt_l;
data_ram_sys_if.we <= 1;
data_ram_sys_if.a <= curr_inst.c + 1;
- cnt <= 34;
+ cnt <= 33;
end
end
// FE12 multiplication
@@ -633,7 +647,6 @@ task task_mul_element();
20,21,22,23,24,25,26,27,28,29,30,31: begin
mul_fe12_i_if.rdy <= 0;
-
if (|data_ram_read[READ_CYCLE:1]== 0 && (~mul_fe12_o_if.val || (mul_fe12_o_if.val && mul_fe12_o_if.rdy))) begin
if (data_ram_read[0]) begin
data_ram_read[0] <= 1;
diff --git a/zcash_fpga/src/rtl/top/include.f b/zcash_fpga/src/rtl/top/include.f
index 0208da0..a781022 100644
--- a/zcash_fpga/src/rtl/top/include.f
+++ b/zcash_fpga/src/rtl/top/include.f
@@ -54,6 +54,12 @@
${ZCASH_DIR}/ip_cores/util/src/rtl/adder_pipe.sv
${ZCASH_DIR}/ip_cores/util/src/rtl/subtracter_pipe.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/accum_mult_mod.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/carry_save_adder_tree_level.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/carry_save_adder.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/compressor_tree_3_to_2.sv
+${ZCASH_DIR}/ip_cores/accum_mult_mod/src/rtl/full_adder.sv
+
${ZCASH_DIR}/ip_cores/ec/src/rtl/ec_fp_mult_mod.sv
${ZCASH_DIR}/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv
${ZCASH_DIR}/ip_cores/ec/src/rtl/ec_fp2_point_add.sv
diff --git a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
index 7ab1ed2..4ff0bb9 100644
--- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
+++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
@@ -27,7 +27,7 @@ package zcash_fpga_pkg;
import bls12_381_pkg::point_type_t;
- parameter FPGA_VERSION = 32'h01_03_00; //v1.3.0
+ parameter FPGA_VERSION = 32'h01_04_00; //v1.4.0
// What features are enabled in this build
parameter bit ENB_VERIFY_SECP256K1_SIG = 1;
diff --git a/zcash_fpga/src/tb/bls12_381_fe12_final_exp_tb.sv b/zcash_fpga/src/tb/bls12_381_fe12_final_exp_tb.sv
deleted file mode 100644
index 1157fa6..0000000
--- a/zcash_fpga/src/tb/bls12_381_fe12_final_exp_tb.sv
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- Copyright (C) 2019 Benjamin Devlin and Zcash Foundation
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
-*/
-`timescale 1ps/1ps
-
-module bls12_381_fe12_final_exp_tb ();
-
-import common_pkg::*;
-import bls12_381_pkg::*;
-
-parameter type FE_TYPE = bls12_381_pkg::fe_t;
-parameter type FE2_TYPE = bls12_381_pkg::fe2_t;
-parameter type FE6_TYPE = bls12_381_pkg::fe6_t;
-parameter type FE12_TYPE = bls12_381_pkg::fe12_t;
-parameter P = bls12_381_pkg::P;
-
-localparam POW_BITS = $bits(ATE_X);
-localparam POW_BIT = 64;
-localparam FMAP_BIT = 56;
-localparam SQ_BIT = 60;
-localparam CTL_BITS = POW_BIT + POW_BITS;
-
-localparam CLK_PERIOD = 100;
-
-logic clk, rst;
-
-initial begin
- rst = 0;
- repeat(2) #(20*CLK_PERIOD) rst = ~rst;
-end
-
-initial begin
- clk = 0;
- forever #(CLK_PERIOD/2) clk = ~clk;
-end
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if [3:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_i_if [3:0] (clk);
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_o_if [4:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) add_fe_i_if [4:0] (clk);
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_o_if [6:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) sub_fe_i_if [6:0] (clk);
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_o_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe2_i_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe2_o_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe2_i_if [2:0] (clk);
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe6_o_if (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe6_i_if (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe6_o_if (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe6_i_if (clk);
-
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [2:0] (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_i_if [2:0] (clk);
-
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe12_o_if (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe12_i_if (clk);
-
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe12_o_if (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe12_i_if (clk);
-
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) pow_fe12_o_if (clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) pow_fe12_i_if (clk);
-
-if_axi_stream #(.DAT_BYTS((7+$bits(FE_TYPE))/8), .CTL_BITS(CTL_BITS)) final_exp_fe12_o_if (clk);
-if_axi_stream #(.DAT_BYTS((7+$bits(FE_TYPE))/8), .CTL_BITS(CTL_BITS)) final_exp_fe12_i_if (clk);
-
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( CTL_BITS )
-)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
- .i_mul ( mul_fe_o_if[3] ),
- .o_mul ( mul_fe_i_if[3] )
-);
-
-adder_pipe # (
- .BITS ( bls12_381_pkg::DAT_BITS ),
- .P ( P ),
- .CTL_BITS ( CTL_BITS ),
- .LEVEL ( 2 )
-)
-adder_pipe (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_add ( add_fe_o_if[4] ),
- .o_add ( add_fe_i_if[4] )
-);
-
-subtractor_pipe # (
- .BITS ( bls12_381_pkg::DAT_BITS ),
- .P ( P ),
- .CTL_BITS ( CTL_BITS ),
- .LEVEL ( 2 )
-)
-subtractor_pipe (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_sub ( sub_fe_o_if[6] ),
- .o_sub ( sub_fe_i_if[6] )
-);
-
-ec_fe2_mul_s #(
- .FE_TYPE ( FE_TYPE ),
- .CTL_BITS ( CTL_BITS )
-)
-ec_fe2_mul_s (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mul_fe2_if ( mul_fe2_i_if[2] ),
- .i_mul_fe2_if ( mul_fe2_o_if[2] ),
- .o_add_fe_if ( add_fe_o_if[0] ),
- .i_add_fe_if ( add_fe_i_if[0] ),
- .o_sub_fe_if ( sub_fe_o_if[0] ),
- .i_sub_fe_if ( sub_fe_i_if[0] ),
- .o_mul_fe_if ( mul_fe_o_if[0] ),
- .i_mul_fe_if ( mul_fe_i_if[0] )
-);
-
-fe2_mul_by_nonresidue_s #(
- .FE_TYPE ( FE_TYPE )
-)
-fe2_mul_by_nonresidue_s (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mnr_fe2_if ( mnr_fe2_i_if[2] ),
- .i_mnr_fe2_if ( mnr_fe2_o_if[2] ),
- .o_add_fe_if ( add_fe_o_if[1] ),
- .i_add_fe_if ( add_fe_i_if[1] ),
- .o_sub_fe_if ( sub_fe_o_if[1] ),
- .i_sub_fe_if ( sub_fe_i_if[1] )
-);
-
-ec_fe6_mul_s #(
- .FE_TYPE ( FE_TYPE ),
- .FE2_TYPE ( FE2_TYPE ),
- .FE6_TYPE ( FE6_TYPE ),
- .OVR_WRT_BIT ( 0 )
-)
-ec_fe6_mul_s (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mul_fe2_if ( mul_fe2_o_if[0] ),
- .i_mul_fe2_if ( mul_fe2_i_if[0] ),
- .o_add_fe_if ( add_fe_o_if[2] ),
- .i_add_fe_if ( add_fe_i_if[2] ),
- .o_sub_fe_if ( sub_fe_o_if[2] ),
- .i_sub_fe_if ( sub_fe_i_if[2] ),
- .o_mnr_fe2_if ( mnr_fe2_o_if[0] ),
- .i_mnr_fe2_if ( mnr_fe2_i_if[0] ),
- .o_mul_fe6_if ( mul_fe6_i_if ),
- .i_mul_fe6_if ( mul_fe6_o_if )
-);
-
-fe6_mul_by_nonresidue_s #(
- .FE_TYPE ( FE_TYPE )
-)
-fe6_mul_by_nonresidue_s (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mnr_fe2_if ( mnr_fe2_o_if[1] ),
- .i_mnr_fe2_if ( mnr_fe2_i_if[1] ),
- .o_mnr_fe6_if ( mnr_fe6_i_if ),
- .i_mnr_fe6_if ( mnr_fe6_o_if )
-);
-
-ec_fe12_mul_s #(
- .FE_TYPE ( FE_TYPE ),
- .OVR_WRT_BIT ( 8 ),
- .SQ_BIT ( SQ_BIT )
-)
-ec_fe12_mul_s (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mul_fe6_if ( mul_fe6_o_if ),
- .i_mul_fe6_if ( mul_fe6_i_if ),
- .o_add_fe_if ( add_fe_o_if[3] ),
- .i_add_fe_if ( add_fe_i_if[3] ),
- .o_sub_fe_if ( sub_fe_o_if[3] ),
- .i_sub_fe_if ( sub_fe_i_if[3] ),
- .o_mnr_fe6_if ( mnr_fe6_o_if ),
- .i_mnr_fe6_if ( mnr_fe6_i_if ),
- .o_mul_fe12_if ( mul_fe12_i_if[2] ),
- .i_mul_fe12_if ( mul_fe12_o_if[2] )
-);
-
-bls12_381_fe12_fmap_wrapper #(
- .FE_TYPE ( FE_TYPE ),
- .CTL_BITS ( CTL_BITS ),
- .CTL_BIT_POW ( FMAP_BIT )
-)
-bls12_381_fe12_fmap_wrapper (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_fmap_fe12_if ( fmap_fe12_i_if ),
- .i_fmap_fe12_if ( fmap_fe12_o_if ),
- .o_mul_fe2_if ( mul_fe2_o_if[1] ),
- .i_mul_fe2_if ( mul_fe2_i_if[1] ),
- .o_mul_fe_if ( mul_fe_o_if[1] ),
- .i_mul_fe_if ( mul_fe_i_if[1] )
-);
-
-bls12_381_fe12_inv_wrapper #(
- .FE_TYPE ( FE_TYPE ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 0 )
-)
-bls12_381_fe12_inv_wrapper (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_inv_fe12_if ( inv_fe12_i_if ),
- .i_inv_fe12_if ( inv_fe12_o_if ),
- .o_mul_fe_if ( mul_fe_o_if[2] ),
- .i_mul_fe_if ( mul_fe_i_if[2] )
-);
-
-ec_fe12_pow_s #(
- .FE_TYPE ( FE_TYPE ),
- .CTL_BIT_POW ( POW_BIT ),
- .POW_BITS ( POW_BITS ),
- .SQ_BIT ( SQ_BIT )
-)
-ec_fe12_pow_s (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mul_fe12_if ( mul_fe12_o_if[0] ),
- .i_mul_fe12_if ( mul_fe12_i_if[0] ),
- .o_sub_fe_if ( sub_fe_o_if[4] ),
- .i_sub_fe_if ( sub_fe_i_if[4] ),
- .o_pow_fe12_if ( pow_fe12_i_if ),
- .i_pow_fe12_if ( pow_fe12_o_if )
-);
-
-bls12_381_final_exponent #(
- .OVR_WRT_BIT ( 32 ),
- .FMAP_BIT ( FMAP_BIT ),
- .POW_BIT ( POW_BIT ),
- .SQ_BIT ( SQ_BIT )
-)
-bls12_381_final_exponent (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .o_mul_fe12_if ( mul_fe12_o_if[1] ),
- .i_mul_fe12_if ( mul_fe12_i_if[1] ),
- .o_pow_fe12_if ( pow_fe12_o_if ),
- .i_pow_fe12_if ( pow_fe12_i_if ),
- .o_fmap_fe12_if ( fmap_fe12_o_if ),
- .i_fmap_fe12_if ( fmap_fe12_i_if ),
- .o_inv_fe12_if ( inv_fe12_o_if ),
- .i_inv_fe12_if ( inv_fe12_i_if ),
- .o_sub_fe_if ( sub_fe_o_if[5] ),
- .i_sub_fe_if ( sub_fe_i_if[5] ),
- .o_final_exp_fe12_if ( final_exp_fe12_i_if ),
- .i_final_exp_fe12_if ( final_exp_fe12_o_if )
-);
-
-
-resource_share # (
- .NUM_IN ( 4 ),
- .DAT_BITS ( 2*$bits(FE_TYPE) ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 40 ),
- .PIPELINE_IN ( 1 ),
- .PIPELINE_OUT ( 1 )
-)
-resource_share_fe_add (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_axi ( add_fe_o_if[3:0] ),
- .o_res ( add_fe_o_if[4] ),
- .i_res ( add_fe_i_if[4] ),
- .o_axi ( add_fe_i_if[3:0] )
-);
-
-resource_share # (
- .NUM_IN ( 6 ),
- .DAT_BITS ( 2*$bits(FE_TYPE) ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 40 ),
- .PIPELINE_IN ( 1 ),
- .PIPELINE_OUT ( 1 )
-)
-resource_share_fe_sub (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_axi ( sub_fe_o_if[5:0] ),
- .o_res ( sub_fe_o_if[6] ),
- .i_res ( sub_fe_i_if[6] ),
- .o_axi ( sub_fe_i_if[5:0] )
-);
-
-resource_share # (
- .NUM_IN ( 3 ),
- .DAT_BITS ( 2*$bits(FE_TYPE) ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 40 ),
- .PIPELINE_IN ( 1 ),
- .PIPELINE_OUT ( 1 )
-)
-resource_share_fe_mul (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_axi ( mul_fe_o_if[2:0] ),
- .o_res ( mul_fe_o_if[3] ),
- .i_res ( mul_fe_i_if[3] ),
- .o_axi ( mul_fe_i_if[2:0] )
-);
-
-resource_share # (
- .NUM_IN ( 2 ),
- .DAT_BITS ( 2*$bits(FE_TYPE) ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 44 ),
- .PIPELINE_IN ( 1 ),
- .PIPELINE_OUT ( 1 )
-)
-resource_share_fe2_mul (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_axi ( mul_fe2_o_if[1:0] ),
- .o_res ( mul_fe2_o_if[2] ),
- .i_res ( mul_fe2_i_if[2] ),
- .o_axi ( mul_fe2_i_if[1:0] )
-);
-
-resource_share # (
- .NUM_IN ( 2 ),
- .DAT_BITS ( 2*$bits(FE_TYPE) ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 48 ),
- .PIPELINE_IN ( 1 ),
- .PIPELINE_OUT ( 1 )
-)
-resource_share_fe12_mul (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_axi ( mul_fe12_o_if[1:0] ),
- .o_res ( mul_fe12_o_if[2] ),
- .i_res ( mul_fe12_i_if[2] ),
- .o_axi ( mul_fe12_i_if[1:0] )
-);
-
-resource_share # (
- .NUM_IN ( 2 ),
- .DAT_BITS ( 2*$bits(FE_TYPE) ),
- .CTL_BITS ( CTL_BITS ),
- .OVR_WRT_BIT ( 52 ),
- .PIPELINE_IN ( 1 ),
- .PIPELINE_OUT ( 1 )
-)
-resource_share_fe2_mnr (
- .i_clk ( clk ),
- .i_rst ( rst ),
- .i_axi ( mnr_fe2_o_if[1:0] ),
- .o_res ( mnr_fe2_o_if[2] ),
- .i_res ( mnr_fe2_i_if[2] ),
- .o_axi ( mnr_fe2_i_if[1:0] )
-);
-
-
-// This just tests our software model vs a known good result
-task test_sw();
- af_point_t P;
- fp2_af_point_t Q;
- fe12_t f, f_exp;
-
- $display("Running test_sw ...");
-
- // Known good result from zcash rust code
- f_exp = {381'h0f41e58663bf08cf068672cbd01a7ec73baca4d72ca93544deff686bfd6df543d48eaa24afe47e1efde449383b676631,
- 381'h04c581234d086a9902249b64728ffd21a189e87935a954051c7cdba7b3872629a4fafc05066245cb9108f0242d0fe3ef,
- 381'h03350f55a7aefcd3c31b4fcb6ce5771cc6a0e9786ab5973320c806ad360829107ba810c5a09ffdd9be2291a0c25a99a2,
- 381'h11b8b424cd48bf38fcef68083b0b0ec5c81a93b330ee1a677d0d15ff7b984e8978ef48881e32fac91b93b47333e2ba57,
- 381'h06fba23eb7c5af0d9f80940ca771b6ffd5857baaf222eb95a7d2809d61bfe02e1bfd1b68ff02f0b8102ae1c2d5d5ab1a,
- 381'h19f26337d205fb469cd6bd15c3d5a04dc88784fbb3d0b2dbdea54d43b2b73f2cbb12d58386a8703e0f948226e47ee89d,
- 381'h018107154f25a764bd3c79937a45b84546da634b8f6be14a8061e55cceba478b23f7dacaa35c8ca78beae9624045b4b6,
- 381'h01b2f522473d171391125ba84dc4007cfbf2f8da752f7c74185203fcca589ac719c34dffbbaad8431dad1c1fb597aaa5,
- 381'h193502b86edb8857c273fa075a50512937e0794e1e65a7617c90d8bd66065b1fffe51d7a579973b1315021ec3c19934f,
- 381'h1368bb445c7c2d209703f239689ce34c0378a68e72a6b3b216da0e22a5031b54ddff57309396b38c881c4c849ec23e87,
- 381'h089a1c5b46e5110b86750ec6a532348868a84045483c92b7af5af689452eafabf1a8943e50439f1d59882a98eaa0170f,
- 381'h1250ebd871fc0a92a7b2d83168d0d727272d441befa15c503dd8e90ce98db3e7b6d194f60839c508a84305aaca1789b6};
-
- // Output of miller loop - input to our model
- f = {381'h049eaeacea5c5e9ad17ab1909cb31c653b0cb7184cc9187f77a934b1189b088d4ca64d0ff60eb0b6be8805757ba3df04,
- 381'h0198faba7d94607ce154e6a711ef859a5c4623722d4136c961a801c2b984aae5838a532aae5c2211660d3b8689b8f015,
- 381'h12b091c5b34124368d2e95a7fd6cfa3b456447e49cd298de506572c5f3afb8727f2a186f0ea14bf5eed2171c4568b5c5,
- 381'h05cfef8c26f3886e502008fc1fd74b86d400c32cb432323f994c060db185e9f8519cf76afcc9969379c2967f2f6ba36a,
- 381'h0465162c766430cf4a98e217e3d765643118598715cc2538c56e933f0528f56dd6ac82507df446545a2fde77349ad37e,
- 381'h1427e91ee8eff7e7187d560c375f5da3a9f0f162192ac4277bff1b14f560355e0b5cf069f452ab4d35ce11b39facc280,
- 381'h087d1320fe5bad5c2d8e12c49e6aff41a0b80e1497bbe85682e22ed853f256041bdf97ef02bdb5d80a5f9bc31d85f25e,
- 381'h159ef660e2d84185f55c0ccae1dd7f8f71b12c0beb7a431fede9e62794d9154e9a0ce4715f64b032492459076224c99b,
- 381'h0cbc592a19a3f60c9938676b257b9c01ed9d708f9428b29e272a811d13d734485970d9d3f1c097b12bfa3d1678096b1d,
- 381'h0751a051e0beb4a0e2351a7527d813b371e189056307d718a446e4016a3df787568a842f3401768dc03b966bd1db90ac,
- 381'h0e760e96f911ae38a6042da82d7b0e30787864e725e9d5462d224c91c4497104d838d566d894564bc19e09d8af706c3f,
- 381'h05194f5785436c8debf0eb2bab4c6ef3de7dc0633c85769173777b782bf897fa45025fd03e7be941123c4ee19910e62e};
-
- final_exponent(f);
- $display("After final exponent:");
- print_fe12(f);
- assert(f == f_exp) else $fatal(1, "Test_sw final exp. did not match known good result");
- $display("test_sw PASSED");
-
-endtask
-
-
-task test_hw();
-begin
- integer signed get_len;
- logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in, get_dat;
- integer start_time, finish_time;
- FE12_TYPE f_in, f_out, f_exp;
- $display("Running hw test ...");
-
- for (int lp = 0; lp < 10; lp++) begin
- $display("Loop %d", lp);
- dat_in = 0;
- for (int i = 0; i < 2; i++)
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 2; k++) begin
- f_in[i][j][k] = random_vector(384/8) % P;
- dat_in[(i*6+j*2+k)*384 +: $bits(FE_TYPE)] = {f_in[i][j][k]};
- end
-
- f_exp = f_in;
- final_exponent(f_exp);
-
- start_time = $time;
- fork
- final_exp_fe12_o_if.put_stream(dat_in, 12*384/8);
- final_exp_fe12_i_if.get_stream(get_dat, get_len);
- join
- finish_time = $time;
-
- for (int i = 0; i < 2; i++)
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 2; k++)
- f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
-
- $display("hw test finished in %d clocks", (finish_time-start_time)/(CLK_PERIOD));
-
- if (f_exp != f_out) begin
- $display("Input:");
- print_fe12(f_in);
- $display("Output:");
- print_fe12(f_out);
- $display("Expected:");
- print_fe12(f_exp);
- $fatal(1, "%m %t ERROR: output was wrong", $time);
- end
- end
-
- $display("all hw tests PASSED");
-end
-endtask;
-
-initial begin
- final_exp_fe12_o_if.reset_source();
- final_exp_fe12_i_if.rdy = 0;
- #100ns;
-
- test_sw();
- test_hw();
-
- #1us $finish();
-end
-
-endmodule
\ No newline at end of file
diff --git a/zcash_fpga/src/tb/bls12_381_fmap_tb.sv b/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
index 791218b..d3f0fb3 100644
--- a/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_fmap_tb.sv
@@ -15,6 +15,8 @@
along with this program. If not, see .
*/
`timescale 1ps/1ps
+`define BL12_381_NEWMULT
+`define SIMULATION
module bls12_381_fmap_tb ();
diff --git a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
index d74171e..aa2c831 100644
--- a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
@@ -64,16 +64,24 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_i_if(clk)
if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_o_if(clk);
if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_i_if(clk);
-ec_fp_mult_mod #(
- .P ( P ),
- .KARATSUBA_LVL ( 3 ),
- .CTL_BITS ( CTL_BITS )
+accum_mult_mod #(
+ .DAT_BITS ( $bits(FE_TYPE) ),
+ .MODULUS ( P ),
+ .CTL_BITS ( CTL_BITS ),
+ .A_DSP_W ( 26 ),
+ .B_DSP_W ( 17 ),
+ .GRID_BIT ( 64 ),
+ .RAM_A_W ( 8 ),
+ .RAM_D_W ( 32 )
)
-ec_fp_mult_mod (
- .i_clk( clk ),
- .i_rst( rst ),
- .i_mul ( mul_fe_o_if ),
- .o_mul ( mul_fe_i_if )
+accum_mult_mod (
+ .i_clk ( clk ),
+ .i_rst ( rst ),
+ .i_mul ( mul_fe_o_if ),
+ .o_mul ( mul_fe_i_if ),
+ .i_ram_d ( '0 ),
+ .i_ram_we ( '0 ),
+ .i_ram_se ( '0 )
);
bls12_381_pairing_wrapper #(