From 6d540f04913cff751ef5befa96bc7a0d37ce1927 Mon Sep 17 00:00:00 2001 From: bsdevlin Date: Sun, 25 Aug 2019 00:30:08 +0800 Subject: [PATCH] Update bls12_381 processor to version 1.3, now has split functions for miller loop and final exponentiation, to support multi-pairing. --- README.md | 2 +- aws/cl_zcash/software/runtime/zcash_fpga.hpp | 9 +- .../src/rtl/bls12_381/bls12_381_pairing.sv | 119 ++++--- .../bls12_381/bls12_381_pairing_wrapper.sv | 48 +-- zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv | 10 +- zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv | 263 ++++++++------- zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv | 2 +- zcash_fpga/src/tb/bls12_381_pairing_tb.sv | 135 ++++++-- zcash_fpga/src/tb/bls12_381_top_tb.sv | 311 ++++++++---------- 9 files changed, 501 insertions(+), 398 deletions(-) diff --git a/README.md b/README.md index 36f1c13..e1c2709 100644 --- a/README.md +++ b/README.md @@ -67,5 +67,5 @@ It optionally contains the following top-level engines (you can include in a bui - General arithmetic over bls12-381 curve - Dual Point multiplication in Fp and Fp^2 (G1 and G2) - Frobenius map operations - - The ate pairing + - The optimal ate pairing - Miller loop and final exponentiation stage diff --git a/aws/cl_zcash/software/runtime/zcash_fpga.hpp b/aws/cl_zcash/software/runtime/zcash_fpga.hpp index f3f66d3..961355b 100644 --- a/aws/cl_zcash/software/runtime/zcash_fpga.hpp +++ b/aws/cl_zcash/software/runtime/zcash_fpga.hpp @@ -88,11 +88,10 @@ class zcash_fpga { MUL_ELEMENT = 0x12, INV_ELEMENT = 0x13, - POINT_MULT = 0x24, - FP_FPOINT_MULT = 0x25, - FP2_FPOINT_MULT = 0x26, - - ATE_PAIRING = 0x28 + POINT_MULT = 0x20, + MILLER_LOOP = 0x21, + FINAL_EXP = 0x22, + ATE_PAIRING = 0x23 } bls12_381_code_t; // Instruction format diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv index 3b642c6..e7e9728 100644 --- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv +++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv @@ -1,6 +1,6 @@ /* This is the top level for the bls12-381 pairing engine. - It performs both the miller loop and final exponentiation required for ate pairing (G2 x G1). + It performs both the miller loop and final exponentiation required for optimal ate pairing w = e(Q, P), where Q is in G2, P is in G1. Inputs are points in G1 and G2 (affine coordinates) Output is a Fp12 element. @@ -40,14 +40,11 @@ module bls12_381_pairing )( input i_clk, i_rst, // Inputs - input i_val, - input i_mode, // 0 == ate pairing, 1 == only point multiplication + input [1:0] i_mode, // 0 == ate pairing, 1 == only point multiplication, 2 == only miller loop, 3 == only final exponentiation input FE_TYPE i_key, // Input key when in mode == 1 - output logic o_rdy, - input G1_FP_AF_TYPE i_g1_af, - input G2_FP_AF_TYPE i_g2_af, - if_axi_stream.source o_fe12_if, - if_axi_stream.source o_p_jb_if, // Output point if we did a point multiplication + if_axi_stream.sink i_pair_af_if, // Input for G1 and G2 points, Input fe_12 for final exponentiation when mode == 3 + if_axi_stream.source o_fe12_if, // Result fe12 of ate pairing / final exponentiation (if mode was 0/3) + if_axi_stream.source o_p_jb_if, // Result of point multiplication / miller loop (if mode was 1/2) // Interface to FE_TYPE multiplier (mod P) if_axi_stream.source o_mul_fe_if, if_axi_stream.sink i_mul_fe_if, @@ -94,7 +91,7 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) final_exp_fe12_o logic dbl_i_val, dbl_o_rdy; logic add_i_val, add_o_rdy; -logic wait_dbl, wait_add, stage_done; +logic wait_dbl, wait_add; G1_FP_AF_TYPE g1_af_i; G2_FP_JB_TYPE g2_r_jb_i, add_g2_o, dbl_g2_o; @@ -106,12 +103,13 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) dbl_f12_o_if ( logic [$clog2($bits(FE_TYPE))-1:0] ate_loop_cnt; logic [1:0] miller_mult_cnt; -enum {IDLE, POINT_MULT_DBL, POINT_MULT_ADD, POINT_MULT_DONE, MILLER_LOOP, FINAL_EXP} pair_state; +enum {IDLE, INPUT_LOAD0, INPUT_LOAD1, POINT_MULT_DBL, POINT_MULT_ADD, POINT_MULT_DONE, MILLER_LOOP, MILLER_ONLY_DONE, FINAL_EXP} pair_state; FE12_TYPE f; logic f_val; logic [3:0] out_cnt; -logic point_mul_mode, found_one; +logic found_one; +logic [1:0] mode; FE_TYPE key; @@ -129,12 +127,12 @@ always_ff @ (posedge i_clk) begin final_exp_fe12_o_if.eop <= 0; g1_af_i <= 0; g2_r_jb_i <= 0; + g2_af_i <= 0; mul_fe12_i_if[0].rdy <= 0; mul_fe12_o_if[0].reset_source(); pair_state <= IDLE; add_i_val <= 0; dbl_i_val <= 0; - o_rdy <= 0; wait_dbl <= 0; wait_add <= 0; miller_mult_cnt <= 0; @@ -143,17 +141,18 @@ always_ff @ (posedge i_clk) begin f <= FE12_one; f_val <= 0; out_cnt <= 0; - point_mul_mode <= 0; + mode <= 0; key <= 0; found_one <= 0; - stage_done <= 0; - + o_p_jb_if.reset_source(); - + dbl_f12_o_if.rdy <= 0; add_f12_o_if.rdy <= 0; + i_pair_af_if.rdy <= 0; + end else begin if (add_o_rdy) add_i_val <= 0; @@ -166,13 +165,13 @@ always_ff @ (posedge i_clk) begin f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]}; f_val <= mul_fe12_i_if[0].eop; end - + dbl_f12_o_if.rdy <= 0; add_f12_o_if.rdy <= 0; case(pair_state) IDLE: begin - ate_loop_cnt <= i_mode == 0 ? ATE_X_START-1 : $bits(FE_TYPE)-1; + ate_loop_cnt <= i_mode == 0 || i_mode == 2 ? ATE_X_START-1 : $bits(FE_TYPE)-1; f <= FE12_one; add_i_val <= 0; dbl_i_val <= 0; @@ -180,31 +179,52 @@ always_ff @ (posedge i_clk) begin wait_add <= 0; out_cnt <= 0; f_val <= 0; - o_rdy <= 1; miller_mult_cnt <= 0; found_one <= 0; - stage_done <= 0; - if (i_val && o_rdy) begin - pair_state <= i_mode == 0 ? MILLER_LOOP : POINT_MULT_DBL; - key <= i_key; - point_mul_mode <= i_mode; - o_rdy <= 0; + i_pair_af_if.rdy <= 0; - g1_af_i <= i_g1_af; - g2_af_i <= i_g2_af; + g2_r_jb_i.x <= 0; + g2_r_jb_i.y <= 0; + g2_r_jb_i.z <= 1; - g2_r_jb_i.x <= i_g2_af.x; - g2_r_jb_i.y <= i_g2_af.y; - g2_r_jb_i.z <= 1; + if (i_pair_af_if.val) begin + pair_state <= INPUT_LOAD0; + key <= i_key; + mode <= i_mode; + i_pair_af_if.rdy <= 1; + end + end + INPUT_LOAD0: begin + if (i_pair_af_if.eop && i_pair_af_if.val && i_pair_af_if.rdy) i_pair_af_if.rdy <= 0; + if (i_pair_af_if.val && i_pair_af_if.rdy) begin + if (mode == 1) begin + g2_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_af_i.y, g2_af_i.x[1]}; + {g2_r_jb_i.y, g2_r_jb_i.x} <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_r_jb_i.y, g2_r_jb_i.x[1]}; + if (i_pair_af_if.eop) pair_state <= POINT_MULT_DBL; + end else + if (mode == 3) begin + f <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], f[1], f[0][2:1], f[0][0][1]}; + if (i_pair_af_if.eop) pair_state <= FINAL_EXP; + end else begin + g1_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g1_af_i.y}; + if (i_pair_af_if.eop) pair_state <= INPUT_LOAD1; + i_pair_af_if.rdy <= 1; + end + end + end + INPUT_LOAD1: begin + if (i_pair_af_if.eop && i_pair_af_if.val && i_pair_af_if.rdy) i_pair_af_if.rdy <= 0; + if (i_pair_af_if.val && i_pair_af_if.rdy) begin + g2_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_af_i.y, g2_af_i.x[1]}; + {g2_r_jb_i.y, g2_r_jb_i.x} <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_r_jb_i.y, g2_r_jb_i.x[1]}; + if (i_pair_af_if.eop) pair_state <= MILLER_LOOP; end end MILLER_LOOP: begin - if (~wait_dbl) begin dbl_i_val <= 1; wait_dbl <= 1; end - if (wait_dbl && dbl_f12_o_if.val && dbl_f12_o_if.sop && dbl_f12_o_if.rdy) begin g2_r_jb_i <= dbl_g2_o; if (~wait_add && ATE_X[ate_loop_cnt] == 1) begin @@ -242,18 +262,18 @@ always_ff @ (posedge i_clk) begin 0,1,4: mul_fe12_o_if[0].dat <= {dbl_f12_o_if.dat, f[0][0][0]}; default: mul_fe12_o_if[0].dat <= {381'd0, f[0][0][0]}; endcase - + out_cnt <= out_cnt + 1; f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]}; if (out_cnt == 11) begin f_val <= 0; out_cnt <= 0; miller_mult_cnt <= ATE_X[ate_loop_cnt] == 0 ? 3 : 2; - end - + end + mul_fe12_o_if[0].ctl <= miller_mult_cnt; mul_fe12_o_if[0].ctl[SQ_BIT] <= 0; - + end end end @@ -290,7 +310,7 @@ always_ff @ (posedge i_clk) begin miller_mult_cnt <= 0; ate_loop_cnt <= ate_loop_cnt - 1; if (ate_loop_cnt == 0) begin - pair_state <= FINAL_EXP; + pair_state <= mode == 0 ? FINAL_EXP : MILLER_ONLY_DONE; end end end @@ -352,7 +372,22 @@ always_ff @ (posedge i_clk) begin key <= key << 1; pair_state <= POINT_MULT_DBL; end - end + end + end + MILLER_ONLY_DONE: begin + if (~o_p_jb_if.val || (o_p_jb_if.val && o_p_jb_if.rdy)) begin + o_p_jb_if.val <= 1; + o_p_jb_if.sop <= out_cnt == 0; + o_p_jb_if.eop <= out_cnt == 11; + o_p_jb_if.dat <= f[0][0][0]; + f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]}; + out_cnt <= out_cnt + 1; + if (o_p_jb_if.val && o_p_jb_if.rdy && o_p_jb_if.eop) begin + pair_state <= IDLE; + out_cnt <= 0; + o_p_jb_if.val <= 0; + end + end end POINT_MULT_DONE: begin if (~o_p_jb_if.val || (o_p_jb_if.val && o_p_jb_if.rdy)) begin @@ -360,7 +395,7 @@ always_ff @ (posedge i_clk) begin o_p_jb_if.sop <= out_cnt == 0; o_p_jb_if.eop <= out_cnt == 5; o_p_jb_if.dat <= g2_r_jb_i; - + out_cnt <= out_cnt + 1; g2_r_jb_i <= g2_r_jb_i >> $bits(FE_TYPE); if (o_p_jb_if.val && o_p_jb_if.rdy && o_p_jb_if.eop) begin @@ -386,7 +421,7 @@ bls12_381_pairing_miller_dbl ( .i_clk ( i_clk ), .i_rst ( i_rst ), .i_val ( dbl_i_val ), - .i_point_mul_mode ( point_mul_mode ), + .i_point_mul_mode ( mode == 1 ), .o_rdy ( dbl_o_rdy ), .i_g1_af ( g1_af_i ), .i_g2_jb ( g2_r_jb_i ), @@ -413,8 +448,8 @@ bls12_381_pairing_miller_add #( bls12_381_pairing_miller_add ( .i_clk ( i_clk ), .i_rst ( i_rst ), - .i_val ( add_i_val ), - .i_point_mul_mode ( point_mul_mode ), + .i_val ( add_i_val ), + .i_point_mul_mode ( mode == 1 ), .o_rdy ( add_o_rdy ), .i_g1_af ( g1_af_i ), .i_g2_jb ( dbl_g2_o ), diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv index 76d04e9..ae0b43e 100644 --- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv +++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv @@ -32,23 +32,22 @@ module bls12_381_pairing_wrapper )( input i_clk, i_rst, // Inputs - input i_val, - output logic o_rdy, - input G1_FP_AF_TYPE i_g1_af, // G1 input point - input G2_FP_AF_TYPE i_g2_af, // G2 input point - input i_mode, // 0 == ate pairing, 1 == only point multiplication - input FE_TYPE i_key, // Input key when in mode == 1 - if_axi_stream.source o_fe12_if, // Result fe12 of ate pairing (or point mult) - if_axi_stream.source o_p_jb_if, // Result of point multiplication + if_axi_stream.sink i_pair_af_if, // G1 and G2 input point - or Fe12 element if we are only performing the final exponentiation + input [1:0] i_mode, // 0 == ate pairing, 1 == only point multiplication, 2 == only miller loop, 3 == only final exponentiation + input FE_TYPE i_key, // Input key when in mode == 1 + if_axi_stream.source o_fe12_if, // Result fe12 of ate pairing / final exponentiation (if mode was 0/3) + if_axi_stream.source o_p_jb_if, // Result of point multiplication / miller loop (if mode was 1/2) // Interface to FE_TYPE multiplier (mod P) if_axi_stream.source o_mul_fe_if, if_axi_stream.sink i_mul_fe_if, + // Interface to FE12_TYPE multiplier (mod P) (Implemented internally) + if_axi_stream.source o_mul_fe12_if, + if_axi_stream.sink i_mul_fe12_if, // We provide interfaces to the inversion module if_axi_stream.source o_inv_fe2_if, if_axi_stream.sink i_inv_fe2_if, if_axi_stream.source o_inv_fe_if, if_axi_stream.sink i_inv_fe_if - ); if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if [3:0] (i_clk); @@ -68,8 +67,8 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe6_i_if if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe6_o_if [2:0] (i_clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mnr_fe6_i_if [2:0] (i_clk); -if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [2:0] (i_clk); -if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_i_if [2:0] (i_clk); +if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [3:0] (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_i_if [3:0] (i_clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) pow_fe12_o_if (i_clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) pow_fe12_i_if (i_clk); @@ -80,6 +79,14 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) fmap_fe12_i_if if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe12_o_if (i_clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe12_i_if (i_clk); +always_comb begin + i_mul_fe12_if.rdy = mul_fe12_o_if[2].rdy; + mul_fe12_o_if[2].copy_if_comb(i_mul_fe12_if.dat, i_mul_fe12_if.val, i_mul_fe12_if.sop, i_mul_fe12_if.eop, i_mul_fe12_if.err, i_mul_fe12_if.mod, i_mul_fe12_if.ctl); + + mul_fe12_i_if[2].rdy = o_mul_fe12_if.rdy; + o_mul_fe12_if.copy_if_comb(mul_fe12_i_if[2].dat, mul_fe12_i_if[2].val, mul_fe12_i_if[2].sop, mul_fe12_i_if[2].eop, mul_fe12_i_if[2].err, mul_fe12_i_if[2].mod, mul_fe12_i_if[2].ctl); +end + bls12_381_pairing #( .FE_TYPE ( FE_TYPE ), .FE2_TYPE ( FE2_TYPE ), @@ -93,10 +100,7 @@ bls12_381_pairing #( bls12_381_pairing ( .i_clk ( i_clk ), .i_rst ( i_rst ), - .i_val ( i_val ), - .o_rdy ( o_rdy ), - .i_g1_af ( i_g1_af ), - .i_g2_af ( i_g2_af ), + .i_pair_af_if ( i_pair_af_if ), .i_mode ( i_mode ), .i_key ( i_key ), .o_fe12_if ( o_fe12_if ), @@ -262,8 +266,8 @@ ec_fe12_mul_s ( .i_sub_fe_if ( sub_fe_i_if[3] ), .o_mnr_fe6_if ( mnr_fe6_o_if[0] ), .i_mnr_fe6_if ( mnr_fe6_i_if[0] ), - .o_mul_fe12_if ( mul_fe12_i_if[2] ), - .i_mul_fe12_if ( mul_fe12_o_if[2] ) + .o_mul_fe12_if ( mul_fe12_i_if[3] ), + .i_mul_fe12_if ( mul_fe12_o_if[3] ) ); adder_pipe # ( @@ -378,7 +382,7 @@ resource_share_fe6_mul ( ); resource_share # ( - .NUM_IN ( 2 ), + .NUM_IN ( 3 ), .DAT_BITS ( 2*$bits(FE_TYPE) ), .CTL_BITS ( CTL_BITS ), .OVR_WRT_BIT ( OVR_WRT_BIT + 42 ), // 2 bits @@ -388,10 +392,10 @@ resource_share # ( resource_share_fe12_mul ( .i_clk ( i_clk ), .i_rst ( i_rst ), - .i_axi ( mul_fe12_o_if[1:0] ), - .o_res ( mul_fe12_o_if[2] ), - .i_res ( mul_fe12_i_if[2] ), - .o_axi ( mul_fe12_i_if[1:0] ) + .i_axi ( mul_fe12_o_if[2:0] ), + .o_res ( mul_fe12_o_if[3] ), + .i_res ( mul_fe12_i_if[3] ), + .o_axi ( mul_fe12_i_if[2:0] ) ); resource_share # ( diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv index f9bbbce..34fc144 100644 --- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv +++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv @@ -165,11 +165,10 @@ package bls12_381_pkg; MUL_ELEMENT = 8'h12, INV_ELEMENT = 8'h13, - POINT_MULT = 8'h24, - FP_FPOINT_MULT = 8'h25, - FP2_FPOINT_MULT = 8'h26, - - ATE_PAIRING = 8'h28 + POINT_MULT = 8'h20, + MILLER_LOOP = 8'h21, + FINAL_EXP = 8'h22, + ATE_PAIRING = 8'h23 } code_t; // Instruction format @@ -663,7 +662,6 @@ package bls12_381_pkg; endtask task automatic ate_pairing(input af_point_t P, input fp2_af_point_t Q, ref fe12_t f); - fp2_jb_point_t R; // This is only used for point multiplication miller_loop(P, Q, f); final_exponent(f); endtask; diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv index 777f32e..ee04004 100644 --- a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv +++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv @@ -66,7 +66,7 @@ if_axi_stream #(.DAT_BYTS(3)) idx_in_if(i_clk); if_axi_stream #(.DAT_BYTS(3)) idx_out_if(i_clk); // Point multiplication -logic pair_mode; +logic [1:0] pair_mode; fe_t pair_key; if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mult_pt_if (i_clk); @@ -89,12 +89,13 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_i_if if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_o_if (i_clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe2_i_if (i_clk); -logic pair_i_val, pair_o_rdy; -if_axi_stream #(.DAT_BITS($bits(FE_TYPE))) pair_o_res_if (i_clk); ; -bls12_381_pkg::af_point_t pair_i_g1; -bls12_381_pkg::fp2_af_point_t pair_i_g2; +if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_i_if (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE))) pair_i_af_if (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE))) pair_o_res_if (i_clk); + logic [31:0] new_inst_pt; logic new_inst_pt_val, new_inst_pt_val_l; logic reset_done_inst, reset_done_data; @@ -146,14 +147,15 @@ always_ff @ (posedge i_clk) begin add_out_if.rdy <= 0; sub_out_if.rdy <= 0; - pair_i_val <= 0; - pair_i_g1 <= 0; - pair_i_g2 <= 0; + pair_i_af_if.reset_source(); pair_mode <= 0; pair_key <= 0; mult_pt_if.rdy <= 0; + mul_fe12_o_if.reset_source(); + mul_fe12_i_if.rdy <= 0; + end else begin mul_in_if[1].sop <= 1; @@ -179,9 +181,11 @@ always_ff @ (posedge i_clk) begin if (add_in_if.rdy) add_in_if.val <= 0; if (sub_in_if.rdy) sub_in_if.val <= 0; if (mul_in_if[1].rdy) mul_in_if[1].val <= 0; - if (pair_o_rdy) pair_i_val <= 0; + if (pair_i_af_if.rdy) pair_i_af_if.val <= 0; + if (mul_fe12_o_if.rdy) mul_fe12_o_if.val <= 0; mult_pt_if.rdy <= 1; + mul_fe12_i_if.rdy <= 1; if (idx_in_if.val && idx_in_if.rdy) idx_in_if.val <= 0; if (interrupt_in_if.val && interrupt_in_if.rdy) interrupt_in_if.val <= 0; @@ -234,18 +238,19 @@ always_ff @ (posedge i_clk) begin if (cnt == 0) last_inst_cnt <= 0; task_point_mult(); end - // We don't use precaculation for fixed point but could be used as optimizations - FP_FPOINT_MULT: begin + ATE_PAIRING: begin if (cnt == 0) last_inst_cnt <= 0; - task_fp_fpoint_mult(); + pair_mode <= 0; + task_pairing(); end - FP2_FPOINT_MULT: begin + MILLER_LOOP: begin if (cnt == 0) last_inst_cnt <= 0; - task_fp2_fpoint_mult(); + pair_mode <= 2; + task_pairing(); end - ATE_PAIRING: begin + FINAL_EXP: begin if (cnt == 0) last_inst_cnt <= 0; - task_pairing(); + task_final_exp(); end default: get_next_inst(); endcase @@ -302,20 +307,19 @@ bls12_381_pairing_wrapper #( bls12_381_pairing_wrapper ( .i_clk ( i_clk ), .i_rst ( i_rst ), - .i_val ( pair_i_val ), - .o_rdy ( pair_o_rdy ), - .i_g1_af ( pair_i_g1 ), - .i_g2_af ( pair_i_g2 ), + .i_pair_af_if ( pair_i_af_if ), .i_mode ( pair_mode ), .i_key ( pair_key ), - .o_fe12_if ( pair_o_res_if ), - .o_p_jb_if ( mult_pt_if ), - .o_mul_fe_if ( mul_in_if[0] ), - .i_mul_fe_if ( mul_out_if[0] ), - .o_inv_fe2_if ( inv_fe2_i_if ), - .i_inv_fe2_if ( inv_fe2_o_if ), - .o_inv_fe_if ( inv_fe_i_if ), - .i_inv_fe_if ( inv_fe_o_if ) + .o_fe12_if ( pair_o_res_if ), + .o_p_jb_if ( mult_pt_if ), + .o_mul_fe_if ( mul_in_if[0] ), + .i_mul_fe_if ( mul_out_if[0] ), + .i_mul_fe12_if ( mul_fe12_o_if ), + .o_mul_fe12_if ( mul_fe12_i_if ), + .o_inv_fe2_if ( inv_fe2_i_if ), + .i_inv_fe2_if ( inv_fe2_o_if ), + .o_inv_fe_if ( inv_fe_i_if ), + .i_inv_fe_if ( inv_fe_o_if ) ); resource_share # ( @@ -532,6 +536,10 @@ task task_mul_element(); data_ram_sys_if.a <= curr_inst.b; data_ram_read[0] <= 1; cnt <= 2; + if (curr_data.pt == FE12) begin + cnt <= 8; + data_ram_sys_if.a <= curr_inst.a; + end end end 2: begin @@ -552,7 +560,7 @@ task task_mul_element(); new_data.dat <= mul_out_if[1].dat; new_data.pt <= pt_l; data_ram_sys_if.we <= 1; - cnt <= 8; + cnt <= 34; end end 3: begin @@ -617,10 +625,51 @@ task task_mul_element(); new_data.pt <= pt_l; data_ram_sys_if.we <= 1; data_ram_sys_if.a <= curr_inst.c + 1; - cnt <= 8; + cnt <= 34; end end - 8: begin + // FE12 multiplication + 8,9,10,11,12,13,14,15,16,17,18,19, + 20,21,22,23,24,25,26,27,28,29,30,31: begin + mul_fe12_i_if.rdy <= 0; + + + if (|data_ram_read[READ_CYCLE:1]== 0 && (~mul_fe12_o_if.val || (mul_fe12_o_if.val && mul_fe12_o_if.rdy))) begin + if (data_ram_read[0]) begin + data_ram_read[0] <= 1; + data_ram_sys_if.a <= curr_inst.b + ((cnt-8)/2); + end else begin + data_ram_read[0] <= 1; + data_ram_sys_if.a <= curr_inst.a + ((cnt-8)/2); + end + end + + if (data_ram_read[READ_CYCLE]) begin + cnt <= cnt + 1; + if (cnt % 2 == 1) begin + mul_fe12_o_if.sop <= cnt == 9; + mul_fe12_o_if.eop <= cnt == 31; + mul_fe12_o_if.val <= 1; + mul_fe12_o_if.dat[$bits(fe_t) +: $bits(fe_t)] <= curr_data.dat; + end else begin + mul_fe12_o_if.dat[0 +: $bits(fe_t)] <= curr_data.dat; + end + end + end + 32: begin + mul_fe12_i_if.rdy <= 1; + if (mul_fe12_i_if.val && mul_fe12_i_if.rdy) begin + if (mul_fe12_i_if.sop) + data_ram_sys_if.a <= curr_inst.c; + else + data_ram_sys_if.a <= data_ram_sys_if.a + 1; + data_ram_sys_if.we <= 1; + new_data.dat <= mul_fe12_i_if.dat; + new_data.pt <= pt_l; + if (mul_fe12_i_if.eop) cnt <= cnt + 1; + end + end + 33: begin get_next_inst(); end endcase @@ -803,20 +852,19 @@ task task_point_mult(); end end 1,2,3,4: begin - if (data_ram_read[READ_CYCLE]) begin + if (|data_ram_read == 0 && (~pair_i_af_if.val || (pair_i_af_if.val && pair_i_af_if.rdy))) begin data_ram_read[0] <= 1; data_ram_sys_if.a <= data_ram_sys_if.a + 1; if (curr_data.pt == FP_AF && cnt % 2 == 0) data_ram_sys_if.a <= data_ram_sys_if.a; - case(cnt) - 1: pair_i_g2.x[0] <= curr_data.dat; - 2: pair_i_g2.x[1] <= curr_data.pt == FP_AF ? 0 : curr_data.dat; - 3: pair_i_g2.y[0] <= curr_data.dat; - 4: pair_i_g2.y[1] <= curr_data.pt == FP_AF ? 0 : curr_data.dat; - endcase + end + if (data_ram_read[READ_CYCLE]) begin + pair_i_af_if.val <= 1; + pair_i_af_if.sop <= cnt == 1; + pair_i_af_if.eop <= cnt == 4; + pair_i_af_if.dat <= (curr_data.pt == FP_AF && cnt % 2 == 0) ? 0 : curr_data.dat; cnt <= cnt + 1; if (cnt == 1) pt_l <= curr_data.pt; if (cnt == 4) begin - pair_i_val <= 1; data_ram_sys_if.a <= curr_inst.c; end end @@ -843,136 +891,105 @@ task task_point_mult(); endcase endtask -task task_fp_fpoint_mult(); - pair_mode <= 1; +task task_pairing(); case(cnt) inside 0: begin data_ram_sys_if.a <= curr_inst.a; data_ram_read[0] <= 1; cnt <= cnt + 1; end - 1: begin + // Load G1 affine point + 1,2: begin if (data_ram_read[READ_CYCLE]) begin - data_ram_sys_if.a <= curr_inst.b; - pair_key <= curr_data.dat; - pair_i_g2 <= bls12_381_pkg::g_af_point_fp2; - pair_i_val <= 1; + data_ram_sys_if.a <= data_ram_sys_if.a + 1; + data_ram_read[0] <= 1; + pair_i_af_if.dat <= curr_data.dat; + pair_i_af_if.val <= 1; + pair_i_af_if.sop <= cnt == 1; + pair_i_af_if.eop <= cnt == 2; cnt <= cnt + 1; + if (cnt == 2) begin + data_ram_sys_if.a <= curr_inst.b; + end end end - // Wait for result - 2,3,4,5,6,7: begin - mult_pt_if.rdy <= 1; - if (mult_pt_if.val) begin - new_data.pt <= FP_JB; - new_data.dat <= mult_pt_if.dat; - data_ram_sys_if.we <= 1; - if (cnt > 2) data_ram_sys_if.a <= data_ram_sys_if.a + 1; - if (cnt % 2 == 1) begin // Odd elements will be 0 - data_ram_sys_if.a <= data_ram_sys_if.a; - data_ram_sys_if.we <= 0; - end - cnt <= cnt + 1; - end - end - 8: begin - pair_mode <= 0; - get_next_inst(); - end - endcase -endtask - -task task_fp2_fpoint_mult(); - pair_mode <= 1; - case(cnt) inside - 0: begin - data_ram_sys_if.a <= curr_inst.a; - data_ram_read[0] <= 1; - cnt <= cnt + 1; - end - 1: begin + // Load G2 affine point + 3,4,5,6: begin if (data_ram_read[READ_CYCLE]) begin - data_ram_sys_if.a <= curr_inst.b; - pair_key <= curr_data.dat; - pair_i_g2 <= bls12_381_pkg::g2_af_point_fp2; - pair_i_val <= 1; + data_ram_sys_if.a <= data_ram_sys_if.a + 1; + data_ram_read[0] <= 1; + pair_i_af_if.dat <= curr_data.dat; + pair_i_af_if.val <= 1; + pair_i_af_if.sop <= cnt == 3; + pair_i_af_if.eop <= cnt == 6; cnt <= cnt + 1; + if (cnt == 6) begin + data_ram_sys_if.a <= curr_inst.c; + pair_o_res_if.rdy <= 1; + mult_pt_if.rdy <= 1; + end end end // Wait for result - 2,3,4,5,6,7: begin - mult_pt_if.rdy <= 1; - if (mult_pt_if.val) begin - new_data.pt <= FP2_JB; - new_data.dat <= mult_pt_if.dat; + 7: begin + if (pair_o_res_if.val || mult_pt_if.val) begin + new_data.pt <= FE12; + new_data.dat <= pair_o_res_if.val ? pair_o_res_if.dat : mult_pt_if.dat; data_ram_sys_if.we <= 1; - if (cnt > 2) data_ram_sys_if.a <= data_ram_sys_if.a + 1; - cnt <= cnt + 1; + if ((pair_o_res_if.val && ~pair_o_res_if.sop) || + (mult_pt_if.val && ~mult_pt_if.sop)) + data_ram_sys_if.a <= data_ram_sys_if.a + 1; + if (pair_o_res_if.eop || mult_pt_if.eop) begin + mult_pt_if.rdy <= 0; + pair_o_res_if.rdy <= 0; + cnt <= cnt + 1; + end end end 8: begin - pair_mode <= 0; get_next_inst(); end endcase endtask -task task_pairing(); - pair_mode <= 0; +task task_final_exp(); + pair_mode <= 3; case(cnt) inside 0: begin data_ram_sys_if.a <= curr_inst.a; data_ram_read[0] <= 1; cnt <= cnt + 1; end - // Load G1 affine point - 1,2: begin + // Load FE12 + 1,2,3,4,5,6,7,8,9,10,11,12: begin if (data_ram_read[READ_CYCLE]) begin data_ram_sys_if.a <= data_ram_sys_if.a + 1; data_ram_read[0] <= 1; - case(cnt) - 1: pair_i_g1.x <= curr_data.dat; - 2: pair_i_g1.y <= curr_data.dat; - endcase + pair_i_af_if.dat <= curr_data.dat; + pair_i_af_if.val <= 1; + pair_i_af_if.sop <= cnt == 1; + pair_i_af_if.eop <= cnt == 12; cnt <= cnt + 1; - if (cnt == 2) begin + if (cnt == 12) begin data_ram_sys_if.a <= curr_inst.b; end end end - // Load G2 affine point - 3,4,5,6: begin - if (data_ram_read[READ_CYCLE]) begin - data_ram_sys_if.a <= data_ram_sys_if.a + 1; - data_ram_read[0] <= 1; - case(cnt) - 3: pair_i_g2.x[0] <= curr_data.dat; - 4: pair_i_g2.x[1] <= curr_data.dat; - 5: pair_i_g2.y[0] <= curr_data.dat; - 6: pair_i_g2.y[1] <= curr_data.dat; - endcase - cnt <= cnt + 1; - if (cnt == 6) begin - data_ram_sys_if.a <= curr_inst.c; - pair_i_val <= 1; - pair_o_res_if.rdy <= 1; - end - end - end // Wait for result - 7,8,9,10,11,12,13,14,15,16,17,18: begin - if (pair_o_res_if.val) begin + 13: begin + pair_o_res_if.rdy <= 1; + if (pair_o_res_if.val && pair_o_res_if.rdy) begin new_data.pt <= FE12; new_data.dat <= pair_o_res_if.dat; data_ram_sys_if.we <= 1; - if (cnt > 7) data_ram_sys_if.a <= data_ram_sys_if.a + 1; - cnt <= cnt + 1; - if (cnt == 18) begin + if (~pair_o_res_if.sop) data_ram_sys_if.a <= data_ram_sys_if.a + 1; + if (pair_o_res_if.eop) begin + cnt <= cnt + 1; pair_o_res_if.rdy <= 0; end end end - 19: begin + 14: begin get_next_inst(); end endcase diff --git a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv index c6881ba..7ab1ed2 100644 --- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv +++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv @@ -27,7 +27,7 @@ package zcash_fpga_pkg; import bls12_381_pkg::point_type_t; - parameter FPGA_VERSION = 32'h01_02_02; //v1.2.2 + parameter FPGA_VERSION = 32'h01_03_00; //v1.3.0 // What features are enabled in this build parameter bit ENB_VERIFY_SECP256K1_SIG = 1; diff --git a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv index 1b7a3de..d74171e 100644 --- a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv +++ b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv @@ -46,13 +46,18 @@ initial begin forever #(CLK_PERIOD/2) clk = ~clk; end -if_axi_stream #(.DAT_BYTS(($bits(af_point_t) + $bits(fp2_af_point_t)+7)/8), .CTL_BITS(CTL_BITS)) in_if(clk); +logic [1:0] mode; if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) out_if(clk); if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if(clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_i_if(clk); -if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) o_p_jb_if(clk); +if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if(clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_i_if(clk); + +if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) o_p_jb_if(clk); + +if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) pair_af_if(clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_o_if(clk); if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_i_if(clk); @@ -78,20 +83,19 @@ bls12_381_pairing_wrapper #( bls12_381_pairing_wrapper ( .i_clk ( clk ), .i_rst ( rst ), - .i_val ( in_if.val ), - .o_rdy ( in_if.rdy ), - .i_g1_af ( in_if.dat[0 +: $bits(af_point_t)] ), - .i_g2_af ( in_if.dat[$bits(af_point_t) +: $bits(fp2_af_point_t)] ), - .i_mode ( 1'd0 ), + .i_pair_af_if ( pair_af_if ), + .i_mode ( mode ), .i_key ( 381'd0 ), .o_p_jb_if ( o_p_jb_if ), .o_fe12_if ( out_if ), .o_mul_fe_if ( mul_fe_o_if ), .i_mul_fe_if ( mul_fe_i_if ), + .o_mul_fe12_if ( mul_fe12_i_if ), + .i_mul_fe12_if ( mul_fe12_o_if ), .o_inv_fe2_if ( inv_fe2_i_if ), .i_inv_fe2_if ( inv_fe2_o_if ), .o_inv_fe_if ( inv_fe_i_if ), - .i_inv_fe_if ( inv_fe_o_if ) + .i_inv_fe_if ( inv_fe_o_if ) ); // This just tests our software model vs a known good result @@ -126,16 +130,30 @@ endtask task test1(input af_point_t G1_p, fp2_af_point_t G2_p); begin integer signed get_len; - logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat; + logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in0, dat_in1, get_dat; integer start_time, finish_time; FE12_TYPE f_out, f_exp; $display("Running test1 ..."); + + dat_in0 = 0; + dat_in0[0*384 +: $bits(FE_TYPE)] = G1_p.x; + dat_in0[1*384 +: $bits(FE_TYPE)] = G1_p.y; + + dat_in1 = 0; + dat_in1[0*384 +: $bits(FE_TYPE)] = G2_p.x[0]; + dat_in1[1*384 +: $bits(FE_TYPE)] = G2_p.x[1]; + dat_in1[2*384 +: $bits(FE_TYPE)] = G2_p.y[0]; + dat_in1[3*384 +: $bits(FE_TYPE)] = G2_p.y[1]; + mode = 0; ate_pairing(G1_p, G2_p, f_exp); start_time = $time; fork - in_if.put_stream({G2_p, G1_p}, (($bits(af_point_t) + $bits(fp2_af_point_t))+7)/8); + begin + pair_af_if.put_stream(dat_in0, (($bits(af_point_t))+7)/8); + pair_af_if.put_stream(dat_in1, (($bits(fp2_af_point_t))+7)/8); + end out_if.get_stream(get_dat, get_len); join finish_time = $time; @@ -145,7 +163,7 @@ begin for (int k = 0; k < 2; k++) f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)]; - + $display("Expected:"); print_fe12(f_exp); @@ -165,32 +183,46 @@ endtask; task test_linear(); begin integer signed get_len; - logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat; + logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in0, dat_in1, get_dat; integer start_time, finish_time, n; FE12_TYPE f_out, f_exp0, f_exp1; af_point_t G1_a, G1_a_n; fp2_af_point_t G1_j, G2_a_n; fp2_af_point_t G2_a; fp2_jb_point_t G2_j; - + $display("Running test_linear ..."); - + G1_a = {Gy, Gx}; G2_a = {G2y, G2x}; G1_j = {381'd1, Gy, Gx}; - G2_j = {381'd1, G2y, G2x}; + G2_j = {381'd1, G2y, G2x}; n = 2; G1_a_n = to_affine(point_mult(n, G1_j)); G2_a_n = fp2_to_affine(fp2_point_mult(n, G2_j)); - + ate_pairing(G1_a, G2_a_n, f_exp0); ate_pairing(G1_a_n, G2_a, f_exp1); - + assert(f_exp0 == f_exp1) else $fatal(1, "Error in test_linear with sw model"); + + dat_in0 = 0; + dat_in0[0*384 +: $bits(FE_TYPE)] = G1_a_n.x; + dat_in0[1*384 +: $bits(FE_TYPE)] = G1_a_n.y; + + dat_in1 = 0; + dat_in1[0*384 +: $bits(FE_TYPE)] = G2_a.x[0]; + dat_in1[1*384 +: $bits(FE_TYPE)] = G2_a.x[1]; + dat_in1[2*384 +: $bits(FE_TYPE)] = G2_a.y[0]; + dat_in1[3*384 +: $bits(FE_TYPE)] = G2_a.y[1]; + mode = 0; start_time = $time; fork - in_if.put_stream({G2_a, G1_a_n}, (($bits(af_point_t) + $bits(fp2_af_point_t))+7)/8); + begin + pair_af_if.put_stream(dat_in0, (($bits(af_point_t))+7)/8); + pair_af_if.put_stream(dat_in1, (($bits(fp2_af_point_t))+7)/8); + end out_if.get_stream(get_dat, get_len); join finish_time = $time; @@ -215,19 +247,80 @@ begin end endtask; +task test_miller_only(); +begin + integer signed get_len; + logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in0, dat_in1, get_dat; + integer start_time, finish_time, n; + FE12_TYPE f_out, f_exp0; + af_point_t G1_a; + fp2_af_point_t G2_a; + + $display("Running test_miller_only ..."); + + G1_a = {Gy, Gx}; + G2_a = {G2y, G2x}; + miller_loop(G1_a, G2_a, f_exp0); + + dat_in0 = 0; + dat_in0[0*384 +: $bits(FE_TYPE)] = G1_a.x; + dat_in0[1*384 +: $bits(FE_TYPE)] = G1_a.y; + + dat_in1 = 0; + dat_in1[0*384 +: $bits(FE_TYPE)] = G2_a.x[0]; + dat_in1[1*384 +: $bits(FE_TYPE)] = G2_a.x[1]; + dat_in1[2*384 +: $bits(FE_TYPE)] = G2_a.y[0]; + dat_in1[3*384 +: $bits(FE_TYPE)] = G2_a.y[1]; + mode = 2; + + start_time = $time; + fork + begin + pair_af_if.put_stream(dat_in0, (($bits(af_point_t))+7)/8); + pair_af_if.put_stream(dat_in1, (($bits(fp2_af_point_t))+7)/8); + end + o_p_jb_if.get_stream(get_dat, get_len); + join + finish_time = $time; + + for (int i = 0; i < 2; i++) + for (int j = 0; j < 3; j++) + for (int k = 0; k < 2; k++) + f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)]; + + $display("Expected:"); + print_fe12(f_exp0); + $display("Was:"); + print_fe12(f_out); + + $display("test_miller_only finished in %d clocks", (finish_time-start_time)/(CLK_PERIOD)); + + if (f_exp0 != f_out) begin + $fatal(1, "%m %t ERROR: output was wrong", $time); + end + + $display("test_miller_only PASSED"); +end +endtask; + initial begin - in_if.reset_source(); + mul_fe12_o_if.reset_source(); + mul_fe12_i_if.rdy = 0; + o_p_jb_if.rdy = 0; + pair_af_if.reset_source(); inv_fe2_o_if.reset_source(); inv_fe_o_if.reset_source(); inv_fe2_i_if.rdy = 0; inv_fe_i_if.rdy = 0; out_if.rdy = 0; + mode = 0; #100ns; - test0(); // Test SW model + test0(); // Test SW model test1(G1, G2); // Pairing of generators test_linear(); // test linear properties e(n*G1,G2) == e(G1, n*G2), ... - + test_miller_only(); + #1us $finish(); end diff --git a/zcash_fpga/src/tb/bls12_381_top_tb.sv b/zcash_fpga/src/tb/bls12_381_top_tb.sv index 28b28c6..8076f88 100644 --- a/zcash_fpga/src/tb/bls12_381_top_tb.sv +++ b/zcash_fpga/src/tb/bls12_381_top_tb.sv @@ -51,175 +51,6 @@ bls12_381_top bls12_381_top ( .axi_lite_if ( axi_lite_if ) ); - -task test_fp_fpoint_mult(); -begin - integer signed get_len; - logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat; - inst_t inst; - logic failed; - data_t data; - logic [31:0] rdata; - jb_point_t out_p, exp_p; - logic [DAT_BITS-1:0] in_k; - bls12_381_interrupt_rpl_t interrupt_rpl; - - failed = 0; - in_k = 1 << 379; - exp_p = point_mult(in_k, g_point); - - $display("Running test_fp_fpoint_mult..."); - - axi_lite_if.peek(.addr(0), .data(rdata)); - assert(rdata == INST_AXIL_START) else $fatal("ERROR: AXI lite register returned wrong value"); - - axi_lite_if.peek(.addr(4), .data(rdata)); - assert(rdata == DATA_AXIL_START) else $fatal("ERROR: AXI lite register returned wrong value"); - - axi_lite_if.peek(.addr(8), .data(rdata)); - assert(rdata == DATA_RAM_DEPTH) else $fatal("ERROR: AXI lite register returned wrong value"); - - axi_lite_if.peek(.addr(12), .data(rdata)); - assert(rdata == INST_RAM_DEPTH) else $fatal("ERROR: AXI lite register returned wrong value"); - - data = '{dat:in_k, pt:SCALAR}; - axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START), .len(48)); - - inst = '{code:SEND_INTERRUPT, a:16'd1, b:16'hbeef, c:16'd0}; - axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + 8), .len(8)); - - - // Write slot 0 to start - inst = '{code:FP_FPOINT_MULT, a:16'd0, b:16'd1, c:16'd0}; - axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START), .len(8)); - - fork - begin - out_if.get_stream(get_dat, get_len, 50); - interrupt_rpl = get_dat; - - assert(interrupt_rpl.hdr.cmd == BLS12_381_INTERRUPT_RPL) else $fatal(1, "ERROR: Received non-interrupt message"); - assert(interrupt_rpl.index == 16'hbeef) else $fatal(1, "ERROR: Received wrong index value in message"); - assert(interrupt_rpl.data_type == FP_JB) else $fatal(1, "ERROR: Received wrong data type value in message"); - - get_dat = get_dat >> $bits(bls12_381_interrupt_rpl_t); - - for (int i = 0; i < 3; i++) - out_p[i*381 +: 381] = get_dat[i*(48*8) +: 381]; - - if (to_affine(out_p) == to_affine(exp_p)) begin - $display("INFO: Output point matched expected:"); - print_jb_point(out_p); - end else begin - $display("ERROR: Output point did NOT match expected:"); - print_jb_point(out_p); - $display("Expected:"); - print_jb_point(exp_p); - failed = 1; - end - end - begin - repeat(1000000) @(posedge out_if.i_clk); - $fatal("ERROR: Timeout while waiting for result"); - end - join_any - disable fork; - - axi_lite_if.peek(.addr(32'h14), .data(rdata)); - $display("INFO: Last cycle count was %d", rdata); - - if(failed) - $fatal(1, "ERROR: test_fp_fpoint_mult FAILED"); - else - $display("INFO: test_fp_fpoint_mult PASSED"); -end -endtask; - -task test_fp2_fpoint_mult(); -begin - integer signed get_len; - logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat; - inst_t inst; - logic failed; - data_t data; - logic [31:0] rdata; - fp2_jb_point_t out_p, exp_p; - logic [DAT_BITS-1:0] in_k; - bls12_381_interrupt_rpl_t interrupt_rpl; - - failed = 0; - in_k = 381'h33333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333; - exp_p = fp2_point_mult(in_k, g2_point); - $display("Running test_fp2_fpoint_mult..."); - - // See what current instruction pointer is - axi_lite_if.peek(.addr(32'h10), .data(rdata)); - - data = '{dat:in_k, pt:SCALAR}; - axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + 64), .len(48)); // Scalar to multiply by goes in data slot 1 - - inst = '{code:SEND_INTERRUPT, a:16'd3, b:16'habcd, c:16'd0}; - axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+1)*8), .len(8)); - - // Write to current slot to start - inst = '{code:FP2_FPOINT_MULT, a:16'd1, b:16'd3, c:16'd0}; - axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata)*8), .len(8)); - - fork - begin - out_if.get_stream(get_dat, get_len, 0); - interrupt_rpl = get_dat; - - assert(interrupt_rpl.hdr.cmd == BLS12_381_INTERRUPT_RPL) else $fatal(1, "ERROR: Received non-interrupt message"); - assert(interrupt_rpl.index == 16'habcd) else $fatal(1, "ERROR: Received wrong index value in message"); - assert(interrupt_rpl.data_type == FP2_JB) else $fatal(1, "ERROR: Received wrong data type value in message"); - - get_dat = get_dat >> $bits(bls12_381_interrupt_rpl_t); - - for (int i = 0; i < 6; i++) - out_p[i*381 +: 381] = get_dat[i*(48*8) +: 381]; - - if (fp2_to_affine(out_p) == fp2_to_affine(exp_p)) begin - $display("INFO: Output point matched expected:"); - print_fp2_jb_point(out_p); - end else begin - $display("ERROR: Output point did NOT match expected:"); - print_fp2_jb_point(out_p); - $display("Expected:"); - print_fp2_jb_point(exp_p); - failed = 1; - end - end - begin - repeat(1000000) @(posedge out_if.i_clk); - $fatal("ERROR: Timeout while waiting for result"); - end - join_any - disable fork; - - axi_lite_if.peek(.addr(32'h14), .data(rdata)); - $display("INFO: Last cycle count was %d", rdata); - - // See what current instruction pointer is - axi_lite_if.peek(.addr(32'h10), .data(rdata)); - - $display("INFO: Current instruction pointer is 0x%x, setting to 0 and writing NULL instruction", rdata); - - inst = '{code:NOOP_WAIT, a:16'd0, b:16'h0, c:16'd0}; - axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START), .len(8)); - - axi_lite_if.poke(.addr(32'h10), .data(32'd0)); - repeat(10) @(posedge clk); - axi_lite_if.peek(.addr(32'h10), .data(rdata)); - assert(rdata == 32'd0) else $fatal(1, "ERROR: could not set instruction pointer"); - - if(failed) - $fatal(1, "ERROR: test_fp2_fpoint_mult FAILED"); - else - $display("INFO: test_fp2_fpoint_mult PASSED"); -end -endtask; - task test_inv_element(); integer signed get_len; logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat; @@ -232,7 +63,7 @@ task test_inv_element(); bls12_381_interrupt_rpl_t interrupt_rpl; failed = 0; - in = random_vector(384/8) % P; + in = (1 + random_vector(384/8)) % P; exp = fe_inv(in); $display("Running test_inv_element..."); $display("First trying FE element ..."); @@ -766,9 +597,9 @@ task test_control_logic(); failed = 0; in = 0; in[7:0] = $random(); - + $display("Running test_control_logic..."); - + //Reset the RAM axi_lite_if.poke(.addr(32'h0), .data(2'b11)); @@ -780,7 +611,7 @@ task test_control_logic(); axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + 1*64), .len(48)); data = '{dat:381'd1, pt:SCALAR}; axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + 2*64), .len(48)); - + inst = '{code:SEND_INTERRUPT, a:16'd3, b:16'h8888, c:16'd0}; axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + 7*8), .len(8)); @@ -797,7 +628,7 @@ task test_control_logic(); axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + 4*8), .len(8)); axi_lite_if.poke(.addr(32'h10), .data(1)); - + fork begin out_if.get_stream(get_dat, get_len, 0); @@ -839,19 +670,145 @@ task test_control_logic(); endtask; +task test_multi_pairing(); +begin + integer signed get_len; + logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat; + inst_t inst; + logic failed; + data_t data; + logic [31:0] rdata; + logic [DAT_BITS-1:0] in_k; + bls12_381_interrupt_rpl_t interrupt_rpl; + fe12_t f_out, f_exp0, f_exp1; + af_point_t G1_p; + fp2_af_point_t G2_p; + fp2_jb_point_t R; + failed = 0; + + G1_p = {Gy, Gx}; + G2_p = {bls12_381_pkg::G2y, bls12_381_pkg::G2x}; + + miller_loop(G1_p, G2_p, f_exp0); + miller_loop(G1_p, G2_p, f_exp1); + f_exp0 = fe12_mul(f_exp0, f_exp1); + final_exponent(f_exp0); + + $display("Running test_multi_pairing..."); + + // See what current instruction pointer is + axi_lite_if.peek(.addr(32'h10), .data(rdata)); + + // First load generator points into memory + // G1 = ((1 << DATA_RAM_DEPTH) -1 -6) + // G1 = ((1 << DATA_RAM_DEPTH) -1 -4) + + // G1 + data = '{dat:G1_p.x, pt:FP_AF}; + axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -6)*64), .len(48)); + data = '{dat:G1_p.y, pt:FP_AF}; + axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -5)*64), .len(48)); + + // G2 + data = '{dat:G2_p.x[0], pt:FP2_AF}; + axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -4)*64), .len(48)); + data = '{dat:G2_p.x[1], pt:FP2_AF}; + axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -3)*64), .len(48)); + + data = '{dat:G2_p.y[0], pt:FP2_AF}; + axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -2)*64), .len(48)); + data = '{dat:G2_p.y[1], pt:FP2_AF}; + axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -1)*64), .len(48)); + + // Program instruction memory + + // Do two miller loops + inst = '{code:MILLER_LOOP, a:((1 << DATA_RAM_DEPTH) -1 -6), b:((1 << DATA_RAM_DEPTH) -1 -4), c:16'd0}; + axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+1)*8), .len(8)); + inst = '{code:MILLER_LOOP, a:((1 << DATA_RAM_DEPTH) -1 -6), b:((1 << DATA_RAM_DEPTH) -1 -4), c:16'd12}; + axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+2)*8), .len(8)); + + // Multiply result + inst = '{code:MUL_ELEMENT , a:16'd0, b:16'd12, c:16'd0}; + axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+3)*8), .len(8)); + + // Do final exp. + inst = '{code:FINAL_EXP , a:16'd0, b:16'd0, c:16'd0}; + axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+4)*8), .len(8)); + + inst = '{code:SEND_INTERRUPT, a:16'd0, b:16'h4321, c:16'd0}; + axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+5)*8), .len(8)); + + axi_lite_if.poke(.addr(32'h10), .data(rdata+1)); + + fork + begin + out_if.get_stream(get_dat, get_len, 0); + interrupt_rpl = get_dat; + + assert(interrupt_rpl.hdr.cmd == BLS12_381_INTERRUPT_RPL) else $fatal(1, "ERROR: Received non-interrupt message"); + assert(interrupt_rpl.index == 16'h4321) else $fatal(1, "ERROR: Received wrong index value in message"); + assert(interrupt_rpl.data_type == FE12) else $fatal(1, "ERROR: Received wrong data type value in message"); + + get_dat = get_dat >> $bits(bls12_381_interrupt_rpl_t); + + for (int i = 0; i < 2; i++) + for (int j = 0; j < 3; j++) + for (int k = 0; k < 2; k++) + f_out[i][j][k] = get_dat[(i*6+j*2+k)*(48*8) +: 381]; + + if (f_out == f_exp0) begin + $display("INFO: Output matched expected:"); + print_fe12(f_out); + end else begin + $display("ERROR: Output did NOT match expected:"); + print_fe12(f_out); + $display("Expected:"); + print_fe12(f_exp0); + failed = 1; + end + end + begin + repeat(1000000) @(posedge out_if.i_clk); + $fatal("ERROR: Timeout while waiting for result"); + end + join_any + disable fork; + + axi_lite_if.peek(.addr(32'h14), .data(rdata)); + $display("INFO: Last cycle count was %d", rdata); + + // See what current instruction pointer is + axi_lite_if.peek(.addr(32'h10), .data(rdata)); + + $display("INFO: Current instruction pointer is 0x%x, setting to 0 and writing NULL instruction", rdata); + + inst = '{code:NOOP_WAIT, a:16'd0, b:16'h0, c:16'd0}; + axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START), .len(8)); + + axi_lite_if.poke(.addr(32'h10), .data(32'd0)); + repeat(10) @(posedge clk); + axi_lite_if.peek(.addr(32'h10), .data(rdata)); + assert(rdata == 32'd0) else $fatal(1, "ERROR: could not set instruction pointer"); + + if(failed) + $fatal(1, "ERROR: test_multi_pairing FAILED"); + else + $display("INFO: test_multi_pairing PASSED"); +end +endtask; + initial begin axi_lite_if.reset_source(); out_if.rdy = 0; #100ns; - test_fp_fpoint_mult(); - test_fp2_fpoint_mult(); test_inv_element(); test_mul_add_sub_element(); test_point_mult(); test_pairing(); test_control_logic(); - + test_multi_pairing(); #1us $finish(); end