Update bls12_381 processor to version 1.3, now has split functions for

miller loop and final exponentiation, to support multi-pairing.
HowToLoveChina · Aug 24, 2019 · 6d540f0 · 6d540f0
1 parent 3a8c799
commit 6d540f0
Show file tree

Hide file tree

Showing 9 changed files with 501 additions and 398 deletions.
diff --git a/README.md b/README.md
@@ -67,5 +67,5 @@ It optionally contains the following top-level engines (you can include in a bui
   - General arithmetic over bls12-381 curve
   - Dual Point multiplication in Fp and Fp^2 (G1 and G2)
   - Frobenius map operations
-  - The ate pairing
+  - The optimal ate pairing
     - Miller loop and final exponentiation stage
diff --git a/aws/cl_zcash/software/runtime/zcash_fpga.hpp b/aws/cl_zcash/software/runtime/zcash_fpga.hpp
@@ -88,11 +88,10 @@ class zcash_fpga {
       MUL_ELEMENT     = 0x12,
       INV_ELEMENT     = 0x13,
 
-      POINT_MULT      = 0x24,
-      FP_FPOINT_MULT  = 0x25,
-      FP2_FPOINT_MULT = 0x26,
-
-      ATE_PAIRING     = 0x28
+      POINT_MULT      = 0x20,
+      MILLER_LOOP     = 0x21,
+      FINAL_EXP       = 0x22,
+      ATE_PAIRING     = 0x23
     } bls12_381_code_t;
 
     // Instruction format

diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
@@ -1,6 +1,6 @@
 /*
   This is the top level for the bls12-381 pairing engine.
-  It performs both the miller loop and final exponentiation required for ate pairing (G2 x G1).
+  It performs both the miller loop and final exponentiation required for optimal ate pairing w = e(Q, P), where Q is in G2, P is in G1.
   Inputs are points in G1 and G2 (affine coordinates)
   Output is a Fp12 element.
 
@@ -40,14 +40,11 @@ module bls12_381_pairing
 )(
   input i_clk, i_rst,
   // Inputs
-  input                i_val,
-  input                i_mode,  // 0 == ate pairing, 1 == only point multiplication
+  input [1:0]          i_mode,  // 0 == ate pairing, 1 == only point multiplication, 2 == only miller loop, 3 == only final exponentiation
   input FE_TYPE        i_key,   // Input key when in mode == 1
-  output logic         o_rdy,
-  input G1_FP_AF_TYPE  i_g1_af,
-  input G2_FP_AF_TYPE  i_g2_af,
-  if_axi_stream.source o_fe12_if,
-  if_axi_stream.source o_p_jb_if,     // Output point if we did a point multiplication
+  if_axi_stream.sink   i_pair_af_if,  // Input for G1 and G2 points, Input fe_12 for final exponentiation when mode == 3
+  if_axi_stream.source o_fe12_if,     // Result fe12 of ate pairing / final exponentiation (if mode was 0/3)
+  if_axi_stream.source o_p_jb_if,     // Result of point multiplication / miller loop  (if mode was 1/2)
   // Interface to FE_TYPE multiplier (mod P)
   if_axi_stream.source o_mul_fe_if,
   if_axi_stream.sink   i_mul_fe_if,
@@ -94,7 +91,7 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) final_exp_fe12_o
 logic dbl_i_val, dbl_o_rdy;
 logic add_i_val, add_o_rdy;
 
-logic wait_dbl, wait_add, stage_done;
+logic wait_dbl, wait_add;
 
 G1_FP_AF_TYPE g1_af_i;
 G2_FP_JB_TYPE g2_r_jb_i, add_g2_o, dbl_g2_o;
@@ -106,12 +103,13 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   dbl_f12_o_if (
 logic [$clog2($bits(FE_TYPE))-1:0] ate_loop_cnt;
 logic [1:0] miller_mult_cnt;
 
-enum {IDLE, POINT_MULT_DBL, POINT_MULT_ADD, POINT_MULT_DONE, MILLER_LOOP, FINAL_EXP} pair_state;
+enum {IDLE, INPUT_LOAD0, INPUT_LOAD1, POINT_MULT_DBL, POINT_MULT_ADD, POINT_MULT_DONE, MILLER_LOOP, MILLER_ONLY_DONE, FINAL_EXP} pair_state;
 
 FE12_TYPE f;
 logic f_val;
 logic [3:0] out_cnt;
-logic point_mul_mode, found_one;
+logic found_one;
+logic [1:0] mode;
 
 FE_TYPE key;
 
@@ -129,12 +127,12 @@ always_ff @ (posedge i_clk) begin
     final_exp_fe12_o_if.eop <= 0;
     g1_af_i <= 0;
     g2_r_jb_i <= 0;
+    g2_af_i <= 0;
     mul_fe12_i_if[0].rdy <= 0;
     mul_fe12_o_if[0].reset_source();
     pair_state <= IDLE;
     add_i_val <= 0;
     dbl_i_val <= 0;
-    o_rdy <= 0;
     wait_dbl <= 0;
     wait_add <= 0;
     miller_mult_cnt <= 0;
@@ -143,17 +141,18 @@ always_ff @ (posedge i_clk) begin
     f <= FE12_one;
     f_val <= 0;
     out_cnt <= 0;
-    point_mul_mode <= 0;
+    mode <= 0;
 
     key <= 0;
     found_one <= 0;
-    stage_done <= 0;
-
+
     o_p_jb_if.reset_source();
-    
+
     dbl_f12_o_if.rdy <= 0;
     add_f12_o_if.rdy <= 0;
 
+    i_pair_af_if.rdy <= 0;
+
   end else begin
 
     if (add_o_rdy) add_i_val <= 0;
@@ -166,45 +165,66 @@ always_ff @ (posedge i_clk) begin
       f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
       f_val <= mul_fe12_i_if[0].eop;
     end
-    
+
     dbl_f12_o_if.rdy <= 0;
     add_f12_o_if.rdy <= 0;
 
     case(pair_state)
       IDLE: begin
-        ate_loop_cnt <= i_mode == 0 ? ATE_X_START-1 : $bits(FE_TYPE)-1;
+        ate_loop_cnt <= i_mode == 0 || i_mode == 2 ? ATE_X_START-1 : $bits(FE_TYPE)-1;
         f <= FE12_one;
         add_i_val <= 0;
         dbl_i_val <= 0;
         wait_dbl <= 0;
         wait_add <= 0;
         out_cnt <= 0;
         f_val <= 0;
-        o_rdy <= 1;
         miller_mult_cnt <= 0;
         found_one <= 0;
-        stage_done <= 0;
-        if (i_val && o_rdy) begin
-          pair_state <= i_mode == 0 ? MILLER_LOOP : POINT_MULT_DBL;
-          key <= i_key;
-          point_mul_mode <= i_mode;
-          o_rdy <= 0;
+        i_pair_af_if.rdy <= 0;
 
-          g1_af_i <= i_g1_af;
-          g2_af_i <= i_g2_af;
+        g2_r_jb_i.x <= 0;
+        g2_r_jb_i.y <= 0;
+        g2_r_jb_i.z <= 1;
 
-          g2_r_jb_i.x <= i_g2_af.x;
-          g2_r_jb_i.y <= i_g2_af.y;
-          g2_r_jb_i.z <= 1;
+        if (i_pair_af_if.val) begin
+          pair_state <= INPUT_LOAD0;
+          key <= i_key;
+          mode <= i_mode;
+          i_pair_af_if.rdy <= 1;
+        end
+      end
+      INPUT_LOAD0: begin
+        if (i_pair_af_if.eop && i_pair_af_if.val && i_pair_af_if.rdy) i_pair_af_if.rdy <= 0;
+        if (i_pair_af_if.val && i_pair_af_if.rdy) begin
+          if (mode == 1) begin
+            g2_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_af_i.y, g2_af_i.x[1]};
+            {g2_r_jb_i.y, g2_r_jb_i.x} <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_r_jb_i.y, g2_r_jb_i.x[1]};
+            if (i_pair_af_if.eop) pair_state <= POINT_MULT_DBL;
+          end else
+          if (mode == 3) begin
+            f <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], f[1], f[0][2:1], f[0][0][1]};
+            if (i_pair_af_if.eop) pair_state <= FINAL_EXP;
+          end else begin
+            g1_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g1_af_i.y};
+            if (i_pair_af_if.eop) pair_state <= INPUT_LOAD1;
+            i_pair_af_if.rdy <= 1;
+          end
+        end
+      end
+      INPUT_LOAD1: begin
+        if (i_pair_af_if.eop && i_pair_af_if.val && i_pair_af_if.rdy) i_pair_af_if.rdy <= 0;
+        if (i_pair_af_if.val && i_pair_af_if.rdy) begin
+          g2_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_af_i.y, g2_af_i.x[1]};
+          {g2_r_jb_i.y, g2_r_jb_i.x} <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_r_jb_i.y, g2_r_jb_i.x[1]};
+          if (i_pair_af_if.eop) pair_state <= MILLER_LOOP;
         end
       end
       MILLER_LOOP: begin
-
         if (~wait_dbl) begin
           dbl_i_val <= 1;
           wait_dbl <= 1;
         end
-
         if (wait_dbl && dbl_f12_o_if.val && dbl_f12_o_if.sop && dbl_f12_o_if.rdy) begin
           g2_r_jb_i <= dbl_g2_o;
           if (~wait_add && ATE_X[ate_loop_cnt] == 1) begin
@@ -242,18 +262,18 @@ always_ff @ (posedge i_clk) begin
                   0,1,4: mul_fe12_o_if[0].dat <= {dbl_f12_o_if.dat, f[0][0][0]};
                   default: mul_fe12_o_if[0].dat <= {381'd0, f[0][0][0]};
                 endcase
-                
+
                 out_cnt <= out_cnt + 1;
                 f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
                 if (out_cnt == 11) begin
                   f_val <= 0;
                   out_cnt <= 0;
                   miller_mult_cnt <= ATE_X[ate_loop_cnt] == 0 ? 3 : 2;
-                end                  
-                 
+                end
+
                 mul_fe12_o_if[0].ctl <= miller_mult_cnt;
                 mul_fe12_o_if[0].ctl[SQ_BIT] <= 0;
-                
+
               end
             end
           end
@@ -290,7 +310,7 @@ always_ff @ (posedge i_clk) begin
               miller_mult_cnt <= 0;
               ate_loop_cnt <= ate_loop_cnt - 1;
               if (ate_loop_cnt == 0) begin
-                pair_state <= FINAL_EXP;
+                pair_state <= mode == 0 ? FINAL_EXP : MILLER_ONLY_DONE;
               end
             end
           end
@@ -352,15 +372,30 @@ always_ff @ (posedge i_clk) begin
             key <= key << 1;
             pair_state <= POINT_MULT_DBL;
           end
-        end      
+        end
+      end
+      MILLER_ONLY_DONE: begin
+        if (~o_p_jb_if.val || (o_p_jb_if.val && o_p_jb_if.rdy)) begin
+          o_p_jb_if.val <= 1;
+          o_p_jb_if.sop <= out_cnt == 0;
+          o_p_jb_if.eop <= out_cnt == 11;
+          o_p_jb_if.dat <= f[0][0][0];
+          f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
+          out_cnt <= out_cnt + 1;
+          if (o_p_jb_if.val && o_p_jb_if.rdy && o_p_jb_if.eop) begin
+            pair_state <= IDLE;
+            out_cnt <= 0;
+            o_p_jb_if.val <= 0;
+          end
+        end
       end
       POINT_MULT_DONE: begin
         if (~o_p_jb_if.val || (o_p_jb_if.val && o_p_jb_if.rdy)) begin
           o_p_jb_if.val <= 1;
           o_p_jb_if.sop <= out_cnt == 0;
           o_p_jb_if.eop <= out_cnt == 5;
           o_p_jb_if.dat <= g2_r_jb_i;
-          
+
           out_cnt <= out_cnt + 1;
           g2_r_jb_i <= g2_r_jb_i >> $bits(FE_TYPE);
           if (o_p_jb_if.val && o_p_jb_if.rdy && o_p_jb_if.eop) begin
@@ -386,7 +421,7 @@ bls12_381_pairing_miller_dbl (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
   .i_val            ( dbl_i_val      ),
-  .i_point_mul_mode ( point_mul_mode ),
+  .i_point_mul_mode ( mode == 1      ),
   .o_rdy            ( dbl_o_rdy      ),
   .i_g1_af          ( g1_af_i        ),
   .i_g2_jb          ( g2_r_jb_i      ),
@@ -413,8 +448,8 @@ bls12_381_pairing_miller_add #(
 bls12_381_pairing_miller_add (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_val            ( add_i_val     ),
-  .i_point_mul_mode ( point_mul_mode ),
+  .i_val            ( add_i_val      ),
+  .i_point_mul_mode ( mode == 1      ),
   .o_rdy            ( add_o_rdy      ),
   .i_g1_af          ( g1_af_i        ),
   .i_g2_jb          ( dbl_g2_o       ),

diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv
@@ -32,23 +32,22 @@ module bls12_381_pairing_wrapper
 )(
   input i_clk, i_rst,
   // Inputs
-  input               i_val,
-  output logic        o_rdy,
-  input G1_FP_AF_TYPE i_g1_af,    // G1 input point
-  input G2_FP_AF_TYPE i_g2_af,    // G2 input point
-  input                i_mode,    // 0 == ate pairing, 1 == only point multiplication
-  input FE_TYPE        i_key,     // Input key when in mode == 1
-  if_axi_stream.source o_fe12_if, // Result fe12 of ate pairing (or point mult)
-  if_axi_stream.source o_p_jb_if, // Result of point multiplication
+  if_axi_stream.sink   i_pair_af_if, // G1 and G2 input point - or Fe12 element if we are only performing the final exponentiation
+  input [1:0]          i_mode,       // 0 == ate pairing, 1 == only point multiplication, 2 == only miller loop, 3 == only final exponentiation
+  input FE_TYPE        i_key,        // Input key when in mode == 1
+  if_axi_stream.source o_fe12_if,    // Result fe12 of ate pairing / final exponentiation (if mode was 0/3)
+  if_axi_stream.source o_p_jb_if,    // Result of point multiplication / miller loop  (if mode was 1/2)
   // Interface to FE_TYPE multiplier (mod P)
   if_axi_stream.source o_mul_fe_if,
   if_axi_stream.sink   i_mul_fe_if,
+  // Interface to FE12_TYPE multiplier (mod P) (Implemented internally)
+  if_axi_stream.source o_mul_fe12_if,
+  if_axi_stream.sink   i_mul_fe12_if,
   // We provide interfaces to the inversion module
   if_axi_stream.source o_inv_fe2_if,
   if_axi_stream.sink   i_inv_fe2_if,
   if_axi_stream.source o_inv_fe_if,
   if_axi_stream.sink   i_inv_fe_if
-
 );
 
 if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if  [3:0] (i_clk);
@@ -68,8 +67,8 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe6_i_if
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_o_if  [2:0] (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_i_if  [2:0] (i_clk);
 
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [2:0] (i_clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if [2:0] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [3:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if [3:0] (i_clk);
 
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   pow_fe12_o_if       (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   pow_fe12_i_if       (i_clk);
@@ -80,6 +79,14 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   fmap_fe12_i_if
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe12_o_if       (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe12_i_if       (i_clk);
 
+always_comb begin
+  i_mul_fe12_if.rdy = mul_fe12_o_if[2].rdy;
+  mul_fe12_o_if[2].copy_if_comb(i_mul_fe12_if.dat, i_mul_fe12_if.val, i_mul_fe12_if.sop, i_mul_fe12_if.eop, i_mul_fe12_if.err, i_mul_fe12_if.mod, i_mul_fe12_if.ctl);
+
+  mul_fe12_i_if[2].rdy = o_mul_fe12_if.rdy;
+  o_mul_fe12_if.copy_if_comb(mul_fe12_i_if[2].dat, mul_fe12_i_if[2].val, mul_fe12_i_if[2].sop, mul_fe12_i_if[2].eop, mul_fe12_i_if[2].err, mul_fe12_i_if[2].mod, mul_fe12_i_if[2].ctl);
+end
+
 bls12_381_pairing #(
   .FE_TYPE     ( FE_TYPE   ),
   .FE2_TYPE    ( FE2_TYPE  ),
@@ -93,10 +100,7 @@ bls12_381_pairing #(
 bls12_381_pairing (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_val ( i_val ),
-  .o_rdy ( o_rdy ),
-  .i_g1_af ( i_g1_af ),
-  .i_g2_af ( i_g2_af ),
+  .i_pair_af_if ( i_pair_af_if ),
   .i_mode  ( i_mode  ),
   .i_key   ( i_key   ),
   .o_fe12_if      ( o_fe12_if        ),
@@ -262,8 +266,8 @@ ec_fe12_mul_s (
   .i_sub_fe_if    ( sub_fe_i_if[3]   ),
   .o_mnr_fe6_if   ( mnr_fe6_o_if[0]  ),
   .i_mnr_fe6_if   ( mnr_fe6_i_if[0]  ),
-  .o_mul_fe12_if  ( mul_fe12_i_if[2] ),
-  .i_mul_fe12_if  ( mul_fe12_o_if[2] )
+  .o_mul_fe12_if  ( mul_fe12_i_if[3] ),
+  .i_mul_fe12_if  ( mul_fe12_o_if[3] )
 );
 
 adder_pipe # (
@@ -378,7 +382,7 @@ resource_share_fe6_mul (
 );
 
 resource_share # (
-  .NUM_IN       ( 2                ),
+  .NUM_IN       ( 3                ),
   .DAT_BITS     ( 2*$bits(FE_TYPE) ),
   .CTL_BITS     ( CTL_BITS         ),
   .OVR_WRT_BIT  ( OVR_WRT_BIT + 42 ), // 2 bits
@@ -388,10 +392,10 @@ resource_share # (
 resource_share_fe12_mul (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_axi ( mul_fe12_o_if[1:0] ),
-  .o_res ( mul_fe12_o_if[2]   ),
-  .i_res ( mul_fe12_i_if[2]   ),
-  .o_axi ( mul_fe12_i_if[1:0] )
+  .i_axi ( mul_fe12_o_if[2:0] ),
+  .o_res ( mul_fe12_o_if[3]   ),
+  .i_res ( mul_fe12_i_if[3]   ),
+  .o_axi ( mul_fe12_i_if[2:0] )
 );
 
 resource_share # (