From 6d540f04913cff751ef5befa96bc7a0d37ce1927 Mon Sep 17 00:00:00 2001
From: bsdevlin <bsdevlin@LAPTOP-AL12KDU7>
Date: Sun, 25 Aug 2019 00:30:08 +0800
Subject: [PATCH] Update bls12_381 processor to version 1.3, now has split
 functions for miller loop and final exponentiation, to support multi-pairing.

---
 README.md                                     |   2 +-
 aws/cl_zcash/software/runtime/zcash_fpga.hpp  |   9 +-
 .../src/rtl/bls12_381/bls12_381_pairing.sv    | 119 ++++---
 .../bls12_381/bls12_381_pairing_wrapper.sv    |  48 +--
 zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv |  10 +-
 zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv | 263 ++++++++-------
 zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv      |   2 +-
 zcash_fpga/src/tb/bls12_381_pairing_tb.sv     | 135 ++++++--
 zcash_fpga/src/tb/bls12_381_top_tb.sv         | 311 ++++++++----------
 9 files changed, 501 insertions(+), 398 deletions(-)

diff --git a/README.md b/README.md
index 36f1c13..e1c2709 100644
--- a/README.md
+++ b/README.md
@@ -67,5 +67,5 @@ It optionally contains the following top-level engines (you can include in a bui
   - General arithmetic over bls12-381 curve
   - Dual Point multiplication in Fp and Fp^2 (G1 and G2)
   - Frobenius map operations
-  - The ate pairing
+  - The optimal ate pairing
     - Miller loop and final exponentiation stage
diff --git a/aws/cl_zcash/software/runtime/zcash_fpga.hpp b/aws/cl_zcash/software/runtime/zcash_fpga.hpp
index f3f66d3..961355b 100644
--- a/aws/cl_zcash/software/runtime/zcash_fpga.hpp
+++ b/aws/cl_zcash/software/runtime/zcash_fpga.hpp
@@ -88,11 +88,10 @@ class zcash_fpga {
       MUL_ELEMENT     = 0x12,
       INV_ELEMENT     = 0x13,
 
-      POINT_MULT      = 0x24,
-      FP_FPOINT_MULT  = 0x25,
-      FP2_FPOINT_MULT = 0x26,
-
-      ATE_PAIRING     = 0x28
+      POINT_MULT      = 0x20,
+      MILLER_LOOP     = 0x21,
+      FINAL_EXP       = 0x22,
+      ATE_PAIRING     = 0x23
     } bls12_381_code_t;
 
     // Instruction format
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
index 3b642c6..e7e9728 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing.sv
@@ -1,6 +1,6 @@
 /*
   This is the top level for the bls12-381 pairing engine.
-  It performs both the miller loop and final exponentiation required for ate pairing (G2 x G1).
+  It performs both the miller loop and final exponentiation required for optimal ate pairing w = e(Q, P), where Q is in G2, P is in G1.
   Inputs are points in G1 and G2 (affine coordinates)
   Output is a Fp12 element.
 
@@ -40,14 +40,11 @@ module bls12_381_pairing
 )(
   input i_clk, i_rst,
   // Inputs
-  input                i_val,
-  input                i_mode,  // 0 == ate pairing, 1 == only point multiplication
+  input [1:0]          i_mode,  // 0 == ate pairing, 1 == only point multiplication, 2 == only miller loop, 3 == only final exponentiation
   input FE_TYPE        i_key,   // Input key when in mode == 1
-  output logic         o_rdy,
-  input G1_FP_AF_TYPE  i_g1_af,
-  input G2_FP_AF_TYPE  i_g2_af,
-  if_axi_stream.source o_fe12_if,
-  if_axi_stream.source o_p_jb_if,     // Output point if we did a point multiplication
+  if_axi_stream.sink   i_pair_af_if,  // Input for G1 and G2 points, Input fe_12 for final exponentiation when mode == 3
+  if_axi_stream.source o_fe12_if,     // Result fe12 of ate pairing / final exponentiation (if mode was 0/3)
+  if_axi_stream.source o_p_jb_if,     // Result of point multiplication / miller loop  (if mode was 1/2)
   // Interface to FE_TYPE multiplier (mod P)
   if_axi_stream.source o_mul_fe_if,
   if_axi_stream.sink   i_mul_fe_if,
@@ -94,7 +91,7 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) final_exp_fe12_o
 logic dbl_i_val, dbl_o_rdy;
 logic add_i_val, add_o_rdy;
 
-logic wait_dbl, wait_add, stage_done;
+logic wait_dbl, wait_add;
 
 G1_FP_AF_TYPE g1_af_i;
 G2_FP_JB_TYPE g2_r_jb_i, add_g2_o, dbl_g2_o;
@@ -106,12 +103,13 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   dbl_f12_o_if (
 logic [$clog2($bits(FE_TYPE))-1:0] ate_loop_cnt;
 logic [1:0] miller_mult_cnt;
 
-enum {IDLE, POINT_MULT_DBL, POINT_MULT_ADD, POINT_MULT_DONE, MILLER_LOOP, FINAL_EXP} pair_state;
+enum {IDLE, INPUT_LOAD0, INPUT_LOAD1, POINT_MULT_DBL, POINT_MULT_ADD, POINT_MULT_DONE, MILLER_LOOP, MILLER_ONLY_DONE, FINAL_EXP} pair_state;
 
 FE12_TYPE f;
 logic f_val;
 logic [3:0] out_cnt;
-logic point_mul_mode, found_one;
+logic found_one;
+logic [1:0] mode;
 
 FE_TYPE key;
 
@@ -129,12 +127,12 @@ always_ff @ (posedge i_clk) begin
     final_exp_fe12_o_if.eop <= 0;
     g1_af_i <= 0;
     g2_r_jb_i <= 0;
+    g2_af_i <= 0;
     mul_fe12_i_if[0].rdy <= 0;
     mul_fe12_o_if[0].reset_source();
     pair_state <= IDLE;
     add_i_val <= 0;
     dbl_i_val <= 0;
-    o_rdy <= 0;
     wait_dbl <= 0;
     wait_add <= 0;
     miller_mult_cnt <= 0;
@@ -143,17 +141,18 @@ always_ff @ (posedge i_clk) begin
     f <= FE12_one;
     f_val <= 0;
     out_cnt <= 0;
-    point_mul_mode <= 0;
+    mode <= 0;
 
     key <= 0;
     found_one <= 0;
-    stage_done <= 0;
-    
+
     o_p_jb_if.reset_source();
-    
+
     dbl_f12_o_if.rdy <= 0;
     add_f12_o_if.rdy <= 0;
 
+    i_pair_af_if.rdy <= 0;
+
   end else begin
 
     if (add_o_rdy) add_i_val <= 0;
@@ -166,13 +165,13 @@ always_ff @ (posedge i_clk) begin
       f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
       f_val <= mul_fe12_i_if[0].eop;
     end
-    
+
     dbl_f12_o_if.rdy <= 0;
     add_f12_o_if.rdy <= 0;
 
     case(pair_state)
       IDLE: begin
-        ate_loop_cnt <= i_mode == 0 ? ATE_X_START-1 : $bits(FE_TYPE)-1;
+        ate_loop_cnt <= i_mode == 0 || i_mode == 2 ? ATE_X_START-1 : $bits(FE_TYPE)-1;
         f <= FE12_one;
         add_i_val <= 0;
         dbl_i_val <= 0;
@@ -180,31 +179,52 @@ always_ff @ (posedge i_clk) begin
         wait_add <= 0;
         out_cnt <= 0;
         f_val <= 0;
-        o_rdy <= 1;
         miller_mult_cnt <= 0;
         found_one <= 0;
-        stage_done <= 0;
-        if (i_val && o_rdy) begin
-          pair_state <= i_mode == 0 ? MILLER_LOOP : POINT_MULT_DBL;
-          key <= i_key;
-          point_mul_mode <= i_mode;
-          o_rdy <= 0;
+        i_pair_af_if.rdy <= 0;
 
-          g1_af_i <= i_g1_af;
-          g2_af_i <= i_g2_af;
+        g2_r_jb_i.x <= 0;
+        g2_r_jb_i.y <= 0;
+        g2_r_jb_i.z <= 1;
 
-          g2_r_jb_i.x <= i_g2_af.x;
-          g2_r_jb_i.y <= i_g2_af.y;
-          g2_r_jb_i.z <= 1;
+        if (i_pair_af_if.val) begin
+          pair_state <= INPUT_LOAD0;
+          key <= i_key;
+          mode <= i_mode;
+          i_pair_af_if.rdy <= 1;
+        end
+      end
+      INPUT_LOAD0: begin
+        if (i_pair_af_if.eop && i_pair_af_if.val && i_pair_af_if.rdy) i_pair_af_if.rdy <= 0;
+        if (i_pair_af_if.val && i_pair_af_if.rdy) begin
+          if (mode == 1) begin
+            g2_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_af_i.y, g2_af_i.x[1]};
+            {g2_r_jb_i.y, g2_r_jb_i.x} <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_r_jb_i.y, g2_r_jb_i.x[1]};
+            if (i_pair_af_if.eop) pair_state <= POINT_MULT_DBL;
+          end else
+          if (mode == 3) begin
+            f <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], f[1], f[0][2:1], f[0][0][1]};
+            if (i_pair_af_if.eop) pair_state <= FINAL_EXP;
+          end else begin
+            g1_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g1_af_i.y};
+            if (i_pair_af_if.eop) pair_state <= INPUT_LOAD1;
+            i_pair_af_if.rdy <= 1;
+          end
+        end
+      end
+      INPUT_LOAD1: begin
+        if (i_pair_af_if.eop && i_pair_af_if.val && i_pair_af_if.rdy) i_pair_af_if.rdy <= 0;
+        if (i_pair_af_if.val && i_pair_af_if.rdy) begin
+          g2_af_i <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_af_i.y, g2_af_i.x[1]};
+          {g2_r_jb_i.y, g2_r_jb_i.x} <= {i_pair_af_if.dat[0 +: $bits(FE_TYPE)], g2_r_jb_i.y, g2_r_jb_i.x[1]};
+          if (i_pair_af_if.eop) pair_state <= MILLER_LOOP;
         end
       end
       MILLER_LOOP: begin
-
         if (~wait_dbl) begin
           dbl_i_val <= 1;
           wait_dbl <= 1;
         end
-
         if (wait_dbl && dbl_f12_o_if.val && dbl_f12_o_if.sop && dbl_f12_o_if.rdy) begin
           g2_r_jb_i <= dbl_g2_o;
           if (~wait_add && ATE_X[ate_loop_cnt] == 1) begin
@@ -242,18 +262,18 @@ always_ff @ (posedge i_clk) begin
                   0,1,4: mul_fe12_o_if[0].dat <= {dbl_f12_o_if.dat, f[0][0][0]};
                   default: mul_fe12_o_if[0].dat <= {381'd0, f[0][0][0]};
                 endcase
-                
+
                 out_cnt <= out_cnt + 1;
                 f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
                 if (out_cnt == 11) begin
                   f_val <= 0;
                   out_cnt <= 0;
                   miller_mult_cnt <= ATE_X[ate_loop_cnt] == 0 ? 3 : 2;
-                end                  
-                 
+                end
+
                 mul_fe12_o_if[0].ctl <= miller_mult_cnt;
                 mul_fe12_o_if[0].ctl[SQ_BIT] <= 0;
-                
+
               end
             end
           end
@@ -290,7 +310,7 @@ always_ff @ (posedge i_clk) begin
               miller_mult_cnt <= 0;
               ate_loop_cnt <= ate_loop_cnt - 1;
               if (ate_loop_cnt == 0) begin
-                pair_state <= FINAL_EXP;
+                pair_state <= mode == 0 ? FINAL_EXP : MILLER_ONLY_DONE;
               end
             end
           end
@@ -352,7 +372,22 @@ always_ff @ (posedge i_clk) begin
             key <= key << 1;
             pair_state <= POINT_MULT_DBL;
           end
-        end      
+        end
+      end
+      MILLER_ONLY_DONE: begin
+        if (~o_p_jb_if.val || (o_p_jb_if.val && o_p_jb_if.rdy)) begin
+          o_p_jb_if.val <= 1;
+          o_p_jb_if.sop <= out_cnt == 0;
+          o_p_jb_if.eop <= out_cnt == 11;
+          o_p_jb_if.dat <= f[0][0][0];
+          f <= {mul_fe12_i_if[0].dat, f[1], f[0][2:1], f[0][0][1]};
+          out_cnt <= out_cnt + 1;
+          if (o_p_jb_if.val && o_p_jb_if.rdy && o_p_jb_if.eop) begin
+            pair_state <= IDLE;
+            out_cnt <= 0;
+            o_p_jb_if.val <= 0;
+          end
+        end
       end
       POINT_MULT_DONE: begin
         if (~o_p_jb_if.val || (o_p_jb_if.val && o_p_jb_if.rdy)) begin
@@ -360,7 +395,7 @@ always_ff @ (posedge i_clk) begin
           o_p_jb_if.sop <= out_cnt == 0;
           o_p_jb_if.eop <= out_cnt == 5;
           o_p_jb_if.dat <= g2_r_jb_i;
-          
+
           out_cnt <= out_cnt + 1;
           g2_r_jb_i <= g2_r_jb_i >> $bits(FE_TYPE);
           if (o_p_jb_if.val && o_p_jb_if.rdy && o_p_jb_if.eop) begin
@@ -386,7 +421,7 @@ bls12_381_pairing_miller_dbl (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
   .i_val            ( dbl_i_val      ),
-  .i_point_mul_mode ( point_mul_mode ),
+  .i_point_mul_mode ( mode == 1      ),
   .o_rdy            ( dbl_o_rdy      ),
   .i_g1_af          ( g1_af_i        ),
   .i_g2_jb          ( g2_r_jb_i      ),
@@ -413,8 +448,8 @@ bls12_381_pairing_miller_add #(
 bls12_381_pairing_miller_add (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_val            ( add_i_val     ),
-  .i_point_mul_mode ( point_mul_mode ),
+  .i_val            ( add_i_val      ),
+  .i_point_mul_mode ( mode == 1      ),
   .o_rdy            ( add_o_rdy      ),
   .i_g1_af          ( g1_af_i        ),
   .i_g2_jb          ( dbl_g2_o       ),
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv
index 76d04e9..ae0b43e 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pairing_wrapper.sv
@@ -32,23 +32,22 @@ module bls12_381_pairing_wrapper
 )(
   input i_clk, i_rst,
   // Inputs
-  input               i_val,
-  output logic        o_rdy,
-  input G1_FP_AF_TYPE i_g1_af,    // G1 input point
-  input G2_FP_AF_TYPE i_g2_af,    // G2 input point
-  input                i_mode,    // 0 == ate pairing, 1 == only point multiplication
-  input FE_TYPE        i_key,     // Input key when in mode == 1
-  if_axi_stream.source o_fe12_if, // Result fe12 of ate pairing (or point mult)
-  if_axi_stream.source o_p_jb_if, // Result of point multiplication
+  if_axi_stream.sink   i_pair_af_if, // G1 and G2 input point - or Fe12 element if we are only performing the final exponentiation
+  input [1:0]          i_mode,       // 0 == ate pairing, 1 == only point multiplication, 2 == only miller loop, 3 == only final exponentiation
+  input FE_TYPE        i_key,        // Input key when in mode == 1
+  if_axi_stream.source o_fe12_if,    // Result fe12 of ate pairing / final exponentiation (if mode was 0/3)
+  if_axi_stream.source o_p_jb_if,    // Result of point multiplication / miller loop  (if mode was 1/2)
   // Interface to FE_TYPE multiplier (mod P)
   if_axi_stream.source o_mul_fe_if,
   if_axi_stream.sink   i_mul_fe_if,
+  // Interface to FE12_TYPE multiplier (mod P) (Implemented internally)
+  if_axi_stream.source o_mul_fe12_if,
+  if_axi_stream.sink   i_mul_fe12_if,
   // We provide interfaces to the inversion module
   if_axi_stream.source o_inv_fe2_if,
   if_axi_stream.sink   i_inv_fe2_if,
   if_axi_stream.source o_inv_fe_if,
   if_axi_stream.sink   i_inv_fe_if
-
 );
 
 if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if  [3:0] (i_clk);
@@ -68,8 +67,8 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe6_i_if
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_o_if  [2:0] (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mnr_fe6_i_if  [2:0] (i_clk);
 
-if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [2:0] (i_clk);
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if [2:0] (i_clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if [3:0] (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if [3:0] (i_clk);
 
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   pow_fe12_o_if       (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   pow_fe12_i_if       (i_clk);
@@ -80,6 +79,14 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   fmap_fe12_i_if
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe12_o_if       (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe12_i_if       (i_clk);
 
+always_comb begin
+  i_mul_fe12_if.rdy = mul_fe12_o_if[2].rdy;
+  mul_fe12_o_if[2].copy_if_comb(i_mul_fe12_if.dat, i_mul_fe12_if.val, i_mul_fe12_if.sop, i_mul_fe12_if.eop, i_mul_fe12_if.err, i_mul_fe12_if.mod, i_mul_fe12_if.ctl);
+
+  mul_fe12_i_if[2].rdy = o_mul_fe12_if.rdy;
+  o_mul_fe12_if.copy_if_comb(mul_fe12_i_if[2].dat, mul_fe12_i_if[2].val, mul_fe12_i_if[2].sop, mul_fe12_i_if[2].eop, mul_fe12_i_if[2].err, mul_fe12_i_if[2].mod, mul_fe12_i_if[2].ctl);
+end
+
 bls12_381_pairing #(
   .FE_TYPE     ( FE_TYPE   ),
   .FE2_TYPE    ( FE2_TYPE  ),
@@ -93,10 +100,7 @@ bls12_381_pairing #(
 bls12_381_pairing (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_val ( i_val ),
-  .o_rdy ( o_rdy ),
-  .i_g1_af ( i_g1_af ),
-  .i_g2_af ( i_g2_af ),
+  .i_pair_af_if ( i_pair_af_if ),
   .i_mode  ( i_mode  ),
   .i_key   ( i_key   ),
   .o_fe12_if      ( o_fe12_if        ),
@@ -262,8 +266,8 @@ ec_fe12_mul_s (
   .i_sub_fe_if    ( sub_fe_i_if[3]   ),
   .o_mnr_fe6_if   ( mnr_fe6_o_if[0]  ),
   .i_mnr_fe6_if   ( mnr_fe6_i_if[0]  ),
-  .o_mul_fe12_if  ( mul_fe12_i_if[2] ),
-  .i_mul_fe12_if  ( mul_fe12_o_if[2] )
+  .o_mul_fe12_if  ( mul_fe12_i_if[3] ),
+  .i_mul_fe12_if  ( mul_fe12_o_if[3] )
 );
 
 adder_pipe # (
@@ -378,7 +382,7 @@ resource_share_fe6_mul (
 );
 
 resource_share # (
-  .NUM_IN       ( 2                ),
+  .NUM_IN       ( 3                ),
   .DAT_BITS     ( 2*$bits(FE_TYPE) ),
   .CTL_BITS     ( CTL_BITS         ),
   .OVR_WRT_BIT  ( OVR_WRT_BIT + 42 ), // 2 bits
@@ -388,10 +392,10 @@ resource_share # (
 resource_share_fe12_mul (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_axi ( mul_fe12_o_if[1:0] ),
-  .o_res ( mul_fe12_o_if[2]   ),
-  .i_res ( mul_fe12_i_if[2]   ),
-  .o_axi ( mul_fe12_i_if[1:0] )
+  .i_axi ( mul_fe12_o_if[2:0] ),
+  .o_res ( mul_fe12_o_if[3]   ),
+  .i_res ( mul_fe12_i_if[3]   ),
+  .o_axi ( mul_fe12_i_if[2:0] )
 );
 
 resource_share # (
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
index f9bbbce..34fc144 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_pkg.sv
@@ -165,11 +165,10 @@ package bls12_381_pkg;
     MUL_ELEMENT     = 8'h12,
     INV_ELEMENT     = 8'h13,
 
-    POINT_MULT      = 8'h24,
-    FP_FPOINT_MULT  = 8'h25,
-    FP2_FPOINT_MULT = 8'h26,
-
-    ATE_PAIRING     = 8'h28
+    POINT_MULT      = 8'h20,
+    MILLER_LOOP     = 8'h21,
+    FINAL_EXP       = 8'h22,
+    ATE_PAIRING     = 8'h23
   } code_t;
 
   // Instruction format
@@ -663,7 +662,6 @@ package bls12_381_pkg;
   endtask
 
   task automatic ate_pairing(input af_point_t P, input fp2_af_point_t Q, ref fe12_t f);
-    fp2_jb_point_t R; // This is only used for point multiplication
     miller_loop(P, Q, f);
     final_exponent(f);
   endtask;
diff --git a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
index 777f32e..ee04004 100644
--- a/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
+++ b/zcash_fpga/src/rtl/bls12_381/bls12_381_top.sv
@@ -66,7 +66,7 @@ if_axi_stream #(.DAT_BYTS(3)) idx_in_if(i_clk);
 if_axi_stream #(.DAT_BYTS(3)) idx_out_if(i_clk);
 
 // Point multiplication
-logic pair_mode;
+logic [1:0] pair_mode;
 fe_t  pair_key;
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mult_pt_if (i_clk);
 
@@ -89,12 +89,13 @@ if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe_i_if
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe2_o_if     (i_clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   inv_fe2_i_if     (i_clk);
 
-logic pair_i_val, pair_o_rdy;
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE))) pair_o_res_if (i_clk); ;
-bls12_381_pkg::af_point_t pair_i_g1;
-bls12_381_pkg::fp2_af_point_t pair_i_g2;
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if     (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if     (i_clk);
 
 
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE))) pair_i_af_if  (i_clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE))) pair_o_res_if (i_clk);
+
 logic [31:0] new_inst_pt;
 logic        new_inst_pt_val, new_inst_pt_val_l;
 logic        reset_done_inst, reset_done_data;
@@ -146,14 +147,15 @@ always_ff @ (posedge i_clk) begin
     add_out_if.rdy <= 0;
     sub_out_if.rdy <= 0;
 
-    pair_i_val <= 0;
-    pair_i_g1 <= 0;
-    pair_i_g2 <= 0;
+    pair_i_af_if.reset_source();
 
     pair_mode <= 0;
     pair_key <= 0;
     mult_pt_if.rdy <= 0;
 
+    mul_fe12_o_if.reset_source();
+    mul_fe12_i_if.rdy <= 0;
+
   end else begin
 
     mul_in_if[1].sop <= 1;
@@ -179,9 +181,11 @@ always_ff @ (posedge i_clk) begin
     if (add_in_if.rdy) add_in_if.val <= 0;
     if (sub_in_if.rdy) sub_in_if.val <= 0;
     if (mul_in_if[1].rdy) mul_in_if[1].val <= 0;
-    if (pair_o_rdy) pair_i_val <= 0;
+    if (pair_i_af_if.rdy) pair_i_af_if.val <= 0;
+    if (mul_fe12_o_if.rdy) mul_fe12_o_if.val <= 0;
 
     mult_pt_if.rdy <= 1;
+    mul_fe12_i_if.rdy <= 1;
 
     if (idx_in_if.val && idx_in_if.rdy) idx_in_if.val <= 0;
     if (interrupt_in_if.val && interrupt_in_if.rdy) interrupt_in_if.val <= 0;
@@ -234,18 +238,19 @@ always_ff @ (posedge i_clk) begin
         if (cnt == 0) last_inst_cnt <= 0;
         task_point_mult();
       end
-      // We don't use precaculation for fixed point but could be used as optimizations
-      FP_FPOINT_MULT: begin
+      ATE_PAIRING: begin
         if (cnt == 0) last_inst_cnt <= 0;
-        task_fp_fpoint_mult();
+        pair_mode <= 0;
+        task_pairing();
       end
-      FP2_FPOINT_MULT: begin
+      MILLER_LOOP: begin
         if (cnt == 0) last_inst_cnt <= 0;
-        task_fp2_fpoint_mult();
+        pair_mode <= 2;
+        task_pairing();
       end
-      ATE_PAIRING: begin
+      FINAL_EXP: begin
         if (cnt == 0) last_inst_cnt <= 0;
-        task_pairing();
+        task_final_exp();
       end
       default: get_next_inst();
     endcase
@@ -302,20 +307,19 @@ bls12_381_pairing_wrapper #(
 bls12_381_pairing_wrapper (
   .i_clk ( i_clk ),
   .i_rst ( i_rst ),
-  .i_val ( pair_i_val ),
-  .o_rdy ( pair_o_rdy ),
-  .i_g1_af ( pair_i_g1 ),
-  .i_g2_af ( pair_i_g2 ),
+  .i_pair_af_if ( pair_i_af_if ),
   .i_mode  ( pair_mode ),
   .i_key   ( pair_key  ),
-  .o_fe12_if    ( pair_o_res_if ),
-  .o_p_jb_if    ( mult_pt_if    ),
-  .o_mul_fe_if  ( mul_in_if[0]  ),
-  .i_mul_fe_if  ( mul_out_if[0] ),
-  .o_inv_fe2_if ( inv_fe2_i_if  ),
-  .i_inv_fe2_if ( inv_fe2_o_if  ),
-  .o_inv_fe_if  ( inv_fe_i_if   ),
-  .i_inv_fe_if  ( inv_fe_o_if   )
+  .o_fe12_if     ( pair_o_res_if ),
+  .o_p_jb_if     ( mult_pt_if    ),
+  .o_mul_fe_if   ( mul_in_if[0]  ),
+  .i_mul_fe_if   ( mul_out_if[0] ),
+  .i_mul_fe12_if ( mul_fe12_o_if ),
+  .o_mul_fe12_if ( mul_fe12_i_if ),
+  .o_inv_fe2_if  ( inv_fe2_i_if  ),
+  .i_inv_fe2_if  ( inv_fe2_o_if  ),
+  .o_inv_fe_if   ( inv_fe_i_if   ),
+  .i_inv_fe_if   ( inv_fe_o_if   )
 );
 
 resource_share # (
@@ -532,6 +536,10 @@ task task_mul_element();
         data_ram_sys_if.a <=  curr_inst.b;
         data_ram_read[0] <= 1;
         cnt <= 2;
+        if (curr_data.pt == FE12) begin
+          cnt <= 8;
+          data_ram_sys_if.a <=  curr_inst.a;
+        end
       end
     end
     2: begin
@@ -552,7 +560,7 @@ task task_mul_element();
         new_data.dat <= mul_out_if[1].dat;
         new_data.pt <= pt_l;
         data_ram_sys_if.we <= 1;
-        cnt <= 8;
+        cnt <= 34;
       end
     end
     3: begin
@@ -617,10 +625,51 @@ task task_mul_element();
         new_data.pt <= pt_l;
         data_ram_sys_if.we <= 1;
         data_ram_sys_if.a <=  curr_inst.c + 1;
-        cnt <= 8;
+        cnt <= 34;
       end
     end
-    8: begin
+    // FE12 multiplication
+    8,9,10,11,12,13,14,15,16,17,18,19,
+    20,21,22,23,24,25,26,27,28,29,30,31: begin
+      mul_fe12_i_if.rdy <= 0;
+      
+
+      if (|data_ram_read[READ_CYCLE:1]== 0 && (~mul_fe12_o_if.val || (mul_fe12_o_if.val && mul_fe12_o_if.rdy))) begin
+        if (data_ram_read[0]) begin
+          data_ram_read[0] <= 1;
+          data_ram_sys_if.a <= curr_inst.b + ((cnt-8)/2);
+        end else begin
+          data_ram_read[0] <= 1;
+          data_ram_sys_if.a <= curr_inst.a + ((cnt-8)/2);
+        end
+      end
+
+      if (data_ram_read[READ_CYCLE]) begin
+       cnt <= cnt + 1;
+        if (cnt % 2 == 1) begin
+          mul_fe12_o_if.sop <= cnt == 9;
+          mul_fe12_o_if.eop <= cnt == 31;
+          mul_fe12_o_if.val <= 1;
+          mul_fe12_o_if.dat[$bits(fe_t) +: $bits(fe_t)] <= curr_data.dat;
+        end else begin
+          mul_fe12_o_if.dat[0 +: $bits(fe_t)] <= curr_data.dat;
+        end
+      end
+    end
+    32: begin
+      mul_fe12_i_if.rdy <= 1;
+      if (mul_fe12_i_if.val && mul_fe12_i_if.rdy) begin
+        if (mul_fe12_i_if.sop)
+          data_ram_sys_if.a <= curr_inst.c;
+        else
+          data_ram_sys_if.a <= data_ram_sys_if.a + 1;
+        data_ram_sys_if.we <= 1;
+        new_data.dat <= mul_fe12_i_if.dat;
+        new_data.pt <= pt_l;
+        if (mul_fe12_i_if.eop) cnt <= cnt + 1;
+      end
+    end
+    33: begin
       get_next_inst();
     end
   endcase
@@ -803,20 +852,19 @@ task task_point_mult();
       end
     end
     1,2,3,4: begin
-      if (data_ram_read[READ_CYCLE]) begin
+      if (|data_ram_read == 0 && (~pair_i_af_if.val || (pair_i_af_if.val && pair_i_af_if.rdy))) begin
         data_ram_read[0] <= 1;
         data_ram_sys_if.a <= data_ram_sys_if.a + 1;
         if (curr_data.pt == FP_AF && cnt % 2 == 0) data_ram_sys_if.a <= data_ram_sys_if.a;
-        case(cnt)
-          1: pair_i_g2.x[0] <= curr_data.dat;
-          2: pair_i_g2.x[1] <= curr_data.pt == FP_AF ? 0 : curr_data.dat;
-          3: pair_i_g2.y[0] <= curr_data.dat;
-          4: pair_i_g2.y[1] <= curr_data.pt == FP_AF ? 0 : curr_data.dat;
-        endcase
+      end
+      if (data_ram_read[READ_CYCLE]) begin
+        pair_i_af_if.val <= 1;
+        pair_i_af_if.sop <= cnt == 1;
+        pair_i_af_if.eop <= cnt == 4;
+        pair_i_af_if.dat <= (curr_data.pt == FP_AF && cnt % 2 == 0) ? 0 : curr_data.dat;
         cnt <= cnt + 1;
         if (cnt == 1) pt_l <= curr_data.pt;
         if (cnt == 4) begin
-          pair_i_val <= 1;
           data_ram_sys_if.a <= curr_inst.c;
         end
       end
@@ -843,136 +891,105 @@ task task_point_mult();
   endcase
 endtask
 
-task task_fp_fpoint_mult();
-  pair_mode <= 1;
+task task_pairing();
   case(cnt) inside
     0: begin
       data_ram_sys_if.a <= curr_inst.a;
       data_ram_read[0] <= 1;
       cnt <= cnt + 1;
     end
-    1: begin
+    // Load G1 affine point
+    1,2: begin
       if (data_ram_read[READ_CYCLE]) begin
-        data_ram_sys_if.a <= curr_inst.b;
-        pair_key <= curr_data.dat;
-        pair_i_g2 <= bls12_381_pkg::g_af_point_fp2;
-        pair_i_val <= 1;
+        data_ram_sys_if.a <= data_ram_sys_if.a + 1;
+        data_ram_read[0] <= 1;
+        pair_i_af_if.dat <= curr_data.dat;
+        pair_i_af_if.val <= 1;
+        pair_i_af_if.sop <= cnt == 1;
+        pair_i_af_if.eop <= cnt == 2;
         cnt <= cnt + 1;
+        if (cnt == 2) begin
+          data_ram_sys_if.a <= curr_inst.b;
+        end
       end
     end
-    // Wait for result
-    2,3,4,5,6,7: begin
-      mult_pt_if.rdy <= 1;
-      if (mult_pt_if.val) begin
-         new_data.pt <= FP_JB;
-         new_data.dat <= mult_pt_if.dat;
-         data_ram_sys_if.we <= 1;
-         if (cnt > 2) data_ram_sys_if.a <= data_ram_sys_if.a + 1;
-         if (cnt % 2 == 1) begin // Odd elements will be 0
-           data_ram_sys_if.a <= data_ram_sys_if.a;
-           data_ram_sys_if.we <= 0;
-         end
-         cnt <= cnt + 1;
-      end
-    end
-    8: begin
-      pair_mode <= 0;
-      get_next_inst();
-    end
-  endcase
-endtask
-
-task task_fp2_fpoint_mult();
-  pair_mode <= 1;
-  case(cnt) inside
-    0: begin
-      data_ram_sys_if.a <= curr_inst.a;
-      data_ram_read[0] <= 1;
-      cnt <= cnt + 1;
-    end
-    1: begin
+    // Load G2 affine point
+    3,4,5,6: begin
       if (data_ram_read[READ_CYCLE]) begin
-        data_ram_sys_if.a <= curr_inst.b;
-        pair_key <= curr_data.dat;
-        pair_i_g2 <= bls12_381_pkg::g2_af_point_fp2;
-        pair_i_val <= 1;
+        data_ram_sys_if.a <= data_ram_sys_if.a + 1;
+        data_ram_read[0] <= 1;
+        pair_i_af_if.dat <= curr_data.dat;
+        pair_i_af_if.val <= 1;
+        pair_i_af_if.sop <= cnt == 3;
+        pair_i_af_if.eop <= cnt == 6;
         cnt <= cnt + 1;
+        if (cnt == 6) begin
+          data_ram_sys_if.a <= curr_inst.c;
+          pair_o_res_if.rdy <= 1;
+          mult_pt_if.rdy <= 1;
+        end
       end
     end
     // Wait for result
-    2,3,4,5,6,7: begin
-      mult_pt_if.rdy <= 1;
-      if (mult_pt_if.val) begin
-         new_data.pt <= FP2_JB;
-         new_data.dat <= mult_pt_if.dat;
+    7: begin
+      if (pair_o_res_if.val || mult_pt_if.val) begin
+         new_data.pt <= FE12;
+         new_data.dat <= pair_o_res_if.val ? pair_o_res_if.dat : mult_pt_if.dat;
          data_ram_sys_if.we <= 1;
-         if (cnt > 2) data_ram_sys_if.a <= data_ram_sys_if.a + 1;
-         cnt <= cnt + 1;
+         if ((pair_o_res_if.val && ~pair_o_res_if.sop) ||
+              (mult_pt_if.val && ~mult_pt_if.sop)) 
+           data_ram_sys_if.a <= data_ram_sys_if.a + 1;         
+         if (pair_o_res_if.eop || mult_pt_if.eop) begin
+           mult_pt_if.rdy <= 0;
+           pair_o_res_if.rdy <= 0;
+           cnt <= cnt + 1;
+         end
       end
     end
     8: begin
-      pair_mode <= 0;
       get_next_inst();
     end
   endcase
 endtask
 
-task task_pairing();
-  pair_mode <= 0;
+task task_final_exp();
+  pair_mode <= 3;
   case(cnt) inside
     0: begin
       data_ram_sys_if.a <= curr_inst.a;
       data_ram_read[0] <= 1;
       cnt <= cnt + 1;
     end
-    // Load G1 affine point
-    1,2: begin
+    // Load FE12
+    1,2,3,4,5,6,7,8,9,10,11,12: begin
       if (data_ram_read[READ_CYCLE]) begin
         data_ram_sys_if.a <= data_ram_sys_if.a + 1;
         data_ram_read[0] <= 1;
-        case(cnt)
-          1: pair_i_g1.x <= curr_data.dat;
-          2: pair_i_g1.y <= curr_data.dat;
-        endcase
+        pair_i_af_if.dat <= curr_data.dat;
+        pair_i_af_if.val <= 1;
+        pair_i_af_if.sop <= cnt == 1;
+        pair_i_af_if.eop <= cnt == 12;
         cnt <= cnt + 1;
-        if (cnt == 2) begin
+        if (cnt == 12) begin
           data_ram_sys_if.a <= curr_inst.b;
         end
       end
     end
-    // Load G2 affine point
-    3,4,5,6: begin
-      if (data_ram_read[READ_CYCLE]) begin
-        data_ram_sys_if.a <= data_ram_sys_if.a + 1;
-        data_ram_read[0] <= 1;
-        case(cnt)
-          3: pair_i_g2.x[0] <= curr_data.dat;
-          4: pair_i_g2.x[1] <= curr_data.dat;
-          5: pair_i_g2.y[0] <= curr_data.dat;
-          6: pair_i_g2.y[1] <= curr_data.dat;
-        endcase
-        cnt <= cnt + 1;
-        if (cnt == 6) begin
-          data_ram_sys_if.a <= curr_inst.c;
-          pair_i_val <= 1;
-          pair_o_res_if.rdy <= 1;
-        end
-      end
-    end
     // Wait for result
-    7,8,9,10,11,12,13,14,15,16,17,18: begin
-      if (pair_o_res_if.val) begin
+    13: begin
+      pair_o_res_if.rdy <= 1;
+      if (pair_o_res_if.val && pair_o_res_if.rdy) begin
          new_data.pt <= FE12;
          new_data.dat <= pair_o_res_if.dat;
          data_ram_sys_if.we <= 1;
-         if (cnt > 7) data_ram_sys_if.a <= data_ram_sys_if.a + 1;
-         cnt <= cnt + 1;
-         if (cnt == 18) begin
+         if (~pair_o_res_if.sop) data_ram_sys_if.a <= data_ram_sys_if.a + 1;
+         if (pair_o_res_if.eop) begin
+           cnt <= cnt + 1;
            pair_o_res_if.rdy <= 0;
          end
       end
     end
-    19: begin
+    14: begin
       get_next_inst();
     end
   endcase
diff --git a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
index c6881ba..7ab1ed2 100644
--- a/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
+++ b/zcash_fpga/src/rtl/top/zcash_fpga_pkg.sv
@@ -27,7 +27,7 @@ package zcash_fpga_pkg;
 
   import bls12_381_pkg::point_type_t;
 
-  parameter FPGA_VERSION = 32'h01_02_02;  //v1.2.2
+  parameter FPGA_VERSION = 32'h01_03_00;  //v1.3.0
 
   // What features are enabled in this build
   parameter bit ENB_VERIFY_SECP256K1_SIG = 1;
diff --git a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
index 1b7a3de..d74171e 100644
--- a/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_pairing_tb.sv
@@ -46,13 +46,18 @@ initial begin
   forever #(CLK_PERIOD/2) clk = ~clk;
 end
 
-if_axi_stream #(.DAT_BYTS(($bits(af_point_t) + $bits(fp2_af_point_t)+7)/8), .CTL_BITS(CTL_BITS)) in_if(clk);
+logic [1:0] mode;
 if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) out_if(clk);
 
 if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe_o_if(clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe_i_if(clk);
 
-if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   o_p_jb_if(clk);
+if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) mul_fe12_o_if(clk);
+if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS))   mul_fe12_i_if(clk);
+
+if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS))   o_p_jb_if(clk);
+
+if_axi_stream #(.DAT_BYTS(($bits(FE_TYPE)+7)/8), .CTL_BITS(CTL_BITS)) pair_af_if(clk);
 
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_o_if(clk);
 if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BITS)) inv_fe_i_if(clk);
@@ -78,20 +83,19 @@ bls12_381_pairing_wrapper #(
 bls12_381_pairing_wrapper (
   .i_clk ( clk ),
   .i_rst ( rst ),
-  .i_val ( in_if.val ),
-  .o_rdy ( in_if.rdy ),
-  .i_g1_af ( in_if.dat[0 +: $bits(af_point_t)] ),
-  .i_g2_af ( in_if.dat[$bits(af_point_t) +: $bits(fp2_af_point_t)] ),
-  .i_mode    ( 1'd0   ),
+  .i_pair_af_if ( pair_af_if ),
+  .i_mode    ( mode  ),
   .i_key     ( 381'd0 ),
   .o_p_jb_if ( o_p_jb_if ),
   .o_fe12_if ( out_if ),
   .o_mul_fe_if ( mul_fe_o_if ),
   .i_mul_fe_if ( mul_fe_i_if ),
+  .o_mul_fe12_if ( mul_fe12_i_if ),
+  .i_mul_fe12_if ( mul_fe12_o_if ),  
   .o_inv_fe2_if ( inv_fe2_i_if  ),
   .i_inv_fe2_if ( inv_fe2_o_if  ),
   .o_inv_fe_if  ( inv_fe_i_if   ),
-  .i_inv_fe_if  ( inv_fe_o_if   )  
+  .i_inv_fe_if  ( inv_fe_o_if   )
 );
 
 // This just tests our software model vs a known good result
@@ -126,16 +130,30 @@ endtask
 task test1(input af_point_t G1_p, fp2_af_point_t G2_p);
 begin
   integer signed get_len;
-  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in0, dat_in1, get_dat;
   integer start_time, finish_time;
   FE12_TYPE  f_out, f_exp;
   $display("Running test1 ...");
+  
+  dat_in0 = 0;
+  dat_in0[0*384 +: $bits(FE_TYPE)] = G1_p.x;
+  dat_in0[1*384 +: $bits(FE_TYPE)] = G1_p.y;
+  
+  dat_in1 = 0;
+  dat_in1[0*384 +: $bits(FE_TYPE)] = G2_p.x[0];
+  dat_in1[1*384 +: $bits(FE_TYPE)] = G2_p.x[1];
+  dat_in1[2*384 +: $bits(FE_TYPE)] = G2_p.y[0];
+  dat_in1[3*384 +: $bits(FE_TYPE)] = G2_p.y[1];
+  mode = 0;
 
   ate_pairing(G1_p, G2_p, f_exp);
 
   start_time = $time;
   fork
-    in_if.put_stream({G2_p, G1_p}, (($bits(af_point_t) + $bits(fp2_af_point_t))+7)/8);
+    begin
+      pair_af_if.put_stream(dat_in0, (($bits(af_point_t))+7)/8);
+      pair_af_if.put_stream(dat_in1, (($bits(fp2_af_point_t))+7)/8);
+    end
     out_if.get_stream(get_dat, get_len);
   join
   finish_time = $time;
@@ -145,7 +163,7 @@ begin
       for (int k = 0; k < 2; k++)
         f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
 
-  
+
 
   $display("Expected:");
   print_fe12(f_exp);
@@ -165,32 +183,46 @@ endtask;
 task test_linear();
 begin
   integer signed get_len;
-  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in0, dat_in1, get_dat;
   integer start_time, finish_time, n;
   FE12_TYPE  f_out, f_exp0, f_exp1;
   af_point_t G1_a, G1_a_n;
   fp2_af_point_t G1_j, G2_a_n;
   fp2_af_point_t G2_a;
   fp2_jb_point_t G2_j;
-  
+
   $display("Running test_linear ...");
-  
+
   G1_a = {Gy, Gx};
   G2_a = {G2y, G2x};
   G1_j = {381'd1, Gy, Gx};
-  G2_j = {381'd1, G2y, G2x};  
+  G2_j = {381'd1, G2y, G2x};
   n = 2;
   G1_a_n = to_affine(point_mult(n, G1_j));
   G2_a_n = fp2_to_affine(fp2_point_mult(n, G2_j));
-  
+
   ate_pairing(G1_a, G2_a_n, f_exp0);
   ate_pairing(G1_a_n, G2_a, f_exp1);
-  
+
   assert(f_exp0 == f_exp1) else $fatal(1, "Error in test_linear with sw model");
+
+  dat_in0 = 0;
+  dat_in0[0*384 +: $bits(FE_TYPE)] = G1_a_n.x;
+  dat_in0[1*384 +: $bits(FE_TYPE)] = G1_a_n.y;
+  
+  dat_in1 = 0;
+  dat_in1[0*384 +: $bits(FE_TYPE)] = G2_a.x[0];
+  dat_in1[1*384 +: $bits(FE_TYPE)] = G2_a.x[1];
+  dat_in1[2*384 +: $bits(FE_TYPE)] = G2_a.y[0];
+  dat_in1[3*384 +: $bits(FE_TYPE)] = G2_a.y[1];
+  mode = 0;
   
   start_time = $time;
   fork
-    in_if.put_stream({G2_a, G1_a_n}, (($bits(af_point_t) + $bits(fp2_af_point_t))+7)/8);
+    begin
+      pair_af_if.put_stream(dat_in0, (($bits(af_point_t))+7)/8);
+      pair_af_if.put_stream(dat_in1, (($bits(fp2_af_point_t))+7)/8);
+    end
     out_if.get_stream(get_dat, get_len);
   join
   finish_time = $time;
@@ -215,19 +247,80 @@ begin
 end
 endtask;
 
+task test_miller_only();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] dat_in0, dat_in1, get_dat;
+  integer start_time, finish_time, n;
+  FE12_TYPE  f_out, f_exp0;
+  af_point_t G1_a;
+  fp2_af_point_t G2_a;
+
+  $display("Running test_miller_only ...");
+
+  G1_a = {Gy, Gx};
+  G2_a = {G2y, G2x};
+  miller_loop(G1_a, G2_a, f_exp0);
+
+  dat_in0 = 0;
+  dat_in0[0*384 +: $bits(FE_TYPE)] = G1_a.x;
+  dat_in0[1*384 +: $bits(FE_TYPE)] = G1_a.y;
+  
+  dat_in1 = 0;
+  dat_in1[0*384 +: $bits(FE_TYPE)] = G2_a.x[0];
+  dat_in1[1*384 +: $bits(FE_TYPE)] = G2_a.x[1];
+  dat_in1[2*384 +: $bits(FE_TYPE)] = G2_a.y[0];
+  dat_in1[3*384 +: $bits(FE_TYPE)] = G2_a.y[1];
+  mode = 2;
+  
+  start_time = $time;
+  fork
+    begin
+      pair_af_if.put_stream(dat_in0, (($bits(af_point_t))+7)/8);
+      pair_af_if.put_stream(dat_in1, (($bits(fp2_af_point_t))+7)/8);
+    end
+    o_p_jb_if.get_stream(get_dat, get_len);
+  join
+  finish_time = $time;
+
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+        f_out[i][j][k] = get_dat[(i*6+j*2+k)*384 +: $bits(FE_TYPE)];
+
+  $display("Expected:");
+  print_fe12(f_exp0);
+  $display("Was:");
+  print_fe12(f_out);
+
+  $display("test_miller_only finished in %d clocks", (finish_time-start_time)/(CLK_PERIOD));
+
+  if (f_exp0 != f_out) begin
+    $fatal(1, "%m %t ERROR: output was wrong", $time);
+  end
+
+  $display("test_miller_only PASSED");
+end
+endtask;
+
 initial begin
-  in_if.reset_source();
+  mul_fe12_o_if.reset_source();
+  mul_fe12_i_if.rdy = 0;
+  o_p_jb_if.rdy = 0;
+  pair_af_if.reset_source();
   inv_fe2_o_if.reset_source();
   inv_fe_o_if.reset_source();
   inv_fe2_i_if.rdy = 0;
   inv_fe_i_if.rdy = 0;
   out_if.rdy = 0;
+  mode = 0;
   #100ns;
 
-  test0(); // Test SW model
+  test0();       // Test SW model
   test1(G1, G2); // Pairing of generators
   test_linear(); // test linear properties e(n*G1,G2) == e(G1, n*G2), ...
-  
+  test_miller_only();
+
   #1us $finish();
 end
 
diff --git a/zcash_fpga/src/tb/bls12_381_top_tb.sv b/zcash_fpga/src/tb/bls12_381_top_tb.sv
index 28b28c6..8076f88 100644
--- a/zcash_fpga/src/tb/bls12_381_top_tb.sv
+++ b/zcash_fpga/src/tb/bls12_381_top_tb.sv
@@ -51,175 +51,6 @@ bls12_381_top bls12_381_top (
   .axi_lite_if ( axi_lite_if )
 );
 
-
-task test_fp_fpoint_mult();
-begin
-  integer signed get_len;
-  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
-  inst_t inst;
-  logic failed;
-  data_t data;
-  logic [31:0] rdata;
-  jb_point_t out_p, exp_p;
-  logic [DAT_BITS-1:0] in_k;
-  bls12_381_interrupt_rpl_t interrupt_rpl;
-  
-  failed = 0;
-  in_k = 1 << 379;
-  exp_p =  point_mult(in_k, g_point);
-
-  $display("Running test_fp_fpoint_mult...");
-
-  axi_lite_if.peek(.addr(0), .data(rdata));
-  assert(rdata == INST_AXIL_START) else $fatal("ERROR: AXI lite register returned wrong value");
-
-  axi_lite_if.peek(.addr(4), .data(rdata));
-  assert(rdata == DATA_AXIL_START) else $fatal("ERROR: AXI lite register returned wrong value");
-
-  axi_lite_if.peek(.addr(8), .data(rdata));
-  assert(rdata == DATA_RAM_DEPTH) else $fatal("ERROR: AXI lite register returned wrong value");
-
-  axi_lite_if.peek(.addr(12), .data(rdata));
-  assert(rdata == INST_RAM_DEPTH) else $fatal("ERROR: AXI lite register returned wrong value");
-
-  data = '{dat:in_k, pt:SCALAR};
-  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START), .len(48));
-
-  inst = '{code:SEND_INTERRUPT, a:16'd1, b:16'hbeef, c:16'd0};
-  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + 8), .len(8));
-
-
-  // Write slot 0 to start
-  inst = '{code:FP_FPOINT_MULT, a:16'd0, b:16'd1, c:16'd0};
-  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START), .len(8));
-
-  fork
-    begin
-      out_if.get_stream(get_dat, get_len, 50);
-      interrupt_rpl = get_dat;
-
-      assert(interrupt_rpl.hdr.cmd == BLS12_381_INTERRUPT_RPL) else $fatal(1, "ERROR: Received non-interrupt message");
-      assert(interrupt_rpl.index == 16'hbeef) else $fatal(1, "ERROR: Received wrong index value in message");
-      assert(interrupt_rpl.data_type == FP_JB) else $fatal(1, "ERROR: Received wrong data type value in message");
-
-      get_dat = get_dat >> $bits(bls12_381_interrupt_rpl_t);
-
-      for (int i = 0; i < 3; i++)
-        out_p[i*381 +: 381] = get_dat[i*(48*8) +: 381];
-
-      if (to_affine(out_p) == to_affine(exp_p)) begin
-        $display("INFO: Output point matched expected:");
-        print_jb_point(out_p);
-      end else begin
-        $display("ERROR: Output point did NOT match expected:");
-        print_jb_point(out_p);
-        $display("Expected:");
-        print_jb_point(exp_p);
-        failed = 1;
-      end
-    end
-    begin
-      repeat(1000000) @(posedge out_if.i_clk);
-      $fatal("ERROR: Timeout while waiting for result");
-    end
-  join_any
-  disable fork;
-
-  axi_lite_if.peek(.addr(32'h14), .data(rdata));
-  $display("INFO: Last cycle count was %d", rdata);
-
-  if(failed)
-   $fatal(1, "ERROR: test_fp_fpoint_mult FAILED");
-  else
-   $display("INFO: test_fp_fpoint_mult PASSED");
-end
-endtask;
-
-task test_fp2_fpoint_mult();
-begin
-  integer signed get_len;
-  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
-  inst_t inst;
-  logic failed;
-  data_t data;
-  logic [31:0] rdata;
-  fp2_jb_point_t out_p, exp_p;
-  logic [DAT_BITS-1:0] in_k;
-  bls12_381_interrupt_rpl_t interrupt_rpl;
-
-  failed = 0;
-  in_k = 381'h33333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333;
-  exp_p =  fp2_point_mult(in_k, g2_point);
-  $display("Running test_fp2_fpoint_mult...");
-
-  // See what current instruction pointer is
-  axi_lite_if.peek(.addr(32'h10), .data(rdata));
-
-  data = '{dat:in_k, pt:SCALAR};
-  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + 64), .len(48));  // Scalar to multiply by goes in data slot 1
-
-  inst = '{code:SEND_INTERRUPT, a:16'd3, b:16'habcd, c:16'd0};
-  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+1)*8), .len(8));
-
-  // Write to current slot to start
-  inst = '{code:FP2_FPOINT_MULT, a:16'd1, b:16'd3, c:16'd0};
-  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata)*8), .len(8));
-
-  fork
-    begin
-      out_if.get_stream(get_dat, get_len, 0);
-      interrupt_rpl = get_dat;
-
-      assert(interrupt_rpl.hdr.cmd == BLS12_381_INTERRUPT_RPL) else $fatal(1, "ERROR: Received non-interrupt message");
-      assert(interrupt_rpl.index == 16'habcd) else $fatal(1, "ERROR: Received wrong index value in message");
-      assert(interrupt_rpl.data_type == FP2_JB) else $fatal(1, "ERROR: Received wrong data type value in message");
-
-      get_dat = get_dat >> $bits(bls12_381_interrupt_rpl_t);
-
-      for (int i = 0; i < 6; i++)
-        out_p[i*381 +: 381] = get_dat[i*(48*8) +: 381];
-
-      if (fp2_to_affine(out_p) == fp2_to_affine(exp_p)) begin
-        $display("INFO: Output point matched expected:");
-        print_fp2_jb_point(out_p);
-      end else begin
-        $display("ERROR: Output point did NOT match expected:");
-        print_fp2_jb_point(out_p);
-        $display("Expected:");
-        print_fp2_jb_point(exp_p);
-        failed = 1;
-      end
-    end
-    begin
-      repeat(1000000) @(posedge out_if.i_clk);
-      $fatal("ERROR: Timeout while waiting for result");
-    end
-  join_any
-  disable fork;
-
-  axi_lite_if.peek(.addr(32'h14), .data(rdata));
-  $display("INFO: Last cycle count was %d", rdata);
-
-  // See what current instruction pointer is
-  axi_lite_if.peek(.addr(32'h10), .data(rdata));
-
-  $display("INFO: Current instruction pointer is 0x%x, setting to 0 and writing NULL instruction", rdata);
-
-  inst = '{code:NOOP_WAIT, a:16'd0, b:16'h0, c:16'd0};
-  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START), .len(8));
-
-  axi_lite_if.poke(.addr(32'h10), .data(32'd0));
-  repeat(10) @(posedge clk);
-  axi_lite_if.peek(.addr(32'h10), .data(rdata));
-  assert(rdata == 32'd0) else $fatal(1, "ERROR: could not set instruction pointer");
-
-  if(failed)
-   $fatal(1, "ERROR: test_fp2_fpoint_mult FAILED");
-  else
-   $display("INFO: test_fp2_fpoint_mult PASSED");
-end
-endtask;
-
 task test_inv_element();
   integer signed get_len;
   logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
@@ -232,7 +63,7 @@ task test_inv_element();
   bls12_381_interrupt_rpl_t interrupt_rpl;
 
   failed = 0;
-  in = random_vector(384/8) % P;
+  in = (1 + random_vector(384/8)) % P;
   exp =  fe_inv(in);
   $display("Running test_inv_element...");
   $display("First trying FE element ...");
@@ -766,9 +597,9 @@ task test_control_logic();
   failed = 0;
   in = 0;
   in[7:0] = $random();
-  
+
   $display("Running test_control_logic...");
-  
+
   //Reset the RAM
   axi_lite_if.poke(.addr(32'h0), .data(2'b11));
 
@@ -780,7 +611,7 @@ task test_control_logic();
   axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + 1*64), .len(48));
   data = '{dat:381'd1, pt:SCALAR};
   axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + 2*64), .len(48));
-  
+
   inst = '{code:SEND_INTERRUPT, a:16'd3, b:16'h8888, c:16'd0};
   axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + 7*8), .len(8));
 
@@ -797,7 +628,7 @@ task test_control_logic();
   axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + 4*8), .len(8));
 
   axi_lite_if.poke(.addr(32'h10), .data(1));
-  
+
   fork
     begin
       out_if.get_stream(get_dat, get_len, 0);
@@ -839,19 +670,145 @@ task test_control_logic();
 
 endtask;
 
+task test_multi_pairing();
+begin
+  integer signed get_len;
+  logic [common_pkg::MAX_SIM_BYTS*8-1:0] get_dat;
+  inst_t inst;
+  logic failed;
+  data_t data;
+  logic [31:0] rdata;
+  logic [DAT_BITS-1:0] in_k;
+  bls12_381_interrupt_rpl_t interrupt_rpl;
+  fe12_t  f_out, f_exp0, f_exp1;
+  af_point_t G1_p;
+  fp2_af_point_t G2_p;
+  fp2_jb_point_t R;
+  failed = 0;
+
+  G1_p = {Gy, Gx};
+  G2_p = {bls12_381_pkg::G2y, bls12_381_pkg::G2x};
+
+  miller_loop(G1_p, G2_p, f_exp0);
+  miller_loop(G1_p, G2_p, f_exp1);
+  f_exp0 = fe12_mul(f_exp0, f_exp1);
+  final_exponent(f_exp0);
+  
+  $display("Running test_multi_pairing...");
+
+  // See what current instruction pointer is
+  axi_lite_if.peek(.addr(32'h10), .data(rdata));
+  
+  // First load generator points into memory
+  // G1 = ((1 << DATA_RAM_DEPTH) -1 -6)
+  // G1 = ((1 << DATA_RAM_DEPTH) -1 -4)
+  
+  // G1
+  data = '{dat:G1_p.x, pt:FP_AF};
+  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -6)*64), .len(48));
+  data = '{dat:G1_p.y, pt:FP_AF};
+  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -5)*64), .len(48));
+
+  // G2
+  data = '{dat:G2_p.x[0], pt:FP2_AF};
+  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -4)*64), .len(48));
+  data = '{dat:G2_p.x[1], pt:FP2_AF};
+  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -3)*64), .len(48));
+
+  data = '{dat:G2_p.y[0], pt:FP2_AF};
+  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -2)*64), .len(48));
+  data = '{dat:G2_p.y[1], pt:FP2_AF};
+  axi_lite_if.put_data_multiple(.data(data), .addr(DATA_AXIL_START + ((1 << DATA_RAM_DEPTH) -1 -1)*64), .len(48));
+    
+  // Program instruction memory  
+    
+  // Do two miller loops
+  inst = '{code:MILLER_LOOP, a:((1 << DATA_RAM_DEPTH) -1 -6), b:((1 << DATA_RAM_DEPTH) -1 -4), c:16'd0};
+  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+1)*8), .len(8));
+  inst = '{code:MILLER_LOOP, a:((1 << DATA_RAM_DEPTH) -1 -6), b:((1 << DATA_RAM_DEPTH) -1 -4), c:16'd12};
+  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+2)*8), .len(8));  
+  
+  // Multiply result
+  inst = '{code:MUL_ELEMENT , a:16'd0, b:16'd12, c:16'd0};
+  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+3)*8), .len(8)); 
+  
+  // Do final exp.
+  inst = '{code:FINAL_EXP , a:16'd0, b:16'd0, c:16'd0};
+  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+4)*8), .len(8));   
+  
+  inst = '{code:SEND_INTERRUPT, a:16'd0, b:16'h4321, c:16'd0};
+  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START + (rdata+5)*8), .len(8));
+  
+  axi_lite_if.poke(.addr(32'h10), .data(rdata+1));
+  
+  fork
+    begin
+      out_if.get_stream(get_dat, get_len, 0);
+      interrupt_rpl = get_dat;
+
+      assert(interrupt_rpl.hdr.cmd == BLS12_381_INTERRUPT_RPL) else $fatal(1, "ERROR: Received non-interrupt message");
+      assert(interrupt_rpl.index == 16'h4321) else $fatal(1, "ERROR: Received wrong index value in message");
+      assert(interrupt_rpl.data_type == FE12) else $fatal(1, "ERROR: Received wrong data type value in message");
+
+      get_dat = get_dat >> $bits(bls12_381_interrupt_rpl_t);
+
+      for (int i = 0; i < 2; i++)
+        for (int j = 0; j < 3; j++)
+          for (int k = 0; k < 2; k++)
+            f_out[i][j][k] = get_dat[(i*6+j*2+k)*(48*8) +: 381];
+
+      if (f_out == f_exp0) begin
+        $display("INFO: Output matched expected:");
+        print_fe12(f_out);
+      end else begin
+        $display("ERROR: Output did NOT match expected:");
+        print_fe12(f_out);
+        $display("Expected:");
+        print_fe12(f_exp0);
+        failed = 1;
+      end
+    end
+    begin
+      repeat(1000000) @(posedge out_if.i_clk);
+      $fatal("ERROR: Timeout while waiting for result");
+    end
+  join_any
+  disable fork;
+
+  axi_lite_if.peek(.addr(32'h14), .data(rdata));
+  $display("INFO: Last cycle count was %d", rdata);
+
+  // See what current instruction pointer is
+  axi_lite_if.peek(.addr(32'h10), .data(rdata));
+
+  $display("INFO: Current instruction pointer is 0x%x, setting to 0 and writing NULL instruction", rdata);
+
+  inst = '{code:NOOP_WAIT, a:16'd0, b:16'h0, c:16'd0};
+  axi_lite_if.put_data_multiple(.data(inst), .addr(INST_AXIL_START), .len(8));
+
+  axi_lite_if.poke(.addr(32'h10), .data(32'd0));
+  repeat(10) @(posedge clk);
+  axi_lite_if.peek(.addr(32'h10), .data(rdata));
+  assert(rdata == 32'd0) else $fatal(1, "ERROR: could not set instruction pointer");
+
+  if(failed)
+   $fatal(1, "ERROR: test_multi_pairing FAILED");
+  else
+   $display("INFO: test_multi_pairing PASSED");
+end
+endtask;
+
 initial begin
   axi_lite_if.reset_source();
   out_if.rdy = 0;
   #100ns;
 
-  test_fp_fpoint_mult();
-  test_fp2_fpoint_mult();
   test_inv_element();
   test_mul_add_sub_element();
   test_point_mult();
   test_pairing();
   test_control_logic();
-
+  test_multi_pairing();
 
   #1us $finish();
 end