From 2719f4e3d0a14900d35716b2709bddf0d26e3aa7 Mon Sep 17 00:00:00 2001 From: bsdevlin Date: Tue, 16 Jul 2019 17:42:41 +0800 Subject: [PATCH] Fix control and add multiplication to fp6 --- ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv | 24 +- ip_cores/ec/src/rtl/ec_fp6_arithmetic.sv | 447 ++++++++++++++++------- 2 files changed, 339 insertions(+), 132 deletions(-) diff --git a/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv b/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv index 1ae9b57..95254e3 100644 --- a/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv +++ b/ip_cores/ec/src/rtl/ec_fp2_arithmetic.sv @@ -24,7 +24,7 @@ module ec_fe2_arithmetic #( parameter type FE_TYPE, parameter type FE2_TYPE, - parameter CTL_BIT = 8 // From this bit 2 bits are used for control + parameter CTL_BIT = 8 // From this bit 4 bits are used for control )( input i_clk, i_rst, input i_fp_mode, // If this bit is high then we operate in fp mode @@ -48,11 +48,11 @@ module ec_fe2_arithmetic if_axi_stream.sink i_sub_fe2_if ); -if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) add_if_fe_i [1:0] (i_clk); -if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) add_if_fe_o [1:0] (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) add_if_fe_i [1:0] (i_clk); +if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) add_if_fe_o [1:0] (i_clk); -if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(16)) sub_if_fe_i [1:0] (i_clk); -if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(16)) sub_if_fe_o [1:0] (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) sub_if_fe_i [1:0] (i_clk); +if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) sub_if_fe_o [1:0] (i_clk); logic fp_mode_add, fp_mode_sub, fp_mode_mul; @@ -71,6 +71,9 @@ always_ff @ (posedge i_clk) begin fp_mode_add <= 0; end else begin fp_mode_add <= i_fp_mode; + + o_add_fe2_if.sop <= 1; + o_add_fe2_if.eop <= 1; if (add_if_fe_o[0].val && add_if_fe_o[0].rdy) add_if_fe_o[0].val <= 0; if (o_add_fe2_if.val && o_add_fe2_if.rdy) o_add_fe2_if.val <= 0; @@ -127,6 +130,9 @@ always_ff @ (posedge i_clk) begin fp_mode_sub <= 0; end else begin + o_sub_fe2_if.sop <= 1; + o_sub_fe2_if.eop <= 1; + fp_mode_sub <= i_fp_mode; if (sub_if_fe_o[0].val && sub_if_fe_o[0].rdy) sub_if_fe_o[0].val <= 0; @@ -295,8 +301,8 @@ end resource_share # ( .NUM_IN ( 2 ), .DAT_BITS ( 2*$bits(FE_TYPE) ), - .CTL_BITS ( 16 ), - .OVR_WRT_BIT ( 12 ), + .CTL_BITS ( CTL_BIT + 4 ), + .OVR_WRT_BIT ( CTL_BIT + 2 ), .PIPELINE_IN ( 0 ), .PIPELINE_OUT ( 0 ) ) @@ -312,8 +318,8 @@ resource_share_sub ( resource_share # ( .NUM_IN ( 2 ), .DAT_BITS ( 2*$bits(FE_TYPE) ), - .CTL_BITS ( 16 ), - .OVR_WRT_BIT ( 12 ), + .CTL_BITS ( CTL_BIT + 4 ), + .OVR_WRT_BIT ( CTL_BIT + 2 ), .PIPELINE_IN ( 0 ), .PIPELINE_OUT ( 0 ) ) diff --git a/ip_cores/ec/src/rtl/ec_fp6_arithmetic.sv b/ip_cores/ec/src/rtl/ec_fp6_arithmetic.sv index c7c2408..348111d 100644 --- a/ip_cores/ec/src/rtl/ec_fp6_arithmetic.sv +++ b/ip_cores/ec/src/rtl/ec_fp6_arithmetic.sv @@ -3,6 +3,7 @@ Fp^6 point logic (adding, subtracting, multiplication), over a Fp2 tower. Fq6 is constructed as Fq2(v) / (v3 - ξ) where ξ = u + 1 + TODO: Input control should be added to allow for sparse multiplication. Copyright (C) 2019 Benjamin Devlin and Zcash Foundation @@ -24,7 +25,7 @@ module ec_fe6_arithmetic #( parameter type FE2_TYPE, parameter type FE6_TYPE, - parameter CTL_BIT = 8 // From this bit 2 bits are used for internal control, 2 bits for resource sharing + parameter CTL_BIT = 8 // From this bit 4 bits are used for internal control, 2 bits for resource sharing - 6 total )( input i_clk, i_rst, // Interface to FE2_TYPE multiplier (mod P) @@ -36,6 +37,9 @@ module ec_fe6_arithmetic // Interface to FE2_TYPE subtractor (mod P) if_axi_stream.source o_sub_fe2_if, if_axi_stream.sink i_sub_fe2_if, + // Interface to FE2_TYPE multiply by non-residue + if_axi_stream.source o_mnr_fe2_if, + if_axi_stream.sink i_mnr_fe2_if, // Interface to FE6_TYPE multiplier (mod P) if_axi_stream.source o_mul_fe6_if, if_axi_stream.sink i_mul_fe6_if, @@ -47,12 +51,11 @@ module ec_fe6_arithmetic if_axi_stream.sink i_sub_fe6_if ); -if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) add_if_fe2_i [1:0] (i_clk); -if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) add_if_fe2_o [1:0] (i_clk); - -if_axi_stream #(.DAT_BITS($bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) sub_if_fe2_i [1:0] (i_clk); -if_axi_stream #(.DAT_BITS(2*$bits(FE_TYPE)), .CTL_BITS(CTL_BIT+4)) sub_if_fe2_o [1:0] (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BIT+6)) add_if_fe2_i [1:0] (i_clk); +if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BIT+6)) add_if_fe2_o [1:0] (i_clk); +if_axi_stream #(.DAT_BITS($bits(FE2_TYPE)), .CTL_BITS(CTL_BIT+6)) sub_if_fe2_i [1:0] (i_clk); +if_axi_stream #(.DAT_BITS(2*$bits(FE2_TYPE)), .CTL_BITS(CTL_BIT+6)) sub_if_fe2_o [1:0] (i_clk); // Point addtions are simple additions on each of the Fp2 elements logic [1:0] add_cnt; @@ -68,6 +71,9 @@ always_ff @ (posedge i_clk) begin add_if_fe2_o[0].reset_source(); end else begin + o_add_fe6_if.sop <= 1; + o_add_fe6_if.eop <= 1; + if (add_if_fe2_o[0].val && add_if_fe2_o[0].rdy) add_if_fe2_o[0].val <= 0; if (o_add_fe6_if.val && o_add_fe6_if.rdy) o_add_fe6_if.val <= 0; @@ -78,7 +84,7 @@ always_ff @ (posedge i_clk) begin add_if_fe2_o[0].copy_if({i_add_fe6_if.dat[0 +: $bits(FE2_TYPE)], i_add_fe6_if.dat[$bits(FE6_TYPE) +: $bits(FE2_TYPE)]}, i_add_fe6_if.val, 1, 1, i_add_fe6_if.err, i_add_fe6_if.mod, i_add_fe6_if.ctl); - add_if_fe2_o[0].ctl[CTL_BIT] <= add_cnt; + add_if_fe2_o[0].ctl[CTL_BIT +: 2] <= add_cnt; if (i_add_fe6_if.val) add_cnt <= 1; end end @@ -87,7 +93,7 @@ always_ff @ (posedge i_clk) begin add_if_fe2_o[0].copy_if({i_add_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)], i_add_fe6_if.dat[$bits(FE6_TYPE)+$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, i_add_fe6_if.val, 1, 1, i_add_fe6_if.err, i_add_fe6_if.mod, i_add_fe6_if.ctl); - add_if_fe2_o[0].ctl[CTL_BIT] <= add_cnt; + add_if_fe2_o[0].ctl[CTL_BIT +: 2] <= add_cnt; if (i_add_fe6_if.val) add_cnt <= 2; end end @@ -96,7 +102,7 @@ always_ff @ (posedge i_clk) begin add_if_fe2_o[0].copy_if({i_add_fe6_if.dat[2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)], i_add_fe6_if.dat[$bits(FE6_TYPE)+2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, i_add_fe6_if.val, 1, 1, i_add_fe6_if.err, i_add_fe6_if.mod, i_add_fe6_if.ctl); - add_if_fe2_o[0].ctl[CTL_BIT] <= add_cnt; + add_if_fe2_o[0].ctl[CTL_BIT +: 2] <= add_cnt; if (i_add_fe6_if.val) add_cnt <= 0; end end @@ -105,10 +111,10 @@ always_ff @ (posedge i_clk) begin // One process to assign outputs if (~o_add_fe6_if.val || (o_add_fe6_if.val && o_add_fe6_if.rdy)) begin o_add_fe6_if.ctl <= add_if_fe2_i[0].ctl; - if (add_if_fe2_i[0].ctl[CTL_BIT] == 0) begin + if (add_if_fe2_i[0].ctl[CTL_BIT +: 2] == 0) begin if (add_if_fe2_i[0].val) o_add_fe6_if.dat[0 +: $bits(FE2_TYPE)] <= add_if_fe2_i[0].dat; - end else if (add_if_fe2_i[0].ctl[CTL_BIT] == 1) begin + end else if (add_if_fe2_i[0].ctl[CTL_BIT +: 2] == 1) begin if (add_if_fe2_i[0].val) o_add_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= add_if_fe2_i[0].dat; end else begin @@ -133,17 +139,19 @@ always_ff @ (posedge i_clk) begin sub_if_fe2_o[0].reset_source(); end else begin + o_sub_fe6_if.sop <= 1; + o_sub_fe6_if.eop <= 1; + if (sub_if_fe2_o[0].val && sub_if_fe2_o[0].rdy) sub_if_fe2_o[0].val <= 0; if (o_sub_fe6_if.val && o_sub_fe6_if.rdy) o_sub_fe6_if.val <= 0; - // One process to parse inputs and send them to the adder case(sub_cnt) 0: begin if (~sub_if_fe2_o[0].val || (sub_if_fe2_o[0].val && sub_if_fe2_o[0].rdy)) begin sub_if_fe2_o[0].copy_if({i_sub_fe6_if.dat[0 +: $bits(FE2_TYPE)], i_sub_fe6_if.dat[$bits(FE6_TYPE) +: $bits(FE2_TYPE)]}, i_sub_fe6_if.val, 1, 1, i_sub_fe6_if.err, i_sub_fe6_if.mod, i_sub_fe6_if.ctl); - sub_if_fe2_o[0].ctl[CTL_BIT] <= sub_cnt; + sub_if_fe2_o[0].ctl[CTL_BIT +: 2] <= sub_cnt; if (i_sub_fe6_if.val) sub_cnt <= 1; end end @@ -152,7 +160,7 @@ always_ff @ (posedge i_clk) begin sub_if_fe2_o[0].copy_if({i_sub_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)], i_sub_fe6_if.dat[$bits(FE6_TYPE)+$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, i_sub_fe6_if.val, 1, 1, i_sub_fe6_if.err, i_sub_fe6_if.mod, i_sub_fe6_if.ctl); - sub_if_fe2_o[0].ctl[CTL_BIT] <= sub_cnt; + sub_if_fe2_o[0].ctl[CTL_BIT +: 2] <= sub_cnt; if (i_sub_fe6_if.val) sub_cnt <= 2; end end @@ -161,7 +169,7 @@ always_ff @ (posedge i_clk) begin sub_if_fe2_o[0].copy_if({i_sub_fe6_if.dat[2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)], i_sub_fe6_if.dat[$bits(FE6_TYPE)+2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, i_sub_fe6_if.val, 1, 1, i_sub_fe6_if.err, i_sub_fe6_if.mod, i_sub_fe6_if.ctl); - sub_if_fe2_o[0].ctl[CTL_BIT] <= sub_cnt; + sub_if_fe2_o[0].ctl[CTL_BIT +: 2] <= sub_cnt; if (i_sub_fe6_if.val) sub_cnt <= 0; end end @@ -170,10 +178,10 @@ always_ff @ (posedge i_clk) begin // One process to assign outputs if (~o_sub_fe6_if.val || (o_sub_fe6_if.val && o_sub_fe6_if.rdy)) begin o_sub_fe6_if.ctl <= sub_if_fe2_i[0].ctl; - if (sub_if_fe2_i[0].ctl[CTL_BIT] == 0) begin + if (sub_if_fe2_i[0].ctl[CTL_BIT +: 2] == 0) begin if (sub_if_fe2_i[0].val) o_sub_fe6_if.dat[0 +: $bits(FE2_TYPE)] <= sub_if_fe2_i[0].dat; - end else if (sub_if_fe2_i[0].ctl[CTL_BIT] == 1) begin + end else if (sub_if_fe2_i[0].ctl[CTL_BIT +: 2] == 1) begin if (sub_if_fe2_i[0].val) o_sub_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= sub_if_fe2_i[0].dat; end else begin @@ -185,149 +193,342 @@ always_ff @ (posedge i_clk) begin end // Multiplications are calculated using the formula in bls12_381.pkg::fe6_mul() -logic [2:0] mul_cnt; -logic [1:0] add_sub_val; -always_comb begin - i_mul_fe2_if.rdy = (fp_mode_mul || mul_state == MUL3) && (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)); - i_mul_fe_if.rdy = fp_mode_mul ? ~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy) : - (i_mul_fe_if.ctl[CTL_BIT +: 2] == 0 || i_mul_fe_if.ctl[CTL_BIT +: 2] == 1) ? - (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy)) : - (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy)); +logic [3:0] mul_cnt, add_mul_cnt, sub_mul_cnt, mnr_cnt; +logic [2:0] mul_val; +FE2_TYPE a_a, b_b, c_c; - o_mul_fe2_if.val = &add_sub_val; - sub_if_fe_i[1].rdy = ~add_sub_val[1] || (o_mul_fe2_if.val && o_mul_fe2_if.rdy); - add_if_fe_i[1].rdy = ~add_sub_val[0] || (o_mul_fe2_if.val && o_mul_fe2_if.rdy); +always_comb begin + + case(i_mul_fe2_if.ctl[CTL_BIT +: 4]) inside + 0, 1, 2: i_mul_fe2_if.rdy = 1; + 3: i_mul_fe2_if.rdy = sub_mul_cnt == 0 && (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)); + 4: i_mul_fe2_if.rdy = sub_mul_cnt == 2 && (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)); + 5: i_mul_fe2_if.rdy = sub_mul_cnt == 3 && (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)); + default: i_mul_fe2_if.rdy = 0; + endcase + + case(add_if_fe2_i[1].ctl[CTL_BIT +: 4]) inside + 0: add_if_fe2_i[1].rdy = mul_cnt == 3 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)); + 1: add_if_fe2_i[1].rdy = mul_cnt == 3 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)); + 2: add_if_fe2_i[1].rdy = mul_cnt == 4 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)); + 3: add_if_fe2_i[1].rdy = mul_cnt == 4 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)); + 4: add_if_fe2_i[1].rdy = mul_cnt == 5 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)); + 5: add_if_fe2_i[1].rdy = mul_cnt == 5 && (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)); + 6: add_if_fe2_i[1].rdy = sub_mul_cnt == 5 && (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)); + 7, 8: add_if_fe2_i[1].rdy = ~o_mul_fe6_if.val || (o_mul_fe6_if.val && o_mul_fe6_if.rdy); + default: add_if_fe2_i[1].rdy = 0; + endcase + + case(sub_if_fe2_i[1].ctl[CTL_BIT +: 4]) inside + 0: sub_if_fe2_i[1].rdy = sub_mul_cnt == 1 && (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)); + 1: sub_if_fe2_i[1].rdy = mnr_cnt == 0 && (~o_mnr_fe2_if.val || (o_mnr_fe2_if.val && o_mnr_fe2_if.rdy)); + 2: sub_if_fe2_i[1].rdy = add_mul_cnt == 6 && (~add_if_fe2_o[1].val || (add_if_fe2_o[1].val && add_if_fe2_o[1].rdy)); + 3: sub_if_fe2_i[1].rdy = sub_mul_cnt == 4 && (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)); + 4: sub_if_fe2_i[1].rdy = add_mul_cnt == 8 && i_mnr_fe2_if.val && i_mnr_fe2_if.ctl[CTL_BIT +: 4] == 1 && (~add_if_fe2_o[1].val || (add_if_fe2_o[1].val && add_if_fe2_o[1].rdy)); + 5: sub_if_fe2_i[1].rdy = ~mul_val[2]; + default: sub_if_fe2_i[1].rdy = 0; + endcase + + case(i_mnr_fe2_if.ctl[CTL_BIT +: 4]) inside + 0: i_mnr_fe2_if.rdy = add_mul_cnt == 7 && (~add_if_fe2_o[1].val || (add_if_fe2_o[1].val && add_if_fe2_o[1].rdy)); + 1: i_mnr_fe2_if.rdy = add_mul_cnt == 8 && sub_if_fe2_i[1].val && sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 4 && (~add_if_fe2_o[1].val || (add_if_fe2_o[1].val && add_if_fe2_o[1].rdy)); + default: i_mnr_fe2_if.rdy = 0; + endcase + + o_mul_fe6_if.val <= &mul_val; end always_ff @ (posedge i_clk) begin if (i_rst) begin - add_sub_val <= 0; - o_mul_fe2_if.sop <= 1; - o_mul_fe2_if.eop <= 1; - o_mul_fe2_if.err <= 0; - o_mul_fe2_if.ctl <= 0; - o_mul_fe2_if.dat <= 0; - o_mul_fe2_if.mod <= 0; - mul_state <= MUL0; - o_mul_fe_if.reset_source(); - sub_if_fe_o[1].copy_if(0, 0, 1, 1, 0, 0, 0); - add_if_fe_o[1].copy_if(0, 0, 1, 1, 0, 0, 0); - fp_mode_mul <= 0; + o_mul_fe6_if.ctl <= 0; + o_mul_fe6_if.mod <= 0; + o_mul_fe6_if.err <= 0; + o_mul_fe6_if.sop <= 1; + o_mul_fe6_if.eop <= 1; + o_mul_fe6_if.dat <= 0; + o_mnr_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0); + mul_cnt <= 0; + add_mul_cnt <= 0; + sub_mul_cnt <= 0; + mnr_cnt <= 0; + o_mul_fe2_if.copy_if(0, 0, 1, 1, 0, 0, 0); + sub_if_fe2_o[1].copy_if(0, 0, 1, 1, 0, 0, 0); + add_if_fe2_o[1].copy_if(0, 0, 1, 1, 0, 0, 0); + a_a <= 0; + b_b <= 0; + c_c <= 0; + i_mul_fe6_if.rdy <= 0; + mul_val <= 0; end else begin - fp_mode_mul <= i_fp_mode; - - if (o_mul_fe2_if.val && o_mul_fe2_if.rdy) begin - add_sub_val <= 0; - end - if (o_mul_fe_if.val && o_mul_fe_if.rdy) o_mul_fe_if.val <= 0; - if (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy) sub_if_fe_o[1].val <= 0; - if (add_if_fe_o[1].val && add_if_fe_o[1].rdy) add_if_fe_o[1].val <= 0; - - // One process to parse inputs and send them to the multiplier - if (~o_mul_fe_if.val || (o_mul_fe_if.val && o_mul_fe_if.rdy)) begin - case (mul_state) - MUL0: begin - o_mul_fe_if.copy_if({i_mul_fe2_if.dat[0 +: $bits(FE_TYPE)], - i_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]}, - i_mul_fe2_if.val, 1, 1, i_mul_fe2_if.err, i_mul_fe2_if.mod, i_mul_fe2_if.ctl); - o_mul_fe_if.ctl[CTL_BIT +: 2] <= 0; - if (i_mul_fe2_if.val && ~fp_mode_mul) mul_state <= MUL1; + i_mul_fe6_if.rdy <= 0; + + if (o_mul_fe6_if.val && o_mul_fe6_if.rdy) mul_val <= 0; + if (o_mul_fe2_if.val && o_mul_fe2_if.rdy) o_mul_fe2_if.val <= 0; + if (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy) sub_if_fe2_o[1].val <= 0; + if (add_if_fe2_o[1].val && add_if_fe2_o[1].rdy) add_if_fe2_o[1].val <= 0; + if (o_mnr_fe2_if.val && o_mnr_fe2_if.rdy) o_mnr_fe2_if.val <= 0; + + // Multiplications + if (~o_mul_fe2_if.val || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin + case(mul_cnt) + 0: begin + o_mul_fe2_if.copy_if({i_mul_fe6_if.dat[0 +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE6_TYPE) +: $bits(FE2_TYPE)]}, + i_mul_fe6_if.val, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + if (i_mul_fe6_if.val) mul_cnt <= mul_cnt + 1; + end + 1: begin + o_mul_fe2_if.copy_if({i_mul_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE6_TYPE) + $bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + mul_cnt <= mul_cnt + 1; + end + 2: begin + o_mul_fe2_if.copy_if({i_mul_fe6_if.dat[2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE6_TYPE) + 2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + mul_cnt <= mul_cnt + 1; end - MUL1: begin - o_mul_fe_if.copy_if({i_mul_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)], - i_mul_fe2_if.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]}, - i_mul_fe2_if.val, 1, 1, i_mul_fe2_if.err, i_mul_fe2_if.mod, i_mul_fe2_if.ctl); - o_mul_fe_if.ctl[CTL_BIT +: 2] <= 1; - if (i_mul_fe2_if.val) mul_state <= MUL2; + 3: begin + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 0) + o_mul_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 1) begin + o_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + if (add_if_fe2_i[1].val) begin + o_mul_fe2_if.val <= 1; + mul_cnt <= mul_cnt + 1; + end + end end - MUL2: begin - o_mul_fe_if.copy_if({i_mul_fe2_if.dat[0 +: $bits(FE_TYPE)], - i_mul_fe2_if.dat[$bits(FE2_TYPE) + $bits(FE_TYPE) +: $bits(FE_TYPE)]}, - i_mul_fe2_if.val, 1, 1, i_mul_fe2_if.err, i_mul_fe2_if.mod, i_mul_fe2_if.ctl); - o_mul_fe_if.ctl[CTL_BIT +: 2] <= 2; - if (i_mul_fe2_if.val) mul_state <= MUL3; + 4: begin + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 2) + o_mul_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 3) begin + o_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + if (add_if_fe2_i[1].val) begin + o_mul_fe2_if.val <= 1; + mul_cnt <= mul_cnt + 1; + end + end end - MUL3: begin - o_mul_fe_if.copy_if({i_mul_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)], - i_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE_TYPE)]}, - i_mul_fe2_if.val, 1, 1, i_mul_fe2_if.err, i_mul_fe2_if.mod, i_mul_fe2_if.ctl); - o_mul_fe_if.ctl[CTL_BIT +: 2] <= 3; - if (i_mul_fe2_if.val) mul_state <= MUL0; + 5: begin + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 4) + o_mul_fe2_if.dat[0 +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 5) begin + o_mul_fe2_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + if (add_if_fe2_i[1].val) begin + o_mul_fe2_if.val <= 1; + mul_cnt <= 0; + end + end end endcase + o_mul_fe2_if.ctl[CTL_BIT +: 4] <= mul_cnt; + end + + // Additions + if (~add_if_fe2_o[1].val || (add_if_fe2_o[1].val && add_if_fe2_o[1].rdy)) begin + case (add_mul_cnt) + 0: begin + add_if_fe2_o[1].copy_if({i_mul_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + i_mul_fe6_if.val, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + if (i_mul_fe6_if.val) add_mul_cnt <= add_mul_cnt + 1; + end + 1: begin + add_if_fe2_o[1].copy_if({i_mul_fe6_if.dat[$bits(FE6_TYPE) + $bits(FE2_TYPE) +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE6_TYPE) + 2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + add_mul_cnt <= add_mul_cnt + 1; + end + 2: begin + add_if_fe2_o[1].copy_if({i_mul_fe6_if.dat[$bits(FE6_TYPE) +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE6_TYPE) + 2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + add_mul_cnt <= add_mul_cnt + 1; + end + 3: begin + add_if_fe2_o[1].copy_if({i_mul_fe6_if.dat[0 +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + add_mul_cnt <= add_mul_cnt + 1; + end + 4: begin + add_if_fe2_o[1].copy_if({i_mul_fe6_if.dat[$bits(FE6_TYPE) +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE6_TYPE) + $bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + add_mul_cnt <= add_mul_cnt + 1; + end + 5: begin + add_if_fe2_o[1].copy_if({i_mul_fe6_if.dat[0 +: $bits(FE2_TYPE)], + i_mul_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)]}, + 1, 1, 1, i_mul_fe6_if.err, i_mul_fe6_if.mod, i_mul_fe6_if.ctl); + add_mul_cnt <= add_mul_cnt + 1; + i_mul_fe6_if.rdy <= 1; // Release input here + end + 6: begin + add_if_fe2_o[1].dat[0 +: $bits(FE2_TYPE)] <= b_b; + add_if_fe2_o[1].dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= sub_if_fe2_i[1].dat; + add_if_fe2_o[1].ctl <= sub_if_fe2_i[1].ctl; + if (sub_if_fe2_i[1].val && sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 2) begin + add_mul_cnt <= add_mul_cnt + 1; + add_if_fe2_o[1].val <= 1; + end + end + 7: begin + add_if_fe2_o[1].dat[0 +: $bits(FE2_TYPE)] <= a_a; + add_if_fe2_o[1].dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= i_mnr_fe2_if.dat; + add_if_fe2_o[1].ctl <= i_mnr_fe2_if.ctl; + if (i_mnr_fe2_if.val && i_mnr_fe2_if.ctl[CTL_BIT +: 4] == 0) begin + add_mul_cnt <= add_mul_cnt + 1; + add_if_fe2_o[1].val <= 1; + end + end + 8: begin + if (sub_if_fe2_i[1].val && sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 4 && + i_mnr_fe2_if.val && i_mnr_fe2_if.ctl[CTL_BIT +: 4] == 1) begin + add_if_fe2_o[1].dat[0 +: $bits(FE2_TYPE)] <= sub_if_fe2_i[1].dat; + add_if_fe2_o[1].ctl <= sub_if_fe2_i[1].ctl; + add_if_fe2_o[1].dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= i_mnr_fe2_if.dat; + add_mul_cnt <= 0; + add_if_fe2_o[1].val <= 1; + end + end + endcase + add_if_fe2_o[1].ctl[CTL_BIT +: 4] <= add_mul_cnt; end - // Process multiplications and do subtraction - if (~fp_mode_mul && (~sub_if_fe_o[1].val || (sub_if_fe_o[1].val && sub_if_fe_o[1].rdy))) begin - if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 0) begin - if (i_mul_fe_if.val) sub_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat; - end - if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 1) begin - sub_if_fe_o[1].val <= i_mul_fe_if.val; - sub_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_fe_if.dat; - end - sub_if_fe_o[1].ctl <= i_mul_fe_if.ctl; + // Subtractions + if (~sub_if_fe2_o[1].val || (sub_if_fe2_o[1].val && sub_if_fe2_o[1].rdy)) begin + case (sub_mul_cnt) + 0: begin + if (i_mul_fe2_if.ctl[CTL_BIT +: 4] == 3) begin + sub_if_fe2_o[1].dat <= {b_b, i_mul_fe2_if.dat}; + sub_if_fe2_o[1].val <= i_mul_fe2_if.val; + sub_if_fe2_o[1].ctl <= i_mul_fe2_if.ctl; + if (i_mul_fe2_if.val) sub_mul_cnt <= sub_mul_cnt + 1; + end + end + 1: begin + if (sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 0) begin + sub_if_fe2_o[1].dat <= {c_c, sub_if_fe2_i[1].dat}; + sub_if_fe2_o[1].val <= sub_if_fe2_i[1].val; + sub_if_fe2_o[1].ctl <= sub_if_fe2_i[1].ctl; + if (sub_if_fe2_i[1].val) sub_mul_cnt <= sub_mul_cnt + 1; + end + end + 2: begin + if (i_mul_fe2_if.ctl[CTL_BIT +: 4] == 4) begin + sub_if_fe2_o[1].dat <= {a_a, i_mul_fe2_if.dat}; + sub_if_fe2_o[1].val <= i_mul_fe2_if.val; + sub_if_fe2_o[1].ctl <= i_mul_fe2_if.ctl; + if (i_mul_fe2_if.val) sub_mul_cnt <= sub_mul_cnt + 1; + end + end + 3: begin + if (i_mul_fe2_if.ctl[CTL_BIT +: 4] == 5) begin + sub_if_fe2_o[1].dat <= {a_a, i_mul_fe2_if.dat}; + sub_if_fe2_o[1].val <= i_mul_fe2_if.val; + sub_if_fe2_o[1].ctl <= i_mul_fe2_if.ctl; + if (i_mul_fe2_if.val) sub_mul_cnt <= sub_mul_cnt + 1; + end + end + 4: begin + if (sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 3) begin + sub_if_fe2_o[1].dat <= {b_b, sub_if_fe2_i[1].dat}; + sub_if_fe2_o[1].val <= sub_if_fe2_i[1].val; + sub_if_fe2_o[1].ctl <= sub_if_fe2_i[1].ctl; + if (sub_if_fe2_i[1].val) sub_mul_cnt <= sub_mul_cnt + 1; + end + end + 5: begin + if (add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 6) begin + sub_if_fe2_o[1].dat <= {c_c, add_if_fe2_i[1].dat}; + sub_if_fe2_o[1].val <= add_if_fe2_i[1].val; + sub_if_fe2_o[1].ctl <= add_if_fe2_i[1].ctl; + if (add_if_fe2_i[1].val) sub_mul_cnt <= 0; + end + end + endcase + sub_if_fe2_o[1].ctl[CTL_BIT +: 4] <= sub_mul_cnt; end - // Process multiplications and do addition - if (~fp_mode_mul && (~add_if_fe_o[1].val || (add_if_fe_o[1].val && add_if_fe_o[1].rdy))) begin - if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 2) begin - if (i_mul_fe_if.val) add_if_fe_o[1].dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat; - end - if (i_mul_fe_if.ctl[CTL_BIT +: 2] == 3) begin - add_if_fe_o[1].val <= i_mul_fe_if.val; - add_if_fe_o[1].dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= i_mul_fe_if.dat; - end - add_if_fe_o[1].ctl <= i_mul_fe_if.ctl; + // Non-residue + if (~o_mnr_fe2_if.val || (o_mnr_fe2_if.val && o_mnr_fe2_if.rdy)) begin + case(mnr_cnt) + 0: begin + if (sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 1) begin + o_mnr_fe2_if.dat <= sub_if_fe2_i[1].dat; + o_mnr_fe2_if.val <= sub_if_fe2_i[1].val; + o_mnr_fe2_if.ctl <= 0; + o_mnr_fe2_if.ctl[CTL_BIT +: 4] <= mnr_cnt; + if (sub_if_fe2_i[1].val) mnr_cnt <= mnr_cnt + 1; + end + end + 1: begin + o_mnr_fe2_if.dat <= c_c; + o_mnr_fe2_if.val <= 1; + o_mnr_fe2_if.ctl <= 0; + o_mnr_fe2_if.ctl[CTL_BIT +: 4] <= mnr_cnt; + mnr_cnt <= 0; + end + endcase end - // One process to assign output - // If we are in fp_mode - if (fp_mode_mul) begin - if (~add_sub_val[0] || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin - o_mul_fe2_if.dat[0 +: $bits(FE_TYPE)] <= i_mul_fe_if.dat; - o_mul_fe2_if.ctl <= i_mul_fe_if.ctl; - add_sub_val <= {2{i_mul_fe_if.val}}; - end - end else begin - if (~add_sub_val[0] || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin - o_mul_fe2_if.ctl <= add_if_fe_i[1].ctl; - o_mul_fe2_if.dat[$bits(FE_TYPE) +: $bits(FE_TYPE)] <= add_if_fe_i[1].dat; - add_sub_val[0] <= add_if_fe_i[1].val; - end - - if (~add_sub_val[1] || (o_mul_fe2_if.val && o_mul_fe2_if.rdy)) begin - o_mul_fe2_if.dat[0 +: $bits(FE_TYPE)] <= sub_if_fe_i[1].dat; - add_sub_val[1] <= sub_if_fe_i[1].val; - end + // Take results from multiplications to save a_a, b_b, c_c + if (i_mul_fe2_if.val && i_mul_fe2_if.rdy) begin + case (i_mul_fe2_if.ctl[CTL_BIT +: 4]) + 0: a_a <= i_mul_fe2_if.dat; + 1: b_b <= i_mul_fe2_if.dat; + 2: c_c <= i_mul_fe2_if.dat; + endcase + end + + // Final output valid + if (sub_if_fe2_i[1].val && sub_if_fe2_i[1].ctl[CTL_BIT +: 4] == 5 && ~mul_val[2]) begin + o_mul_fe6_if.dat[2*$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= sub_if_fe2_i[1].dat; + o_mul_fe6_if.ctl <= sub_if_fe2_i[1].ctl; + mul_val[2] <= 1; + end + if (add_if_fe2_i[1].val && add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 8 && ~mul_val[1]) begin + o_mul_fe6_if.dat[$bits(FE2_TYPE) +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + o_mul_fe6_if.ctl <= add_if_fe2_i[1].ctl; + mul_val[1] <= 1; end + if (add_if_fe2_i[1].val && add_if_fe2_i[1].ctl[CTL_BIT +: 4] == 7 && ~mul_val[0]) begin + o_mul_fe6_if.dat[0 +: $bits(FE2_TYPE)] <= add_if_fe2_i[1].dat; + o_mul_fe6_if.ctl <= add_if_fe2_i[1].ctl; + mul_val[0] <= 1; + end + end end resource_share # ( .NUM_IN ( 2 ), .DAT_BITS ( 2*$bits(FE2_TYPE) ), - .CTL_BITS ( CTL_BIT+4 ), - .OVR_WRT_BIT ( CTL_BIT+2 ), + .CTL_BITS ( CTL_BIT+6 ), + .OVR_WRT_BIT ( CTL_BIT+4 ), .PIPELINE_IN ( 0 ), .PIPELINE_OUT ( 0 ) ) resource_share_sub ( .i_clk ( i_clk ), .i_rst ( i_rst ), - .i_axi ( sub_if_fe_o[1:0] ), - .o_res ( o_sub_fe_if ), - .i_res ( i_sub_fe_if ), - .o_axi ( sub_if_fe_i[1:0] ) + .i_axi ( sub_if_fe2_o[1:0] ), + .o_res ( o_sub_fe2_if ), + .i_res ( i_sub_fe2_if ), + .o_axi ( sub_if_fe2_i[1:0] ) ); resource_share # ( .NUM_IN ( 2 ), .DAT_BITS ( 2*$bits(FE2_TYPE) ), - .CTL_BITS ( CTL_BIT+4 ), - .OVR_WRT_BIT ( CTL_BIT+2 ), + .CTL_BITS ( CTL_BIT+6 ), + .OVR_WRT_BIT ( CTL_BIT+4 ), .PIPELINE_IN ( 0 ), .PIPELINE_OUT ( 0 ) )