From 2b37dfdd7605b70b5d6b1eab7648eabcb6a7d6fe Mon Sep 17 00:00:00 2001
From: Simon Peffers <simon@supranational.net>
Date: Sat, 3 Aug 2019 16:36:49 -0400
Subject: [PATCH] Update documentation, update python model and expected
 results for contest modulus, prepare for test portal.

---
 README.md                         | 22 +++++++++++++++++-----
 docs/aws_f1.md                    |  8 ++++----
 docs/test_portal.md               |  2 +-
 docs/verilator.md                 |  2 +-
 modular_square/model/vdf_basic.py |  5 ++---
 msu/Makefile                      |  8 ++++++--
 msu/rtl/sdaccel/Makefile.sdaccel  |  1 -
 msu/sw/MSU.cpp                    | 11 +++++++++++
 vdf_portal_config.json            |  3 +++
 9 files changed, 45 insertions(+), 17 deletions(-)
 create mode 100644 vdf_portal_config.json

diff --git a/README.md b/README.md
index c13a99a..6f0a3da 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 This repository contains the modular squaring multiplier baseline design for the VDF (Verifiable Delay Function) low latency multiplier FPGA competition. For more information about the research behind VDFs see <https://vdfresearch.org/>.
 
-The goal of the competition is to create the fastest (lowest latency) 1024 bit modular squaring circuit possible targeting the AWS F1 FPGA platform. Up to $100k in prizes is available across two rounds of the competition. For additional detail see [FPGA Contest](https://supranational.atlassian.net/wiki/spaces/VA/pages/36569208/FPGA+Contest) on the [VDF Alliance](https://supranational.atlassian.net/wiki/spaces/VA/overview) page.
+The goal of the competition is to create the fastest (lowest latency) 1024 bit modular squaring circuit possible targeting the AWS F1 FPGA platform. Up to $100k in prizes is available across two rounds of the competition. For additional detail see [FPGA Contest Wiki](https://supranational.atlassian.net/wiki/spaces/VA/pages/36569208/FPGA+Contest) on the [VDF Alliance](https://supranational.atlassian.net/wiki/spaces/VA/overview) page.
 
 Official competition rules can be found in [FPGA_Competition_Official_Rules_and_Disclosures.pdf](FPGA_Competition_Official_Rules_and_Disclosures.pdf).
 
@@ -20,13 +20,25 @@ x, N are 1024 bits
 t = 2^30
 
 x = random
+
+Decimal:
+N = 12406669568412474139879892740481443274469842712573568412813185506
+    49768953373091389100150712146576743094431494074574934345790638408
+    41220334555160125016331040933690674569571217337630239191517205721
+    31019760838723984636436085022089677296497856968322944926681990341
+    4117058030106528073928633017118689826625594484331
+
+Hex:
+N = 0xb0ad4555c1ee34c8cb0577d7105a475171760330d577a0777ddcb955b302ad0
+    803487d78ca267e8e9f5e3f46e35e10ca641a27e622b2d04bb09f3f5e3ad274b1
+    744f34aeaf90fd45129a02a298dbc430f404f9988c862d10b58c91faba2aa2922
+    f079229b0c8f88d86bfe6def7d026294ed9dee2504b5d30466f7b0488e2666b
 ```
 
 Here is a sample implementation in Python:
 ```
 #!/usr/bin/python3
 
-from Crypto.PublicKey import RSA
 from random import getrandbits
 
 # Competition is for 1024 bits
@@ -36,10 +48,10 @@ NUM_ITERATIONS = 1000
 
 # Rather than being random each time, we will provide randomly generated values
 x = getrandbits(NUM_BITS)
-N = RSA.generate(NUM_BITS).n
+N = 124066695684124741398798927404814432744698427125735684128131855064976895337309138910015071214657674309443149407457493434579063840841220334555160125016331040933690674569571217337630239191517205721310197608387239846364360850220896772964978569683229449266819903414117058030106528073928633017118689826625594484331
 
 # t should be small for testing purposes.  
-# For the final FPGA runs, t will be around 1 billion
+# For the final FPGA runs, t will be 2^30
 t = NUM_ITERATIONS
 
 # Iterative modular squaring t times
@@ -182,7 +194,7 @@ The following are some potential optimization paths.
 * Try other algorithms such as Chinese Remainder Theorem, Montgomery/Barrett, etc. 
 * Shorten the pipeline - we believe a 4-5 cycle pipeline is possible with this design
 * Lengthen the pipeline - insert more pipe stages, run with a faster clock
-* Change the partial product multiplier size. The DSPs are 26x17 bit multipliers and the modular squaring circuit supports using either by changing a define at the top.
+* Change the partial product multiplier size. The DSPs are 26x17 bit unsigned multipliers. The Ozturk modular squaring circuit supports using either 17x17 or 26x17 bit multipliers by changing a define at the top of the file.
 * This design uses lookup tables stored in BlockRAM for the reduction step. These are easy to change to distributed memory and there is support in the model to use UltraRAM. For an example using UltraRAM see https://github.com/supranational/vdf-fpga/tree/f72eb8c06eec94a09142f675cde8d1514fb72e60
 * Optimize the compression trees and accumulators to make the best use of FPGA LUTs and CARRY8 primitives.
 * Floorplan the design.
diff --git a/docs/aws_f1.md b/docs/aws_f1.md
index ac1be6e..0fb8e67 100644
--- a/docs/aws_f1.md
+++ b/docs/aws_f1.md
@@ -136,11 +136,11 @@ sudo su
 source $AWS_FPGA_REPO_DIR/sdaccel_runtime_setup.sh 
 
 # Run a short test and verify the result in software
-./host -e -u 0 -f 100
+./host -e -f 100
 
 # Run a billion iterations starting with an input of 2
-./host -u 0 -s 0x2 -f 1000000000
+./host -s 0x2 -f 1073741824
 ```
 
-The expected result of 2^2^1B using the default 1k (64 coefficient) modulus in the Makefile is:
-`305939394796769797811431929207587607176284037479412924905827147439718856946037842431593490055940763973150879770720223457997191020439404083394702653096083649807090448385799021330059496823106654989629199132438283594347957634468046231084628857389350823217443926925454895121571284954146032303555585511855910526`
+The expected result of 2^2^2^30 using the default 1k (64 coefficient) modulus in the Makefile is:
+`9782776834334634490446343758704728706980122657033141222406929631982781114105293252444979173994924549755313289718816652420124314107449156688222852673024696927113240716169907514261823484008194829047317452425855361884165852504086556390349991640188347831084926001670580437428161157316196941905575574310934275893`
diff --git a/docs/test_portal.md b/docs/test_portal.md
index 4c962a8..11e9a61 100644
--- a/docs/test_portal.md
+++ b/docs/test_portal.md
@@ -2,7 +2,7 @@
 
 The online test portal dramatically lowers the bar to testing your design in AWS F1 environment. 
 
-Rather than go through the process of enabling AWS, the F1 environment, etc., you can design, test and tune your multiplier and Vivado and submit it to the portal to make sure the results are what you expect. 
+Rather than go through the process of enabling AWS, the F1 environment, etc., you can design, test and tune your multiplier in Vivado and submit it to the portal to make sure the results are what you expect. 
 
 Once you submit your design, the test portal will clone your repo, run simulation, hardware emulation, synthesis/place and route, and provide the results back to you in an encrypted file on S3. 
 
diff --git a/docs/verilator.md b/docs/verilator.md
index 156d436..8466365 100644
--- a/docs/verilator.md
+++ b/docs/verilator.md
@@ -2,7 +2,7 @@
 
 The Ozturk design supports verilator as a simulator. 
 
-While we're big fans of verilator, it unfortunately doesn't support 1024 bit modular squaring using * and %. As a result the default bitwidth for this design when using verilator is 128 bits. We found it can also be finicky with large bitwidths. Unpacked arrays of 
+While we're big fans of verilator, it unfortunately doesn't support 1024 bit modular squaring using * and %. As a result the default bitwidth for this design when using verilator is 128 bits. We found it can also be finicky with large bitwidths. Unpacked arrays of smaller words seems more stable.
 
 Enabling verilator takes just a few steps on Ubuntu 18 and AWS F1 CentOS. The setup script requires sudo access to install dependencies.
 
diff --git a/modular_square/model/vdf_basic.py b/modular_square/model/vdf_basic.py
index ecc5fab..d979300 100644
--- a/modular_square/model/vdf_basic.py
+++ b/modular_square/model/vdf_basic.py
@@ -1,6 +1,5 @@
 #!/usr/bin/python3
 
-from Crypto.PublicKey import RSA
 from random import getrandbits
 
 # Competition is for 1024 bits
@@ -10,10 +9,10 @@
 
 # Rather than being random each time, we will provide randomly generated values
 x = getrandbits(NUM_BITS)
-N = RSA.generate(NUM_BITS).n
+N = 124066695684124741398798927404814432744698427125735684128131855064976895337309138910015071214657674309443149407457493434579063840841220334555160125016331040933690674569571217337630239191517205721310197608387239846364360850220896772964978569683229449266819903414117058030106528073928633017118689826625594484331
 
 # t should be small for testing purposes.  
-# For the final FPGA runs, t will be around 1 billion
+# For the final FPGA runs, t will be 2^30
 t = NUM_ITERATIONS
 
 # Iterative modular squaring t times
diff --git a/msu/Makefile b/msu/Makefile
index 7d98e28..466b58f 100644
--- a/msu/Makefile
+++ b/msu/Makefile
@@ -42,15 +42,19 @@ judge:
 
 hw_emu:
 	make clean
-	MOD_LEN=1024 SIMPLE_SQ=0 $(MAKE) -C $(SDACCEL_DIR) hw_emu
+	OBJ=obj_hw_emu MOD_LEN=1024 SIMPLE_SQ=0 $(MAKE) -C $(SDACCEL_DIR) hw_emu
 
 hw_emu_simple:
 	make clean
-	MOD_LEN=1024 SIMPLE_SQ=1 $(MAKE) -C $(SDACCEL_DIR) hw_emu
+	OBJ=obj_hw_emu MOD_LEN=1024 SIMPLE_SQ=1 $(MAKE) -C $(SDACCEL_DIR) hw_emu
 
 hw:
 	MOD_LEN=1024 SIMPLE_SQ=0 $(MAKE) -C $(SDACCEL_DIR) hw
 
+synthesis:
+	make clean
+	OBJ=obj_hw_emu MOD_LEN=1024 SIMPLE_SQ=0 $(MAKE) -C $(SDACCEL_DIR) hw_emu
+	OBJ=obj_hw     MOD_LEN=1024 SIMPLE_SQ=0 $(MAKE) -C $(SDACCEL_DIR) hw
 
 
 # Additional, mostly verilator, targets
diff --git a/msu/rtl/sdaccel/Makefile.sdaccel b/msu/rtl/sdaccel/Makefile.sdaccel
index 108a000..edf965e 100755
--- a/msu/rtl/sdaccel/Makefile.sdaccel
+++ b/msu/rtl/sdaccel/Makefile.sdaccel
@@ -69,7 +69,6 @@ LDCLFLAGS += --xp "vivado_prop:run.impl_1.{STEPS.PLACE_DESIGN.TCL.PRE}=\
 
 LDCLFLAGS += --kernel_frequency 161
 
-
 ############################################################################
 # AWS/SDAccel configuration
 ############################################################################
diff --git a/msu/sw/MSU.cpp b/msu/sw/MSU.cpp
index e9e96eb..112cd49 100644
--- a/msu/sw/MSU.cpp
+++ b/msu/sw/MSU.cpp
@@ -110,7 +110,18 @@ void MSU::prepare_random_job(bool rrandom) {
 void MSU::compute_job() {
     struct timespec start_ts;
     start_ts = timer_start();
+    
+    //////////////////////////////////////////////////////////////////////
+    // PREPROCESSING goes below this line (Montgomery conversion, etc)
+    //
+
+    // Perform the computation
     device.compute_job(t_start, t_final, sq_in, sq_out);
+
+    //
+    // POSTPROCESSING goes above this line (Montgomery conversion, etc)
+    //////////////////////////////////////////////////////////////////////
+
     compute_time = timer_end(start_ts);
 
     if(!quiet) {
diff --git a/vdf_portal_config.json b/vdf_portal_config.json
new file mode 100644
index 0000000..bb89292
--- /dev/null
+++ b/vdf_portal_config.json
@@ -0,0 +1,3 @@
+{
+  "target": "liveness"
+}