diff --git a/core/crypto/_fiat/field_curve448/field.odin b/core/crypto/_fiat/field_curve448/field.odin new file mode 100644 index 00000000000..540d88f2813 --- /dev/null +++ b/core/crypto/_fiat/field_curve448/field.odin @@ -0,0 +1,235 @@ +package field_curve448 + +import "core:mem" + +fe_relax_cast :: #force_inline proc "contextless" ( + arg1: ^Tight_Field_Element, +) -> ^Loose_Field_Element { + return (^Loose_Field_Element)(arg1) +} + +fe_tighten_cast :: #force_inline proc "contextless" ( + arg1: ^Loose_Field_Element, +) -> ^Tight_Field_Element { + return (^Tight_Field_Element)(arg1) +} + +fe_clear :: proc "contextless" ( + arg1: $T, +) where T == ^Tight_Field_Element || T == ^Loose_Field_Element { + mem.zero_explicit(arg1, size_of(arg1^)) +} + +fe_clear_vec :: proc "contextless" ( + arg1: $T, +) where T == []^Tight_Field_Element || T == []^Loose_Field_Element { + for fe in arg1 { + fe_clear(fe) + } +} + +fe_carry_mul_small :: proc "contextless" ( + out1: ^Tight_Field_Element, + arg1: ^Loose_Field_Element, + arg2: u64, +) { + arg2_ := Loose_Field_Element{arg2, 0, 0, 0, 0, 0, 0, 0} + fe_carry_mul(out1, arg1, &arg2_) +} + +fe_carry_pow2k :: proc "contextless" ( + out1: ^Tight_Field_Element, + arg1: ^Loose_Field_Element, + arg2: uint, +) { + // Special case: `arg1^(2 * 0) = 1`, though this should never happen. + if arg2 == 0 { + fe_one(out1) + return + } + + fe_carry_square(out1, arg1) + for _ in 1 ..< arg2 { + fe_carry_square(out1, fe_relax_cast(out1)) + } +} + +fe_carry_inv :: proc "contextless" ( + out1: ^Tight_Field_Element, + arg1: ^Loose_Field_Element, +) { + // Inversion computation is derived from the addition chain: + // + // _10 = 2*1 + // _11 = 1 + _10 + // _110 = 2*_11 + // _111 = 1 + _110 + // _111000 = _111 << 3 + // _111111 = _111 + _111000 + // x12 = _111111 << 6 + _111111 + // x24 = x12 << 12 + x12 + // i34 = x24 << 6 + // x30 = _111111 + i34 + // x48 = i34 << 18 + x24 + // x96 = x48 << 48 + x48 + // x192 = x96 << 96 + x96 + // x222 = x192 << 30 + x30 + // x223 = 2*x222 + 1 + // return (x223 << 223 + x222) << 2 + 1 + // + // Operations: 447 squares 13 multiplies + // + // Generated by github.com/mmcloughlin/addchain v0.4.0. + + t0, t1, t2: Tight_Field_Element = ---, ---, --- + + // Step 1: t0 = x^0x2 + fe_carry_square(&t0, arg1) + + // Step 2: t0 = x^0x3 + fe_carry_mul(&t0, arg1, fe_relax_cast(&t0)) + + // t0.Sqr(t0) + fe_carry_square(&t0, fe_relax_cast(&t0)) + + // Step 4: t0 = x^0x7 + fe_carry_mul(&t0, arg1, fe_relax_cast(&t0)) + + // Step 7: t1 = x^0x38 + fe_carry_pow2k(&t1, fe_relax_cast(&t0), 3) + + // Step 8: t0 = x^0x3f + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 14: t1 = x^0xfc0 + fe_carry_pow2k(&t1, fe_relax_cast(&t0), 6) + + // Step 15: t1 = x^0xfff + fe_carry_mul(&t1, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 27: t2 = x^0xfff000 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 12) + + // Step 28: t1 = x^0xffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 34: t2 = x^0x3fffffc0 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 6) + + // Step 35: t0 = x^0x3fffffff + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t2)) + + // Step 53: t2 = x^0xffffff000000 + fe_carry_pow2k(&t2, fe_relax_cast(&t2), 18) + + // Step 54: t1 = x^0xffffffffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 102: t2 = x^0xffffffffffff000000000000 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 48) + + // Step 103: t1 = x^0xffffffffffffffffffffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 199: t2 = x^0xffffffffffffffffffffffff000000000000000000000000 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 96) + + // Step 200: t1 = x^0xffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 230: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffc0000000 + fe_carry_pow2k(&t1, fe_relax_cast(&t1), 30) + + // Step 231: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 232: t1 = x^0x7ffffffffffffffffffffffffffffffffffffffffffffffffffffffe + fe_carry_square(&t1, fe_relax_cast(&t0)) + + // Step 233: t1 = x^0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t1, arg1, fe_relax_cast(&t1)) + + // Step 456: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff80000000000000000000000000000000000000000000000000000000 + fe_carry_pow2k(&t1, fe_relax_cast(&t1), 223) + + // Step 457: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 459: t0 = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffc + fe_carry_pow2k(&t0, fe_relax_cast(&t0), 2) + + // Step 460: z = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffd + fe_carry_mul(out1, arg1, fe_relax_cast(&t0)) + + fe_clear_vec([]^Tight_Field_Element{&t0, &t1, &t2}) +} + +fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 0 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 + out1[4] = 0 + out1[5] = 0 + out1[6] = 0 + out1[7] = 0 +} + +fe_one :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 1 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 + out1[4] = 0 + out1[5] = 0 + out1[6] = 0 + out1[7] = 0 +} + +fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + x5 := arg1[4] + x6 := arg1[5] + x7 := arg1[6] + x8 := arg1[7] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +@(optimization_mode = "none") +fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) { + mask := (u64(arg1) * 0xffffffffffffffff) + x := (out1[0] ~ out2[0]) & mask + x1, y1 := out1[0] ~ x, out2[0] ~ x + x = (out1[1] ~ out2[1]) & mask + x2, y2 := out1[1] ~ x, out2[1] ~ x + x = (out1[2] ~ out2[2]) & mask + x3, y3 := out1[2] ~ x, out2[2] ~ x + x = (out1[3] ~ out2[3]) & mask + x4, y4 := out1[3] ~ x, out2[3] ~ x + x = (out1[4] ~ out2[4]) & mask + x5, y5 := out1[4] ~ x, out2[4] ~ x + x = (out1[5] ~ out2[5]) & mask + x6, y6 := out1[5] ~ x, out2[5] ~ x + x = (out1[6] ~ out2[6]) & mask + x7, y7 := out1[6] ~ x, out2[6] ~ x + x = (out1[7] ~ out2[7]) & mask + x8, y8 := out1[7] ~ x, out2[7] ~ x + out1[0], out2[0] = x1, y1 + out1[1], out2[1] = x2, y2 + out1[2], out2[2] = x3, y3 + out1[3], out2[3] = x4, y4 + out1[4], out2[4] = x5, y5 + out1[5], out2[5] = x6, y6 + out1[6], out2[6] = x7, y7 + out1[7], out2[7] = x8, y8 +} \ No newline at end of file diff --git a/core/crypto/_fiat/field_curve448/field51.odin b/core/crypto/_fiat/field_curve448/field51.odin new file mode 100644 index 00000000000..d8e49e04d55 --- /dev/null +++ b/core/crypto/_fiat/field_curve448/field51.odin @@ -0,0 +1,1060 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package field_curve448 + +// The file provides arithmetic on the field Z/(2^448 - 2^224 - 1) using +// unsaturated 64-bit integer arithmetic. It is derived primarily +// from the machine generated Golang output from the fiat-crypto project. +// +// While the base implementation is provably correct, this implementation +// makes no such claims as the port and optimizations were done by hand. +// +// TODO: +// * When fiat-crypto supports it, using a saturated 64-bit limbs +// instead of 56-bit limbs will be faster, though the gains are +// minimal unless adcx/adox/mulx are used. + +import fiat "core:crypto/_fiat" +import "core:math/bits" + +Loose_Field_Element :: distinct [8]u64 +Tight_Field_Element :: distinct [8]u64 + +@(rodata) +FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0, 0, 0, 0} +@(rodata) +FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0, 0, 0, 0} + +_addcarryx_u56 :: #force_inline proc "contextless" ( + arg1: fiat.u1, + arg2, arg3: u64, +) -> ( + out1: u64, + out2: fiat.u1, +) { + x1 := ((u64(arg1) + arg2) + arg3) + x2 := (x1 & 0xffffffffffffff) + x3 := fiat.u1((x1 >> 56)) + out1 = x2 + out2 = x3 + return +} + +_subborrowx_u56 :: #force_inline proc "contextless" ( + arg1: fiat.u1, + arg2, arg3: u64, +) -> ( + out1: u64, + out2: fiat.u1, +) { + x1 := ((i64(arg2) - i64(arg1)) - i64(arg3)) + x2 := fiat.u1((x1 >> 56)) + x3 := (u64(x1) & 0xffffffffffffff) + out1 = x3 + out2 = (0x0 - fiat.u1(x2)) + return +} + +fe_carry_mul :: proc "contextless" (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) { + x2, x1 := bits.mul_u64(arg1[7], arg2[7]) + x4, x3 := bits.mul_u64(arg1[7], arg2[6]) + x6, x5 := bits.mul_u64(arg1[7], arg2[5]) + x8, x7 := bits.mul_u64(arg1[6], arg2[7]) + x10, x9 := bits.mul_u64(arg1[6], arg2[6]) + x12, x11 := bits.mul_u64(arg1[5], arg2[7]) + x14, x13 := bits.mul_u64(arg1[7], arg2[7]) + x16, x15 := bits.mul_u64(arg1[7], arg2[6]) + x18, x17 := bits.mul_u64(arg1[7], arg2[5]) + x20, x19 := bits.mul_u64(arg1[6], arg2[7]) + x22, x21 := bits.mul_u64(arg1[6], arg2[6]) + x24, x23 := bits.mul_u64(arg1[5], arg2[7]) + x26, x25 := bits.mul_u64(arg1[7], arg2[7]) + x28, x27 := bits.mul_u64(arg1[7], arg2[6]) + x30, x29 := bits.mul_u64(arg1[7], arg2[5]) + x32, x31 := bits.mul_u64(arg1[7], arg2[4]) + x34, x33 := bits.mul_u64(arg1[7], arg2[3]) + x36, x35 := bits.mul_u64(arg1[7], arg2[2]) + x38, x37 := bits.mul_u64(arg1[7], arg2[1]) + x40, x39 := bits.mul_u64(arg1[6], arg2[7]) + x42, x41 := bits.mul_u64(arg1[6], arg2[6]) + x44, x43 := bits.mul_u64(arg1[6], arg2[5]) + x46, x45 := bits.mul_u64(arg1[6], arg2[4]) + x48, x47 := bits.mul_u64(arg1[6], arg2[3]) + x50, x49 := bits.mul_u64(arg1[6], arg2[2]) + x52, x51 := bits.mul_u64(arg1[5], arg2[7]) + x54, x53 := bits.mul_u64(arg1[5], arg2[6]) + x56, x55 := bits.mul_u64(arg1[5], arg2[5]) + x58, x57 := bits.mul_u64(arg1[5], arg2[4]) + x60, x59 := bits.mul_u64(arg1[5], arg2[3]) + x62, x61 := bits.mul_u64(arg1[4], arg2[7]) + x64, x63 := bits.mul_u64(arg1[4], arg2[6]) + x66, x65 := bits.mul_u64(arg1[4], arg2[5]) + x68, x67 := bits.mul_u64(arg1[4], arg2[4]) + x70, x69 := bits.mul_u64(arg1[3], arg2[7]) + x72, x71 := bits.mul_u64(arg1[3], arg2[6]) + x74, x73 := bits.mul_u64(arg1[3], arg2[5]) + x76, x75 := bits.mul_u64(arg1[2], arg2[7]) + x78, x77 := bits.mul_u64(arg1[2], arg2[6]) + x80, x79 := bits.mul_u64(arg1[1], arg2[7]) + x82, x81 := bits.mul_u64(arg1[7], arg2[4]) + x84, x83 := bits.mul_u64(arg1[7], arg2[3]) + x86, x85 := bits.mul_u64(arg1[7], arg2[2]) + x88, x87 := bits.mul_u64(arg1[7], arg2[1]) + x90, x89 := bits.mul_u64(arg1[6], arg2[5]) + x92, x91 := bits.mul_u64(arg1[6], arg2[4]) + x94, x93 := bits.mul_u64(arg1[6], arg2[3]) + x96, x95 := bits.mul_u64(arg1[6], arg2[2]) + x98, x97 := bits.mul_u64(arg1[5], arg2[6]) + x100, x99 := bits.mul_u64(arg1[5], arg2[5]) + x102, x101 := bits.mul_u64(arg1[5], arg2[4]) + x104, x103 := bits.mul_u64(arg1[5], arg2[3]) + x106, x105 := bits.mul_u64(arg1[4], arg2[7]) + x108, x107 := bits.mul_u64(arg1[4], arg2[6]) + x110, x109 := bits.mul_u64(arg1[4], arg2[5]) + x112, x111 := bits.mul_u64(arg1[4], arg2[4]) + x114, x113 := bits.mul_u64(arg1[3], arg2[7]) + x116, x115 := bits.mul_u64(arg1[3], arg2[6]) + x118, x117 := bits.mul_u64(arg1[3], arg2[5]) + x120, x119 := bits.mul_u64(arg1[2], arg2[7]) + x122, x121 := bits.mul_u64(arg1[2], arg2[6]) + x124, x123 := bits.mul_u64(arg1[1], arg2[7]) + x126, x125 := bits.mul_u64(arg1[7], arg2[0]) + x128, x127 := bits.mul_u64(arg1[6], arg2[1]) + x130, x129 := bits.mul_u64(arg1[6], arg2[0]) + x132, x131 := bits.mul_u64(arg1[5], arg2[2]) + x134, x133 := bits.mul_u64(arg1[5], arg2[1]) + x136, x135 := bits.mul_u64(arg1[5], arg2[0]) + x138, x137 := bits.mul_u64(arg1[4], arg2[3]) + x140, x139 := bits.mul_u64(arg1[4], arg2[2]) + x142, x141 := bits.mul_u64(arg1[4], arg2[1]) + x144, x143 := bits.mul_u64(arg1[4], arg2[0]) + x146, x145 := bits.mul_u64(arg1[3], arg2[4]) + x148, x147 := bits.mul_u64(arg1[3], arg2[3]) + x150, x149 := bits.mul_u64(arg1[3], arg2[2]) + x152, x151 := bits.mul_u64(arg1[3], arg2[1]) + x154, x153 := bits.mul_u64(arg1[3], arg2[0]) + x156, x155 := bits.mul_u64(arg1[2], arg2[5]) + x158, x157 := bits.mul_u64(arg1[2], arg2[4]) + x160, x159 := bits.mul_u64(arg1[2], arg2[3]) + x162, x161 := bits.mul_u64(arg1[2], arg2[2]) + x164, x163 := bits.mul_u64(arg1[2], arg2[1]) + x166, x165 := bits.mul_u64(arg1[2], arg2[0]) + x168, x167 := bits.mul_u64(arg1[1], arg2[6]) + x170, x169 := bits.mul_u64(arg1[1], arg2[5]) + x172, x171 := bits.mul_u64(arg1[1], arg2[4]) + x174, x173 := bits.mul_u64(arg1[1], arg2[3]) + x176, x175 := bits.mul_u64(arg1[1], arg2[2]) + x178, x177 := bits.mul_u64(arg1[1], arg2[1]) + x180, x179 := bits.mul_u64(arg1[1], arg2[0]) + x182, x181 := bits.mul_u64(arg1[0], arg2[7]) + x184, x183 := bits.mul_u64(arg1[0], arg2[6]) + x186, x185 := bits.mul_u64(arg1[0], arg2[5]) + x188, x187 := bits.mul_u64(arg1[0], arg2[4]) + x190, x189 := bits.mul_u64(arg1[0], arg2[3]) + x192, x191 := bits.mul_u64(arg1[0], arg2[2]) + x194, x193 := bits.mul_u64(arg1[0], arg2[1]) + x196, x195 := bits.mul_u64(arg1[0], arg2[0]) + x197, x198 := bits.add_u64(x43, x31, u64(0x0)) + x199, _ := bits.add_u64(x44, x32, u64(fiat.u1(x198))) + x201, x202 := bits.add_u64(x53, x197, u64(0x0)) + x203, _ := bits.add_u64(x54, x199, u64(fiat.u1(x202))) + x205, x206 := bits.add_u64(x61, x201, u64(0x0)) + x207, _ := bits.add_u64(x62, x203, u64(fiat.u1(x206))) + x209, x210 := bits.add_u64(x153, x205, u64(0x0)) + x211, _ := bits.add_u64(x154, x207, u64(fiat.u1(x210))) + x213, x214 := bits.add_u64(x163, x209, u64(0x0)) + x215, _ := bits.add_u64(x164, x211, u64(fiat.u1(x214))) + x217, x218 := bits.add_u64(x175, x213, u64(0x0)) + x219, _ := bits.add_u64(x176, x215, u64(fiat.u1(x218))) + x221, x222 := bits.add_u64(x189, x217, u64(0x0)) + x223, _ := bits.add_u64(x190, x219, u64(fiat.u1(x222))) + x225 := ((x221 >> 56) | ((x223 << 8) & 0xffffffffffffffff)) + x226 := (x221 & 0xffffffffffffff) + x227, x228 := bits.add_u64(x89, x81, u64(0x0)) + x229, _ := bits.add_u64(x90, x82, u64(fiat.u1(x228))) + x231, x232 := bits.add_u64(x97, x227, u64(0x0)) + x233, _ := bits.add_u64(x98, x229, u64(fiat.u1(x232))) + x235, x236 := bits.add_u64(x105, x231, u64(0x0)) + x237, _ := bits.add_u64(x106, x233, u64(fiat.u1(x236))) + x239, x240 := bits.add_u64(x125, x235, u64(0x0)) + x241, _ := bits.add_u64(x126, x237, u64(fiat.u1(x240))) + x243, x244 := bits.add_u64(x127, x239, u64(0x0)) + x245, _ := bits.add_u64(x128, x241, u64(fiat.u1(x244))) + x247, x248 := bits.add_u64(x131, x243, u64(0x0)) + x249, _ := bits.add_u64(x132, x245, u64(fiat.u1(x248))) + x251, x252 := bits.add_u64(x137, x247, u64(0x0)) + x253, _ := bits.add_u64(x138, x249, u64(fiat.u1(x252))) + x255, x256 := bits.add_u64(x145, x251, u64(0x0)) + x257, _ := bits.add_u64(x146, x253, u64(fiat.u1(x256))) + x259, x260 := bits.add_u64(x155, x255, u64(0x0)) + x261, _ := bits.add_u64(x156, x257, u64(fiat.u1(x260))) + x263, x264 := bits.add_u64(x167, x259, u64(0x0)) + x265, _ := bits.add_u64(x168, x261, u64(fiat.u1(x264))) + x267, x268 := bits.add_u64(x181, x263, u64(0x0)) + x269, _ := bits.add_u64(x182, x265, u64(fiat.u1(x268))) + x271, x272 := bits.add_u64(x25, x13, u64(0x0)) + x273, _ := bits.add_u64(x26, x14, u64(fiat.u1(x272))) + x275, x276 := bits.add_u64(x83, x271, u64(0x0)) + x277, _ := bits.add_u64(x84, x273, u64(fiat.u1(x276))) + x279, x280 := bits.add_u64(x91, x275, u64(0x0)) + x281, _ := bits.add_u64(x92, x277, u64(fiat.u1(x280))) + x283, x284 := bits.add_u64(x99, x279, u64(0x0)) + x285, _ := bits.add_u64(x100, x281, u64(fiat.u1(x284))) + x287, x288 := bits.add_u64(x107, x283, u64(0x0)) + x289, _ := bits.add_u64(x108, x285, u64(fiat.u1(x288))) + x291, x292 := bits.add_u64(x113, x287, u64(0x0)) + x293, _ := bits.add_u64(x114, x289, u64(fiat.u1(x292))) + x295, x296 := bits.add_u64(x129, x291, u64(0x0)) + x297, _ := bits.add_u64(x130, x293, u64(fiat.u1(x296))) + x299, x300 := bits.add_u64(x133, x295, u64(0x0)) + x301, _ := bits.add_u64(x134, x297, u64(fiat.u1(x300))) + x303, x304 := bits.add_u64(x139, x299, u64(0x0)) + x305, _ := bits.add_u64(x140, x301, u64(fiat.u1(x304))) + x307, x308 := bits.add_u64(x147, x303, u64(0x0)) + x309, _ := bits.add_u64(x148, x305, u64(fiat.u1(x308))) + x311, x312 := bits.add_u64(x157, x307, u64(0x0)) + x313, _ := bits.add_u64(x158, x309, u64(fiat.u1(x312))) + x315, x316 := bits.add_u64(x169, x311, u64(0x0)) + x317, _ := bits.add_u64(x170, x313, u64(fiat.u1(x316))) + x319, x320 := bits.add_u64(x183, x315, u64(0x0)) + x321, _ := bits.add_u64(x184, x317, u64(fiat.u1(x320))) + x323, x324 := bits.add_u64(x19, x15, u64(0x0)) + x325, _ := bits.add_u64(x20, x16, u64(fiat.u1(x324))) + x327, x328 := bits.add_u64(x27, x323, u64(0x0)) + x329, _ := bits.add_u64(x28, x325, u64(fiat.u1(x328))) + x331, x332 := bits.add_u64(x39, x327, u64(0x0)) + x333, _ := bits.add_u64(x40, x329, u64(fiat.u1(x332))) + x335, x336 := bits.add_u64(x85, x331, u64(0x0)) + x337, _ := bits.add_u64(x86, x333, u64(fiat.u1(x336))) + x339, x340 := bits.add_u64(x93, x335, u64(0x0)) + x341, _ := bits.add_u64(x94, x337, u64(fiat.u1(x340))) + x343, x344 := bits.add_u64(x101, x339, u64(0x0)) + x345, _ := bits.add_u64(x102, x341, u64(fiat.u1(x344))) + x347, x348 := bits.add_u64(x109, x343, u64(0x0)) + x349, _ := bits.add_u64(x110, x345, u64(fiat.u1(x348))) + x351, x352 := bits.add_u64(x115, x347, u64(0x0)) + x353, _ := bits.add_u64(x116, x349, u64(fiat.u1(x352))) + x355, x356 := bits.add_u64(x119, x351, u64(0x0)) + x357, _ := bits.add_u64(x120, x353, u64(fiat.u1(x356))) + x359, x360 := bits.add_u64(x135, x355, u64(0x0)) + x361, _ := bits.add_u64(x136, x357, u64(fiat.u1(x360))) + x363, x364 := bits.add_u64(x141, x359, u64(0x0)) + x365, _ := bits.add_u64(x142, x361, u64(fiat.u1(x364))) + x367, x368 := bits.add_u64(x149, x363, u64(0x0)) + x369, _ := bits.add_u64(x150, x365, u64(fiat.u1(x368))) + x371, x372 := bits.add_u64(x159, x367, u64(0x0)) + x373, _ := bits.add_u64(x160, x369, u64(fiat.u1(x372))) + x375, x376 := bits.add_u64(x171, x371, u64(0x0)) + x377, _ := bits.add_u64(x172, x373, u64(fiat.u1(x376))) + x379, x380 := bits.add_u64(x185, x375, u64(0x0)) + x381, _ := bits.add_u64(x186, x377, u64(fiat.u1(x380))) + x383, x384 := bits.add_u64(x21, x17, u64(0x0)) + x385, _ := bits.add_u64(x22, x18, u64(fiat.u1(x384))) + x387, x388 := bits.add_u64(x23, x383, u64(0x0)) + x389, _ := bits.add_u64(x24, x385, u64(fiat.u1(x388))) + x391, x392 := bits.add_u64(x29, x387, u64(0x0)) + x393, _ := bits.add_u64(x30, x389, u64(fiat.u1(x392))) + x395, x396 := bits.add_u64(x41, x391, u64(0x0)) + x397, _ := bits.add_u64(x42, x393, u64(fiat.u1(x396))) + x399, x400 := bits.add_u64(x51, x395, u64(0x0)) + x401, _ := bits.add_u64(x52, x397, u64(fiat.u1(x400))) + x403, x404 := bits.add_u64(x87, x399, u64(0x0)) + x405, _ := bits.add_u64(x88, x401, u64(fiat.u1(x404))) + x407, x408 := bits.add_u64(x95, x403, u64(0x0)) + x409, _ := bits.add_u64(x96, x405, u64(fiat.u1(x408))) + x411, x412 := bits.add_u64(x103, x407, u64(0x0)) + x413, _ := bits.add_u64(x104, x409, u64(fiat.u1(x412))) + x415, x416 := bits.add_u64(x111, x411, u64(0x0)) + x417, _ := bits.add_u64(x112, x413, u64(fiat.u1(x416))) + x419, x420 := bits.add_u64(x117, x415, u64(0x0)) + x421, _ := bits.add_u64(x118, x417, u64(fiat.u1(x420))) + x423, x424 := bits.add_u64(x121, x419, u64(0x0)) + x425, _ := bits.add_u64(x122, x421, u64(fiat.u1(x424))) + x427, x428 := bits.add_u64(x123, x423, u64(0x0)) + x429, _ := bits.add_u64(x124, x425, u64(fiat.u1(x428))) + x431, x432 := bits.add_u64(x143, x427, u64(0x0)) + x433, _ := bits.add_u64(x144, x429, u64(fiat.u1(x432))) + x435, x436 := bits.add_u64(x151, x431, u64(0x0)) + x437, _ := bits.add_u64(x152, x433, u64(fiat.u1(x436))) + x439, x440 := bits.add_u64(x161, x435, u64(0x0)) + x441, _ := bits.add_u64(x162, x437, u64(fiat.u1(x440))) + x443, x444 := bits.add_u64(x173, x439, u64(0x0)) + x445, _ := bits.add_u64(x174, x441, u64(fiat.u1(x444))) + x447, x448 := bits.add_u64(x187, x443, u64(0x0)) + x449, _ := bits.add_u64(x188, x445, u64(fiat.u1(x448))) + x451, x452 := bits.add_u64(x33, x1, u64(0x0)) + x453, _ := bits.add_u64(x34, x2, u64(fiat.u1(x452))) + x455, x456 := bits.add_u64(x45, x451, u64(0x0)) + x457, _ := bits.add_u64(x46, x453, u64(fiat.u1(x456))) + x459, x460 := bits.add_u64(x55, x455, u64(0x0)) + x461, _ := bits.add_u64(x56, x457, u64(fiat.u1(x460))) + x463, x464 := bits.add_u64(x63, x459, u64(0x0)) + x465, _ := bits.add_u64(x64, x461, u64(fiat.u1(x464))) + x467, x468 := bits.add_u64(x69, x463, u64(0x0)) + x469, _ := bits.add_u64(x70, x465, u64(fiat.u1(x468))) + x471, x472 := bits.add_u64(x165, x467, u64(0x0)) + x473, _ := bits.add_u64(x166, x469, u64(fiat.u1(x472))) + x475, x476 := bits.add_u64(x177, x471, u64(0x0)) + x477, _ := bits.add_u64(x178, x473, u64(fiat.u1(x476))) + x479, x480 := bits.add_u64(x191, x475, u64(0x0)) + x481, _ := bits.add_u64(x192, x477, u64(fiat.u1(x480))) + x483, x484 := bits.add_u64(x7, x3, u64(0x0)) + x485, _ := bits.add_u64(x8, x4, u64(fiat.u1(x484))) + x487, x488 := bits.add_u64(x35, x483, u64(0x0)) + x489, _ := bits.add_u64(x36, x485, u64(fiat.u1(x488))) + x491, x492 := bits.add_u64(x47, x487, u64(0x0)) + x493, _ := bits.add_u64(x48, x489, u64(fiat.u1(x492))) + x495, x496 := bits.add_u64(x57, x491, u64(0x0)) + x497, _ := bits.add_u64(x58, x493, u64(fiat.u1(x496))) + x499, x500 := bits.add_u64(x65, x495, u64(0x0)) + x501, _ := bits.add_u64(x66, x497, u64(fiat.u1(x500))) + x503, x504 := bits.add_u64(x71, x499, u64(0x0)) + x505, _ := bits.add_u64(x72, x501, u64(fiat.u1(x504))) + x507, x508 := bits.add_u64(x75, x503, u64(0x0)) + x509, _ := bits.add_u64(x76, x505, u64(fiat.u1(x508))) + x511, x512 := bits.add_u64(x179, x507, u64(0x0)) + x513, _ := bits.add_u64(x180, x509, u64(fiat.u1(x512))) + x515, x516 := bits.add_u64(x193, x511, u64(0x0)) + x517, _ := bits.add_u64(x194, x513, u64(fiat.u1(x516))) + x519, x520 := bits.add_u64(x9, x5, u64(0x0)) + x521, _ := bits.add_u64(x10, x6, u64(fiat.u1(x520))) + x523, x524 := bits.add_u64(x11, x519, u64(0x0)) + x525, _ := bits.add_u64(x12, x521, u64(fiat.u1(x524))) + x527, x528 := bits.add_u64(x37, x523, u64(0x0)) + x529, _ := bits.add_u64(x38, x525, u64(fiat.u1(x528))) + x531, x532 := bits.add_u64(x49, x527, u64(0x0)) + x533, _ := bits.add_u64(x50, x529, u64(fiat.u1(x532))) + x535, x536 := bits.add_u64(x59, x531, u64(0x0)) + x537, _ := bits.add_u64(x60, x533, u64(fiat.u1(x536))) + x539, x540 := bits.add_u64(x67, x535, u64(0x0)) + x541, _ := bits.add_u64(x68, x537, u64(fiat.u1(x540))) + x543, x544 := bits.add_u64(x73, x539, u64(0x0)) + x545, _ := bits.add_u64(x74, x541, u64(fiat.u1(x544))) + x547, x548 := bits.add_u64(x77, x543, u64(0x0)) + x549, _ := bits.add_u64(x78, x545, u64(fiat.u1(x548))) + x551, x552 := bits.add_u64(x79, x547, u64(0x0)) + x553, _ := bits.add_u64(x80, x549, u64(fiat.u1(x552))) + x555, x556 := bits.add_u64(x195, x551, u64(0x0)) + x557, _ := bits.add_u64(x196, x553, u64(fiat.u1(x556))) + x559, x560 := bits.add_u64(x225, x447, u64(0x0)) + x561 := (u64(fiat.u1(x560)) + x449) + x562 := ((x267 >> 56) | ((x269 << 8) & 0xffffffffffffffff)) + x563 := (x267 & 0xffffffffffffff) + x564, x565 := bits.add_u64(x559, x562, u64(0x0)) + x566 := (u64(fiat.u1(x565)) + x561) + x567 := ((x564 >> 56) | ((x566 << 8) & 0xffffffffffffffff)) + x568 := (x564 & 0xffffffffffffff) + x569, x570 := bits.add_u64(x555, x562, u64(0x0)) + x571 := (u64(fiat.u1(x570)) + x557) + x572, x573 := bits.add_u64(x567, x379, u64(0x0)) + x574 := (u64(fiat.u1(x573)) + x381) + x575 := ((x569 >> 56) | ((x571 << 8) & 0xffffffffffffffff)) + x576 := (x569 & 0xffffffffffffff) + x577, x578 := bits.add_u64(x575, x515, u64(0x0)) + x579 := (u64(fiat.u1(x578)) + x517) + x580 := ((x572 >> 56) | ((x574 << 8) & 0xffffffffffffffff)) + x581 := (x572 & 0xffffffffffffff) + x582, x583 := bits.add_u64(x580, x319, u64(0x0)) + x584 := (u64(fiat.u1(x583)) + x321) + x585 := ((x577 >> 56) | ((x579 << 8) & 0xffffffffffffffff)) + x586 := (x577 & 0xffffffffffffff) + x587, x588 := bits.add_u64(x585, x479, u64(0x0)) + x589 := (u64(fiat.u1(x588)) + x481) + x590 := ((x582 >> 56) | ((x584 << 8) & 0xffffffffffffffff)) + x591 := (x582 & 0xffffffffffffff) + x592 := (x590 + x563) + x593 := ((x587 >> 56) | ((x589 << 8) & 0xffffffffffffffff)) + x594 := (x587 & 0xffffffffffffff) + x595 := (x593 + x226) + x596 := (x592 >> 56) + x597 := (x592 & 0xffffffffffffff) + x598 := (x595 >> 56) + x599 := (x595 & 0xffffffffffffff) + x600 := (x568 + x596) + x601 := (x576 + x596) + x602 := (x598 + x600) + x603 := fiat.u1((x602 >> 56)) + x604 := (x602 & 0xffffffffffffff) + x605 := (u64(x603) + x581) + x606 := fiat.u1((x601 >> 56)) + x607 := (x601 & 0xffffffffffffff) + x608 := (u64(x606) + x586) + out1[0] = x607 + out1[1] = x608 + out1[2] = x594 + out1[3] = x599 + out1[4] = x604 + out1[5] = x605 + out1[6] = x591 + out1[7] = x597 +} + +fe_carry_square :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := arg1[7] + x2 := arg1[7] + x3 := (x1 * 0x2) + x4 := (x2 * 0x2) + x5 := (arg1[7] * 0x2) + x6 := arg1[6] + x7 := arg1[6] + x8 := (x6 * 0x2) + x9 := (x7 * 0x2) + x10 := (arg1[6] * 0x2) + x11 := arg1[5] + x12 := arg1[5] + x13 := (x11 * 0x2) + x14 := (x12 * 0x2) + x15 := (arg1[5] * 0x2) + x16 := arg1[4] + x17 := arg1[4] + x18 := (arg1[4] * 0x2) + x19 := (arg1[3] * 0x2) + x20 := (arg1[2] * 0x2) + x21 := (arg1[1] * 0x2) + x23, x22 := bits.mul_u64(arg1[7], x1) + x25, x24 := bits.mul_u64(arg1[6], x3) + x27, x26 := bits.mul_u64(arg1[6], x6) + x29, x28 := bits.mul_u64(arg1[5], x3) + x31, x30 := bits.mul_u64(arg1[7], x1) + x33, x32 := bits.mul_u64(arg1[6], x3) + x35, x34 := bits.mul_u64(arg1[6], x6) + x37, x36 := bits.mul_u64(arg1[5], x3) + x39, x38 := bits.mul_u64(arg1[7], x2) + x41, x40 := bits.mul_u64(arg1[6], x4) + x43, x42 := bits.mul_u64(arg1[6], x7) + x45, x44 := bits.mul_u64(arg1[5], x4) + x47, x46 := bits.mul_u64(arg1[5], x9) + x49, x48 := bits.mul_u64(arg1[5], x8) + x51, x50 := bits.mul_u64(arg1[5], x12) + x53, x52 := bits.mul_u64(arg1[5], x11) + x55, x54 := bits.mul_u64(arg1[4], x4) + x57, x56 := bits.mul_u64(arg1[4], x3) + x59, x58 := bits.mul_u64(arg1[4], x9) + x61, x60 := bits.mul_u64(arg1[4], x8) + x63, x62 := bits.mul_u64(arg1[4], x14) + x65, x64 := bits.mul_u64(arg1[4], x13) + x67, x66 := bits.mul_u64(arg1[4], x17) + x69, x68 := bits.mul_u64(arg1[4], x16) + x71, x70 := bits.mul_u64(arg1[3], x4) + x73, x72 := bits.mul_u64(arg1[3], x3) + x75, x74 := bits.mul_u64(arg1[3], x9) + x77, x76 := bits.mul_u64(arg1[3], x8) + x79, x78 := bits.mul_u64(arg1[3], x14) + x81, x80 := bits.mul_u64(arg1[3], x13) + x83, x82 := bits.mul_u64(arg1[3], x18) + x85, x84 := bits.mul_u64(arg1[3], arg1[3]) + x87, x86 := bits.mul_u64(arg1[2], x4) + x89, x88 := bits.mul_u64(arg1[2], x3) + x91, x90 := bits.mul_u64(arg1[2], x9) + x93, x92 := bits.mul_u64(arg1[2], x8) + x95, x94 := bits.mul_u64(arg1[2], x15) + x97, x96 := bits.mul_u64(arg1[2], x18) + x99, x98 := bits.mul_u64(arg1[2], x19) + x101, x100 := bits.mul_u64(arg1[2], arg1[2]) + x103, x102 := bits.mul_u64(arg1[1], x4) + x105, x104 := bits.mul_u64(arg1[1], x3) + x107, x106 := bits.mul_u64(arg1[1], x10) + x109, x108 := bits.mul_u64(arg1[1], x15) + x111, x110 := bits.mul_u64(arg1[1], x18) + x113, x112 := bits.mul_u64(arg1[1], x19) + x115, x114 := bits.mul_u64(arg1[1], x20) + x117, x116 := bits.mul_u64(arg1[1], arg1[1]) + x119, x118 := bits.mul_u64(arg1[0], x5) + x121, x120 := bits.mul_u64(arg1[0], x10) + x123, x122 := bits.mul_u64(arg1[0], x15) + x125, x124 := bits.mul_u64(arg1[0], x18) + x127, x126 := bits.mul_u64(arg1[0], x19) + x129, x128 := bits.mul_u64(arg1[0], x20) + x131, x130 := bits.mul_u64(arg1[0], x21) + x133, x132 := bits.mul_u64(arg1[0], arg1[0]) + x134, x135 := bits.add_u64(x54, x46, u64(0x0)) + x136, _ := bits.add_u64(x55, x47, u64(fiat.u1(x135))) + x138, x139 := bits.add_u64(x114, x134, u64(0x0)) + x140, _ := bits.add_u64(x115, x136, u64(fiat.u1(x139))) + x142, x143 := bits.add_u64(x126, x138, u64(0x0)) + x144, _ := bits.add_u64(x127, x140, u64(fiat.u1(x143))) + x146 := ((x142 >> 56) | ((x144 << 8) & 0xffffffffffffffff)) + x147 := (x142 & 0xffffffffffffff) + x148, x149 := bits.add_u64(x56, x48, u64(0x0)) + x150, _ := bits.add_u64(x57, x49, u64(fiat.u1(x149))) + x152, x153 := bits.add_u64(x82, x148, u64(0x0)) + x154, _ := bits.add_u64(x83, x150, u64(fiat.u1(x153))) + x156, x157 := bits.add_u64(x94, x152, u64(0x0)) + x158, _ := bits.add_u64(x95, x154, u64(fiat.u1(x157))) + x160, x161 := bits.add_u64(x106, x156, u64(0x0)) + x162, _ := bits.add_u64(x107, x158, u64(fiat.u1(x161))) + x164, x165 := bits.add_u64(x118, x160, u64(0x0)) + x166, _ := bits.add_u64(x119, x162, u64(fiat.u1(x165))) + x168, x169 := bits.add_u64(x38, x30, u64(0x0)) + x170, _ := bits.add_u64(x39, x31, u64(fiat.u1(x169))) + x172, x173 := bits.add_u64(x52, x168, u64(0x0)) + x174, _ := bits.add_u64(x53, x170, u64(fiat.u1(x173))) + x176, x177 := bits.add_u64(x60, x172, u64(0x0)) + x178, _ := bits.add_u64(x61, x174, u64(fiat.u1(x177))) + x180, x181 := bits.add_u64(x72, x176, u64(0x0)) + x182, _ := bits.add_u64(x73, x178, u64(fiat.u1(x181))) + x184, x185 := bits.add_u64(x84, x180, u64(0x0)) + x186, _ := bits.add_u64(x85, x182, u64(fiat.u1(x185))) + x188, x189 := bits.add_u64(x96, x184, u64(0x0)) + x190, _ := bits.add_u64(x97, x186, u64(fiat.u1(x189))) + x192, x193 := bits.add_u64(x108, x188, u64(0x0)) + x194, _ := bits.add_u64(x109, x190, u64(fiat.u1(x193))) + x196, x197 := bits.add_u64(x120, x192, u64(0x0)) + x198, _ := bits.add_u64(x121, x194, u64(fiat.u1(x197))) + x200, x201 := bits.add_u64(x40, x32, u64(0x0)) + x202, _ := bits.add_u64(x41, x33, u64(fiat.u1(x201))) + x204, x205 := bits.add_u64(x64, x200, u64(0x0)) + x206, _ := bits.add_u64(x65, x202, u64(fiat.u1(x205))) + x208, x209 := bits.add_u64(x76, x204, u64(0x0)) + x210, _ := bits.add_u64(x77, x206, u64(fiat.u1(x209))) + x212, x213 := bits.add_u64(x88, x208, u64(0x0)) + x214, _ := bits.add_u64(x89, x210, u64(fiat.u1(x213))) + x216, x217 := bits.add_u64(x98, x212, u64(0x0)) + x218, _ := bits.add_u64(x99, x214, u64(fiat.u1(x217))) + x220, x221 := bits.add_u64(x110, x216, u64(0x0)) + x222, _ := bits.add_u64(x111, x218, u64(fiat.u1(x221))) + x224, x225 := bits.add_u64(x122, x220, u64(0x0)) + x226, _ := bits.add_u64(x123, x222, u64(fiat.u1(x225))) + x228, x229 := bits.add_u64(x36, x34, u64(0x0)) + x230, _ := bits.add_u64(x37, x35, u64(fiat.u1(x229))) + x232, x233 := bits.add_u64(x42, x228, u64(0x0)) + x234, _ := bits.add_u64(x43, x230, u64(fiat.u1(x233))) + x236, x237 := bits.add_u64(x44, x232, u64(0x0)) + x238, _ := bits.add_u64(x45, x234, u64(fiat.u1(x237))) + x240, x241 := bits.add_u64(x68, x236, u64(0x0)) + x242, _ := bits.add_u64(x69, x238, u64(fiat.u1(x241))) + x244, x245 := bits.add_u64(x80, x240, u64(0x0)) + x246, _ := bits.add_u64(x81, x242, u64(fiat.u1(x245))) + x248, x249 := bits.add_u64(x92, x244, u64(0x0)) + x250, _ := bits.add_u64(x93, x246, u64(fiat.u1(x249))) + x252, x253 := bits.add_u64(x100, x248, u64(0x0)) + x254, _ := bits.add_u64(x101, x250, u64(fiat.u1(x253))) + x256, x257 := bits.add_u64(x104, x252, u64(0x0)) + x258, _ := bits.add_u64(x105, x254, u64(fiat.u1(x257))) + x260, x261 := bits.add_u64(x112, x256, u64(0x0)) + x262, _ := bits.add_u64(x113, x258, u64(fiat.u1(x261))) + x264, x265 := bits.add_u64(x124, x260, u64(0x0)) + x266, _ := bits.add_u64(x125, x262, u64(fiat.u1(x265))) + x268, x269 := bits.add_u64(x50, x22, u64(0x0)) + x270, _ := bits.add_u64(x51, x23, u64(fiat.u1(x269))) + x272, x273 := bits.add_u64(x58, x268, u64(0x0)) + x274, _ := bits.add_u64(x59, x270, u64(fiat.u1(x273))) + x276, x277 := bits.add_u64(x70, x272, u64(0x0)) + x278, _ := bits.add_u64(x71, x274, u64(fiat.u1(x277))) + x280, x281 := bits.add_u64(x116, x276, u64(0x0)) + x282, _ := bits.add_u64(x117, x278, u64(fiat.u1(x281))) + x284, x285 := bits.add_u64(x128, x280, u64(0x0)) + x286, _ := bits.add_u64(x129, x282, u64(fiat.u1(x285))) + x288, x289 := bits.add_u64(x62, x24, u64(0x0)) + x290, _ := bits.add_u64(x63, x25, u64(fiat.u1(x289))) + x292, x293 := bits.add_u64(x74, x288, u64(0x0)) + x294, _ := bits.add_u64(x75, x290, u64(fiat.u1(x293))) + x296, x297 := bits.add_u64(x86, x292, u64(0x0)) + x298, _ := bits.add_u64(x87, x294, u64(fiat.u1(x297))) + x300, x301 := bits.add_u64(x130, x296, u64(0x0)) + x302, _ := bits.add_u64(x131, x298, u64(fiat.u1(x301))) + x304, x305 := bits.add_u64(x28, x26, u64(0x0)) + x306, _ := bits.add_u64(x29, x27, u64(fiat.u1(x305))) + x308, x309 := bits.add_u64(x66, x304, u64(0x0)) + x310, _ := bits.add_u64(x67, x306, u64(fiat.u1(x309))) + x312, x313 := bits.add_u64(x78, x308, u64(0x0)) + x314, _ := bits.add_u64(x79, x310, u64(fiat.u1(x313))) + x316, x317 := bits.add_u64(x90, x312, u64(0x0)) + x318, _ := bits.add_u64(x91, x314, u64(fiat.u1(x317))) + x320, x321 := bits.add_u64(x102, x316, u64(0x0)) + x322, _ := bits.add_u64(x103, x318, u64(fiat.u1(x321))) + x324, x325 := bits.add_u64(x132, x320, u64(0x0)) + x326, _ := bits.add_u64(x133, x322, u64(fiat.u1(x325))) + x328, x329 := bits.add_u64(x146, x264, u64(0x0)) + x330 := (u64(fiat.u1(x329)) + x266) + x331 := ((x164 >> 56) | ((x166 << 8) & 0xffffffffffffffff)) + x332 := (x164 & 0xffffffffffffff) + x333, x334 := bits.add_u64(x328, x331, u64(0x0)) + x335 := (u64(fiat.u1(x334)) + x330) + x336 := ((x333 >> 56) | ((x335 << 8) & 0xffffffffffffffff)) + x337 := (x333 & 0xffffffffffffff) + x338, x339 := bits.add_u64(x324, x331, u64(0x0)) + x340 := (u64(fiat.u1(x339)) + x326) + x341, x342 := bits.add_u64(x336, x224, u64(0x0)) + x343 := (u64(fiat.u1(x342)) + x226) + x344 := ((x338 >> 56) | ((x340 << 8) & 0xffffffffffffffff)) + x345 := (x338 & 0xffffffffffffff) + x346, x347 := bits.add_u64(x344, x300, u64(0x0)) + x348 := (u64(fiat.u1(x347)) + x302) + x349 := ((x341 >> 56) | ((x343 << 8) & 0xffffffffffffffff)) + x350 := (x341 & 0xffffffffffffff) + x351, x352 := bits.add_u64(x349, x196, u64(0x0)) + x353 := (u64(fiat.u1(x352)) + x198) + x354 := ((x346 >> 56) | ((x348 << 8) & 0xffffffffffffffff)) + x355 := (x346 & 0xffffffffffffff) + x356, x357 := bits.add_u64(x354, x284, u64(0x0)) + x358 := (u64(fiat.u1(x357)) + x286) + x359 := ((x351 >> 56) | ((x353 << 8) & 0xffffffffffffffff)) + x360 := (x351 & 0xffffffffffffff) + x361 := (x359 + x332) + x362 := ((x356 >> 56) | ((x358 << 8) & 0xffffffffffffffff)) + x363 := (x356 & 0xffffffffffffff) + x364 := (x362 + x147) + x365 := (x361 >> 56) + x366 := (x361 & 0xffffffffffffff) + x367 := (x364 >> 56) + x368 := (x364 & 0xffffffffffffff) + x369 := (x337 + x365) + x370 := (x345 + x365) + x371 := (x367 + x369) + x372 := fiat.u1((x371 >> 56)) + x373 := (x371 & 0xffffffffffffff) + x374 := (u64(x372) + x350) + x375 := fiat.u1((x370 >> 56)) + x376 := (x370 & 0xffffffffffffff) + x377 := (u64(x375) + x355) + out1[0] = x376 + out1[1] = x377 + out1[2] = x363 + out1[3] = x368 + out1[4] = x373 + out1[5] = x374 + out1[6] = x360 + out1[7] = x366 +} + +fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := arg1[3] + x2 := arg1[7] + x3 := (x2 >> 56) + x4 := (((x1 >> 56) + arg1[4]) + x3) + x5 := (arg1[0] + x3) + x6 := ((x4 >> 56) + arg1[5]) + x7 := ((x5 >> 56) + arg1[1]) + x8 := ((x6 >> 56) + arg1[6]) + x9 := ((x7 >> 56) + arg1[2]) + x10 := ((x8 >> 56) + (x2 & 0xffffffffffffff)) + x11 := ((x9 >> 56) + (x1 & 0xffffffffffffff)) + x12 := fiat.u1((x10 >> 56)) + x13 := ((x5 & 0xffffffffffffff) + u64(x12)) + x14 := (u64(fiat.u1((x11 >> 56))) + ((x4 & 0xffffffffffffff) + u64(x12))) + x15 := (x13 & 0xffffffffffffff) + x16 := (u64(fiat.u1((x13 >> 56))) + (x7 & 0xffffffffffffff)) + x17 := (x9 & 0xffffffffffffff) + x18 := (x11 & 0xffffffffffffff) + x19 := (x14 & 0xffffffffffffff) + x20 := (u64(fiat.u1((x14 >> 56))) + (x6 & 0xffffffffffffff)) + x21 := (x8 & 0xffffffffffffff) + x22 := (x10 & 0xffffffffffffff) + out1[0] = x15 + out1[1] = x16 + out1[2] = x17 + out1[3] = x18 + out1[4] = x19 + out1[5] = x20 + out1[6] = x21 + out1[7] = x22 +} + +fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := (arg1[0] + arg2[0]) + x2 := (arg1[1] + arg2[1]) + x3 := (arg1[2] + arg2[2]) + x4 := (arg1[3] + arg2[3]) + x5 := (arg1[4] + arg2[4]) + x6 := (arg1[5] + arg2[5]) + x7 := (arg1[6] + arg2[6]) + x8 := (arg1[7] + arg2[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := ((0x1fffffffffffffe + arg1[0]) - arg2[0]) + x2 := ((0x1fffffffffffffe + arg1[1]) - arg2[1]) + x3 := ((0x1fffffffffffffe + arg1[2]) - arg2[2]) + x4 := ((0x1fffffffffffffe + arg1[3]) - arg2[3]) + x5 := ((0x1fffffffffffffc + arg1[4]) - arg2[4]) + x6 := ((0x1fffffffffffffe + arg1[5]) - arg2[5]) + x7 := ((0x1fffffffffffffe + arg1[6]) - arg2[6]) + x8 := ((0x1fffffffffffffe + arg1[7]) - arg2[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := (0x1fffffffffffffe - arg1[0]) + x2 := (0x1fffffffffffffe - arg1[1]) + x3 := (0x1fffffffffffffe - arg1[2]) + x4 := (0x1fffffffffffffe - arg1[3]) + x5 := (0x1fffffffffffffc - arg1[4]) + x6 := (0x1fffffffffffffe - arg1[5]) + x7 := (0x1fffffffffffffe - arg1[6]) + x8 := (0x1fffffffffffffe - arg1[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +@(optimization_mode = "none") +fe_cond_assign :: #force_no_inline proc "contextless" ( + out1, arg1: ^Tight_Field_Element, + arg2: int, +) { + x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0]) + x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1]) + x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2]) + x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3]) + x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4]) + x6 := fiat.cmovznz_u64(fiat.u1(arg2), out1[5], arg1[5]) + x7 := fiat.cmovznz_u64(fiat.u1(arg2), out1[6], arg1[6]) + x8 := fiat.cmovznz_u64(fiat.u1(arg2), out1[7], arg1[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +fe_to_bytes :: proc "contextless" (out1: ^[56]byte, arg1: ^Tight_Field_Element) { + x1, x2 := _subborrowx_u56(0x0, arg1[0], 0xffffffffffffff) + x3, x4 := _subborrowx_u56(x2, arg1[1], 0xffffffffffffff) + x5, x6 := _subborrowx_u56(x4, arg1[2], 0xffffffffffffff) + x7, x8 := _subborrowx_u56(x6, arg1[3], 0xffffffffffffff) + x9, x10 := _subborrowx_u56(x8, arg1[4], 0xfffffffffffffe) + x11, x12 := _subborrowx_u56(x10, arg1[5], 0xffffffffffffff) + x13, x14 := _subborrowx_u56(x12, arg1[6], 0xffffffffffffff) + x15, x16 := _subborrowx_u56(x14, arg1[7], 0xffffffffffffff) + x17 := fiat.cmovznz_u64(x16, u64(0x0), 0xffffffffffffffff) + x18, x19 := _addcarryx_u56(0x0, x1, (x17 & 0xffffffffffffff)) + x20, x21 := _addcarryx_u56(x19, x3, (x17 & 0xffffffffffffff)) + x22, x23 := _addcarryx_u56(x21, x5, (x17 & 0xffffffffffffff)) + x24, x25 := _addcarryx_u56(x23, x7, (x17 & 0xffffffffffffff)) + x26, x27 := _addcarryx_u56(x25, x9, (x17 & 0xfffffffffffffe)) + x28, x29 := _addcarryx_u56(x27, x11, (x17 & 0xffffffffffffff)) + x30, x31 := _addcarryx_u56(x29, x13, (x17 & 0xffffffffffffff)) + x32, _ := _addcarryx_u56(x31, x15, (x17 & 0xffffffffffffff)) + x34 := (u8(x18) & 0xff) + x35 := (x18 >> 8) + x36 := (u8(x35) & 0xff) + x37 := (x35 >> 8) + x38 := (u8(x37) & 0xff) + x39 := (x37 >> 8) + x40 := (u8(x39) & 0xff) + x41 := (x39 >> 8) + x42 := (u8(x41) & 0xff) + x43 := (x41 >> 8) + x44 := (u8(x43) & 0xff) + x45 := u8((x43 >> 8)) + x46 := (u8(x20) & 0xff) + x47 := (x20 >> 8) + x48 := (u8(x47) & 0xff) + x49 := (x47 >> 8) + x50 := (u8(x49) & 0xff) + x51 := (x49 >> 8) + x52 := (u8(x51) & 0xff) + x53 := (x51 >> 8) + x54 := (u8(x53) & 0xff) + x55 := (x53 >> 8) + x56 := (u8(x55) & 0xff) + x57 := u8((x55 >> 8)) + x58 := (u8(x22) & 0xff) + x59 := (x22 >> 8) + x60 := (u8(x59) & 0xff) + x61 := (x59 >> 8) + x62 := (u8(x61) & 0xff) + x63 := (x61 >> 8) + x64 := (u8(x63) & 0xff) + x65 := (x63 >> 8) + x66 := (u8(x65) & 0xff) + x67 := (x65 >> 8) + x68 := (u8(x67) & 0xff) + x69 := u8((x67 >> 8)) + x70 := (u8(x24) & 0xff) + x71 := (x24 >> 8) + x72 := (u8(x71) & 0xff) + x73 := (x71 >> 8) + x74 := (u8(x73) & 0xff) + x75 := (x73 >> 8) + x76 := (u8(x75) & 0xff) + x77 := (x75 >> 8) + x78 := (u8(x77) & 0xff) + x79 := (x77 >> 8) + x80 := (u8(x79) & 0xff) + x81 := u8((x79 >> 8)) + x82 := (u8(x26) & 0xff) + x83 := (x26 >> 8) + x84 := (u8(x83) & 0xff) + x85 := (x83 >> 8) + x86 := (u8(x85) & 0xff) + x87 := (x85 >> 8) + x88 := (u8(x87) & 0xff) + x89 := (x87 >> 8) + x90 := (u8(x89) & 0xff) + x91 := (x89 >> 8) + x92 := (u8(x91) & 0xff) + x93 := u8((x91 >> 8)) + x94 := (u8(x28) & 0xff) + x95 := (x28 >> 8) + x96 := (u8(x95) & 0xff) + x97 := (x95 >> 8) + x98 := (u8(x97) & 0xff) + x99 := (x97 >> 8) + x100 := (u8(x99) & 0xff) + x101 := (x99 >> 8) + x102 := (u8(x101) & 0xff) + x103 := (x101 >> 8) + x104 := (u8(x103) & 0xff) + x105 := u8((x103 >> 8)) + x106 := (u8(x30) & 0xff) + x107 := (x30 >> 8) + x108 := (u8(x107) & 0xff) + x109 := (x107 >> 8) + x110 := (u8(x109) & 0xff) + x111 := (x109 >> 8) + x112 := (u8(x111) & 0xff) + x113 := (x111 >> 8) + x114 := (u8(x113) & 0xff) + x115 := (x113 >> 8) + x116 := (u8(x115) & 0xff) + x117 := u8((x115 >> 8)) + x118 := (u8(x32) & 0xff) + x119 := (x32 >> 8) + x120 := (u8(x119) & 0xff) + x121 := (x119 >> 8) + x122 := (u8(x121) & 0xff) + x123 := (x121 >> 8) + x124 := (u8(x123) & 0xff) + x125 := (x123 >> 8) + x126 := (u8(x125) & 0xff) + x127 := (x125 >> 8) + x128 := (u8(x127) & 0xff) + x129 := u8((x127 >> 8)) + out1[0] = x34 + out1[1] = x36 + out1[2] = x38 + out1[3] = x40 + out1[4] = x42 + out1[5] = x44 + out1[6] = x45 + out1[7] = x46 + out1[8] = x48 + out1[9] = x50 + out1[10] = x52 + out1[11] = x54 + out1[12] = x56 + out1[13] = x57 + out1[14] = x58 + out1[15] = x60 + out1[16] = x62 + out1[17] = x64 + out1[18] = x66 + out1[19] = x68 + out1[20] = x69 + out1[21] = x70 + out1[22] = x72 + out1[23] = x74 + out1[24] = x76 + out1[25] = x78 + out1[26] = x80 + out1[27] = x81 + out1[28] = x82 + out1[29] = x84 + out1[30] = x86 + out1[31] = x88 + out1[32] = x90 + out1[33] = x92 + out1[34] = x93 + out1[35] = x94 + out1[36] = x96 + out1[37] = x98 + out1[38] = x100 + out1[39] = x102 + out1[40] = x104 + out1[41] = x105 + out1[42] = x106 + out1[43] = x108 + out1[44] = x110 + out1[45] = x112 + out1[46] = x114 + out1[47] = x116 + out1[48] = x117 + out1[49] = x118 + out1[50] = x120 + out1[51] = x122 + out1[52] = x124 + out1[53] = x126 + out1[54] = x128 + out1[55] = x129 +} + +fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[56]byte) { + x1 := (u64(arg1[55]) << 48) + x2 := (u64(arg1[54]) << 40) + x3 := (u64(arg1[53]) << 32) + x4 := (u64(arg1[52]) << 24) + x5 := (u64(arg1[51]) << 16) + x6 := (u64(arg1[50]) << 8) + x7 := arg1[49] + x8 := (u64(arg1[48]) << 48) + x9 := (u64(arg1[47]) << 40) + x10 := (u64(arg1[46]) << 32) + x11 := (u64(arg1[45]) << 24) + x12 := (u64(arg1[44]) << 16) + x13 := (u64(arg1[43]) << 8) + x14 := arg1[42] + x15 := (u64(arg1[41]) << 48) + x16 := (u64(arg1[40]) << 40) + x17 := (u64(arg1[39]) << 32) + x18 := (u64(arg1[38]) << 24) + x19 := (u64(arg1[37]) << 16) + x20 := (u64(arg1[36]) << 8) + x21 := arg1[35] + x22 := (u64(arg1[34]) << 48) + x23 := (u64(arg1[33]) << 40) + x24 := (u64(arg1[32]) << 32) + x25 := (u64(arg1[31]) << 24) + x26 := (u64(arg1[30]) << 16) + x27 := (u64(arg1[29]) << 8) + x28 := arg1[28] + x29 := (u64(arg1[27]) << 48) + x30 := (u64(arg1[26]) << 40) + x31 := (u64(arg1[25]) << 32) + x32 := (u64(arg1[24]) << 24) + x33 := (u64(arg1[23]) << 16) + x34 := (u64(arg1[22]) << 8) + x35 := arg1[21] + x36 := (u64(arg1[20]) << 48) + x37 := (u64(arg1[19]) << 40) + x38 := (u64(arg1[18]) << 32) + x39 := (u64(arg1[17]) << 24) + x40 := (u64(arg1[16]) << 16) + x41 := (u64(arg1[15]) << 8) + x42 := arg1[14] + x43 := (u64(arg1[13]) << 48) + x44 := (u64(arg1[12]) << 40) + x45 := (u64(arg1[11]) << 32) + x46 := (u64(arg1[10]) << 24) + x47 := (u64(arg1[9]) << 16) + x48 := (u64(arg1[8]) << 8) + x49 := arg1[7] + x50 := (u64(arg1[6]) << 48) + x51 := (u64(arg1[5]) << 40) + x52 := (u64(arg1[4]) << 32) + x53 := (u64(arg1[3]) << 24) + x54 := (u64(arg1[2]) << 16) + x55 := (u64(arg1[1]) << 8) + x56 := arg1[0] + x57 := (x55 + u64(x56)) + x58 := (x54 + x57) + x59 := (x53 + x58) + x60 := (x52 + x59) + x61 := (x51 + x60) + x62 := (x50 + x61) + x63 := (x48 + u64(x49)) + x64 := (x47 + x63) + x65 := (x46 + x64) + x66 := (x45 + x65) + x67 := (x44 + x66) + x68 := (x43 + x67) + x69 := (x41 + u64(x42)) + x70 := (x40 + x69) + x71 := (x39 + x70) + x72 := (x38 + x71) + x73 := (x37 + x72) + x74 := (x36 + x73) + x75 := (x34 + u64(x35)) + x76 := (x33 + x75) + x77 := (x32 + x76) + x78 := (x31 + x77) + x79 := (x30 + x78) + x80 := (x29 + x79) + x81 := (x27 + u64(x28)) + x82 := (x26 + x81) + x83 := (x25 + x82) + x84 := (x24 + x83) + x85 := (x23 + x84) + x86 := (x22 + x85) + x87 := (x20 + u64(x21)) + x88 := (x19 + x87) + x89 := (x18 + x88) + x90 := (x17 + x89) + x91 := (x16 + x90) + x92 := (x15 + x91) + x93 := (x13 + u64(x14)) + x94 := (x12 + x93) + x95 := (x11 + x94) + x96 := (x10 + x95) + x97 := (x9 + x96) + x98 := (x8 + x97) + x99 := (x6 + u64(x7)) + x100 := (x5 + x99) + x101 := (x4 + x100) + x102 := (x3 + x101) + x103 := (x2 + x102) + x104 := (x1 + x103) + out1[0] = x62 + out1[1] = x68 + out1[2] = x74 + out1[3] = x80 + out1[4] = x86 + out1[5] = x92 + out1[6] = x98 + out1[7] = x104 +} + +fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + x5 := arg1[4] + x6 := arg1[5] + x7 := arg1[6] + x8 := arg1[7] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} diff --git a/core/crypto/x448/x448.odin b/core/crypto/x448/x448.odin new file mode 100644 index 00000000000..ebe0c28b9b0 --- /dev/null +++ b/core/crypto/x448/x448.odin @@ -0,0 +1,161 @@ +/* +package x448 implements the X448 (aka curve448) Elliptic-Curve +Diffie-Hellman key exchange protocol. + +See: +- [[ https://www.rfc-editor.org/rfc/rfc7748 ]] +*/ +package x448 + +import field "core:crypto/_fiat/field_curve448" +import "core:mem" + +// SCALAR_SIZE is the size of a X448 scalar (private key) in bytes. +SCALAR_SIZE :: 56 +// POINT_SIZE is the size of a X448 point (public key/shared secret) in bytes. +POINT_SIZE :: 56 + +@(private, rodata) +_BASE_POINT: [56]byte = { + 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} + +@(private) +_scalar_bit :: #force_inline proc "contextless" (s: ^[56]byte, i: int) -> u8 { + if i < 0 { + return 0 + } + return (s[i >> 3] >> uint(i & 7)) & 1 +} + +@(private) +_scalarmult :: proc "contextless" (out, scalar, point: ^[56]byte) { + // Montgomery pseudo-multiplication, using the RFC 7748 formula. + t1, t2: field.Loose_Field_Element = ---, --- + + // x_1 = u + // x_2 = 1 + // z_2 = 0 + // x_3 = u + // z_3 = 1 + x1: field.Tight_Field_Element = --- + field.fe_from_bytes(&x1, point) + + x2, x3, z2, z3: field.Tight_Field_Element = ---, ---, ---, --- + field.fe_one(&x2) + field.fe_zero(&z2) + field.fe_set(&x3, &x1) + field.fe_one(&z3) + + // swap = 0 + swap: int + + // For t = bits-1 down to 0:a + for t := 448 - 1; t >= 0; t -= 1 { + // k_t = (k >> t) & 1 + k_t := int(_scalar_bit(scalar, t)) + // swap ^= k_t + swap ~= k_t + // // Conditional swap; see text below. + // (x_2, x_3) = cswap(swap, x_2, x_3) + field.fe_cond_swap(&x2, &x3, swap) + // (z_2, z_3) = cswap(swap, z_2, z_3) + field.fe_cond_swap(&z2, &z3, swap) + // swap = k_t + swap = k_t + + // Note: This deliberately omits reductions after add/sub operations + // if the result is only ever used as the input to a mul/square since + // the implementations of those can deal with non-reduced inputs. + // + // fe_tighten_cast is only used to store a fully reduced + // output in a Loose_Field_Element, or to provide such a + // Loose_Field_Element as a Tight_Field_Element argument. + + // A = x_2 + z_2 + field.fe_add(&t1, &x2, &z2) + // B = x_2 - z_2 + field.fe_sub(&t2, &x2, &z2) + // D = x_3 - z_3 + field.fe_sub(field.fe_relax_cast(&z2), &x3, &z3) // (z2 unreduced) + // DA = D * A + field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1) + // C = x_3 + z_3 + field.fe_add(field.fe_relax_cast(&z3), &x3, &z3) // (z3 unreduced) + // CB = C * B + field.fe_carry_mul(&x3, &t2, field.fe_relax_cast(&z3)) + // z_3 = x_1 * (DA - CB)^2 + field.fe_sub(field.fe_relax_cast(&z3), &x2, &x3) // (z3 unreduced) + field.fe_carry_square(&z3, field.fe_relax_cast(&z3)) + field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z3)) + // x_3 = (DA + CB)^2 + field.fe_add(field.fe_relax_cast(&z2), &x2, &x3) // (z2 unreduced) + field.fe_carry_square(&x3, field.fe_relax_cast(&z2)) + + // AA = A^2 + field.fe_carry_square(&z2, &t1) + // BB = B^2 + field.fe_carry_square(field.fe_tighten_cast(&t1), &t2) // (t1 reduced) + // x_2 = AA * BB + field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1) + // E = AA - BB + field.fe_sub(&t2, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced) + // z_2 = E * (AA + a24 * E) + field.fe_carry_mul_small(field.fe_tighten_cast(&t1), &t2, 39081) // (t1 reduced) + field.fe_add(&t1, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced) + field.fe_carry_mul(&z2, &t2, &t1) + } + + // Conditional swap; see text below. + // (x_2, x_3) = cswap(swap, x_2, x_3) + field.fe_cond_swap(&x2, &x3, swap) + // (z_2, z_3) = cswap(swap, z_2, z_3) + field.fe_cond_swap(&z2, &z3, swap) + + // Return x_2 * (z_2^(p - 2)) + field.fe_carry_inv(&z2, field.fe_relax_cast(&z2)) + field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2)) + field.fe_to_bytes(out, &x2) + + field.fe_clear_vec([]^field.Tight_Field_Element{&x1, &x2, &x3, &z2, &z3}) + field.fe_clear_vec([]^field.Loose_Field_Element{&t1, &t2}) +} + +// scalarmult "multiplies" the provided scalar and point, and writes the +// resulting point to dst. +scalarmult :: proc(dst, scalar, point: []byte) { + if len(scalar) != SCALAR_SIZE { + panic("crypto/x448: invalid scalar size") + } + if len(point) != POINT_SIZE { + panic("crypto/x448: invalid point size") + } + if len(dst) != POINT_SIZE { + panic("crypto/x448: invalid destination point size") + } + + // "clamp" the scalar + e: [56]byte = --- + copy_slice(e[:], scalar) + e[0] &= 252 + e[55] |= 128 + + p: [56]byte = --- + copy_slice(p[:], point) + + d: [56]byte = --- + _scalarmult(&d, &e, &p) + copy_slice(dst, d[:]) + + mem.zero_explicit(&e, size_of(e)) + mem.zero_explicit(&d, size_of(d)) +} + +// scalarmult_basepoint "multiplies" the provided scalar with the X25519 +// base point and writes the resulting point to dst. +scalarmult_basepoint :: proc(dst, scalar: []byte) { + scalarmult(dst, scalar, _BASE_POINT[:]) +} diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index 5bd45fda461..482958a5f16 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -48,6 +48,7 @@ import shake "core:crypto/shake" import sm3 "core:crypto/sm3" import tuplehash "core:crypto/tuplehash" import x25519 "core:crypto/x25519" +import x448 "core:crypto/x448" import pe "core:debug/pe" import trace "core:debug/trace" @@ -189,6 +190,7 @@ _ :: shake _ :: sm3 _ :: tuplehash _ :: x25519 +_ :: x448 _ :: pe _ :: trace _ :: dynlib diff --git a/tests/benchmark/crypto/benchmark_crypto.odin b/tests/benchmark/crypto/benchmark_crypto.odin index b139ea6696f..72e6d09328d 100644 --- a/tests/benchmark/crypto/benchmark_crypto.odin +++ b/tests/benchmark/crypto/benchmark_crypto.odin @@ -14,6 +14,7 @@ import "core:crypto/chacha20poly1305" import "core:crypto/ed25519" import "core:crypto/poly1305" import "core:crypto/x25519" +import "core:crypto/x448" // Cryptographic primitive benchmarks. @@ -237,6 +238,26 @@ benchmark_crypto :: proc(t: ^testing.T) { time.duration_microseconds(elapsed) / iters, ) } + { + point_str := "deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef" + scalar_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe" + + point, _ := hex.decode(transmute([]byte)(point_str), context.temp_allocator) + scalar, _ := hex.decode(transmute([]byte)(scalar_str), context.temp_allocator) + out: [x448.POINT_SIZE]byte = --- + + iters :: 10000 + start := time.now() + for i := 0; i < iters; i = i + 1 { + x448.scalarmult(out[:], scalar[:], point[:]) + } + elapsed := time.since(start) + + fmt.sbprintfln(&str, + "x448.scalarmult: ~%f us/op", + time.duration_microseconds(elapsed) / iters, + ) + } } @(private) diff --git a/tests/core/crypto/test_core_crypto_ecc25519.odin b/tests/core/crypto/test_core_crypto_edwards.odin similarity index 91% rename from tests/core/crypto/test_core_crypto_ecc25519.odin rename to tests/core/crypto/test_core_crypto_edwards.odin index fec4fa38e20..61933c00fee 100644 --- a/tests/core/crypto/test_core_crypto_ecc25519.odin +++ b/tests/core/crypto/test_core_crypto_edwards.odin @@ -7,6 +7,7 @@ import field "core:crypto/_fiat/field_curve25519" import "core:crypto/ed25519" import "core:crypto/ristretto255" import "core:crypto/x25519" +import "core:crypto/x448" @(test) test_sqrt_ratio_m1 :: proc(t: ^testing.T) { @@ -684,6 +685,68 @@ test_x25519 :: proc(t: ^testing.T) { } } +@(test) +test_x448 :: proc(t: ^testing.T) { + // Local copy of this so that the base point doesn't need to be exported. + _BASE_POINT: [56]byte = { + 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + } + + test_vectors := []struct { + scalar: string, + point: string, + product: string, + } { + // Test vectors from RFC 7748 + { + "3d262fddf9ec8e88495266fea19a34d28882acef045104d0d1aae121700a779c984c24f8cdd78fbff44943eba368f54b29259a4f1c600ad3", + "06fce640fa3487bfda5f6cf2d5263f8aad88334cbd07437f020f08f9814dc031ddbdc38c19c6da2583fa5429db94ada18aa7a7fb4ef8a086", + "ce3e4ff95a60dc6697da1db1d85e6afbdf79b50a2412d7546d5f239fe14fbaadeb445fc66a01b0779d98223961111e21766282f73dd96b6f", + }, + { + "203d494428b8399352665ddca42f9de8fef600908e0d461cb021f8c538345dd77c3e4806e25f46d3315c44e0a5b4371282dd2c8d5be3095f", + "0fbcc2f993cd56d3305b0b7d9e55d4c1a8fb5dbb52f8e9a1e9b6201b165d015894e56c4d3570bee52fe205e28a78b91cdfbde71ce8d157db", + "884a02576239ff7a2f2f63b2db6a9ff37047ac13568e1e30fe63c4a7ad1b3ee3a5700df34321d62077e63633c575c1c954514e99da7c179d", + }, + } + for v, _ in test_vectors { + scalar, _ := hex.decode(transmute([]byte)(v.scalar), context.temp_allocator) + point, _ := hex.decode(transmute([]byte)(v.point), context.temp_allocator) + + derived_point: [x448.POINT_SIZE]byte + x448.scalarmult(derived_point[:], scalar[:], point[:]) + derived_point_str := string(hex.encode(derived_point[:], context.temp_allocator)) + + testing.expectf( + t, + derived_point_str == v.product, + "Expected %s for %s * %s, but got %s instead", + v.product, + v.scalar, + v.point, + derived_point_str, + ) + + // Abuse the test vectors to sanity-check the scalar-basepoint multiply. + p1, p2: [x448.POINT_SIZE]byte + x448.scalarmult_basepoint(p1[:], scalar[:]) + x448.scalarmult(p2[:], scalar[:], _BASE_POINT[:]) + p1_str := string(hex.encode(p1[:], context.temp_allocator)) + p2_str := string(hex.encode(p2[:], context.temp_allocator)) + testing.expectf( + t, + p1_str == p2_str, + "Expected %s for %s * basepoint, but got %s instead", + p2_str, + v.scalar, + p1_str, + ) + } +} + @(private) ge_str :: proc(ge: ^ristretto255.Group_Element) -> string { b: [ristretto255.ELEMENT_SIZE]byte