From c42d19514fb86ecb9ca676831de3e7e4b4b37ad9 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Fri, 18 Nov 2022 22:56:31 +0000 Subject: [PATCH 01/14] Feat add Blake2b intrinsics --- crypto/src/crypto/digests/Blake2bDigest.cs | 555 ++++++++++++++++++++- 1 file changed, 554 insertions(+), 1 deletion(-) diff --git a/crypto/src/crypto/digests/Blake2bDigest.cs b/crypto/src/crypto/digests/Blake2bDigest.cs index 953ac0062e..12a56e543f 100644 --- a/crypto/src/crypto/digests/Blake2bDigest.cs +++ b/crypto/src/crypto/digests/Blake2bDigest.cs @@ -2,6 +2,12 @@ #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER using System.Runtime.CompilerServices; #endif +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER +using System.Runtime.InteropServices; +using System.Diagnostics; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; +#endif using Org.BouncyCastle.Crypto.Utilities; using Org.BouncyCastle.Utilities; @@ -87,7 +93,7 @@ public sealed class Blake2bDigest // Tree hashing parameters: // Because this class does not implement the Tree Hashing Mode, // these parameters can be treated as constants (see init() function) - /* + /* * private int fanout = 1; // 0-255 private int depth = 1; // 1 - 255 * private int leafLength= 0; private long nodeOffset = 0L; private int * nodeDepth = 0; private int innerHashLength = 0; @@ -515,6 +521,13 @@ public void Reset() #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER private void Compress(ReadOnlySpan message) { +#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + if (Avx2.IsSupported && BitConverter.IsLittleEndian) + { + Blake2bSimd.Compress(f0 == ulong.MaxValue, chainValue, message, t0, t1, blake2b_IV); + return; + } +#endif InitializeInternalState(); Span m = stackalloc ulong[16]; @@ -643,4 +656,544 @@ public void ClearSalt() } } } + + +#if NETCOREAPP3_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + internal static class Blake2bSimd + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) + { + Debug.Assert(dataBuffer.Length >= 128); + Debug.Assert(hashBuffer.Length >= 8); + Debug.Assert(Avx2.IsSupported); + Debug.Assert(BitConverter.IsLittleEndian); + + unchecked + { + Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + + // TODO: moving this to the bottom causes a 2x slow down? + ref ulong m = ref Unsafe.As(ref MemoryMarshal.GetReference(dataBuffer)); + + ref ulong hash = ref MemoryMarshal.GetReference(hashBuffer); + ref ulong iv = ref MemoryMarshal.GetReference(blakeIV); + + var r_14 = isFinal ? ulong.MaxValue : 0; + var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + + Vector256 row1 = VectorExtensions.LoadUnsafeVector256(ref hash); + Vector256 row2 = VectorExtensions.LoadUnsafeVector256(ref hash, (nuint)Vector256.Count); + Vector256 row3 = VectorExtensions.LoadUnsafeVector256(ref iv); + Vector256 row4 = VectorExtensions.LoadUnsafeVector256(ref iv, (nuint)Vector256.Count); + row4 = Avx2.Xor(row4, t_0); + + Vector256 orig_1 = row1; + Vector256 orig_2 = row2; + + #region Rounds + //ROUND 1 + var m0 = VectorExtensions.BroadcastVector128ToVector256(ref m); + var m1 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count)); + var m2 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 2)); + var m3 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 3)); + + var t0 = Avx2.UnpackLow(m0, m1); + var t1 = Avx2.UnpackLow(m2, m3); + var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + var m4 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 4)); + var m5 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 5)); + var m6 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 6)); + var m7 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 7)); + + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 2 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 3 + t0 = Avx2.AlignRight(m6, m5, 8); + t1 = Avx2.UnpackHigh(m2, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m5, m4, 8); + t1 = Avx2.UnpackHigh(m1, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m2, m7); + t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 4 + t0 = Avx2.UnpackHigh(m3, m1); + t1 = Avx2.UnpackHigh(m6, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m4, m0); + t1 = Avx2.UnpackLow(m6, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m1, m7, 8); + t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m4, m3); + t1 = Avx2.UnpackLow(m5, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 5 + t0 = Avx2.UnpackHigh(m4, m2); + t1 = Avx2.UnpackLow(m1, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m7, m1, 8); + t1 = Avx2.AlignRight(m3, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m6, m0); + t1 = Avx2.UnpackLow(m6, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 6 + t0 = Avx2.UnpackLow(m1, m3); + t1 = Avx2.UnpackLow(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m6, m5); + t1 = Avx2.UnpackHigh(m5, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m2, m0, 8); + t1 = Avx2.UnpackHigh(m3, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m4, m6); + t1 = Avx2.AlignRight(m7, m2, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 7 + t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m2); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m2, m7); + t1 = Avx2.AlignRight(m5, m6, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m5, m3); + t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 8 + t0 = Avx2.UnpackHigh(m6, m3); + t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m7, m5, 8); + t1 = Avx2.UnpackHigh(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.AlignRight(m4, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m5, m0); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 9 + t0 = Avx2.UnpackLow(m3, m7); + t1 = Avx2.AlignRight(m0, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.AlignRight(m4, m1, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackLow(m5, m6); + t1 = Avx2.UnpackHigh(m6, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m1, m2, 8); + t1 = Avx2.AlignRight(m2, m3, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 10 + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.UnpackHigh(m3, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m1, m2); + t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackHigh(m6, m7); + t1 = Avx2.UnpackHigh(m4, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 11 + t0 = Avx2.UnpackLow(m0, m1); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 12 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + #endregion + + row1 = Avx2.Xor(row1, row3); + row2 = Avx2.Xor(row2, row4); + row1 = Avx2.Xor(row1, orig_1); + row2 = Avx2.Xor(row2, orig_2); + + row1.StoreUnsafe(ref hash); + row2.StoreUnsafe(ref hash, (nuint)Vector256.Count); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) + { + unchecked + { + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + // ---> + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G1(Vector256 r24, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) + { + unchecked + { + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G2(Vector256 r16, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) + { + unchecked + { + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Undiagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) + { + unchecked + { + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + // ---> + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + } + } + } + + internal static class VectorExtensions + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 LoadUnsafeVector128(ref T source) + where T : struct + { + return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 LoadUnsafeVector256(ref T source) + where T : struct + { + return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 LoadUnsafeVector256(ref T source, nuint elementOffset) + where T : struct + { + source = ref Unsafe.Add(ref source, (nint)elementOffset); + return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void StoreUnsafe(this Vector256 source, ref T destination) + where T : struct + { + Unsafe.WriteUnaligned(ref Unsafe.As(ref destination), source); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void StoreUnsafe(this Vector256 source, ref T destination, nuint elementOffset) + where T : struct + { + destination = ref Unsafe.Add(ref destination, (nint)elementOffset); + Unsafe.WriteUnaligned(ref Unsafe.As(ref destination), source); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 BroadcastVector128ToVector256(ref T ptr) where T : struct + { + var vector = Unsafe.ReadUnaligned>(ref Unsafe.As(ref ptr)); + Vector256 result = vector.ToVector256Unsafe(); + return result.WithUpper(vector); + } + } + +#endif } From 4503b5c898e7c1b2ba9e0737f6aae3ed254ab199 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 12:30:51 +0000 Subject: [PATCH 02/14] Move code, use MemoryMarshal --- crypto/src/crypto/digests/Blake2bDigest.cs | 552 +----------------- crypto/src/crypto/digests/Blake2b_X86.cs | 508 ++++++++++++++++ crypto/src/crypto/digests/VectorExtensions.cs | 37 ++ 3 files changed, 548 insertions(+), 549 deletions(-) create mode 100644 crypto/src/crypto/digests/Blake2b_X86.cs create mode 100644 crypto/src/crypto/digests/VectorExtensions.cs diff --git a/crypto/src/crypto/digests/Blake2bDigest.cs b/crypto/src/crypto/digests/Blake2bDigest.cs index 12a56e543f..5edf19aefb 100644 --- a/crypto/src/crypto/digests/Blake2bDigest.cs +++ b/crypto/src/crypto/digests/Blake2bDigest.cs @@ -2,12 +2,6 @@ #if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER using System.Runtime.CompilerServices; #endif -#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER -using System.Runtime.InteropServices; -using System.Diagnostics; -using System.Runtime.Intrinsics.X86; -using System.Runtime.Intrinsics; -#endif using Org.BouncyCastle.Crypto.Utilities; using Org.BouncyCastle.Utilities; @@ -521,10 +515,10 @@ public void Reset() #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER private void Compress(ReadOnlySpan message) { -#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - if (Avx2.IsSupported && BitConverter.IsLittleEndian) +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER + if (System.Runtime.Intrinsics.X86.Avx2.IsSupported && BitConverter.IsLittleEndian) { - Blake2bSimd.Compress(f0 == ulong.MaxValue, chainValue, message, t0, t1, blake2b_IV); + Blake2b_X86.Compress(f0 == ulong.MaxValue, chainValue, message, t0, t1, blake2b_IV); return; } #endif @@ -656,544 +650,4 @@ public void ClearSalt() } } } - - -#if NETCOREAPP3_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER - internal static class Blake2bSimd - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) - { - Debug.Assert(dataBuffer.Length >= 128); - Debug.Assert(hashBuffer.Length >= 8); - Debug.Assert(Avx2.IsSupported); - Debug.Assert(BitConverter.IsLittleEndian); - - unchecked - { - Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - - // TODO: moving this to the bottom causes a 2x slow down? - ref ulong m = ref Unsafe.As(ref MemoryMarshal.GetReference(dataBuffer)); - - ref ulong hash = ref MemoryMarshal.GetReference(hashBuffer); - ref ulong iv = ref MemoryMarshal.GetReference(blakeIV); - - var r_14 = isFinal ? ulong.MaxValue : 0; - var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); - - Vector256 row1 = VectorExtensions.LoadUnsafeVector256(ref hash); - Vector256 row2 = VectorExtensions.LoadUnsafeVector256(ref hash, (nuint)Vector256.Count); - Vector256 row3 = VectorExtensions.LoadUnsafeVector256(ref iv); - Vector256 row4 = VectorExtensions.LoadUnsafeVector256(ref iv, (nuint)Vector256.Count); - row4 = Avx2.Xor(row4, t_0); - - Vector256 orig_1 = row1; - Vector256 orig_2 = row2; - - #region Rounds - //ROUND 1 - var m0 = VectorExtensions.BroadcastVector128ToVector256(ref m); - var m1 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count)); - var m2 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 2)); - var m3 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 3)); - - var t0 = Avx2.UnpackLow(m0, m1); - var t1 = Avx2.UnpackLow(m2, m3); - var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m0, m1); - t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - var m4 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 4)); - var m5 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 5)); - var m6 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 6)); - var m7 = VectorExtensions.BroadcastVector128ToVector256(ref Unsafe.Add(ref m, Vector128.Count * 7)); - - t0 = Avx2.UnpackLow(m7, m4); - t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 2 - t0 = Avx2.UnpackLow(m7, m2); - t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.UnpackHigh(m2, m0); - t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.AlignRight(m6, m1, 8); - t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 3 - t0 = Avx2.AlignRight(m6, m5, 8); - t1 = Avx2.UnpackHigh(m2, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m4, m0); - t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.AlignRight(m5, m4, 8); - t1 = Avx2.UnpackHigh(m1, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m2, m7); - t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 4 - t0 = Avx2.UnpackHigh(m3, m1); - t1 = Avx2.UnpackHigh(m6, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m4, m0); - t1 = Avx2.UnpackLow(m6, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.AlignRight(m1, m7, 8); - t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m4, m3); - t1 = Avx2.UnpackLow(m5, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 5 - t0 = Avx2.UnpackHigh(m4, m2); - t1 = Avx2.UnpackLow(m1, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.AlignRight(m7, m1, 8); - t1 = Avx2.AlignRight(m3, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m6, m0); - t1 = Avx2.UnpackLow(m6, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 6 - t0 = Avx2.UnpackLow(m1, m3); - t1 = Avx2.UnpackLow(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m6, m5); - t1 = Avx2.UnpackHigh(m5, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.AlignRight(m2, m0, 8); - t1 = Avx2.UnpackHigh(m3, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m4, m6); - t1 = Avx2.AlignRight(m7, m2, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 7 - t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.UnpackLow(m7, m2); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m2, m7); - t1 = Avx2.AlignRight(m5, m6, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.UnpackLow(m4, m0); - t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m5, m3); - t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 8 - t0 = Avx2.UnpackHigh(m6, m3); - t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.AlignRight(m7, m5, 8); - t1 = Avx2.UnpackHigh(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.AlignRight(m4, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m5, m0); - t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 9 - t0 = Avx2.UnpackLow(m3, m7); - t1 = Avx2.AlignRight(m0, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.AlignRight(m4, m1, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.UnpackLow(m5, m6); - t1 = Avx2.UnpackHigh(m6, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.AlignRight(m1, m2, 8); - t1 = Avx2.AlignRight(m2, m3, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 10 - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.UnpackHigh(m3, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m1, m2); - t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.UnpackHigh(m6, m7); - t1 = Avx2.UnpackHigh(m4, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.UnpackLow(m7, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 11 - t0 = Avx2.UnpackLow(m0, m1); - t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m0, m1); - t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.UnpackLow(m7, m4); - t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - - //ROUND 12 - t0 = Avx2.UnpackLow(m7, m2); - t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); - - t0 = Avx2.UnpackHigh(m2, m0); - t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - - t0 = Avx2.AlignRight(m6, m1, 8); - t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); - #endregion - - row1 = Avx2.Xor(row1, row3); - row2 = Avx2.Xor(row2, row4); - row1 = Avx2.Xor(row1, orig_1); - row2 = Avx2.Xor(row2, orig_2); - - row1.StoreUnsafe(ref hash); - row2.StoreUnsafe(ref hash, (nuint)Vector256.Count); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) - { - unchecked - { - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - // ---> - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - - row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); - row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); - row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void G1(Vector256 r24, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) - { - unchecked - { - row1 = Avx2.Add(Avx2.Add(row1, b0), row2); - row4 = Avx2.Xor(row4, row1); - row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); - - row3 = Avx2.Add(row3, row4); - row2 = Avx2.Xor(row2, row3); - row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void G2(Vector256 r16, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) - { - unchecked - { - row1 = Avx2.Add(Avx2.Add(row1, b0), row2); - row4 = Avx2.Xor(row4, row1); - row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); - - row3 = Avx2.Add(row3, row4); - row2 = Avx2.Xor(row2, row3); - row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void Undiagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) - { - unchecked - { - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - // ---> - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - - row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); - row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); - row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); - } - } - } - - internal static class VectorExtensions - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 LoadUnsafeVector128(ref T source) - where T : struct - { - return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 LoadUnsafeVector256(ref T source) - where T : struct - { - return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 LoadUnsafeVector256(ref T source, nuint elementOffset) - where T : struct - { - source = ref Unsafe.Add(ref source, (nint)elementOffset); - return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void StoreUnsafe(this Vector256 source, ref T destination) - where T : struct - { - Unsafe.WriteUnaligned(ref Unsafe.As(ref destination), source); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void StoreUnsafe(this Vector256 source, ref T destination, nuint elementOffset) - where T : struct - { - destination = ref Unsafe.Add(ref destination, (nint)elementOffset); - Unsafe.WriteUnaligned(ref Unsafe.As(ref destination), source); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 BroadcastVector128ToVector256(ref T ptr) where T : struct - { - var vector = Unsafe.ReadUnaligned>(ref Unsafe.As(ref ptr)); - Vector256 result = vector.ToVector256Unsafe(); - return result.WithUpper(vector); - } - } - -#endif } diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs new file mode 100644 index 0000000000..a313041bb0 --- /dev/null +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -0,0 +1,508 @@ +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Diagnostics; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace Org.BouncyCastle.Crypto.Digests +{ + internal static class Blake2b_X86 + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) + { + Debug.Assert(dataBuffer.Length >= 128); + Debug.Assert(hashBuffer.Length >= 8); + Debug.Assert(Avx2.IsSupported); + Debug.Assert(BitConverter.IsLittleEndian); + + unchecked + { + Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + + // TODO: moving this to the bottom causes a 2x slow down? + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); + + var r_14 = isFinal ? ulong.MaxValue : 0; + var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + + Vector256 row1 = VectorExtensions.LoadVector256(hashBytes); + Vector256 row2 = VectorExtensions.LoadVector256(hashBytes[Vector256.Count..]); + Vector256 row3 = VectorExtensions.LoadVector256(ivBytes); + Vector256 row4 = VectorExtensions.LoadVector256(ivBytes[Vector256.Count..]); + row4 = Avx2.Xor(row4, t_0); + + Vector256 orig_1 = row1; + Vector256 orig_2 = row2; + + Perform12Rounds(r24, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + + row1 = Avx2.Xor(row1, row3); + row2 = Avx2.Xor(row2, row4); + row1 = Avx2.Xor(row1, orig_1); + row2 = Avx2.Xor(row2, orig_2); + + row1.Store(hashBytes); + row2.Store(hashBytes[Vector256.Count..]); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Perform12Rounds(Vector256 r24, Vector256 r16, ReadOnlySpan m, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4) + { + Debug.Assert(m.Length >= 128); + + unchecked + { + #region Rounds + //ROUND 1 + var m0 = VectorExtensions.BroadcastVector128ToVector256(m); + var m1 = VectorExtensions.BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); + var m2 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); + var m3 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); + + var t0 = Avx2.UnpackLow(m0, m1); + var t1 = Avx2.UnpackLow(m2, m3); + var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + var m4 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); + var m5 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); + var m6 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); + var m7 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); + + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 2 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 3 + t0 = Avx2.AlignRight(m6, m5, 8); + t1 = Avx2.UnpackHigh(m2, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m5, m4, 8); + t1 = Avx2.UnpackHigh(m1, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m2, m7); + t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 4 + t0 = Avx2.UnpackHigh(m3, m1); + t1 = Avx2.UnpackHigh(m6, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m4, m0); + t1 = Avx2.UnpackLow(m6, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m1, m7, 8); + t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m4, m3); + t1 = Avx2.UnpackLow(m5, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 5 + t0 = Avx2.UnpackHigh(m4, m2); + t1 = Avx2.UnpackLow(m1, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m7, m1, 8); + t1 = Avx2.AlignRight(m3, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m6, m0); + t1 = Avx2.UnpackLow(m6, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 6 + t0 = Avx2.UnpackLow(m1, m3); + t1 = Avx2.UnpackLow(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m6, m5); + t1 = Avx2.UnpackHigh(m5, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.AlignRight(m2, m0, 8); + t1 = Avx2.UnpackHigh(m3, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m4, m6); + t1 = Avx2.AlignRight(m7, m2, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 7 + t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m2); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m2, m7); + t1 = Avx2.AlignRight(m5, m6, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m5, m3); + t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 8 + t0 = Avx2.UnpackHigh(m6, m3); + t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m7, m5, 8); + t1 = Avx2.UnpackHigh(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.AlignRight(m4, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m5, m0); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 9 + t0 = Avx2.UnpackLow(m3, m7); + t1 = Avx2.AlignRight(m0, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.AlignRight(m4, m1, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackLow(m5, m6); + t1 = Avx2.UnpackHigh(m6, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m1, m2, 8); + t1 = Avx2.AlignRight(m2, m3, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 10 + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.UnpackHigh(m3, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m1, m2); + t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackHigh(m6, m7); + t1 = Avx2.UnpackHigh(m4, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 11 + t0 = Avx2.UnpackLow(m0, m1); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 12 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + #endregion + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) + { + unchecked + { + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + // ---> + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G1(Vector256 r24, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) + { + unchecked + { + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G2(Vector256 r16, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) + { + unchecked + { + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Undiagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) + { + unchecked + { + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + // ---> + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); + } + } + } +} +#endif diff --git a/crypto/src/crypto/digests/VectorExtensions.cs b/crypto/src/crypto/digests/VectorExtensions.cs new file mode 100644 index 0000000000..f57f8e3f40 --- /dev/null +++ b/crypto/src/crypto/digests/VectorExtensions.cs @@ -0,0 +1,37 @@ +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; + +namespace Org.BouncyCastle.Crypto.Digests +{ + internal static class VectorExtensions + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 BroadcastVector128ToVector256(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length == Unsafe.SizeOf>()); + + var vector = MemoryMarshal.Read>(source); + Vector256 result = vector.ToVector256Unsafe(); + return result.WithUpper(vector); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 LoadVector256(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length == Unsafe.SizeOf>()); + return MemoryMarshal.Read>(source); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Store(this Vector256 vector, Span destination) where T : struct + { + Debug.Assert(destination.Length == Unsafe.SizeOf>()); + MemoryMarshal.Write(destination, ref vector); + } + } +} +#endif \ No newline at end of file From f4684c027f27bcfbd6983e7a75aaa352fb1ecb7d Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 12:31:04 +0000 Subject: [PATCH 03/14] Move VectorExtensions to util --- crypto/src/crypto/{digests => util}/VectorExtensions.cs | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename crypto/src/crypto/{digests => util}/VectorExtensions.cs (100%) diff --git a/crypto/src/crypto/digests/VectorExtensions.cs b/crypto/src/crypto/util/VectorExtensions.cs similarity index 100% rename from crypto/src/crypto/digests/VectorExtensions.cs rename to crypto/src/crypto/util/VectorExtensions.cs From a1082ae28aef54910ab7c8a99e10f52b3db3e9e2 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 14:27:43 +0000 Subject: [PATCH 04/14] Add guard and license --- crypto/src/crypto/digests/Blake2b_X86.cs | 31 +++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index a313041bb0..fe132f3afb 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -8,22 +8,47 @@ namespace Org.BouncyCastle.Crypto.Digests { + // License from the original code created by Clinton Ingram (saucecontrol) for Blake2Fast + // at https://github.com/saucecontrol/Blake2Fast. The code has been copied and modified. + + // The MIT License + + // Copyright(c) 2018-2021 Clinton Ingram + + // Permission is hereby granted, free of charge, to any person obtaining a copy + // of this software and associated documentation files (the "Software"), to deal + // in the Software without restriction, including without limitation the rights + // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + // copies of the Software, and to permit persons to whom the Software is + // furnished to do so, subject to the following conditions: + + // The above copyright notice and this permission notice shall be included in + // all copies or substantial portions of the Software. + + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + // THE SOFTWARE. + internal static class Blake2b_X86 { [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) { + if(!Avx2.IsSupported || !BitConverter.IsLittleEndian) + throw new PlatformNotSupportedException(nameof(Blake2b_X86)); + Debug.Assert(dataBuffer.Length >= 128); Debug.Assert(hashBuffer.Length >= 8); - Debug.Assert(Avx2.IsSupported); - Debug.Assert(BitConverter.IsLittleEndian); unchecked { Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - // TODO: moving this to the bottom causes a 2x slow down? var hashBytes = MemoryMarshal.AsBytes(hashBuffer); var ivBytes = MemoryMarshal.AsBytes(blakeIV); From d925cf1d2bb9b913d5b7e86ce2819acd619c5f31 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 19:34:46 +0000 Subject: [PATCH 05/14] Feat add Blake2d intrinsics --- crypto/src/crypto/digests/Blake2sDigest.cs | 6 + crypto/src/crypto/digests/Blake2s_X86.cs | 482 +++++++++++++++++++++ crypto/src/crypto/util/VectorExtensions.cs | 24 + 3 files changed, 512 insertions(+) create mode 100644 crypto/src/crypto/digests/Blake2s_X86.cs diff --git a/crypto/src/crypto/digests/Blake2sDigest.cs b/crypto/src/crypto/digests/Blake2sDigest.cs index a6ee75af53..32a6b60d70 100644 --- a/crypto/src/crypto/digests/Blake2sDigest.cs +++ b/crypto/src/crypto/digests/Blake2sDigest.cs @@ -555,6 +555,12 @@ public void Reset() #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER private void Compress(ReadOnlySpan message) { +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER + if(System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) + { + Blake2s_X86.Compress(f0 == uint.MaxValue, chainValue, message, t0, t1, blake2s_IV); + } +#endif InitializeInternalState(); Span m = stackalloc uint[16]; diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs new file mode 100644 index 0000000000..2fa44d9f49 --- /dev/null +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -0,0 +1,482 @@ +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Diagnostics; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace Org.BouncyCastle.Crypto.Digests +{ + // License from the original code created by Clinton Ingram (saucecontrol) for Blake2Fast + // at https://github.com/saucecontrol/Blake2Fast. The code has been copied and modified. + + // The MIT License + + // Copyright(c) 2018-2021 Clinton Ingram + + // Permission is hereby granted, free of charge, to any person obtaining a copy + // of this software and associated documentation files (the "Software"), to deal + // in the Software without restriction, including without limitation the rights + // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + // copies of the Software, and to permit persons to whom the Software is + // furnished to do so, subject to the following conditions: + + // The above copyright notice and this permission notice shall be included in + // all copies or substantial portions of the Software. + + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + // THE SOFTWARE. + + internal static class Blake2s_X86 + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) + { + if(!Sse41.IsSupported || !BitConverter.IsLittleEndian) + throw new PlatformNotSupportedException(nameof(Blake2s_X86)); + + Debug.Assert(dataBuffer.Length >= 128); + Debug.Assert(hashBuffer.Length >= 8); + + unchecked + { + Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); + Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); + + var r_14 = isFinal ? uint.MaxValue : 0; + var t_0 = Vector128.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + + Vector128 row1 = VectorExtensions.LoadVector128(hashBytes); + Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); + Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); + Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); + row4 = Sse2.Xor(row4, t_0); + + Vector128 orig_1 = row1; + Vector128 orig_2 = row2; + + Perform10Rounds(r8, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + + row1 = Sse2.Xor(row1, row3); + row2 = Sse2.Xor(row2, row4); + row1 = Sse2.Xor(row1, orig_1); + row2 = Sse2.Xor(row2, orig_2); + + row1.Store(hashBytes); + row2.Store(hashBytes[Vector128.Count..]); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Perform10Rounds(Vector128 r8, Vector128 r16, ReadOnlySpan m, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4) + { + Debug.Assert(m.Length >= 128); + + unchecked + { + #region Rounds + //ROUND 1 + var m0 = VectorExtensions.BroadcastVector64ToVector128(m); + var m1 = VectorExtensions.BroadcastVector64ToVector128(m[Unsafe.SizeOf>()..]); + var m2 = VectorExtensions.BroadcastVector64ToVector128(m[(Unsafe.SizeOf>() * 2)..]); + var m3 = VectorExtensions.BroadcastVector64ToVector128(m[(Unsafe.SizeOf>() * 3)..]); + + var t0 = Sse2.UnpackLow(m0, m1); + var t1 = Sse2.UnpackLow(m2, m3); + var b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m0, m1); + t1 = Sse2.UnpackHigh(m2, m3); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + var m4 = VectorExtensions.BroadcastVector64ToVector128(m[(Unsafe.SizeOf>() * 4)..]); + var m5 = VectorExtensions.BroadcastVector64ToVector128(m[(Unsafe.SizeOf>() * 5)..]); + var m6 = VectorExtensions.BroadcastVector64ToVector128(m[(Unsafe.SizeOf>() * 6)..]); + var m7 = VectorExtensions.BroadcastVector64ToVector128(m[(Unsafe.SizeOf>() * 7)..]); + + t0 = Sse2.UnpackLow(m7, m4); + t1 = Sse2.UnpackLow(m5, m6); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m7, m4); + t1 = Sse2.UnpackHigh(m5, m6); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 2 + t0 = Sse2.UnpackLow(m7, m2); + t1 = Sse2.UnpackHigh(m4, m6); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m5, m4); + t1 = Ssse3.AlignRight(m3, m7, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackHigh(m2, m0); + t1 = Sse41.Blend(m0.AsUInt16(), m5.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Ssse3.AlignRight(m6, m1, 8); + t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 3 + t0 = Ssse3.AlignRight(m6, m5, 8); + t1 = Sse2.UnpackHigh(m2, m7); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m4, m0); + t1 = Sse41.Blend(m1.AsUInt16(), m6.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Ssse3.AlignRight(m5, m4, 8); + t1 = Sse2.UnpackHigh(m1, m3); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m2, m7); + t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 4 + t0 = Sse2.UnpackHigh(m3, m1); + t1 = Sse2.UnpackHigh(m6, m5); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m4, m0); + t1 = Sse2.UnpackLow(m6, m7); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Ssse3.AlignRight(m1, m7, 8); + t1 = Ssse3.Shuffle(m2.AsByte(), r16).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m4, m3); + t1 = Sse2.UnpackLow(m5, m0); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 5 + t0 = Sse2.UnpackHigh(m4, m2); + t1 = Sse2.UnpackLow(m1, m5); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_1100_1100).AsUInt32(); + t1 = Sse41.Blend(m2.AsUInt16(), m7.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Ssse3.AlignRight(m7, m1, 8); + t1 = Ssse3.AlignRight(m3, m5, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m6, m0); + t1 = Sse2.UnpackLow(m6, m4); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 6 + t0 = Sse2.UnpackLow(m1, m3); + t1 = Sse2.UnpackLow(m0, m4); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m6, m5); + t1 = Sse2.UnpackHigh(m5, m1); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Ssse3.AlignRight(m2, m0, 8); + t1 = Sse2.UnpackHigh(m3, m7); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m4, m6); + t1 = Ssse3.AlignRight(m7, m2, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 7 + t0 = Sse41.Blend(m6.AsUInt16(), m0.AsUInt16(), 0b_1100_1100).AsUInt32(); + t1 = Sse2.UnpackLow(m7, m2); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m2, m7); + t1 = Ssse3.AlignRight(m5, m6, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackLow(m4, m0); + t1 = Sse41.Blend(m3.AsUInt16(), m4.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m5, m3); + t1 = Ssse3.Shuffle(m1.AsByte(), r16).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 8 + t0 = Sse2.UnpackHigh(m6, m3); + t1 = Sse41.Blend(m6.AsUInt16(), m1.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Ssse3.AlignRight(m7, m5, 8); + t1 = Sse2.UnpackHigh(m0, m4); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_1100_1100).AsUInt32(); + t1 = Ssse3.AlignRight(m4, m7, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m5, m0); + t1 = Sse2.UnpackLow(m2, m3); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 9 + t0 = Sse2.UnpackLow(m3, m7); + t1 = Ssse3.AlignRight(m0, m5, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m7, m4); + t1 = Ssse3.AlignRight(m4, m1, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackLow(m5, m6); + t1 = Sse2.UnpackHigh(m6, m0); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Ssse3.AlignRight(m1, m2, 8); + t1 = Ssse3.AlignRight(m2, m3, 8); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 10 + t0 = Sse2.UnpackLow(m5, m4); + t1 = Sse2.UnpackHigh(m3, m0); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m1, m2); + t1 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_1100_1100).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackHigh(m6, m7); + t1 = Sse2.UnpackHigh(m4, m1); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m0.AsUInt16(), m5.AsUInt16(), 0b_1100_1100).AsUInt32(); + t1 = Sse2.UnpackLow(m7, m6); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_1111_0000).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + #endregion + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Diagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) + { + unchecked + { + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + // ---> + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + + row1 = Sse2.Shuffle(row1, 0b_10_01_00_11); + row3 = Sse2.Shuffle(row3, 0b_00_11_10_01); + row4 = Sse2.Shuffle(row4, 0b_01_00_11_10); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G1(Vector128 r16, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4, Vector128 b0) + { + unchecked + { + row1 = Sse2.Add(Sse2.Add(row1, b0), row2); + row4 = Sse2.Xor(row4, row1); + row4 = Ssse3.Shuffle(row4.AsByte(), r16).AsUInt32(); + + row3 = Sse2.Add(row3, row4); + row2 = Sse2.Xor(row2, row3); + row2 = RotateElement(row2, 12); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G2(Vector128 r8, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4, Vector128 b0) + { + unchecked + { + row1 = Sse2.Add(Sse2.Add(row1, b0), row2); + row4 = Sse2.Xor(row4, row1); + row4 = Ssse3.Shuffle(row4.AsByte(), r8).AsUInt32(); + + row3 = Sse2.Add(row3, row4); + row2 = Sse2.Xor(row2, row3); + row2 = RotateElement(row2, 7); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static Vector128 RotateElement(Vector128 vector, byte shift) + { + Debug.Assert(shift < sizeof(uint)); + return Sse2.Or(Sse2.ShiftLeftLogical(vector, shift), Sse2.ShiftRightLogical(vector, (byte)(sizeof(ulong) - shift))); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Undiagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) + { + unchecked + { + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + // ---> + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + + row1 = Sse2.Shuffle(row1, 0b_00_11_10_01); + row3 = Sse2.Shuffle(row3, 0b_10_01_00_11); + row4 = Sse2.Shuffle(row4, 0b_01_00_11_10); + } + } + } +} +#endif diff --git a/crypto/src/crypto/util/VectorExtensions.cs b/crypto/src/crypto/util/VectorExtensions.cs index f57f8e3f40..8abac35047 100644 --- a/crypto/src/crypto/util/VectorExtensions.cs +++ b/crypto/src/crypto/util/VectorExtensions.cs @@ -9,6 +9,16 @@ namespace Org.BouncyCastle.Crypto.Digests { internal static class VectorExtensions { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 BroadcastVector64ToVector128(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length == Unsafe.SizeOf>()); + + var vector = MemoryMarshal.Read>(source); + Vector128 result = vector.ToVector128Unsafe(); + return result.WithUpper(vector); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 BroadcastVector128ToVector256(ReadOnlySpan source) where T : struct { @@ -19,6 +29,13 @@ public static Vector256 BroadcastVector128ToVector256(ReadOnlySpan s return result.WithUpper(vector); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 LoadVector128(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length == Unsafe.SizeOf>()); + return MemoryMarshal.Read>(source); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 LoadVector256(ReadOnlySpan source) where T : struct { @@ -26,6 +43,13 @@ public static Vector256 LoadVector256(ReadOnlySpan source) where T : return MemoryMarshal.Read>(source); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Store(this Vector128 vector, Span destination) where T : struct + { + Debug.Assert(destination.Length == Unsafe.SizeOf>()); + MemoryMarshal.Write(destination, ref vector); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Store(this Vector256 vector, Span destination) where T : struct { From 1eb1d8eca6e750dabb098d9afe177f771f60b8e7 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 19:34:46 +0000 Subject: [PATCH 06/14] Feat add Blake2s intrinsics --- crypto/src/crypto/digests/Blake2sDigest.cs | 7 + crypto/src/crypto/digests/Blake2s_X86.cs | 474 +++++++++++++++++++++ crypto/src/crypto/util/VectorExtensions.cs | 30 +- 3 files changed, 508 insertions(+), 3 deletions(-) create mode 100644 crypto/src/crypto/digests/Blake2s_X86.cs diff --git a/crypto/src/crypto/digests/Blake2sDigest.cs b/crypto/src/crypto/digests/Blake2sDigest.cs index a6ee75af53..f1d332130e 100644 --- a/crypto/src/crypto/digests/Blake2sDigest.cs +++ b/crypto/src/crypto/digests/Blake2sDigest.cs @@ -555,6 +555,13 @@ public void Reset() #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER private void Compress(ReadOnlySpan message) { +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER + if(System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) + { + Blake2s_X86.Compress(f0 == uint.MaxValue, chainValue, message, t0, t1, blake2s_IV); + return; + } +#endif InitializeInternalState(); Span m = stackalloc uint[16]; diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs new file mode 100644 index 0000000000..da31e0872e --- /dev/null +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -0,0 +1,474 @@ +#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Diagnostics; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace Org.BouncyCastle.Crypto.Digests +{ + // License from the original code created by Clinton Ingram (saucecontrol) for Blake2Fast + // at https://github.com/saucecontrol/Blake2Fast. The code has been copied and modified. + + // The MIT License + + // Copyright(c) 2018-2021 Clinton Ingram + + // Permission is hereby granted, free of charge, to any person obtaining a copy + // of this software and associated documentation files (the "Software"), to deal + // in the Software without restriction, including without limitation the rights + // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + // copies of the Software, and to permit persons to whom the Software is + // furnished to do so, subject to the following conditions: + + // The above copyright notice and this permission notice shall be included in + // all copies or substantial portions of the Software. + + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + // THE SOFTWARE. + + internal static class Blake2s_X86 + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) + { + if(!Sse41.IsSupported || !BitConverter.IsLittleEndian) + throw new PlatformNotSupportedException(nameof(Blake2s_X86)); + + Debug.Assert(dataBuffer.Length >= Unsafe.SizeOf() * 16); + Debug.Assert(hashBuffer.Length >= 8); + + unchecked + { + Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); + Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); + + var r_14 = isFinal ? uint.MaxValue : 0; + var t_0 = Vector128.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + + Vector128 row1 = VectorExtensions.LoadVector128(hashBytes); + Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); + Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); + Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); + row4 = Sse2.Xor(row4, t_0); + + Vector128 orig_1 = row1; + Vector128 orig_2 = row2; + + Perform10Rounds(r8, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + + row1 = Sse2.Xor(row1, row3); + row2 = Sse2.Xor(row2, row4); + row1 = Sse2.Xor(row1, orig_1); + row2 = Sse2.Xor(row2, orig_2); + + row1.Store(hashBytes); + row2.Store(hashBytes[Vector128.Count..]); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Perform10Rounds(Vector128 r8, Vector128 r16, ReadOnlySpan m, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4) + { + Debug.Assert(m.Length >= Unsafe.SizeOf() * 16); + + #region Rounds + var m0 = VectorExtensions.LoadVector128(m); + var m1 = VectorExtensions.LoadVector128(m[Vector128.Count..]); + var m2 = VectorExtensions.LoadVector128(m[(Vector128.Count * 2)..]); + var m3 = VectorExtensions.LoadVector128(m[(Vector128.Count * 3)..]); + + //ROUND 1 + var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); + + //G2 + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + var t0 = Sse2.Shuffle(m2, 0b_11_10_00_01); + var t1 = Sse2.Shuffle(m3, 0b_00_01_11_10); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_00_00_11).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); + b0 = Sse2.Shuffle(t0, 0b_10_11_00_01); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 2 + t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); + var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.Shuffle(m2, 0b_00_00_10_00); + t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4); + t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); + t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_11_00_01_10); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m0, m1); + t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_11_00_01_10); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 3 + t0 = Sse2.UnpackHigh(m2, m3); + t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m2, m0); + t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); + t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8); + b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32(); + t1 = Sse2.ShiftRightLogical128BitLane(m1, 12); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_00_11_10_01); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4); + t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 4 + t0 = Sse2.UnpackHigh(m0, m1); + t1 = Sse2.UnpackHigh(t0, m2); + t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8); + t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); + t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t1, 0b_00_01_10_11); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Ssse3.AlignRight(m0, m1, 4); + b0 = Sse41.Blend(t0.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 5 + t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); + t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32(); + t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32(); + b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32(); + t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32(); + t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8); + t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_00_11_01); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 6 + t0 = Sse2.UnpackHigh(m0, m1); + t1 = Sse2.UnpackLow(m0, m2); + b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.ShiftRightLogical128BitLane(m2, 4); + t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + t1 = Sse2.ShiftRightLogical128BitLane(m3, 4); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m2.AsUInt64(), m1.AsUInt64()).AsUInt32(); + t1 = Sse2.Shuffle(m3, 0b_10_00_01_00); + t2 = Sse2.ShiftRightLogical128BitLane(t0, 4); + b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 7 + t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12); + t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); + t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_01_11_00); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); + t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m1, m2); + t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32(); + b0 = Sse2.Shuffle(t1, 0b_00_01_10_11); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 8 + t0 = Sse2.UnpackHigh(m0, m1); + t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); + b0 = Sse2.Shuffle(t1, 0b_10_00_11_01); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32(); + t1 = Sse2.ShiftRightLogical128BitLane(m0, 4); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_01_00_10_11); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); + t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_11_01_00); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackLow(m0, m1); + t1 = Sse2.UnpackHigh(m1, m2); + t2 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 9 + t0 = Sse2.UnpackHigh(m1, m3); + t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32(); + t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.UnpackHigh(m0, m3); + t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t1, 0b_00_10_01_11); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackLow(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); + t1 = Sse2.ShiftRightLogical128BitLane(m2, 8); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_01_11_10_00); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t0, 0b_00_11_10_01); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + + //ROUND 10 + t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); + t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); + t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_01_11_00_10); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4); + t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + b0 = Sse2.Shuffle(t1, 0b_01_10_00_11); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Diagonalize(ref row1, ref row3, ref row4); + + t0 = Sse2.UnpackHigh(m0, m3); + t1 = Sse2.UnpackLow(m2, m3); + t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_00_10_01_11); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + + t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); + t1 = Sse2.UnpackLow(m0, m3); + t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); + b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); + + G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + + Undiagonalize(ref row1, ref row3, ref row4); + #endregion + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Diagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) + { + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + // ---> + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + + row1 = Sse2.Shuffle(row1, 0b_10_01_00_11); + row3 = Sse2.Shuffle(row3, 0b_00_11_10_01); + row4 = Sse2.Shuffle(row4, 0b_01_00_11_10); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G1(Vector128 r16, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4, Vector128 b0) + { + row1 = Sse2.Add(Sse2.Add(row1, b0), row2); + row4 = Sse2.Xor(row4, row1); + row4 = Ssse3.Shuffle(row4.AsByte(), r16).AsUInt32(); + + row3 = Sse2.Add(row3, row4); + row2 = Sse2.Xor(row2, row3); + row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 32 - 12)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void G2(Vector128 r8, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4, Vector128 b0) + { + row1 = Sse2.Add(Sse2.Add(row1, b0), row2); + row4 = Sse2.Xor(row4, row1); + row4 = Ssse3.Shuffle(row4.AsByte(), r8).AsUInt32(); + + row3 = Sse2.Add(row3, row4); + row2 = Sse2.Xor(row2, row3); + row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 32 - 7)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Undiagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) + { + // +-------------------+ + // | 3 | 0 | 1 | 2 | + // +-------------------+ + // | 9 | 10 | 11 | 8 | + // +-------------------+ + // | 14 | 15 | 12 | 13 | + // +-------------------+ + // ---> + // +-------------------+ + // | 0 | 1 | 2 | 3 | + // +-------------------+ + // | 8 | 9 | 10 | 11 | + // +-------------------+ + // | 12 | 13 | 14 | 15 | + // +-------------------+ + + row1 = Sse2.Shuffle(row1, 0b_00_11_10_01); + row3 = Sse2.Shuffle(row3, 0b_10_01_00_11); + row4 = Sse2.Shuffle(row4, 0b_01_00_11_10); + } + } +} +#endif diff --git a/crypto/src/crypto/util/VectorExtensions.cs b/crypto/src/crypto/util/VectorExtensions.cs index f57f8e3f40..36dcce31be 100644 --- a/crypto/src/crypto/util/VectorExtensions.cs +++ b/crypto/src/crypto/util/VectorExtensions.cs @@ -9,27 +9,51 @@ namespace Org.BouncyCastle.Crypto.Digests { internal static class VectorExtensions { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 BroadcastVector64ToVector128(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length >= Unsafe.SizeOf>()); + + var vector = MemoryMarshal.Read>(source); + Vector128 result = vector.ToVector128Unsafe(); + return result.WithUpper(vector); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 BroadcastVector128ToVector256(ReadOnlySpan source) where T : struct { - Debug.Assert(source.Length == Unsafe.SizeOf>()); + Debug.Assert(source.Length >= Unsafe.SizeOf>()); var vector = MemoryMarshal.Read>(source); Vector256 result = vector.ToVector256Unsafe(); return result.WithUpper(vector); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 LoadVector128(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length >= Unsafe.SizeOf>()); + return MemoryMarshal.Read>(source); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 LoadVector256(ReadOnlySpan source) where T : struct { - Debug.Assert(source.Length == Unsafe.SizeOf>()); + Debug.Assert(source.Length >= Unsafe.SizeOf>()); return MemoryMarshal.Read>(source); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Store(this Vector128 vector, Span destination) where T : struct + { + Debug.Assert(destination.Length >= Unsafe.SizeOf>()); + MemoryMarshal.Write(destination, ref vector); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Store(this Vector256 vector, Span destination) where T : struct { - Debug.Assert(destination.Length == Unsafe.SizeOf>()); + Debug.Assert(destination.Length >= Unsafe.SizeOf>()); MemoryMarshal.Write(destination, ref vector); } } From 3d1cbd65d0ec5e675bac5383afda6783a3b62d61 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 23:13:30 +0000 Subject: [PATCH 07/14] Code cleanup --- crypto/src/crypto/digests/Blake2b_X86.cs | 632 ++++++++++----------- crypto/src/crypto/digests/Blake2sDigest.cs | 2 +- crypto/src/crypto/digests/Blake2s_X86.cs | 97 ++-- 3 files changed, 339 insertions(+), 392 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index fe132f3afb..5c0f10a581 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -1,10 +1,10 @@ #if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER using System; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Diagnostics; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Digests { @@ -36,44 +36,41 @@ namespace Org.BouncyCastle.Crypto.Digests internal static class Blake2b_X86 { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan message, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) { - if(!Avx2.IsSupported || !BitConverter.IsLittleEndian) + if (!Avx2.IsSupported || !BitConverter.IsLittleEndian) throw new PlatformNotSupportedException(nameof(Blake2b_X86)); - Debug.Assert(dataBuffer.Length >= 128); + Debug.Assert(message.Length >= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - unchecked - { - Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); - var ivBytes = MemoryMarshal.AsBytes(blakeIV); + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); - var r_14 = isFinal ? ulong.MaxValue : 0; - var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + var r_14 = isFinal ? ulong.MaxValue : 0; + var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); - Vector256 row1 = VectorExtensions.LoadVector256(hashBytes); - Vector256 row2 = VectorExtensions.LoadVector256(hashBytes[Vector256.Count..]); - Vector256 row3 = VectorExtensions.LoadVector256(ivBytes); - Vector256 row4 = VectorExtensions.LoadVector256(ivBytes[Vector256.Count..]); - row4 = Avx2.Xor(row4, t_0); + Vector256 row1 = VectorExtensions.LoadVector256(hashBytes); + Vector256 row2 = VectorExtensions.LoadVector256(hashBytes[Vector256.Count..]); + Vector256 row3 = VectorExtensions.LoadVector256(ivBytes); + Vector256 row4 = VectorExtensions.LoadVector256(ivBytes[Vector256.Count..]); + row4 = Avx2.Xor(row4, t_0); - Vector256 orig_1 = row1; - Vector256 orig_2 = row2; + Vector256 orig_1 = row1; + Vector256 orig_2 = row2; - Perform12Rounds(r24, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + Perform12Rounds(r24, r16, message, ref row1, ref row2, ref row3, ref row4); - row1 = Avx2.Xor(row1, row3); - row2 = Avx2.Xor(row2, row4); - row1 = Avx2.Xor(row1, orig_1); - row2 = Avx2.Xor(row2, orig_2); + row1 = Avx2.Xor(row1, row3); + row2 = Avx2.Xor(row2, row4); + row1 = Avx2.Xor(row1, orig_1); + row2 = Avx2.Xor(row2, orig_2); - row1.Store(hashBytes); - row2.Store(hashBytes[Vector256.Count..]); - } + row1.Store(hashBytes); + row2.Store(hashBytes[Vector256.Count..]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -81,452 +78,421 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re { Debug.Assert(m.Length >= 128); - unchecked - { - #region Rounds - //ROUND 1 - var m0 = VectorExtensions.BroadcastVector128ToVector256(m); - var m1 = VectorExtensions.BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); - var m2 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); - var m3 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); + #region Rounds + //ROUND 1 + var m0 = VectorExtensions.BroadcastVector128ToVector256(m); + var m1 = VectorExtensions.BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); + var m2 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); + var m3 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); - var t0 = Avx2.UnpackLow(m0, m1); - var t1 = Avx2.UnpackLow(m2, m3); - var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + var t0 = Avx2.UnpackLow(m0, m1); + var t1 = Avx2.UnpackLow(m2, m3); + var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m0, m1); - t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - var m4 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); - var m5 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); - var m6 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); - var m7 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); + var m4 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); + var m5 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); + var m6 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); + var m7 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); - t0 = Avx2.UnpackLow(m7, m4); - t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 2 - t0 = Avx2.UnpackLow(m7, m2); - t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 2 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackHigh(m2, m0); - t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m6, m1, 8); - t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 3 - t0 = Avx2.AlignRight(m6, m5, 8); - t1 = Avx2.UnpackHigh(m2, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 3 + t0 = Avx2.AlignRight(m6, m5, 8); + t1 = Avx2.UnpackHigh(m2, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m4, m0); - t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m5, m4, 8); - t1 = Avx2.UnpackHigh(m1, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m5, m4, 8); + t1 = Avx2.UnpackHigh(m1, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m2, m7); - t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m2, m7); + t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 4 - t0 = Avx2.UnpackHigh(m3, m1); - t1 = Avx2.UnpackHigh(m6, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 4 + t0 = Avx2.UnpackHigh(m3, m1); + t1 = Avx2.UnpackHigh(m6, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m4, m0); - t1 = Avx2.UnpackLow(m6, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m4, m0); + t1 = Avx2.UnpackLow(m6, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m1, m7, 8); - t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m1, m7, 8); + t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m4, m3); - t1 = Avx2.UnpackLow(m5, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m4, m3); + t1 = Avx2.UnpackLow(m5, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 5 - t0 = Avx2.UnpackHigh(m4, m2); - t1 = Avx2.UnpackLow(m1, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 5 + t0 = Avx2.UnpackHigh(m4, m2); + t1 = Avx2.UnpackLow(m1, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m7, m1, 8); - t1 = Avx2.AlignRight(m3, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m7, m1, 8); + t1 = Avx2.AlignRight(m3, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m6, m0); - t1 = Avx2.UnpackLow(m6, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m6, m0); + t1 = Avx2.UnpackLow(m6, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 6 - t0 = Avx2.UnpackLow(m1, m3); - t1 = Avx2.UnpackLow(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 6 + t0 = Avx2.UnpackLow(m1, m3); + t1 = Avx2.UnpackLow(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m6, m5); - t1 = Avx2.UnpackHigh(m5, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m6, m5); + t1 = Avx2.UnpackHigh(m5, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m2, m0, 8); - t1 = Avx2.UnpackHigh(m3, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m2, m0, 8); + t1 = Avx2.UnpackHigh(m3, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m4, m6); - t1 = Avx2.AlignRight(m7, m2, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m4, m6); + t1 = Avx2.AlignRight(m7, m2, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 7 - t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.UnpackLow(m7, m2); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 7 + t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m2); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m2, m7); - t1 = Avx2.AlignRight(m5, m6, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m2, m7); + t1 = Avx2.AlignRight(m5, m6, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackLow(m4, m0); - t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m5, m3); - t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m5, m3); + t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 8 - t0 = Avx2.UnpackHigh(m6, m3); - t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 8 + t0 = Avx2.UnpackHigh(m6, m3); + t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m7, m5, 8); - t1 = Avx2.UnpackHigh(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m7, m5, 8); + t1 = Avx2.UnpackHigh(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.AlignRight(m4, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.AlignRight(m4, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m5, m0); - t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m0); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 9 - t0 = Avx2.UnpackLow(m3, m7); - t1 = Avx2.AlignRight(m0, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 9 + t0 = Avx2.UnpackLow(m3, m7); + t1 = Avx2.AlignRight(m0, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.AlignRight(m4, m1, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.AlignRight(m4, m1, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackLow(m5, m6); - t1 = Avx2.UnpackHigh(m6, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m6); + t1 = Avx2.UnpackHigh(m6, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m1, m2, 8); - t1 = Avx2.AlignRight(m2, m3, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m1, m2, 8); + t1 = Avx2.AlignRight(m2, m3, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 10 - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.UnpackHigh(m3, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 10 + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.UnpackHigh(m3, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m1, m2); - t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m1, m2); + t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackHigh(m6, m7); - t1 = Avx2.UnpackHigh(m4, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m6, m7); + t1 = Avx2.UnpackHigh(m4, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.UnpackLow(m7, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 11 - t0 = Avx2.UnpackLow(m0, m1); - t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 11 + t0 = Avx2.UnpackLow(m0, m1); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m0, m1); - t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackLow(m7, m4); - t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 12 - t0 = Avx2.UnpackLow(m7, m2); - t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 12 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackHigh(m2, m0); - t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m6, m1, 8); - t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); - #endregion - } + Undiagonalize(ref row1, ref row3, ref row4); + #endregion } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) { - unchecked - { - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - // ---> - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - - row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); - row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); - row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); - } + // +-------------------+ +-------------------+ + // | 0 | 1 | 2 | 3 | | 3 | 0 | 1 | 2 | + // +-------------------+ +-------------------+ + // | 8 | 9 | 10 | 11 | ---> | 9 | 10 | 11 | 8 | + // +-------------------+ +-------------------+ + // | 12 | 13 | 14 | 15 | | 14 | 15 | 12 | 13 | + // +-------------------+ +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G1(Vector256 r24, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) { - unchecked - { - row1 = Avx2.Add(Avx2.Add(row1, b0), row2); - row4 = Avx2.Xor(row4, row1); - row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); - - row3 = Avx2.Add(row3, row4); - row2 = Avx2.Xor(row2, row3); - row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); - } + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G2(Vector256 r16, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) { - unchecked - { - row1 = Avx2.Add(Avx2.Add(row1, b0), row2); - row4 = Avx2.Xor(row4, row1); - row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); - - row3 = Avx2.Add(row3, row4); - row2 = Avx2.Xor(row2, row3); - row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); - } + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Undiagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) { - unchecked - { - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - // ---> - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - - row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); - row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); - row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); - } + // +-------------------+ +-------------------+ + // | 3 | 0 | 1 | 2 | | 0 | 1 | 2 | 3 | + // +-------------------+ +-------------------+ + // | 9 | 10 | 11 | 8 | ---> | 8 | 9 | 10 | 11 | + // +-------------------+ +-------------------+ + // | 14 | 15 | 12 | 13 | | 12 | 13 | 14 | 15 | + // +-------------------+ +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); } } } diff --git a/crypto/src/crypto/digests/Blake2sDigest.cs b/crypto/src/crypto/digests/Blake2sDigest.cs index f1d332130e..69d1ffc708 100644 --- a/crypto/src/crypto/digests/Blake2sDigest.cs +++ b/crypto/src/crypto/digests/Blake2sDigest.cs @@ -556,7 +556,7 @@ public void Reset() private void Compress(ReadOnlySpan message) { #if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER - if(System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) { Blake2s_X86.Compress(f0 == uint.MaxValue, chainValue, message, t0, t1, blake2s_IV); return; diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs index da31e0872e..35d2a9101b 100644 --- a/crypto/src/crypto/digests/Blake2s_X86.cs +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -1,10 +1,10 @@ #if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER using System; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Diagnostics; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Digests { @@ -36,44 +36,41 @@ namespace Org.BouncyCastle.Crypto.Digests internal static class Blake2s_X86 { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan message, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) { - if(!Sse41.IsSupported || !BitConverter.IsLittleEndian) + if (!Sse41.IsSupported || !BitConverter.IsLittleEndian) throw new PlatformNotSupportedException(nameof(Blake2s_X86)); - Debug.Assert(dataBuffer.Length >= Unsafe.SizeOf() * 16); + Debug.Assert(message.Length >= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - unchecked - { - Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); - Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); + Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); - var ivBytes = MemoryMarshal.AsBytes(blakeIV); + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); - var r_14 = isFinal ? uint.MaxValue : 0; - var t_0 = Vector128.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + var r_14 = isFinal ? uint.MaxValue : 0; + var t_0 = Vector128.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); - Vector128 row1 = VectorExtensions.LoadVector128(hashBytes); - Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); - Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); - Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); - row4 = Sse2.Xor(row4, t_0); + Vector128 row1 = VectorExtensions.LoadVector128(hashBytes); + Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); + Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); + Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); + row4 = Sse2.Xor(row4, t_0); - Vector128 orig_1 = row1; - Vector128 orig_2 = row2; + Vector128 orig_1 = row1; + Vector128 orig_2 = row2; - Perform10Rounds(r8, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + Perform10Rounds(r8, r16, message, ref row1, ref row2, ref row3, ref row4); - row1 = Sse2.Xor(row1, row3); - row2 = Sse2.Xor(row2, row4); - row1 = Sse2.Xor(row1, orig_1); - row2 = Sse2.Xor(row2, orig_2); + row1 = Sse2.Xor(row1, row3); + row2 = Sse2.Xor(row2, row4); + row1 = Sse2.Xor(row1, orig_1); + row2 = Sse2.Xor(row2, orig_2); - row1.Store(hashBytes); - row2.Store(hashBytes[Vector128.Count..]); - } + row1.Store(hashBytes); + row2.Store(hashBytes[Vector128.Count..]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -401,21 +398,13 @@ private static void Perform10Rounds(Vector128 r8, Vector128 r16, Rea [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) { - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - // ---> - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ + // +-------------------+ +-------------------+ + // | 0 | 1 | 2 | 3 | | 3 | 0 | 1 | 2 | + // +-------------------+ +-------------------+ + // | 8 | 9 | 10 | 11 | ---> | 9 | 10 | 11 | 8 | + // +-------------------+ +-------------------+ + // | 12 | 13 | 14 | 15 | | 14 | 15 | 12 | 13 | + // +-------------------+ +-------------------+ row1 = Sse2.Shuffle(row1, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_00_11_10_01); @@ -449,21 +438,13 @@ private static void G2(Vector128 r8, ref Vector128 row1, ref Vector1 [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Undiagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) { - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - // ---> - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ + // +-------------------+ +-------------------+ + // | 3 | 0 | 1 | 2 | | 0 | 1 | 2 | 3 | + // +-------------------+ +-------------------+ + // | 9 | 10 | 11 | 8 | ---> | 8 | 9 | 10 | 11 | + // +-------------------+ +-------------------+ + // | 14 | 15 | 12 | 13 | | 12 | 13 | 14 | 15 | + // +-------------------+ +-------------------+ row1 = Sse2.Shuffle(row1, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_10_01_00_11); From d86680b375f06c8a8bea79392d2a61b5dd528b7d Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sat, 19 Nov 2022 23:13:30 +0000 Subject: [PATCH 08/14] Code cleanup --- crypto/src/crypto/digests/Blake2b_X86.cs | 632 ++++++++++----------- crypto/src/crypto/digests/Blake2sDigest.cs | 2 +- crypto/src/crypto/digests/Blake2s_X86.cs | 97 ++-- crypto/src/crypto/util/VectorExtensions.cs | 10 - 4 files changed, 339 insertions(+), 402 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index fe132f3afb..5c0f10a581 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -1,10 +1,10 @@ #if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER using System; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Diagnostics; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Digests { @@ -36,44 +36,41 @@ namespace Org.BouncyCastle.Crypto.Digests internal static class Blake2b_X86 { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan message, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) { - if(!Avx2.IsSupported || !BitConverter.IsLittleEndian) + if (!Avx2.IsSupported || !BitConverter.IsLittleEndian) throw new PlatformNotSupportedException(nameof(Blake2b_X86)); - Debug.Assert(dataBuffer.Length >= 128); + Debug.Assert(message.Length >= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - unchecked - { - Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); - var ivBytes = MemoryMarshal.AsBytes(blakeIV); + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); - var r_14 = isFinal ? ulong.MaxValue : 0; - var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + var r_14 = isFinal ? ulong.MaxValue : 0; + var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); - Vector256 row1 = VectorExtensions.LoadVector256(hashBytes); - Vector256 row2 = VectorExtensions.LoadVector256(hashBytes[Vector256.Count..]); - Vector256 row3 = VectorExtensions.LoadVector256(ivBytes); - Vector256 row4 = VectorExtensions.LoadVector256(ivBytes[Vector256.Count..]); - row4 = Avx2.Xor(row4, t_0); + Vector256 row1 = VectorExtensions.LoadVector256(hashBytes); + Vector256 row2 = VectorExtensions.LoadVector256(hashBytes[Vector256.Count..]); + Vector256 row3 = VectorExtensions.LoadVector256(ivBytes); + Vector256 row4 = VectorExtensions.LoadVector256(ivBytes[Vector256.Count..]); + row4 = Avx2.Xor(row4, t_0); - Vector256 orig_1 = row1; - Vector256 orig_2 = row2; + Vector256 orig_1 = row1; + Vector256 orig_2 = row2; - Perform12Rounds(r24, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + Perform12Rounds(r24, r16, message, ref row1, ref row2, ref row3, ref row4); - row1 = Avx2.Xor(row1, row3); - row2 = Avx2.Xor(row2, row4); - row1 = Avx2.Xor(row1, orig_1); - row2 = Avx2.Xor(row2, orig_2); + row1 = Avx2.Xor(row1, row3); + row2 = Avx2.Xor(row2, row4); + row1 = Avx2.Xor(row1, orig_1); + row2 = Avx2.Xor(row2, orig_2); - row1.Store(hashBytes); - row2.Store(hashBytes[Vector256.Count..]); - } + row1.Store(hashBytes); + row2.Store(hashBytes[Vector256.Count..]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -81,452 +78,421 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re { Debug.Assert(m.Length >= 128); - unchecked - { - #region Rounds - //ROUND 1 - var m0 = VectorExtensions.BroadcastVector128ToVector256(m); - var m1 = VectorExtensions.BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); - var m2 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); - var m3 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); + #region Rounds + //ROUND 1 + var m0 = VectorExtensions.BroadcastVector128ToVector256(m); + var m1 = VectorExtensions.BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); + var m2 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); + var m3 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); - var t0 = Avx2.UnpackLow(m0, m1); - var t1 = Avx2.UnpackLow(m2, m3); - var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + var t0 = Avx2.UnpackLow(m0, m1); + var t1 = Avx2.UnpackLow(m2, m3); + var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m0, m1); - t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - var m4 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); - var m5 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); - var m6 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); - var m7 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); + var m4 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); + var m5 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); + var m6 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); + var m7 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); - t0 = Avx2.UnpackLow(m7, m4); - t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 2 - t0 = Avx2.UnpackLow(m7, m2); - t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 2 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackHigh(m2, m0); - t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m6, m1, 8); - t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 3 - t0 = Avx2.AlignRight(m6, m5, 8); - t1 = Avx2.UnpackHigh(m2, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 3 + t0 = Avx2.AlignRight(m6, m5, 8); + t1 = Avx2.UnpackHigh(m2, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m4, m0); - t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m5, m4, 8); - t1 = Avx2.UnpackHigh(m1, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m5, m4, 8); + t1 = Avx2.UnpackHigh(m1, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m2, m7); - t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m2, m7); + t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 4 - t0 = Avx2.UnpackHigh(m3, m1); - t1 = Avx2.UnpackHigh(m6, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 4 + t0 = Avx2.UnpackHigh(m3, m1); + t1 = Avx2.UnpackHigh(m6, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m4, m0); - t1 = Avx2.UnpackLow(m6, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m4, m0); + t1 = Avx2.UnpackLow(m6, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m1, m7, 8); - t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m1, m7, 8); + t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m4, m3); - t1 = Avx2.UnpackLow(m5, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m4, m3); + t1 = Avx2.UnpackLow(m5, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 5 - t0 = Avx2.UnpackHigh(m4, m2); - t1 = Avx2.UnpackLow(m1, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 5 + t0 = Avx2.UnpackHigh(m4, m2); + t1 = Avx2.UnpackLow(m1, m5); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m7, m1, 8); - t1 = Avx2.AlignRight(m3, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m7, m1, 8); + t1 = Avx2.AlignRight(m3, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m6, m0); - t1 = Avx2.UnpackLow(m6, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m6, m0); + t1 = Avx2.UnpackLow(m6, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 6 - t0 = Avx2.UnpackLow(m1, m3); - t1 = Avx2.UnpackLow(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 6 + t0 = Avx2.UnpackLow(m1, m3); + t1 = Avx2.UnpackLow(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m6, m5); - t1 = Avx2.UnpackHigh(m5, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m6, m5); + t1 = Avx2.UnpackHigh(m5, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.AlignRight(m2, m0, 8); - t1 = Avx2.UnpackHigh(m3, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m2, m0, 8); + t1 = Avx2.UnpackHigh(m3, m7); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m4, m6); - t1 = Avx2.AlignRight(m7, m2, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m4, m6); + t1 = Avx2.AlignRight(m7, m2, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 7 - t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.UnpackLow(m7, m2); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 7 + t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m2); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m2, m7); - t1 = Avx2.AlignRight(m5, m6, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m2, m7); + t1 = Avx2.AlignRight(m5, m6, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackLow(m4, m0); - t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m4, m0); + t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m5, m3); - t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m5, m3); + t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 8 - t0 = Avx2.UnpackHigh(m6, m3); - t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 8 + t0 = Avx2.UnpackHigh(m6, m3); + t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m7, m5, 8); - t1 = Avx2.UnpackHigh(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m7, m5, 8); + t1 = Avx2.UnpackHigh(m0, m4); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.AlignRight(m4, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.AlignRight(m4, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m5, m0); - t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m0); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 9 - t0 = Avx2.UnpackLow(m3, m7); - t1 = Avx2.AlignRight(m0, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 9 + t0 = Avx2.UnpackLow(m3, m7); + t1 = Avx2.AlignRight(m0, m5, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.AlignRight(m4, m1, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.AlignRight(m4, m1, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackLow(m5, m6); - t1 = Avx2.UnpackHigh(m6, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m6); + t1 = Avx2.UnpackHigh(m6, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m1, m2, 8); - t1 = Avx2.AlignRight(m2, m3, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m1, m2, 8); + t1 = Avx2.AlignRight(m2, m3, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 10 - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.UnpackHigh(m3, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 10 + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.UnpackHigh(m3, m0); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m1, m2); - t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m1, m2); + t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackHigh(m6, m7); - t1 = Avx2.UnpackHigh(m4, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m6, m7); + t1 = Avx2.UnpackHigh(m4, m1); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - t1 = Avx2.UnpackLow(m7, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + t1 = Avx2.UnpackLow(m7, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 11 - t0 = Avx2.UnpackLow(m0, m1); - t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 11 + t0 = Avx2.UnpackLow(m0, m1); + t1 = Avx2.UnpackLow(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m0, m1); - t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m0, m1); + t1 = Avx2.UnpackHigh(m2, m3); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackLow(m7, m4); - t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m7, m4); + t1 = Avx2.UnpackLow(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackHigh(m7, m4); - t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m7, m4); + t1 = Avx2.UnpackHigh(m5, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Undiagonalize(ref row1, ref row3, ref row4); - //ROUND 12 - t0 = Avx2.UnpackLow(m7, m2); - t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + //ROUND 12 + t0 = Avx2.UnpackLow(m7, m2); + t1 = Avx2.UnpackHigh(m4, m6); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.UnpackLow(m5, m4); - t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackLow(m5, m4); + t1 = Avx2.AlignRight(m3, m7, 8); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + Diagonalize(ref row1, ref row3, ref row4); - t0 = Avx2.UnpackHigh(m2, m0); - t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.UnpackHigh(m2, m0); + t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + G1(r24, ref row1, ref row2, ref row3, ref row4, b0); - t0 = Avx2.AlignRight(m6, m1, 8); - t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + t0 = Avx2.AlignRight(m6, m1, 8); + t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); + b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); + G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); - #endregion - } + Undiagonalize(ref row1, ref row3, ref row4); + #endregion } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) { - unchecked - { - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - // ---> - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - - row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); - row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); - row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); - } + // +-------------------+ +-------------------+ + // | 0 | 1 | 2 | 3 | | 3 | 0 | 1 | 2 | + // +-------------------+ +-------------------+ + // | 8 | 9 | 10 | 11 | ---> | 9 | 10 | 11 | 8 | + // +-------------------+ +-------------------+ + // | 12 | 13 | 14 | 15 | | 14 | 15 | 12 | 13 | + // +-------------------+ +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11); + row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G1(Vector256 r24, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) { - unchecked - { - row1 = Avx2.Add(Avx2.Add(row1, b0), row2); - row4 = Avx2.Xor(row4, row1); - row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); - - row3 = Avx2.Add(row3, row4); - row2 = Avx2.Xor(row2, row3); - row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); - } + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void G2(Vector256 r16, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b0) { - unchecked - { - row1 = Avx2.Add(Avx2.Add(row1, b0), row2); - row4 = Avx2.Xor(row4, row1); - row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); - - row3 = Avx2.Add(row3, row4); - row2 = Avx2.Xor(row2, row3); - row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); - } + row1 = Avx2.Add(Avx2.Add(row1, b0), row2); + row4 = Avx2.Xor(row4, row1); + row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64(); + + row3 = Avx2.Add(row3, row4); + row2 = Avx2.Xor(row2, row3); + row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Undiagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) { - unchecked - { - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - // ---> - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - - row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); - row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); - row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); - } + // +-------------------+ +-------------------+ + // | 3 | 0 | 1 | 2 | | 0 | 1 | 2 | 3 | + // +-------------------+ +-------------------+ + // | 9 | 10 | 11 | 8 | ---> | 8 | 9 | 10 | 11 | + // +-------------------+ +-------------------+ + // | 14 | 15 | 12 | 13 | | 12 | 13 | 14 | 15 | + // +-------------------+ +-------------------+ + + row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01); + row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11); + row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10); } } } diff --git a/crypto/src/crypto/digests/Blake2sDigest.cs b/crypto/src/crypto/digests/Blake2sDigest.cs index f1d332130e..69d1ffc708 100644 --- a/crypto/src/crypto/digests/Blake2sDigest.cs +++ b/crypto/src/crypto/digests/Blake2sDigest.cs @@ -556,7 +556,7 @@ public void Reset() private void Compress(ReadOnlySpan message) { #if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER - if(System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) + if (System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) { Blake2s_X86.Compress(f0 == uint.MaxValue, chainValue, message, t0, t1, blake2s_IV); return; diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs index da31e0872e..35d2a9101b 100644 --- a/crypto/src/crypto/digests/Blake2s_X86.cs +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -1,10 +1,10 @@ #if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER using System; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Diagnostics; -using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace Org.BouncyCastle.Crypto.Digests { @@ -36,44 +36,41 @@ namespace Org.BouncyCastle.Crypto.Digests internal static class Blake2s_X86 { [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan dataBuffer, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) + public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan message, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) { - if(!Sse41.IsSupported || !BitConverter.IsLittleEndian) + if (!Sse41.IsSupported || !BitConverter.IsLittleEndian) throw new PlatformNotSupportedException(nameof(Blake2s_X86)); - Debug.Assert(dataBuffer.Length >= Unsafe.SizeOf() * 16); + Debug.Assert(message.Length >= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - unchecked - { - Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); - Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); + Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); - var ivBytes = MemoryMarshal.AsBytes(blakeIV); + var hashBytes = MemoryMarshal.AsBytes(hashBuffer); + var ivBytes = MemoryMarshal.AsBytes(blakeIV); - var r_14 = isFinal ? uint.MaxValue : 0; - var t_0 = Vector128.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); + var r_14 = isFinal ? uint.MaxValue : 0; + var t_0 = Vector128.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0); - Vector128 row1 = VectorExtensions.LoadVector128(hashBytes); - Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); - Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); - Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); - row4 = Sse2.Xor(row4, t_0); + Vector128 row1 = VectorExtensions.LoadVector128(hashBytes); + Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); + Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); + Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); + row4 = Sse2.Xor(row4, t_0); - Vector128 orig_1 = row1; - Vector128 orig_2 = row2; + Vector128 orig_1 = row1; + Vector128 orig_2 = row2; - Perform10Rounds(r8, r16, dataBuffer, ref row1, ref row2, ref row3, ref row4); + Perform10Rounds(r8, r16, message, ref row1, ref row2, ref row3, ref row4); - row1 = Sse2.Xor(row1, row3); - row2 = Sse2.Xor(row2, row4); - row1 = Sse2.Xor(row1, orig_1); - row2 = Sse2.Xor(row2, orig_2); + row1 = Sse2.Xor(row1, row3); + row2 = Sse2.Xor(row2, row4); + row1 = Sse2.Xor(row1, orig_1); + row2 = Sse2.Xor(row2, orig_2); - row1.Store(hashBytes); - row2.Store(hashBytes[Vector128.Count..]); - } + row1.Store(hashBytes); + row2.Store(hashBytes[Vector128.Count..]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -401,21 +398,13 @@ private static void Perform10Rounds(Vector128 r8, Vector128 r16, Rea [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) { - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ - // ---> - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ + // +-------------------+ +-------------------+ + // | 0 | 1 | 2 | 3 | | 3 | 0 | 1 | 2 | + // +-------------------+ +-------------------+ + // | 8 | 9 | 10 | 11 | ---> | 9 | 10 | 11 | 8 | + // +-------------------+ +-------------------+ + // | 12 | 13 | 14 | 15 | | 14 | 15 | 12 | 13 | + // +-------------------+ +-------------------+ row1 = Sse2.Shuffle(row1, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_00_11_10_01); @@ -449,21 +438,13 @@ private static void G2(Vector128 r8, ref Vector128 row1, ref Vector1 [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Undiagonalize(ref Vector128 row1, ref Vector128 row3, ref Vector128 row4) { - // +-------------------+ - // | 3 | 0 | 1 | 2 | - // +-------------------+ - // | 9 | 10 | 11 | 8 | - // +-------------------+ - // | 14 | 15 | 12 | 13 | - // +-------------------+ - // ---> - // +-------------------+ - // | 0 | 1 | 2 | 3 | - // +-------------------+ - // | 8 | 9 | 10 | 11 | - // +-------------------+ - // | 12 | 13 | 14 | 15 | - // +-------------------+ + // +-------------------+ +-------------------+ + // | 3 | 0 | 1 | 2 | | 0 | 1 | 2 | 3 | + // +-------------------+ +-------------------+ + // | 9 | 10 | 11 | 8 | ---> | 8 | 9 | 10 | 11 | + // +-------------------+ +-------------------+ + // | 14 | 15 | 12 | 13 | | 12 | 13 | 14 | 15 | + // +-------------------+ +-------------------+ row1 = Sse2.Shuffle(row1, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_10_01_00_11); diff --git a/crypto/src/crypto/util/VectorExtensions.cs b/crypto/src/crypto/util/VectorExtensions.cs index 36dcce31be..49006e95a0 100644 --- a/crypto/src/crypto/util/VectorExtensions.cs +++ b/crypto/src/crypto/util/VectorExtensions.cs @@ -9,16 +9,6 @@ namespace Org.BouncyCastle.Crypto.Digests { internal static class VectorExtensions { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 BroadcastVector64ToVector128(ReadOnlySpan source) where T : struct - { - Debug.Assert(source.Length >= Unsafe.SizeOf>()); - - var vector = MemoryMarshal.Read>(source); - Vector128 result = vector.ToVector128Unsafe(); - return result.WithUpper(vector); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 BroadcastVector128ToVector256(ReadOnlySpan source) where T : struct { From 6a597d3cfaae34be5b5d3e74212d6d81526aa758 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 20 Nov 2022 23:36:00 +0000 Subject: [PATCH 09/14] Delete VectorExtensions --- crypto/src/crypto/digests/Blake2b_X86.cs | 52 ++++++++++++++++------ crypto/src/crypto/digests/Blake2s_X86.cs | 34 +++++++++----- crypto/src/crypto/util/VectorExtensions.cs | 51 --------------------- 3 files changed, 62 insertions(+), 75 deletions(-) delete mode 100644 crypto/src/crypto/util/VectorExtensions.cs diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index 5c0f10a581..798e0bd249 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -53,10 +53,10 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan row1 = VectorExtensions.LoadVector256(hashBytes); - Vector256 row2 = VectorExtensions.LoadVector256(hashBytes[Vector256.Count..]); - Vector256 row3 = VectorExtensions.LoadVector256(ivBytes); - Vector256 row4 = VectorExtensions.LoadVector256(ivBytes[Vector256.Count..]); + Vector256 row1 = LoadVector256(hashBytes); + Vector256 row2 = LoadVector256(hashBytes[Vector256.Count..]); + Vector256 row3 = LoadVector256(ivBytes); + Vector256 row4 = LoadVector256(ivBytes[Vector256.Count..]); row4 = Avx2.Xor(row4, t_0); Vector256 orig_1 = row1; @@ -69,8 +69,8 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan.Count..]); + Store(row1, hashBytes); + Store(row2, hashBytes[Vector256.Count..]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -80,10 +80,10 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re #region Rounds //ROUND 1 - var m0 = VectorExtensions.BroadcastVector128ToVector256(m); - var m1 = VectorExtensions.BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); - var m2 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); - var m3 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); + var m0 = BroadcastVector128ToVector256(m); + var m1 = BroadcastVector128ToVector256(m[Unsafe.SizeOf>()..]); + var m2 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 2)..]); + var m3 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 3)..]); var t0 = Avx2.UnpackLow(m0, m1); var t1 = Avx2.UnpackLow(m2, m3); @@ -99,10 +99,10 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re Diagonalize(ref row1, ref row3, ref row4); - var m4 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); - var m5 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); - var m6 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); - var m7 = VectorExtensions.BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); + var m4 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); + var m5 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); + var m6 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 6)..]); + var m7 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 7)..]); t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); @@ -494,6 +494,30 @@ private static void Undiagonalize(ref Vector256 row1, ref Vector256 BroadcastVector128ToVector256(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length >= Unsafe.SizeOf>()); + + var vector = MemoryMarshal.Read>(source); + Vector256 result = vector.ToVector256Unsafe(); + return result.WithUpper(vector); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LoadVector256(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length >= Unsafe.SizeOf>()); + return MemoryMarshal.Read>(source); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Store(Vector256 vector, Span destination) where T : struct + { + Debug.Assert(destination.Length >= Unsafe.SizeOf>()); + MemoryMarshal.Write(destination, ref vector); + } } } #endif diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs index 35d2a9101b..81f9767d39 100644 --- a/crypto/src/crypto/digests/Blake2s_X86.cs +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -53,10 +53,10 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan row1 = VectorExtensions.LoadVector128(hashBytes); - Vector128 row2 = VectorExtensions.LoadVector128(hashBytes[Vector128.Count..]); - Vector128 row3 = VectorExtensions.LoadVector128(ivBytes); - Vector128 row4 = VectorExtensions.LoadVector128(ivBytes[Vector128.Count..]); + Vector128 row1 = LoadVector128(hashBytes); + Vector128 row2 = LoadVector128(hashBytes[Vector128.Count..]); + Vector128 row3 = LoadVector128(ivBytes); + Vector128 row4 = LoadVector128(ivBytes[Vector128.Count..]); row4 = Sse2.Xor(row4, t_0); Vector128 orig_1 = row1; @@ -69,8 +69,8 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan.Count..]); + Store(row1, hashBytes); + Store(row2, hashBytes[Vector128.Count..]); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -79,10 +79,10 @@ private static void Perform10Rounds(Vector128 r8, Vector128 r16, Rea Debug.Assert(m.Length >= Unsafe.SizeOf() * 16); #region Rounds - var m0 = VectorExtensions.LoadVector128(m); - var m1 = VectorExtensions.LoadVector128(m[Vector128.Count..]); - var m2 = VectorExtensions.LoadVector128(m[(Vector128.Count * 2)..]); - var m3 = VectorExtensions.LoadVector128(m[(Vector128.Count * 3)..]); + var m0 = LoadVector128(m); + var m1 = LoadVector128(m[Vector128.Count..]); + var m2 = LoadVector128(m[(Vector128.Count * 2)..]); + var m3 = LoadVector128(m[(Vector128.Count * 3)..]); //ROUND 1 var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); @@ -450,6 +450,20 @@ private static void Undiagonalize(ref Vector128 row1, ref Vector128 row3 = Sse2.Shuffle(row3, 0b_10_01_00_11); row4 = Sse2.Shuffle(row4, 0b_01_00_11_10); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 LoadVector128(ReadOnlySpan source) where T : struct + { + Debug.Assert(source.Length >= Unsafe.SizeOf>()); + return MemoryMarshal.Read>(source); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Store(Vector128 vector, Span destination) where T : struct + { + Debug.Assert(destination.Length >= Unsafe.SizeOf>()); + MemoryMarshal.Write(destination, ref vector); + } } } #endif diff --git a/crypto/src/crypto/util/VectorExtensions.cs b/crypto/src/crypto/util/VectorExtensions.cs deleted file mode 100644 index 49006e95a0..0000000000 --- a/crypto/src/crypto/util/VectorExtensions.cs +++ /dev/null @@ -1,51 +0,0 @@ -#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER -using System; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; - -namespace Org.BouncyCastle.Crypto.Digests -{ - internal static class VectorExtensions - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 BroadcastVector128ToVector256(ReadOnlySpan source) where T : struct - { - Debug.Assert(source.Length >= Unsafe.SizeOf>()); - - var vector = MemoryMarshal.Read>(source); - Vector256 result = vector.ToVector256Unsafe(); - return result.WithUpper(vector); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 LoadVector128(ReadOnlySpan source) where T : struct - { - Debug.Assert(source.Length >= Unsafe.SizeOf>()); - return MemoryMarshal.Read>(source); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 LoadVector256(ReadOnlySpan source) where T : struct - { - Debug.Assert(source.Length >= Unsafe.SizeOf>()); - return MemoryMarshal.Read>(source); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Store(this Vector128 vector, Span destination) where T : struct - { - Debug.Assert(destination.Length >= Unsafe.SizeOf>()); - MemoryMarshal.Write(destination, ref vector); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void Store(this Vector256 vector, Span destination) where T : struct - { - Debug.Assert(destination.Length >= Unsafe.SizeOf>()); - MemoryMarshal.Write(destination, ref vector); - } - } -} -#endif \ No newline at end of file From 5a53e5b53fd8283b2f9bd8f084c1e38e1c3caa34 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 21 Nov 2022 11:32:47 +0000 Subject: [PATCH 10/14] Update pre processor and add IsSupported property. --- crypto/src/crypto/digests/Blake2bDigest.cs | 4 +-- crypto/src/crypto/digests/Blake2b_X86.cs | 30 +++------------------- crypto/src/crypto/digests/Blake2sDigest.cs | 4 +-- crypto/src/crypto/digests/Blake2s_X86.cs | 6 +++-- 4 files changed, 12 insertions(+), 32 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2bDigest.cs b/crypto/src/crypto/digests/Blake2bDigest.cs index 5edf19aefb..245a355df3 100644 --- a/crypto/src/crypto/digests/Blake2bDigest.cs +++ b/crypto/src/crypto/digests/Blake2bDigest.cs @@ -515,8 +515,8 @@ public void Reset() #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER private void Compress(ReadOnlySpan message) { -#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER - if (System.Runtime.Intrinsics.X86.Avx2.IsSupported && BitConverter.IsLittleEndian) +#if NETCOREAPP3_0_OR_GREATER + if (Blake2b_X86.IsSupported) { Blake2b_X86.Compress(f0 == ulong.MaxValue, chainValue, message, t0, t1, blake2b_IV); return; diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index 798e0bd249..d9088b48a6 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -1,4 +1,4 @@ -#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER +#if NETCOREAPP3_0_OR_GREATER using System; using System.Diagnostics; using System.Runtime.CompilerServices; @@ -35,10 +35,12 @@ namespace Org.BouncyCastle.Crypto.Digests internal static class Blake2b_X86 { + public static bool IsSupported => Avx2.IsSupported && BitConverter.IsLittleEndian; + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan message, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan blakeIV) { - if (!Avx2.IsSupported || !BitConverter.IsLittleEndian) + if (!IsSupported) throw new PlatformNotSupportedException(nameof(Blake2b_X86)); Debug.Assert(message.Length >= Unsafe.SizeOf() * 8); @@ -96,7 +98,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); var m4 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); @@ -115,7 +116,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 2 @@ -130,7 +130,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.UnpackHigh(m2, m0); @@ -144,7 +143,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 3 @@ -159,7 +157,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.AlignRight(m5, m4, 8); @@ -173,7 +170,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 4 @@ -188,7 +184,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.AlignRight(m1, m7, 8); @@ -202,7 +197,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 5 @@ -217,7 +211,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.AlignRight(m7, m1, 8); @@ -231,7 +224,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 6 @@ -246,7 +238,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.AlignRight(m2, m0, 8); @@ -260,7 +251,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 7 @@ -275,7 +265,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.UnpackLow(m4, m0); @@ -289,7 +278,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 8 @@ -304,7 +292,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); @@ -318,7 +305,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 9 @@ -333,7 +319,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.UnpackLow(m5, m6); @@ -347,7 +332,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 10 @@ -362,7 +346,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.UnpackHigh(m6, m7); @@ -376,7 +359,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 11 @@ -391,7 +373,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.UnpackLow(m7, m4); @@ -405,7 +386,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); //ROUND 12 @@ -420,7 +400,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); t0 = Avx2.UnpackHigh(m2, m0); @@ -434,7 +413,6 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); #endregion } diff --git a/crypto/src/crypto/digests/Blake2sDigest.cs b/crypto/src/crypto/digests/Blake2sDigest.cs index 69d1ffc708..22fc2ceda8 100644 --- a/crypto/src/crypto/digests/Blake2sDigest.cs +++ b/crypto/src/crypto/digests/Blake2sDigest.cs @@ -555,8 +555,8 @@ public void Reset() #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER private void Compress(ReadOnlySpan message) { -#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER - if (System.Runtime.Intrinsics.X86.Sse41.IsSupported && BitConverter.IsLittleEndian) +#if NETCOREAPP3_0_OR_GREATER + if (Blake2s_X86.IsSupported) { Blake2s_X86.Compress(f0 == uint.MaxValue, chainValue, message, t0, t1, blake2s_IV); return; diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs index 81f9767d39..251d7ed781 100644 --- a/crypto/src/crypto/digests/Blake2s_X86.cs +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -1,4 +1,4 @@ -#if NETCOREAPP3_0_OR_GREATER || NETSTANDARD2_1_OR_GREATER +#if NETCOREAPP3_0_OR_GREATER using System; using System.Diagnostics; using System.Runtime.CompilerServices; @@ -35,10 +35,12 @@ namespace Org.BouncyCastle.Crypto.Digests internal static class Blake2s_X86 { + public static bool IsSupported => Avx2.IsSupported && BitConverter.IsLittleEndian; + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan message, uint totalSegmentsLow, uint totalSegmentsHigh, ReadOnlySpan blakeIV) { - if (!Sse41.IsSupported || !BitConverter.IsLittleEndian) + if (!IsSupported) throw new PlatformNotSupportedException(nameof(Blake2s_X86)); Debug.Assert(message.Length >= Unsafe.SizeOf() * 8); From 67a0fb3cd3ea8bc38a0e741a336760c84bb5abe1 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 21 Nov 2022 14:40:02 +0000 Subject: [PATCH 11/14] Refactored rounds stage --- crypto/src/crypto/digests/Blake2b_X86.cs | 240 ++++++++--------------- crypto/src/crypto/digests/Blake2s_X86.cs | 221 +++++++-------------- 2 files changed, 146 insertions(+), 315 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index d9088b48a6..f121d3c1a1 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -46,9 +46,6 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); var ivBytes = MemoryMarshal.AsBytes(blakeIV); @@ -64,7 +61,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan orig_1 = row1; Vector256 orig_2 = row2; - Perform12Rounds(r24, r16, message, ref row1, ref row2, ref row3, ref row4); + Perform12Rounds(message, ref row1, ref row2, ref row3, ref row4); row1 = Avx2.Xor(row1, row3); row2 = Avx2.Xor(row2, row4); @@ -76,7 +73,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan r24, Vector256 r16, ReadOnlySpan m, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4) + private static void Perform12Rounds(ReadOnlySpan m, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4) { Debug.Assert(m.Length >= 128); @@ -89,16 +86,11 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re var t0 = Avx2.UnpackLow(m0, m1); var t1 = Avx2.UnpackLow(m2, m3); - var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + var b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + var b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); var m4 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); var m5 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); @@ -107,316 +99,242 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + var b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + var b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 2 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m0); t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m6, m1, 8); t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 3 t0 = Avx2.AlignRight(m6, m5, 8); t1 = Avx2.UnpackHigh(m2, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m5, m4, 8); t1 = Avx2.UnpackHigh(m1, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m2, m7); t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 4 t0 = Avx2.UnpackHigh(m3, m1); t1 = Avx2.UnpackHigh(m6, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m0); t1 = Avx2.UnpackLow(m6, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m1, m7, 8); t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m4, m3); t1 = Avx2.UnpackLow(m5, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 5 t0 = Avx2.UnpackHigh(m4, m2); t1 = Avx2.UnpackLow(m1, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m7, m1, 8); t1 = Avx2.AlignRight(m3, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m6, m0); t1 = Avx2.UnpackLow(m6, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 6 t0 = Avx2.UnpackLow(m1, m3); t1 = Avx2.UnpackLow(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m6, m5); t1 = Avx2.UnpackHigh(m5, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m2, m0, 8); t1 = Avx2.UnpackHigh(m3, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m6); t1 = Avx2.AlignRight(m7, m2, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 7 t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m2); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m7); t1 = Avx2.AlignRight(m5, m6, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m5, m3); t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 8 t0 = Avx2.UnpackHigh(m6, m3); t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m7, m5, 8); t1 = Avx2.UnpackHigh(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.AlignRight(m4, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m0); t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 9 t0 = Avx2.UnpackLow(m3, m7); t1 = Avx2.AlignRight(m0, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.AlignRight(m4, m1, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m6); t1 = Avx2.UnpackHigh(m6, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m1, m2, 8); t1 = Avx2.AlignRight(m2, m3, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 10 t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.UnpackHigh(m3, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m1, m2); t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m6, m7); t1 = Avx2.UnpackHigh(m4, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 11 t0 = Avx2.UnpackLow(m0, m1); t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 12 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m0); t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m6, m1, 8); t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); #endregion } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Round(ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b1, Vector256 b2, Vector256 b3, Vector256 b4) + { + Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b1); + G2(r16, ref row1, ref row2, ref row3, ref row4, b2); + + Diagonalize(ref row1, ref row3, ref row4); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b3); + G2(r16, ref row1, ref row2, ref row3, ref row4, b4); + + Undiagonalize(ref row1, ref row3, ref row4); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) { diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs index 251d7ed781..720eefb11e 100644 --- a/crypto/src/crypto/digests/Blake2s_X86.cs +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -46,9 +46,6 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); - Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); var ivBytes = MemoryMarshal.AsBytes(blakeIV); @@ -76,7 +73,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan r8, Vector128 r16, ReadOnlySpan m, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4) + private static void Perform10Rounds(ReadOnlySpan m, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4) { Debug.Assert(m.Length >= Unsafe.SizeOf() * 16); @@ -87,314 +84,230 @@ private static void Perform10Rounds(Vector128 r8, Vector128 r16, Rea var m3 = LoadVector128(m[(Vector128.Count * 3)..]); //ROUND 1 - var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); + var b1 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); - - b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); - - //G2 - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + var b2 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); var t0 = Sse2.Shuffle(m2, 0b_11_10_00_01); var t1 = Sse2.Shuffle(m3, 0b_00_01_11_10); - b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_00_00_11).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + var b3 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_00_00_11).AsUInt32(); t0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t0, 0b_10_11_00_01); + var b4 = Sse2.Shuffle(t0, 0b_10_11_00_01); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 2 t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_10_01_00_11); t0 = Sse2.Shuffle(m2, 0b_00_00_10_00); t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_10_11_00_01); t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4); t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_00_01_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_11_00_01_10); t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_00_01_10); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_11_00_01_10); - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 3 t0 = Sse2.UnpackHigh(m2, m3); t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_11_01_00_10); t0 = Sse2.UnpackLow(m2, m0); t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8); - b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m1, 12); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_00_11_10_01); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_00_11_10_01); t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4); t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_01_10_11_00); - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 4 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackHigh(t0, m2); t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_11_01_00_10); t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8); t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_10_00_01_11); t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_00_01_10_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t1, 0b_00_01_10_11); t0 = Ssse3.AlignRight(m0, m1, 4); - b0 = Sse41.Blend(t0.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + b4 = Sse41.Blend(t0.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 5 t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_10_00_01_11); t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32(); - b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_10_01_00_11); t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8); t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_00_11_01); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_10_00_11_01); - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 6 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackLow(m0, m2); - b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); t0 = Sse2.ShiftRightLogical128BitLane(m2, 4); t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m3, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_10_11_00_01); t0 = Sse2.UnpackLow(m2.AsUInt64(), m1.AsUInt64()).AsUInt32(); t1 = Sse2.Shuffle(m3, 0b_10_00_01_00); t2 = Sse2.ShiftRightLogical128BitLane(t0, 4); - b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 7 t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12); t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_11_00); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_10_01_11_00); t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_11_01_00_10); t0 = Sse2.UnpackHigh(m1, m2); t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_00_01_10_11); + b4 = Sse2.Shuffle(t1, 0b_00_01_10_11); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 8 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_10_00_11_01); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t1, 0b_10_00_11_01); t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m0, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_00_10_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_01_00_10_11); t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_11_01_00); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_10_11_01_00); t0 = Sse2.UnpackLow(m0, m1); t1 = Sse2.UnpackHigh(m1, m2); t2 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_10_01_00_11); - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 9 t0 = Sse2.UnpackHigh(m1, m3); t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); t0 = Sse2.UnpackHigh(m0, m3); t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_00_10_01_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t1, 0b_00_10_01_11); t0 = Sse2.UnpackLow(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m2, 8); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_11_10_00); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_01_11_10_00); t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t0, 0b_00_11_10_01); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t0, 0b_00_11_10_01); - Undiagonalize(ref row1, ref row3, ref row4); + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 10 t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_11_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_01_11_00_10); t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4); t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_01_10_00_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t1, 0b_01_10_00_11); t0 = Sse2.UnpackHigh(m0, m3); t1 = Sse2.UnpackLow(m2, m3); t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_00_10_01_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_00_10_01_11); t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t1 = Sse2.UnpackLow(m0, m3); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); + b4 = Sse2.Shuffle(t2, 0b_01_10_11_00); + + Round(r8, r16, ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); + #endregion + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Round(ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4, Vector128 b1, Vector128 b2, Vector128 b3, Vector128 b4) + { + Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); + Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b1); + G2(r8, ref row1, ref row2, ref row3, ref row4, b2); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + Diagonalize(ref row1, ref row3, ref row4); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b3); + G2(r8, ref row1, ref row2, ref row3, ref row4, b4); Undiagonalize(ref row1, ref row3, ref row4); - #endregion } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 9022d019eb5f392919bca3e4dde484f86a2c41a0 Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Mon, 21 Nov 2022 14:40:02 +0000 Subject: [PATCH 12/14] Refactored rounds stage --- crypto/src/crypto/digests/Blake2b_X86.cs | 240 ++++++++--------------- crypto/src/crypto/digests/Blake2s_X86.cs | 223 +++++++-------------- 2 files changed, 147 insertions(+), 316 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2b_X86.cs b/crypto/src/crypto/digests/Blake2b_X86.cs index d9088b48a6..f121d3c1a1 100644 --- a/crypto/src/crypto/digests/Blake2b_X86.cs +++ b/crypto/src/crypto/digests/Blake2b_X86.cs @@ -46,9 +46,6 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); var ivBytes = MemoryMarshal.AsBytes(blakeIV); @@ -64,7 +61,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan orig_1 = row1; Vector256 orig_2 = row2; - Perform12Rounds(r24, r16, message, ref row1, ref row2, ref row3, ref row4); + Perform12Rounds(message, ref row1, ref row2, ref row3, ref row4); row1 = Avx2.Xor(row1, row3); row2 = Avx2.Xor(row2, row4); @@ -76,7 +73,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan r24, Vector256 r16, ReadOnlySpan m, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4) + private static void Perform12Rounds(ReadOnlySpan m, ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4) { Debug.Assert(m.Length >= 128); @@ -89,16 +86,11 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re var t0 = Avx2.UnpackLow(m0, m1); var t1 = Avx2.UnpackLow(m2, m3); - var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + var b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + var b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); var m4 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 4)..]); var m5 = BroadcastVector128ToVector256(m[(Unsafe.SizeOf>() * 5)..]); @@ -107,316 +99,242 @@ private static void Perform12Rounds(Vector256 r24, Vector256 r16, Re t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + var b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + var b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 2 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m0); t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m6, m1, 8); t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 3 t0 = Avx2.AlignRight(m6, m5, 8); t1 = Avx2.UnpackHigh(m2, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m5, m4, 8); t1 = Avx2.UnpackHigh(m1, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m2, m7); t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 4 t0 = Avx2.UnpackHigh(m3, m1); t1 = Avx2.UnpackHigh(m6, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m0); t1 = Avx2.UnpackLow(m6, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m1, m7, 8); t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m4, m3); t1 = Avx2.UnpackLow(m5, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 5 t0 = Avx2.UnpackHigh(m4, m2); t1 = Avx2.UnpackLow(m1, m5); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m7, m1, 8); t1 = Avx2.AlignRight(m3, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m6, m0); t1 = Avx2.UnpackLow(m6, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 6 t0 = Avx2.UnpackLow(m1, m3); t1 = Avx2.UnpackLow(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m6, m5); t1 = Avx2.UnpackHigh(m5, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m2, m0, 8); t1 = Avx2.UnpackHigh(m3, m7); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m4, m6); t1 = Avx2.AlignRight(m7, m2, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 7 t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m2); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m7); t1 = Avx2.AlignRight(m5, m6, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m4, m0); t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m5, m3); t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 8 t0 = Avx2.UnpackHigh(m6, m3); t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m7, m5, 8); t1 = Avx2.UnpackHigh(m0, m4); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.AlignRight(m4, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m0); t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 9 t0 = Avx2.UnpackLow(m3, m7); t1 = Avx2.AlignRight(m0, m5, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.AlignRight(m4, m1, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m6); t1 = Avx2.UnpackHigh(m6, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m1, m2, 8); t1 = Avx2.AlignRight(m2, m3, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 10 t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.UnpackHigh(m3, m0); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m1, m2); t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m6, m7); t1 = Avx2.UnpackHigh(m4, m1); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); t1 = Avx2.UnpackLow(m7, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 11 t0 = Avx2.UnpackLow(m0, m1); t1 = Avx2.UnpackLow(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m0, m1); t1 = Avx2.UnpackHigh(m2, m3); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m7, m4); t1 = Avx2.UnpackLow(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m7, m4); t1 = Avx2.UnpackHigh(m5, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 12 t0 = Avx2.UnpackLow(m7, m2); t1 = Avx2.UnpackHigh(m4, m6); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackLow(m5, m4); t1 = Avx2.AlignRight(m3, m7, 8); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Diagonalize(ref row1, ref row3, ref row4); + b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.UnpackHigh(m2, m0); t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - - G1(r24, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); t0 = Avx2.AlignRight(m6, m1, 8); t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64(); - b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); + b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64(); - G2(r16, ref row1, ref row2, ref row3, ref row4, b0); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); #endregion } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Round(ref Vector256 row1, ref Vector256 row2, ref Vector256 row3, ref Vector256 row4, Vector256 b1, Vector256 b2, Vector256 b3, Vector256 b4) + { + Vector256 r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + Vector256 r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b1); + G2(r16, ref row1, ref row2, ref row3, ref row4, b2); + + Diagonalize(ref row1, ref row3, ref row4); + + G1(r24, ref row1, ref row2, ref row3, ref row4, b3); + G2(r16, ref row1, ref row2, ref row3, ref row4, b4); + + Undiagonalize(ref row1, ref row3, ref row4); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void Diagonalize(ref Vector256 row1, ref Vector256 row3, ref Vector256 row4) { diff --git a/crypto/src/crypto/digests/Blake2s_X86.cs b/crypto/src/crypto/digests/Blake2s_X86.cs index 251d7ed781..06a24aabba 100644 --- a/crypto/src/crypto/digests/Blake2s_X86.cs +++ b/crypto/src/crypto/digests/Blake2s_X86.cs @@ -46,9 +46,6 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan= Unsafe.SizeOf() * 8); Debug.Assert(hashBuffer.Length >= 8); - Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); - Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); - var hashBytes = MemoryMarshal.AsBytes(hashBuffer); var ivBytes = MemoryMarshal.AsBytes(blakeIV); @@ -64,7 +61,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan orig_1 = row1; Vector128 orig_2 = row2; - Perform10Rounds(r8, r16, message, ref row1, ref row2, ref row3, ref row4); + Perform10Rounds(message, ref row1, ref row2, ref row3, ref row4); row1 = Sse2.Xor(row1, row3); row2 = Sse2.Xor(row2, row4); @@ -76,7 +73,7 @@ public static void Compress(bool isFinal, Span hashBuffer, ReadOnlySpan r8, Vector128 r16, ReadOnlySpan m, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4) + private static void Perform10Rounds(ReadOnlySpan m, ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4) { Debug.Assert(m.Length >= Unsafe.SizeOf() * 16); @@ -87,314 +84,230 @@ private static void Perform10Rounds(Vector128 r8, Vector128 r16, Rea var m3 = LoadVector128(m[(Vector128.Count * 3)..]); //ROUND 1 - var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); + var b1 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); - - b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); - - //G2 - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + var b2 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); var t0 = Sse2.Shuffle(m2, 0b_11_10_00_01); var t1 = Sse2.Shuffle(m3, 0b_00_01_11_10); - b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_00_00_11).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + var b3 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_00_00_11).AsUInt32(); t0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t0, 0b_10_11_00_01); + var b4 = Sse2.Shuffle(t0, 0b_10_11_00_01); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 2 t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_10_01_00_11); t0 = Sse2.Shuffle(m2, 0b_00_00_10_00); t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_10_11_00_01); t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4); t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_00_01_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_11_00_01_10); t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_00_01_10); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_11_00_01_10); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 3 t0 = Sse2.UnpackHigh(m2, m3); t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_11_01_00_10); t0 = Sse2.UnpackLow(m2, m0); t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8); - b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m1, 12); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_00_11_10_01); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_00_11_10_01); t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4); t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_01_10_11_00); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 4 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackHigh(t0, m2); t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_11_01_00_10); t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8); t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_10_00_01_11); t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_00_01_10_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t1, 0b_00_01_10_11); t0 = Ssse3.AlignRight(m0, m1, 4); - b0 = Sse41.Blend(t0.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); + b4 = Sse41.Blend(t0.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 5 t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_10_00_01_11); t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32(); - b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_10_01_00_11); t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8); t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_00_11_01); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_10_00_11_01); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 6 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackLow(m0, m2); - b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); t0 = Sse2.ShiftRightLogical128BitLane(m2, 4); t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m3, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_10_11_00_01); t0 = Sse2.UnpackLow(m2.AsUInt64(), m1.AsUInt64()).AsUInt32(); t1 = Sse2.Shuffle(m3, 0b_10_00_01_00); t2 = Sse2.ShiftRightLogical128BitLane(t0, 4); - b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 7 t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12); t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32(); - b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_11_00); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_10_01_11_00); t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_11_01_00_10); t0 = Sse2.UnpackHigh(m1, m2); t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_00_01_10_11); + b4 = Sse2.Shuffle(t1, 0b_00_01_10_11); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 8 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_10_00_11_01); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t1, 0b_10_00_11_01); t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m0, 4); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_00_10_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t2, 0b_01_00_10_11); t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_11_01_00); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_10_11_01_00); t0 = Sse2.UnpackLow(m0, m1); t1 = Sse2.UnpackHigh(m1, m2); t2 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t2, 0b_10_01_00_11); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 9 t0 = Sse2.UnpackHigh(m1, m3); t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); t0 = Sse2.UnpackHigh(m0, m3); t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_00_10_01_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t1, 0b_00_10_01_11); t0 = Sse2.UnpackLow(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.ShiftRightLogical128BitLane(m2, 8); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_11_10_00); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_01_11_10_00); t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t0, 0b_00_11_10_01); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + b4 = Sse2.Shuffle(t0, 0b_00_11_10_01); - Undiagonalize(ref row1, ref row3, ref row4); + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); //ROUND 10 t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_11_00_10); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b1 = Sse2.Shuffle(t2, 0b_01_11_00_10); t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4); t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); - b0 = Sse2.Shuffle(t1, 0b_01_10_00_11); - - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); - - Diagonalize(ref row1, ref row3, ref row4); + b2 = Sse2.Shuffle(t1, 0b_01_10_00_11); t0 = Sse2.UnpackHigh(m0, m3); t1 = Sse2.UnpackLow(m2, m3); t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_00_10_01_11); - - G1(r16, ref row1, ref row2, ref row3, ref row4, b0); + b3 = Sse2.Shuffle(t2, 0b_00_10_01_11); t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t1 = Sse2.UnpackLow(m0, m3); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); - b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); + b4 = Sse2.Shuffle(t2, 0b_01_10_11_00); + + Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4); + #endregion + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void Round(ref Vector128 row1, ref Vector128 row2, ref Vector128 row3, ref Vector128 row4, Vector128 b1, Vector128 b2, Vector128 b3, Vector128 b4) + { + Vector128 r8 = Vector128.Create((byte)1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); + Vector128 r16 = Vector128.Create((byte)2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b1); + G2(r8, ref row1, ref row2, ref row3, ref row4, b2); - G2(r8, ref row1, ref row2, ref row3, ref row4, b0); + Diagonalize(ref row1, ref row3, ref row4); + + G1(r16, ref row1, ref row2, ref row3, ref row4, b3); + G2(r8, ref row1, ref row2, ref row3, ref row4, b4); Undiagonalize(ref row1, ref row3, ref row4); - #endregion } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 1619e8ac2d21572e47ba6eb21f7a5c6e185b33dc Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 27 Nov 2022 21:23:11 +0000 Subject: [PATCH 13/14] Capitalise parameter and return definitions. --- crypto/src/crypto/IDigest.cs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crypto/src/crypto/IDigest.cs b/crypto/src/crypto/IDigest.cs index 36caf3728e..c705723531 100644 --- a/crypto/src/crypto/IDigest.cs +++ b/crypto/src/crypto/IDigest.cs @@ -9,41 +9,41 @@ public interface IDigest string AlgorithmName { get; } /// Return the size, in bytes, of the digest produced by this message digest. - /// the size, in bytes, of the digest produced by this message digest. + /// The size, in bytes, of the digest produced by this message digest. int GetDigestSize(); /// Return the size, in bytes, of the internal buffer used by this digest. - /// the size, in bytes, of the internal buffer used by this digest. + /// The size, in bytes, of the internal buffer used by this digest. int GetByteLength(); /// Update the message digest with a single byte. - /// the input byte to be entered. + /// The input byte to be entered. void Update(byte input); /// Update the message digest with a block of bytes. - /// the byte array containing the data. - /// the offset into the byte array where the data starts. - /// the length of the data. + /// The byte array containing the data. + /// The offset into the byte array where the data starts. + /// The length of the data. void BlockUpdate(byte[] input, int inOff, int inLen); #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER /// Update the message digest with a span of bytes. - /// the span containing the data. + /// The span containing the data. void BlockUpdate(ReadOnlySpan input); #endif /// Close the digest, producing the final digest value. /// This call leaves the digest reset. - /// the byte array the digest is to be copied into. - /// the offset into the byte array the digest is to start at. - /// the number of bytes written + /// The byte array the digest is to be copied into. + /// The offset into the byte array the digest is to start at. + /// The number of bytes written. int DoFinal(byte[] output, int outOff); #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER /// Close the digest, producing the final digest value. /// This call leaves the digest reset. - /// the span the digest is to be copied into. - /// the number of bytes written + /// The span the digest is to be copied into. + /// The number of bytes written. int DoFinal(Span output); #endif From 17cace83f9cce1cdd99f2a2ee51090e14a7ecc6b Mon Sep 17 00:00:00 2001 From: Timothy Makkison Date: Sun, 27 Nov 2022 21:24:09 +0000 Subject: [PATCH 14/14] Convert javadocs to xmldocs --- crypto/src/crypto/digests/Blake2bDigest.cs | 177 +++++++++-------- crypto/src/crypto/digests/Blake2sDigest.cs | 178 ++++++++--------- crypto/src/crypto/digests/Blake2xsDigest.cs | 209 +++++++++++--------- 3 files changed, 292 insertions(+), 272 deletions(-) diff --git a/crypto/src/crypto/digests/Blake2bDigest.cs b/crypto/src/crypto/digests/Blake2bDigest.cs index 245a355df3..9d2a0fb11c 100644 --- a/crypto/src/crypto/digests/Blake2bDigest.cs +++ b/crypto/src/crypto/digests/Blake2bDigest.cs @@ -30,18 +30,22 @@ This implementation does not support the Tree Hashing Mode. ---------------+--------+-----------+------+------------+ */ - /** - * Implementation of the cryptographic hash function Blake2b. - *

- * Blake2b offers a built-in keying mechanism to be used directly - * for authentication ("Prefix-MAC") rather than a HMAC construction. - *

- * Blake2b offers a built-in support for a salt for randomized hashing - * and a personal string for defining a unique hash function for each application. - *

- * BLAKE2b is optimized for 64-bit platforms and produces digests of any size - * between 1 and 64 bytes. - */ + ///

+ /// Implementation of the cryptographic hash function Blake2b. + /// BLAKE2b is optimized for 64-bit platforms and produces digests of any size + /// between 1 and 64 bytes. + /// + /// + /// + /// + /// Blake2b offers a built-in keying mechanism to be used directly + /// for authentication ("Prefix-MAC") rather than a HMAC construction. + /// + /// + /// Blake2b offers a built-in support for a salt for randomized hashing + /// and a personal string for defining a unique hash function for each application. + /// + /// public sealed class Blake2bDigest : IDigest { @@ -112,11 +116,18 @@ public sealed class Blake2bDigest // For Tree Hashing Mode, not used here: // private long f1 = 0L; // finalization flag, for last node: ~0L + /// + /// Initializes a new instance of . + /// public Blake2bDigest() : this(512) { } + /// + /// Constructs a new instance of from another ./>. + /// + /// The original instance of that is copied. public Blake2bDigest(Blake2bDigest digest) { this.bufferPos = digest.bufferPos; @@ -132,11 +143,11 @@ public Blake2bDigest(Blake2bDigest digest) this.f0 = digest.f0; } - /** - * Basic sized constructor - size in bits. - * - * @param digestSize size of the digest in bits - */ + /// + /// Initializes a new instance of with a given digest size. + /// + /// Digest size in bits. + /// public Blake2bDigest(int digestSize) { if (digestSize < 8 || digestSize > 512 || digestSize % 8 != 0) @@ -148,15 +159,18 @@ public Blake2bDigest(int digestSize) Init(); } - /** - * Blake2b for authentication ("Prefix-MAC mode"). - * After calling the doFinal() method, the key will - * remain to be used for further computations of - * this instance. - * The key can be overwritten using the clearKey() method. - * - * @param key A key up to 64 bytes or null - */ + /// + /// + /// Initializes a new instance of with a key. + /// + /// + /// Blake2b for authentication ("Prefix-MAC mode"). + /// After calling the method, the key will + /// remain to be used for further computations of this instance. + /// The key can be cleared using the method. + /// + /// A key up to 64 bytes or null. + /// public Blake2bDigest(byte[] key) { buffer = new byte[BLOCK_LENGTH_BYTES]; @@ -176,18 +190,21 @@ public Blake2bDigest(byte[] key) Init(); } - /** - * Blake2b with key, required digest length (in bytes), salt and personalization. - * After calling the doFinal() method, the key, the salt and the personal string - * will remain and might be used for further computations with this instance. - * The key can be overwritten using the clearKey() method, the salt (pepper) - * can be overwritten using the clearSalt() method. - * - * @param key A key up to 64 bytes or null - * @param digestLength from 1 up to 64 bytes - * @param salt 16 bytes or null - * @param personalization 16 bytes or null - */ + /// + /// + /// Initializes a new instance of with a key, required digest length (in bytes), salt and personalization. + /// + /// + /// After calling the method, the key, the salt and the personalization + /// will remain and might be used for further computations with this instance. + /// The key can be overwritten using the method, the salt (pepper) + /// can be overwritten using the method. + /// + /// A key up to 64 bytes or null. + /// Digest length from 1 to 64 bytes. + /// A 16 bytes or nullable salt. + /// A 16 bytes or null personalization. + /// public Blake2bDigest(byte[] key, int digestLength, byte[] salt, byte[] personalization) { if (digestLength < 1 || digestLength > 64) @@ -274,11 +291,7 @@ private void InitializeInternalState() internalState[15] = blake2b_IV[7];// ^ f1 with f1 = 0 } - /** - * update the message digest with a single byte. - * - * @param b the input byte to be entered. - */ + /// public void Update(byte b) { // process the buffer if full else add to buffer: @@ -306,13 +319,7 @@ public void Update(byte b) } } - /** - * update the message digest with a block of bytes. - * - * @param message the byte array containing the data. - * @param offset the offset into the byte array where the data starts. - * @param len the length of the data. - */ + /// public void BlockUpdate(byte[] message, int offset, int len) { if (message == null || len == 0) @@ -369,6 +376,7 @@ public void BlockUpdate(byte[] message, int offset, int len) } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + /// public void BlockUpdate(ReadOnlySpan input) { if (input.IsEmpty) @@ -421,14 +429,14 @@ public void BlockUpdate(ReadOnlySpan input) } #endif - /** - * close the digest, producing the final digest value. The doFinal - * call leaves the digest reset. - * Key, salt and personal string remain. - * - * @param out the array the digest is to be copied into. - * @param outOffset the offset into the out array the digest is to start at. - */ + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The byte array the digest is to be copied into. + /// The offset into the byte array the digest is to start at. + /// The number of bytes written. public int DoFinal(byte[] output, int outOffset) { #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER @@ -462,6 +470,13 @@ public int DoFinal(byte[] output, int outOffset) } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The span the digest is to be copied into. + /// The number of bytes written. public int DoFinal(Span output) { f0 = 0xFFFFFFFFFFFFFFFFUL; @@ -491,11 +506,10 @@ public int DoFinal(Span output) } #endif - /** - * Reset the digest back to it's initial state. - * The key, the salt and the personal string will - * remain for further computations. - */ + /// + /// Reset the digest back to it's initial state. + /// The key, the salt and the personalization will remain for further computations. + /// public void Reset() { bufferPos = 0; @@ -597,38 +611,28 @@ private static ulong Rotr64(ulong x, int rot) return x >> rot | x << -rot; } - /** - * return the algorithm name - * - * @return the algorithm name - */ + /// public string AlgorithmName => "BLAKE2b"; - /** - * return the size, in bytes, of the digest produced by this message digest. - * - * @return the size, in bytes, of the digest produced by this message digest. - */ + /// public int GetDigestSize() { return digestLength; } - /** - * Return the size in bytes of the internal buffer the digest applies it's compression - * function to. - * - * @return byte length of the digests internal buffer. - */ + /// + /// Return the size in bytes of the internal buffer the digest applies it's compression + /// function to. + /// + /// The byte length of the digests internal buffer. public int GetByteLength() { return BLOCK_LENGTH_BYTES; } - /** - * Overwrite the key - * if it is no longer used (zeroization) - */ + /// + /// Clears the key. + /// public void ClearKey() { if (key != null) @@ -638,10 +642,9 @@ public void ClearKey() } } - /** - * Overwrite the salt (pepper) if it - * is secret and no longer used (zeroization) - */ + /// + /// Clears the salt (pepper). + /// public void ClearSalt() { if (salt != null) diff --git a/crypto/src/crypto/digests/Blake2sDigest.cs b/crypto/src/crypto/digests/Blake2sDigest.cs index 22fc2ceda8..b04904a78b 100644 --- a/crypto/src/crypto/digests/Blake2sDigest.cs +++ b/crypto/src/crypto/digests/Blake2sDigest.cs @@ -31,18 +31,22 @@ This implementation does not support the Tree Hashing Mode. ---------------+--------+-----------+------+------------+ */ - /** - * Implementation of the cryptographic hash function BLAKE2s. - *

- * BLAKE2s offers a built-in keying mechanism to be used directly - * for authentication ("Prefix-MAC") rather than a HMAC construction. - *

- * BLAKE2s offers a built-in support for a salt for randomized hashing - * and a personal string for defining a unique hash function for each application. - *

- * BLAKE2s is optimized for 32-bit platforms and produces digests of any size - * between 1 and 32 bytes. - */ + ///

+ /// Implementation of the cryptographic hash function BLAKE2s. + /// BLAKE2s is optimized for 32-bit platforms and produces digests of any size + /// between 1 and 32 bytes. + /// + /// + /// + /// + /// BLAKE2s offers a built-in keying mechanism to be used directly + /// for authentication ("Prefix-MAC") rather than a HMAC construction. + /// + /// + /// BLAKE2s is optimized for 32-bit platforms and produces digests of any size + /// between 1 and 32 bytes. + /// + /// public sealed class Blake2sDigest : IDigest { @@ -130,14 +134,18 @@ public sealed class Blake2sDigest // For Tree Hashing Mode, not used here: // private long f1 = 0L; // finalization flag, for last node: ~0L - /** - * BLAKE2s-256 for hashing. - */ + /// + /// Initializes a new instance of . + /// public Blake2sDigest() : this(256) { } + /// + /// Constructs a new instance of from another ./>. + /// + /// The original instance of that is copied. public Blake2sDigest(Blake2sDigest digest) { this.bufferPos = digest.bufferPos; @@ -160,11 +168,11 @@ public Blake2sDigest(Blake2sDigest digest) this.innerHashLength = digest.innerHashLength; } - /** - * BLAKE2s for hashing. - * - * @param digestBits the desired digest length in bits. Must be a multiple of 8 and less than 256. - */ + /// + /// Initializes a new instance of with a given digest size. + /// + /// Digest size in bits. + /// public Blake2sDigest(int digestBits) { if (digestBits < 8 || digestBits > 256 || digestBits % 8 != 0) @@ -175,33 +183,38 @@ public Blake2sDigest(int digestBits) Init(null, null, null); } - /** - * BLAKE2s for authentication ("Prefix-MAC mode"). - *

- * After calling the doFinal() method, the key will remain to be used for - * further computations of this instance. The key can be overwritten using - * the clearKey() method. - * - * @param key a key up to 32 bytes or null - */ + ///

+ /// + /// Initializes a new instance of with a key. + /// + /// + /// Blake2s for authentication ("Prefix-MAC mode"). + /// After calling the method, the key will + /// remain to be used for further computations of this instance. + /// The key can be cleared using the method. + /// + /// A key up to 32 bytes or null. + /// public Blake2sDigest(byte[] key) { Init(null, null, key); } - /** - * BLAKE2s with key, required digest length, salt and personalization. - *

- * After calling the doFinal() method, the key, the salt and the personal - * string will remain and might be used for further computations with this - * instance. The key can be overwritten using the clearKey() method, the - * salt (pepper) can be overwritten using the clearSalt() method. - * - * @param key a key up to 32 bytes or null - * @param digestBytes from 1 up to 32 bytes - * @param salt 8 bytes or null - * @param personalization 8 bytes or null - */ + ///

+ /// + /// Initializes a new instance of with a key, required digest length (in bytes), salt and personalization. + /// + /// + /// After calling the method, the key, the salt and the personalization + /// will remain and might be used for further computations with this instance. + /// The key can be overwritten using the method, the salt (pepper) + /// can be overwritten using the method. + /// + /// A key up to 32 bytes or null. + /// Digest length from 1 to 32 bytes. + /// A 8 bytes or nullable salt. + /// A 8 bytes or null personalization. + /// public Blake2sDigest(byte[] key, int digestBytes, byte[] salt, byte[] personalization) { if (digestBytes < 1 || digestBytes > 32) @@ -306,11 +319,7 @@ private void InitializeInternalState() internalState[15] = blake2s_IV[7];// ^ f1 with f1 = 0 } - /** - * Update the message digest with a single byte. - * - * @param b the input byte to be entered. - */ + /// public void Update(byte b) { // process the buffer if full else add to buffer: @@ -338,13 +347,7 @@ public void Update(byte b) } } - /** - * Update the message digest with a block of bytes. - * - * @param message the byte array containing the data. - * @param offset the offset into the byte array where the data starts. - * @param len the length of the data. - */ + /// public void BlockUpdate(byte[] message, int offset, int len) { if (message == null || len == 0) @@ -404,6 +407,7 @@ public void BlockUpdate(byte[] message, int offset, int len) } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + /// public void BlockUpdate(ReadOnlySpan input) { if (input.IsEmpty) @@ -458,13 +462,14 @@ public void BlockUpdate(ReadOnlySpan input) } #endif - /** - * Close the digest, producing the final digest value. The doFinal() call - * leaves the digest reset. Key, salt and personal string remain. - * - * @param out the array the digest is to be copied into. - * @param outOffset the offset into the out array the digest is to start at. - */ + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The byte array the digest is to be copied into. + /// The offset into the byte array the digest is to start at. + /// The number of bytes written. public int DoFinal(byte[] output, int outOffset) { #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER @@ -500,6 +505,13 @@ public int DoFinal(byte[] output, int outOffset) } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The span the digest is to be copied into. + /// The number of bytes written. public int DoFinal(Span output) { f0 = 0xFFFFFFFFU; @@ -531,10 +543,10 @@ public int DoFinal(Span output) } #endif - /** - * Reset the digest back to its initial state. The key, the salt and the - * personal string will remain for further computations. - */ + /// + /// Reset the digest back to it's initial state. + /// The key, the salt and the personalization will remain for further computations. + /// public void Reset() { bufferPos = 0; @@ -632,37 +644,28 @@ private void G(uint m1, uint m2, int posA, int posB, int posC, int posD) internalState[posB] = Integers.RotateRight(internalState[posB] ^ internalState[posC], 7); } - /** - * Return the algorithm name. - * - * @return the algorithm name - */ + /// public string AlgorithmName => "BLAKE2s"; - /** - * Return the size in bytes of the digest produced by this message digest. - * - * @return the size in bytes of the digest produced by this message digest. - */ + /// public int GetDigestSize() { return digestLength; } - /** - * Return the size in bytes of the internal buffer the digest applies its - * compression function to. - * - * @return byte length of the digest's internal buffer. - */ + /// + /// Return the size in bytes of the internal buffer the digest applies it's compression + /// function to. + /// + /// The byte length of the digests internal buffer. public int GetByteLength() { return BLOCK_LENGTH_BYTES; } - /** - * Overwrite the key if it is no longer used (zeroization). - */ + /// + /// Clears the key. + /// public void ClearKey() { if (key != null) @@ -672,10 +675,9 @@ public void ClearKey() } } - /** - * Overwrite the salt (pepper) if it is secret and no longer used - * (zeroization). - */ + /// + /// Clears the salt (pepper). + /// public void ClearSalt() { if (salt != null) diff --git a/crypto/src/crypto/digests/Blake2xsDigest.cs b/crypto/src/crypto/digests/Blake2xsDigest.cs index ac7e8f6112..152432e497 100644 --- a/crypto/src/crypto/digests/Blake2xsDigest.cs +++ b/crypto/src/crypto/digests/Blake2xsDigest.cs @@ -12,19 +12,23 @@ The BLAKE2 cryptographic hash function was designed by Jean- Reference Implementation and Description can be found at: https://blake2.net/blake2x.pdf */ - /** - * Implementation of the eXtendable Output Function (XOF) BLAKE2xs. - *

- * BLAKE2xs offers a built-in keying mechanism to be used directly - * for authentication ("Prefix-MAC") rather than a HMAC construction. - *

- * BLAKE2xs offers a built-in support for a salt for randomized hashing - * and a personal string for defining a unique hash function for each application. - *

- * BLAKE2xs is optimized for 32-bit platforms and produces digests of any size - * between 1 and 2^16-2 bytes. The length can also be unknown and then the maximum - * length will be 2^32 blocks of 32 bytes. - */ + ///

+ /// Implementation of the eXtendable Output Function (XOF) BLAKE2xs. + /// BLAKE2xs is optimized for 32-bit platforms and produces digests of any size + /// between 1 and 2^16-2 bytes. The length can also be unknown and then the maximum + /// length will be 2^32 blocks of 32 bytes. + /// + /// + /// + /// + /// BLAKE2xs offers a built-in keying mechanism to be used directly + /// for authentication ("Prefix-MAC") rather than a HMAC construction. + /// + /// + /// BLAKE2xs offers a built-in support for a salt for randomized hashing + /// and a personal string for defining a unique hash function for each application. + /// + /// public sealed class Blake2xsDigest : IXof { @@ -78,43 +82,48 @@ public sealed class Blake2xsDigest */ private long nodeOffset; - /** - * BLAKE2xs for hashing with unknown digest length - */ + /// + /// Initializes a new instance of for hashing an unknown digest length. + /// public Blake2xsDigest() : this(UnknownDigestLength) { } - /** - * BLAKE2xs for hashing - * - * @param digestBytes The desired digest length in bytes. Must be above 1 and less than 2^16-1 - */ + /// + /// Initializes a new instance of with a given digest size. + /// + /// The desired digest length in bytes. Must be above 1 and less than 2^16-1. public Blake2xsDigest(int digestBytes) : this(digestBytes, null, null, null) { } - /** - * BLAKE2xs with key - * - * @param digestBytes The desired digest length in bytes. Must be above 1 and less than 2^16-1 - * @param key A key up to 32 bytes or null - */ + /// + /// + /// Initializes a new instance of with a key and given digest length. + /// + /// After calling the method, the key will + /// remain to be used for further computations of this instance. + /// + /// The desired digest length in bytes. Must be above 1 and less than 2^16-1. + /// A key up to 32 bytes or null. public Blake2xsDigest(int digestBytes, byte[] key) : this(digestBytes, key, null, null) { } - /** - * BLAKE2xs with key, salt and personalization - * - * @param digestBytes The desired digest length in bytes. Must be above 1 and less than 2^16-1 - * @param key A key up to 32 bytes or null - * @param salt 8 bytes or null - * @param personalization 8 bytes or null - */ + + /// + /// + /// Initializes a new instance of with a key, required digest length (in bytes), salt and personalization. + /// + /// + /// The desired digest length in bytes. Must be above 1 and less than 2^16-1. + /// A key up to 32 bytes or null. + /// A 8 bytes or null salt. + /// A 8 bytes or null personalization. + /// public Blake2xsDigest(int digestBytes, byte[] key, byte[] salt, byte[] personalization) { if (digestBytes < 1 || digestBytes > UnknownDigestLength) @@ -125,6 +134,10 @@ public Blake2xsDigest(int digestBytes, byte[] key, byte[] salt, byte[] personali hash = new Blake2sDigest(DigestLength, key, salt, personalization, nodeOffset); } + /// + /// Constructs a new instance of from another ./>. + /// + /// The original instance of that is copied. public Blake2xsDigest(Blake2xsDigest digest) { digestLength = digest.digestLength; @@ -137,72 +150,53 @@ public Blake2xsDigest(Blake2xsDigest digest) nodeOffset = digest.nodeOffset; } - /** - * Return the algorithm name. - * - * @return the algorithm name - */ + /// public string AlgorithmName => "BLAKE2xs"; - /** - * Return the size in bytes of the digest produced by this message digest. - * - * @return the size in bytes of the digest produced by this message digest. - */ + /// public int GetDigestSize() => digestLength; - /** - * Return the size in bytes of the internal buffer the digest applies its - * compression function to. - * - * @return byte length of the digest's internal buffer. - */ + /// + /// Return the size in bytes of the internal buffer the digest applies it's compression + /// function to. + /// + /// The byte length of the digests internal buffer. public int GetByteLength() => hash.GetByteLength(); - /** - * Return the maximum size in bytes the digest can produce when the length - * is unknown - * - * @return byte length of the largest digest with unknown length - */ + /// + /// Return the maximum size in bytes the digest can produce when the length + /// is unknown + /// + /// The byte length of the largest digest with unknown length public long GetUnknownMaxLength() { return MaxNumberBlocks * DigestLength; } - /** - * Update the message digest with a single byte. - * - * @param in the input byte to be entered. - */ + /// public void Update(byte b) { hash.Update(b); } - /** - * Update the message digest with a block of bytes. - * - * @param in the byte array containing the data. - * @param inOff the offset into the byte array where the data starts. - * @param len the length of the data. - */ + /// public void BlockUpdate(byte[] input, int inOff, int inLen) { hash.BlockUpdate(input, inOff, inLen); } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + /// public void BlockUpdate(ReadOnlySpan input) { hash.BlockUpdate(input); } #endif - /** - * Reset the digest back to its initial state. The key, the salt and the - * personal string will remain for further computations. - */ + /// + /// Reset the digest back to it's initial state. + /// The key, the salt and the personalization will remain for further computations. + /// public void Reset() { hash.Reset(); @@ -214,26 +208,28 @@ public void Reset() nodeOffset = ComputeNodeOffset(); } - /** - * Close the digest, producing the final digest value. The doFinal() call - * leaves the digest reset. Key, salt and personal string remain. - * - * @param out the array the digest is to be copied into. - * @param outOffset the offset into the out array the digest is to start at. - */ + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The byte array the digest is to be copied into. + /// The offset into the byte array the digest is to start at. + /// The number of bytes written. public int DoFinal(byte[] output, int outOff) { return OutputFinal(output, outOff, digestLength); } - /** - * Close the digest, producing the final digest value. The doFinal() call - * leaves the digest reset. Key, salt, personal string remain. - * - * @param out output array to write the output bytes to. - * @param outOff offset to start writing the bytes at. - * @param outLen the number of output bytes requested. - */ + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The output array to write the output bytes to. + /// The offset to start writing the bytes at. + /// The number of output bytes requested. + /// The number of bytes written. public int OutputFinal(byte[] output, int outOff, int outLen) { int ret = Output(output, outOff, outLen); @@ -243,15 +239,14 @@ public int OutputFinal(byte[] output, int outOff, int outLen) return ret; } - /** - * Start outputting the results of the final calculation for this digest. Unlike doFinal, this method - * will continue producing output until the Xof is explicitly reset, or signals otherwise. - * - * @param out output array to write the output bytes to. - * @param outOff offset to start writing the bytes at. - * @param outLen the number of output bytes requested. - * @return the number of bytes written - */ + /// + /// Start outputting the results of the final calculation for this digest. Unlike , this method + /// will continue producing output until the Xof is explicitly reset, or signals otherwise. + /// + /// The output array to write the output bytes to. + /// The offset to start writing the bytes at. + /// The number of output bytes requested. + /// The number of bytes written. public int Output(byte[] output, int outOff, int outLen) { #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER @@ -296,11 +291,25 @@ public int Output(byte[] output, int outOff, int outLen) } #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The output span to write the output bytes to. + /// The number of bytes written. public int DoFinal(Span output) { return OutputFinal(output[..digestLength]); } + /// Close the digest, producing the final digest value. + /// + /// The call leaves the digest reset. + /// Key, salt and personal string remain. + /// + /// The output span to write the output bytes to. + /// The number of bytes written. public int OutputFinal(Span output) { int ret = Output(output); @@ -310,6 +319,12 @@ public int OutputFinal(Span output) return ret; } + /// + /// Start outputting the results of the final calculation for this digest. Unlike , this method + /// will continue producing output until the Xof is explicitly reset, or signals otherwise. + /// + /// The output span to write the output bytes to. + /// The number of bytes written. public int Output(Span output) { int outLen = output.Length;