Skip to content

Commit 9ad24ae

Browse files
Adding AVX512 path to Base64 encoding/Decoding (dotnet#92241)
* Adding AVX512 path to Base64 encoding/Decoding * Addressing review Comments. Signed-off-by: Deepak Rajendrakumaran <[email protected]> * Removing fallback path. * Updating Third Party Notice. * Addressing review comments --------- Signed-off-by: Deepak Rajendrakumaran <[email protected]>
1 parent 3cd6455 commit 9ad24ae

File tree

3 files changed

+202
-2
lines changed

3 files changed

+202
-2
lines changed

THIRD-PARTY-NOTICES.TXT

+34
Original file line numberDiff line numberDiff line change
@@ -1297,3 +1297,37 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
12971297
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
12981298
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
12991299
THE SOFTWARE.
1300+
1301+
License notice for Avx512Vbmi base64 encoding / decoding
1302+
--------------------------------------------------------
1303+
1304+
Copyright (c) 2015-2018, Wojciech Muła
1305+
All rights reserved.
1306+
1307+
Redistribution and use in source and binary forms, with or without
1308+
modification, are permitted provided that the following conditions are
1309+
met:
1310+
1311+
1. Redistributions of source code must retain the above copyright
1312+
notice, this list of conditions and the following disclaimer.
1313+
1314+
2. Redistributions in binary form must reproduce the above copyright
1315+
notice, this list of conditions and the following disclaimer in the
1316+
documentation and/or other materials provided with the distribution.
1317+
1318+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
1319+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
1320+
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
1321+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1322+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
1323+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
1324+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1325+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1326+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1327+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1328+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1329+
1330+
--------------------------------------------------------
1331+
1332+
Aspects of base64 encoding / decoding are based on algorithm described in "Base64 encoding and decoding at almost the speed of a memory
1333+
copy", Wojciech Muła and Daniel Lemire. https://arxiv.org/pdf/1910.05109.pdf

src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs

+84-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,18 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp
6868

6969
if (maxSrcLength >= 24)
7070
{
71-
byte* end = srcMax - 45;
71+
byte* end = srcMax - 88;
72+
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && (end >= src))
73+
{
74+
Avx512Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
75+
76+
if (src == srcEnd)
77+
{
78+
goto DoneExit;
79+
}
80+
}
81+
82+
end = srcMax - 45;
7283
if (Avx2.IsSupported && (end >= src))
7384
{
7485
Avx2Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
@@ -616,6 +627,78 @@ private static OperationStatus DecodeWithWhiteSpaceFromUtf8InPlace(Span<byte> ut
616627
return status;
617628
}
618629

630+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
631+
[CompExactlyDependsOn(typeof(Avx512BW))]
632+
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
633+
private static unsafe void Avx512Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
634+
{
635+
// Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/decode
636+
// If we have AVX512 support, pick off 64 bytes at a time for as long as we can,
637+
// but make sure that we quit before seeing any == markers at the end of the
638+
// string. Also, because we write 16 zeroes at the end of the output, ensure
639+
// that there are at least 22 valid bytes of input data remaining to close the
640+
// gap. 64 + 2 + 22 = 88 bytes.
641+
byte* src = srcBytes;
642+
byte* dest = destBytes;
643+
644+
// The JIT won't hoist these "constants", so help it
645+
Vector512<sbyte> vbmiLookup0 = Vector512.Create(
646+
0x80808080, 0x80808080, 0x80808080, 0x80808080,
647+
0x80808080, 0x80808080, 0x80808080, 0x80808080,
648+
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
649+
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080).AsSByte();
650+
Vector512<sbyte> vbmiLookup1 = Vector512.Create(
651+
0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
652+
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
653+
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
654+
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080).AsSByte();
655+
Vector512<byte> vbmiPackedLanesControl = Vector512.Create(
656+
0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
657+
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
658+
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
659+
0x00000000, 0x00000000, 0x00000000, 0x00000000).AsByte();
660+
661+
Vector512<sbyte> mergeConstant0 = Vector512.Create(0x01400140).AsSByte();
662+
Vector512<short> mergeConstant1 = Vector512.Create(0x00011000).AsInt16();
663+
664+
// This algorithm requires AVX512VBMI support.
665+
// Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
666+
do
667+
{
668+
AssertRead<Vector512<sbyte>>(src, srcStart, sourceLength);
669+
Vector512<sbyte> str = Vector512.Load(src).AsSByte();
670+
671+
// Step 1: Translate encoded Base64 input to their original indices
672+
// This step also checks for invalid inputs and exits.
673+
// After this, we have indices which are verified to have upper 2 bits set to 0 in each byte.
674+
// origIndex = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
675+
Vector512<sbyte> origIndex = Avx512Vbmi.PermuteVar64x8x2(vbmiLookup0, str, vbmiLookup1);
676+
Vector512<sbyte> errorVec = (origIndex.AsInt32() | str.AsInt32()).AsSByte();
677+
if (errorVec.ExtractMostSignificantBits() != 0)
678+
{
679+
break;
680+
}
681+
682+
// Step 2: Now we need to reshuffle bits to remove the 0 bits.
683+
// multiAdd1: [...|0000cccc|ccdddddd|0000aaaa|aabbbbbb]
684+
Vector512<short> multiAdd1 = Avx512BW.MultiplyAddAdjacent(origIndex.AsByte(), mergeConstant0);
685+
// multiAdd1: [...|00000000|aaaaaabb|bbbbcccc|ccdddddd]
686+
Vector512<int> multiAdd2 = Avx512BW.MultiplyAddAdjacent(multiAdd1, mergeConstant1);
687+
688+
// Step 3: Pack 48 bytes
689+
str = Avx512Vbmi.PermuteVar64x8(multiAdd2.AsByte(), vbmiPackedLanesControl).AsSByte();
690+
691+
AssertWrite<Vector512<sbyte>>(dest, destStart, destLength);
692+
str.Store((sbyte*)dest);
693+
src += 64;
694+
dest += 48;
695+
}
696+
while (src <= srcEnd);
697+
698+
srcBytes = src;
699+
destBytes = dest;
700+
}
701+
619702
[MethodImpl(MethodImplOptions.AggressiveInlining)]
620703
[CompExactlyDependsOn(typeof(Avx2))]
621704
private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)

src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Encoder.cs

+84-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,16 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan<byte> bytes, Span
6767

6868
if (maxSrcLength >= 16)
6969
{
70-
byte* end = srcMax - 32;
70+
byte* end = srcMax - 64;
71+
if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && (end >= src))
72+
{
73+
Avx512Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
74+
75+
if (src == srcEnd)
76+
goto DoneExit;
77+
}
78+
79+
end = srcMax - 64;
7180
if (Avx2.IsSupported && (end >= src))
7281
{
7382
Avx2Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
@@ -226,6 +235,80 @@ public static unsafe OperationStatus EncodeToUtf8InPlace(Span<byte> buffer, int
226235
}
227236
}
228237

238+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
239+
[CompExactlyDependsOn(typeof(Avx512BW))]
240+
[CompExactlyDependsOn(typeof(Avx512Vbmi))]
241+
private static unsafe void Avx512Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
242+
{
243+
// Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/encode
244+
// If we have AVX512 support, pick off 48 bytes at a time for as long as we can.
245+
// But because we read 64 bytes at a time, ensure we have enough room to do a
246+
// full 64-byte read without segfaulting.
247+
248+
byte* src = srcBytes;
249+
byte* dest = destBytes;
250+
251+
// The JIT won't hoist these "constants", so help it
252+
Vector512<sbyte> shuffleVecVbmi = Vector512.Create(
253+
0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
254+
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
255+
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
256+
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e).AsSByte();
257+
Vector512<sbyte> vbmiLookup = Vector512.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"u8).AsSByte();
258+
259+
Vector512<ushort> maskAC = Vector512.Create((uint)0x0fc0fc00).AsUInt16();
260+
Vector512<uint> maskBB = Vector512.Create((uint)0x3f003f00);
261+
Vector512<ushort> shiftAC = Vector512.Create((uint)0x0006000a).AsUInt16();
262+
Vector512<ushort> shiftBB = Vector512.Create((uint)0x00080004).AsUInt16();
263+
264+
AssertRead<Vector256<sbyte>>(src, srcStart, sourceLength);
265+
266+
// This algorithm requires AVX512VBMI support.
267+
// Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
268+
269+
// str = [...|PONM|LKJI|HGFE|DCBA]
270+
Vector512<sbyte> str = Vector512.Load(src).AsSByte();
271+
272+
while (true)
273+
{
274+
// Step 1 : Split 48 bytes into 64 bytes with each byte using 6-bits from input
275+
// str = [...|KLJK|HIGH|EFDE|BCAB]
276+
str = Avx512Vbmi.PermuteVar64x8(str, shuffleVecVbmi);
277+
278+
// TO-DO- This can be achieved faster with multishift
279+
// Consider the first 4 bytes - BCAB
280+
// temp1 = [...|0000cccc|cc000000|aaaaaa00|00000000]
281+
Vector512<ushort> temp1 = (str.AsUInt16() & maskAC);
282+
283+
// temp2 = [...|00000000|00cccccc|00000000|00aaaaaa]
284+
Vector512<ushort> temp2 = Avx512BW.ShiftRightLogicalVariable(temp1, shiftAC).AsUInt16();
285+
286+
// temp3 = [...|ccdddddd|00000000|aabbbbbb|cccc0000]
287+
Vector512<ushort> temp3 = Avx512BW.ShiftLeftLogicalVariable(str.AsUInt16(), shiftBB).AsUInt16();
288+
289+
// str = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
290+
str = Vector512.ConditionalSelect(maskBB, temp3.AsUInt32(), temp2.AsUInt32()).AsSByte();
291+
292+
// Step 2: Now we have the indices calculated. Next step is to use these indices to translate.
293+
str = Avx512Vbmi.PermuteVar64x8(vbmiLookup, str);
294+
295+
AssertWrite<Vector512<sbyte>>(dest, destStart, destLength);
296+
str.Store((sbyte*)dest);
297+
298+
src += 48;
299+
dest += 64;
300+
301+
if (src > srcEnd)
302+
break;
303+
304+
AssertRead<Vector512<sbyte>>(src, srcStart, sourceLength);
305+
str = Vector512.Load(src).AsSByte();
306+
}
307+
308+
srcBytes = src;
309+
destBytes = dest;
310+
}
311+
229312
[MethodImpl(MethodImplOptions.AggressiveInlining)]
230313
[CompExactlyDependsOn(typeof(Avx2))]
231314
private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)

0 commit comments

Comments
 (0)