@@ -68,7 +68,18 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Sp
68
68
69
69
if ( maxSrcLength >= 24 )
70
70
{
71
- byte * end = srcMax - 45 ;
71
+ byte * end = srcMax - 88 ;
72
+ if ( Vector512 . IsHardwareAccelerated && Avx512Vbmi . IsSupported && ( end >= src ) )
73
+ {
74
+ Avx512Decode ( ref src , ref dest , end , maxSrcLength , destLength , srcBytes , destBytes ) ;
75
+
76
+ if ( src == srcEnd )
77
+ {
78
+ goto DoneExit ;
79
+ }
80
+ }
81
+
82
+ end = srcMax - 45 ;
72
83
if ( Avx2 . IsSupported && ( end >= src ) )
73
84
{
74
85
Avx2Decode ( ref src , ref dest , end , maxSrcLength , destLength , srcBytes , destBytes ) ;
@@ -616,6 +627,78 @@ private static OperationStatus DecodeWithWhiteSpaceFromUtf8InPlace(Span<byte> ut
616
627
return status ;
617
628
}
618
629
630
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
631
+ [ CompExactlyDependsOn ( typeof ( Avx512BW ) ) ]
632
+ [ CompExactlyDependsOn ( typeof ( Avx512Vbmi ) ) ]
633
+ private static unsafe void Avx512Decode ( ref byte * srcBytes , ref byte * destBytes , byte * srcEnd , int sourceLength , int destLength , byte * srcStart , byte * destStart )
634
+ {
635
+ // Reference for VBMI implementation : https://github.com/WojciechMula/base64simd/tree/master/decode
636
+ // If we have AVX512 support, pick off 64 bytes at a time for as long as we can,
637
+ // but make sure that we quit before seeing any == markers at the end of the
638
+ // string. Also, because we write 16 zeroes at the end of the output, ensure
639
+ // that there are at least 22 valid bytes of input data remaining to close the
640
+ // gap. 64 + 2 + 22 = 88 bytes.
641
+ byte * src = srcBytes ;
642
+ byte * dest = destBytes ;
643
+
644
+ // The JIT won't hoist these "constants", so help it
645
+ Vector512 < sbyte > vbmiLookup0 = Vector512 . Create (
646
+ 0x80808080 , 0x80808080 , 0x80808080 , 0x80808080 ,
647
+ 0x80808080 , 0x80808080 , 0x80808080 , 0x80808080 ,
648
+ 0x80808080 , 0x80808080 , 0x3e808080 , 0x3f808080 ,
649
+ 0x37363534 , 0x3b3a3938 , 0x80803d3c , 0x80808080 ) . AsSByte ( ) ;
650
+ Vector512 < sbyte > vbmiLookup1 = Vector512 . Create (
651
+ 0x02010080 , 0x06050403 , 0x0a090807 , 0x0e0d0c0b ,
652
+ 0x1211100f , 0x16151413 , 0x80191817 , 0x80808080 ,
653
+ 0x1c1b1a80 , 0x201f1e1d , 0x24232221 , 0x28272625 ,
654
+ 0x2c2b2a29 , 0x302f2e2d , 0x80333231 , 0x80808080 ) . AsSByte ( ) ;
655
+ Vector512 < byte > vbmiPackedLanesControl = Vector512 . Create (
656
+ 0x06000102 , 0x090a0405 , 0x0c0d0e08 , 0x16101112 ,
657
+ 0x191a1415 , 0x1c1d1e18 , 0x26202122 , 0x292a2425 ,
658
+ 0x2c2d2e28 , 0x36303132 , 0x393a3435 , 0x3c3d3e38 ,
659
+ 0x00000000 , 0x00000000 , 0x00000000 , 0x00000000 ) . AsByte ( ) ;
660
+
661
+ Vector512 < sbyte > mergeConstant0 = Vector512 . Create ( 0x01400140 ) . AsSByte ( ) ;
662
+ Vector512 < short > mergeConstant1 = Vector512 . Create ( 0x00011000 ) . AsInt16 ( ) ;
663
+
664
+ // This algorithm requires AVX512VBMI support.
665
+ // Vbmi was first introduced in CannonLake and is avaialable from IceLake on.
666
+ do
667
+ {
668
+ AssertRead < Vector512 < sbyte > > ( src , srcStart , sourceLength ) ;
669
+ Vector512 < sbyte > str = Vector512 . Load ( src ) . AsSByte ( ) ;
670
+
671
+ // Step 1: Translate encoded Base64 input to their original indices
672
+ // This step also checks for invalid inputs and exits.
673
+ // After this, we have indices which are verified to have upper 2 bits set to 0 in each byte.
674
+ // origIndex = [...|00dddddd|00cccccc|00bbbbbb|00aaaaaa]
675
+ Vector512 < sbyte > origIndex = Avx512Vbmi . PermuteVar64x8x2 ( vbmiLookup0 , str , vbmiLookup1 ) ;
676
+ Vector512 < sbyte > errorVec = ( origIndex . AsInt32 ( ) | str . AsInt32 ( ) ) . AsSByte ( ) ;
677
+ if ( errorVec . ExtractMostSignificantBits ( ) != 0 )
678
+ {
679
+ break ;
680
+ }
681
+
682
+ // Step 2: Now we need to reshuffle bits to remove the 0 bits.
683
+ // multiAdd1: [...|0000cccc|ccdddddd|0000aaaa|aabbbbbb]
684
+ Vector512 < short > multiAdd1 = Avx512BW . MultiplyAddAdjacent ( origIndex . AsByte ( ) , mergeConstant0 ) ;
685
+ // multiAdd1: [...|00000000|aaaaaabb|bbbbcccc|ccdddddd]
686
+ Vector512 < int > multiAdd2 = Avx512BW . MultiplyAddAdjacent ( multiAdd1 , mergeConstant1 ) ;
687
+
688
+ // Step 3: Pack 48 bytes
689
+ str = Avx512Vbmi . PermuteVar64x8 ( multiAdd2 . AsByte ( ) , vbmiPackedLanesControl ) . AsSByte ( ) ;
690
+
691
+ AssertWrite < Vector512 < sbyte > > ( dest , destStart , destLength ) ;
692
+ str . Store ( ( sbyte * ) dest ) ;
693
+ src += 64 ;
694
+ dest += 48 ;
695
+ }
696
+ while ( src <= srcEnd ) ;
697
+
698
+ srcBytes = src ;
699
+ destBytes = dest ;
700
+ }
701
+
619
702
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
620
703
[ CompExactlyDependsOn ( typeof ( Avx2 ) ) ]
621
704
private static unsafe void Avx2Decode ( ref byte * srcBytes , ref byte * destBytes , byte * srcEnd , int sourceLength , int destLength , byte * srcStart , byte * destStart )
0 commit comments