@@ -60,12 +60,30 @@ where
60
60
#[ cfg( all( target_feature = "avx2" , not( target_feature = "avx512vbmi" ) ) ) ]
61
61
32 => transize ( avx2_pshufb, self , idxs) ,
62
62
#[ cfg( all( target_feature = "avx512vl" , target_feature = "avx512vbmi" ) ) ]
63
- 32 => transize ( x86:: _mm256_permutexvar_epi8, zeroing_idxs ( idxs) , self ) ,
64
- // Notable absence: avx512bw shuffle
65
- // If avx512bw is available, odds of avx512vbmi are good
66
- // FIXME: initial AVX512VBMI variant didn't actually pass muster
67
- // #[cfg(target_feature = "avx512vbmi")]
68
- // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
63
+ 32 => {
64
+ // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
65
+ let swizzler = |bytes, idxs| {
66
+ let mask = x86:: _mm256_cmp_epu8_mask :: < { x86:: _MM_CMPINT_LT } > (
67
+ idxs,
68
+ Simd :: < u8 , 32 > :: splat ( N as u8 ) . into ( ) ,
69
+ ) ;
70
+ x86:: _mm256_maskz_permutexvar_epi8 ( mask, idxs, bytes)
71
+ } ;
72
+ transize ( swizzler, self , idxs)
73
+ }
74
+ // Notable absence: avx512bw pshufb shuffle
75
+ #[ cfg( all( target_feature = "avx512vl" , target_feature = "avx512vbmi" ) ) ]
76
+ 64 => {
77
+ // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
78
+ let swizzler = |bytes, idxs| {
79
+ let mask = x86:: _mm512_cmp_epu8_mask :: < { x86:: _MM_CMPINT_LT } > (
80
+ idxs,
81
+ Simd :: < u8 , 64 > :: splat ( N as u8 ) . into ( ) ,
82
+ ) ;
83
+ x86:: _mm512_maskz_permutexvar_epi8 ( mask, idxs, bytes)
84
+ } ;
85
+ transize ( swizzler, self , idxs)
86
+ }
69
87
_ => {
70
88
let mut array = [ 0 ; N ] ;
71
89
for ( i, k) in idxs. to_array ( ) . into_iter ( ) . enumerate ( ) {
0 commit comments