Skip to content

Commit f6519c5

Browse files
Merge pull request #431 from cvijdea-bd/fix-swizzle-dyn-vbmi
Fix avx512vbmi swizzle_dyn implementation
2 parents 4697d39 + d5abbfa commit f6519c5

File tree

1 file changed

+24
-6
lines changed

1 file changed

+24
-6
lines changed

crates/core_simd/src/swizzle_dyn.rs

+24-6
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,30 @@ where
6060
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
6161
32 => transize(avx2_pshufb, self, idxs),
6262
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
63-
32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self),
64-
// Notable absence: avx512bw shuffle
65-
// If avx512bw is available, odds of avx512vbmi are good
66-
// FIXME: initial AVX512VBMI variant didn't actually pass muster
67-
// #[cfg(target_feature = "avx512vbmi")]
68-
// 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
63+
32 => {
64+
// Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
65+
let swizzler = |bytes, idxs| {
66+
let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
67+
idxs,
68+
Simd::<u8, 32>::splat(N as u8).into(),
69+
);
70+
x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
71+
};
72+
transize(swizzler, self, idxs)
73+
}
74+
// Notable absence: avx512bw pshufb shuffle
75+
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
76+
64 => {
77+
// Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
78+
let swizzler = |bytes, idxs| {
79+
let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
80+
idxs,
81+
Simd::<u8, 64>::splat(N as u8).into(),
82+
);
83+
x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
84+
};
85+
transize(swizzler, self, idxs)
86+
}
6987
_ => {
7088
let mut array = [0; N];
7189
for (i, k) in idxs.to_array().into_iter().enumerate() {

0 commit comments

Comments
 (0)