Skip to content

Commit cf01aba

Browse files
committed
AVX512FP16 Part 9: Remaining avx512fp16 and avxneconvert
1 parent 57641cc commit cf01aba

File tree

4 files changed

+377
-43
lines changed

4 files changed

+377
-43
lines changed

Diff for: crates/core_arch/missing-x86.md

-18
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,8 @@
5353

5454
<details><summary>["AVX512_FP16"]</summary><p>
5555

56-
* [ ] [`_mm256_cvtsh_h`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
5756
* [ ] [`_mm256_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pch)
58-
* [ ] [`_mm512_cmp_round_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
59-
* [ ] [`_mm512_cvtsh_h`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
60-
* [ ] [`_mm512_mask_cmp_round_ph_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
6157
* [ ] [`_mm512_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pch)
62-
* [ ] [`_mm_cvtsh_h`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
63-
* [ ] [`_mm_cvtsi128_si16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
64-
* [ ] [`_mm_cvtsi16_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
6558
* [ ] [`_mm_set1_pch`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pch)
6659
</p></details>
6760

@@ -82,17 +75,6 @@
8275
</p></details>
8376

8477

85-
<details><summary>["AVX_NE_CONVERT"]</summary><p>
86-
87-
* [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
88-
* [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
89-
* [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
90-
* [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
91-
* [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
92-
* [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
93-
</p></details>
94-
95-
9678
<details><summary>["CET_SS"]</summary><p>
9779

9880
* [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy)

Diff for: crates/core_arch/src/x86/avx512fp16.rs

+220-25
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,25 @@ pub unsafe fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
596596
)
597597
}
598598

599+
/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
600+
/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601+
/// any instructions.
602+
///
603+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
604+
#[inline]
605+
#[target_feature(enable = "avx512fp16")]
606+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
607+
pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
608+
simd_shuffle!(
609+
a,
610+
_mm256_setzero_ph(),
611+
[
612+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
613+
16, 16, 16, 16, 16, 16, 16, 16
614+
]
615+
)
616+
}
617+
599618
/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
600619
/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601620
/// any instructions.
@@ -615,10 +634,10 @@ pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
615634
)
616635
}
617636

618-
macro_rules! cmp_asm {
637+
macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
619638
($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
620639
let dst: $mask_type;
621-
crate::arch::asm!(
640+
asm!(
622641
"vcmpph {k}, {a}, {b}, {imm8}",
623642
k = lateout(kreg) dst,
624643
a = in($reg) $a,
@@ -630,7 +649,7 @@ macro_rules! cmp_asm {
630649
}};
631650
($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
632651
let dst: $mask_type;
633-
crate::arch::asm!(
652+
asm!(
634653
"vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
635654
k = lateout(kreg) dst,
636655
mask = in(kreg) $mask,
@@ -736,6 +755,73 @@ pub unsafe fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
736755
cmp_asm!(__mmask32, k1, zmm_reg, a, b)
737756
}
738757

758+
/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
759+
/// operand specified by imm8, and store the results in mask vector k.
760+
///
761+
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
762+
///
763+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
764+
#[inline]
765+
#[target_feature(enable = "avx512fp16,avx512bw,avx512f")]
766+
#[rustc_legacy_const_generics(2, 3)]
767+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
768+
pub unsafe fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
769+
a: __m512h,
770+
b: __m512h,
771+
) -> __mmask32 {
772+
static_assert_uimm_bits!(IMM5, 5);
773+
static_assert_sae!(SAE);
774+
if SAE == _MM_FROUND_NO_EXC {
775+
let dst: __mmask32;
776+
asm!(
777+
"vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
778+
k = lateout(kreg) dst,
779+
a = in(zmm_reg) a,
780+
b = in(zmm_reg) b,
781+
imm8 = const IMM5,
782+
options(pure, nomem, nostack)
783+
);
784+
dst
785+
} else {
786+
cmp_asm!(__mmask32, zmm_reg, a, b)
787+
}
788+
}
789+
790+
/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
791+
/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
792+
/// zeroed out when the corresponding mask bit is not set).
793+
///
794+
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
795+
///
796+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
797+
#[inline]
798+
#[target_feature(enable = "avx512fp16,avx512bw,avx512f")]
799+
#[rustc_legacy_const_generics(3, 4)]
800+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
801+
pub unsafe fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
802+
k1: __mmask32,
803+
a: __m512h,
804+
b: __m512h,
805+
) -> __mmask32 {
806+
static_assert_uimm_bits!(IMM5, 5);
807+
static_assert_sae!(SAE);
808+
if SAE == _MM_FROUND_NO_EXC {
809+
let dst: __mmask32;
810+
asm!(
811+
"vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
812+
k = lateout(kreg) dst,
813+
k1 = in(kreg) k1,
814+
a = in(zmm_reg) a,
815+
b = in(zmm_reg) b,
816+
imm8 = const IMM5,
817+
options(pure, nomem, nostack)
818+
);
819+
dst
820+
} else {
821+
cmp_asm!(__mmask32, k1, zmm_reg, a, b)
822+
}
823+
}
824+
739825
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
740826
/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
741827
/// passing _MM_FROUND_NO_EXC in the sae parameter.
@@ -803,25 +889,6 @@ pub unsafe fn _mm_mask_cmp_sh_mask<const IMM5: i32>(
803889
_mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
804890
}
805891

806-
/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
807-
/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
808-
/// any instructions.
809-
///
810-
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
811-
#[inline]
812-
#[target_feature(enable = "avx512fp16")]
813-
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
814-
pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
815-
simd_shuffle!(
816-
a,
817-
_mm256_setzero_ph(),
818-
[
819-
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
820-
16, 16, 16, 16, 16, 16, 16, 16
821-
]
822-
)
823-
}
824-
825892
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
826893
/// operand specified by imm8, and return the boolean result (0 or 1).
827894
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
@@ -10942,10 +11009,10 @@ pub unsafe fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
1094211009
_mm256_reduce_max_ph(_mm256_max_ph(p, q))
1094311010
}
1094411011

10945-
macro_rules! fpclass_asm {
11012+
macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
1094611013
($mask_type: ty, $reg: ident, $a: expr) => {{
1094711014
let dst: $mask_type;
10948-
crate::arch::asm!(
11015+
asm!(
1094911016
"vfpclassph {k}, {src}, {imm8}",
1095011017
k = lateout(kreg) dst,
1095111018
src = in($reg) $a,
@@ -10956,7 +11023,7 @@ macro_rules! fpclass_asm {
1095611023
}};
1095711024
($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
1095811025
let dst: $mask_type;
10959-
crate::arch::asm!(
11026+
asm!(
1096011027
"vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
1096111028
k = lateout(kreg) dst,
1096211029
mask = in(kreg) $mask,
@@ -15873,6 +15940,56 @@ pub unsafe fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(
1587315940
_mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
1587415941
}
1587515942

15943+
/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15944+
///
15945+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
15946+
#[inline]
15947+
#[target_feature(enable = "avx512fp16")]
15948+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15949+
pub unsafe fn _mm_cvtsh_h(a: __m128h) -> f16 {
15950+
simd_extract!(a, 0)
15951+
}
15952+
15953+
/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15954+
///
15955+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
15956+
#[inline]
15957+
#[target_feature(enable = "avx512fp16")]
15958+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15959+
pub unsafe fn _mm256_cvtsh_h(a: __m256h) -> f16 {
15960+
simd_extract!(a, 0)
15961+
}
15962+
15963+
/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15964+
///
15965+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
15966+
#[inline]
15967+
#[target_feature(enable = "avx512fp16")]
15968+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15969+
pub unsafe fn _mm512_cvtsh_h(a: __m512h) -> f16 {
15970+
simd_extract!(a, 0)
15971+
}
15972+
15973+
/// Copy the lower 16-bit integer in a to dst.
15974+
///
15975+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
15976+
#[inline]
15977+
#[target_feature(enable = "avx512fp16")]
15978+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15979+
pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
15980+
simd_extract!(a.as_i16x8(), 0)
15981+
}
15982+
15983+
/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
15984+
///
15985+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
15986+
#[inline]
15987+
#[target_feature(enable = "avx512fp16")]
15988+
#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15989+
pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
15990+
transmute(simd_insert!(i16x8::splat(0), 0, a))
15991+
}
15992+
1587615993
#[allow(improper_ctypes)]
1587715994
extern "C" {
1587815995
#[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
@@ -16693,6 +16810,42 @@ mod tests {
1669316810
assert_eq!(r, 0b01010000010100000101000001010000);
1669416811
}
1669516812

16813+
#[simd_test(enable = "avx512fp16")]
16814+
unsafe fn test_mm512_cmp_round_ph_mask() {
16815+
let a = _mm512_set_ph(
16816+
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16817+
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16818+
31.0, 32.0,
16819+
);
16820+
let b = _mm512_set_ph(
16821+
1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16822+
-16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16823+
-29.0, -30.0, -31.0, -32.0,
16824+
);
16825+
let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
16826+
assert_eq!(r, 0b11110000111100001111000011110000);
16827+
}
16828+
16829+
#[simd_test(enable = "avx512fp16")]
16830+
unsafe fn test_mm512_mask_cmp_round_ph_mask() {
16831+
let a = _mm512_set_ph(
16832+
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16833+
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16834+
31.0, 32.0,
16835+
);
16836+
let b = _mm512_set_ph(
16837+
1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16838+
-16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16839+
-29.0, -30.0, -31.0, -32.0,
16840+
);
16841+
let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
16842+
0b01010101010101010101010101010101,
16843+
a,
16844+
b,
16845+
);
16846+
assert_eq!(r, 0b01010000010100000101000001010000);
16847+
}
16848+
1669616849
#[simd_test(enable = "avx512fp16")]
1669716850
unsafe fn test_mm_cmp_round_sh_mask() {
1669816851
let a = _mm_set_sh(1.0);
@@ -26800,4 +26953,46 @@ mod tests {
2680026953
let e = _mm_setr_pd(1.0, 20.0);
2680126954
assert_eq_m128d(r, e);
2680226955
}
26956+
26957+
#[simd_test(enable = "avx512fp16")]
26958+
unsafe fn test_mm_cvtsh_h() {
26959+
let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
26960+
let r = _mm_cvtsh_h(a);
26961+
assert_eq!(r, 1.0);
26962+
}
26963+
26964+
#[simd_test(enable = "avx512fp16")]
26965+
unsafe fn test_mm256_cvtsh_h() {
26966+
let a = _mm256_setr_ph(
26967+
1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26968+
);
26969+
let r = _mm256_cvtsh_h(a);
26970+
assert_eq!(r, 1.0);
26971+
}
26972+
26973+
#[simd_test(enable = "avx512fp16")]
26974+
unsafe fn test_mm512_cvtsh_h() {
26975+
let a = _mm512_setr_ph(
26976+
1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26977+
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26978+
31.0, 32.0,
26979+
);
26980+
let r = _mm512_cvtsh_h(a);
26981+
assert_eq!(r, 1.0);
26982+
}
26983+
26984+
#[simd_test(enable = "avx512fp16")]
26985+
unsafe fn test_mm_cvtsi128_si16() {
26986+
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
26987+
let r = _mm_cvtsi128_si16(a);
26988+
assert_eq!(r, 1);
26989+
}
26990+
26991+
#[simd_test(enable = "avx512fp16")]
26992+
unsafe fn test_mm_cvtsi16_si128() {
26993+
let a = 1;
26994+
let r = _mm_cvtsi16_si128(a);
26995+
let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
26996+
assert_eq_m128i(r, e);
26997+
}
2680326998
}

0 commit comments

Comments
 (0)