@@ -596,6 +596,25 @@ pub unsafe fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
596
596
)
597
597
}
598
598
599
+ /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
600
+ /// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601
+ /// any instructions.
602
+ ///
603
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
604
+ #[inline]
605
+ #[target_feature(enable = "avx512fp16")]
606
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
607
+ pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
608
+ simd_shuffle!(
609
+ a,
610
+ _mm256_setzero_ph(),
611
+ [
612
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
613
+ 16, 16, 16, 16, 16, 16, 16, 16
614
+ ]
615
+ )
616
+ }
617
+
599
618
/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
600
619
/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601
620
/// any instructions.
@@ -615,10 +634,10 @@ pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
615
634
)
616
635
}
617
636
618
- macro_rules! cmp_asm {
637
+ macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
619
638
($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
620
639
let dst: $mask_type;
621
- crate::arch:: asm!(
640
+ asm!(
622
641
"vcmpph {k}, {a}, {b}, {imm8}",
623
642
k = lateout(kreg) dst,
624
643
a = in($reg) $a,
@@ -630,7 +649,7 @@ macro_rules! cmp_asm {
630
649
}};
631
650
($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
632
651
let dst: $mask_type;
633
- crate::arch:: asm!(
652
+ asm!(
634
653
"vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
635
654
k = lateout(kreg) dst,
636
655
mask = in(kreg) $mask,
@@ -736,6 +755,73 @@ pub unsafe fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
736
755
cmp_asm!(__mmask32, k1, zmm_reg, a, b)
737
756
}
738
757
758
+ /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
759
+ /// operand specified by imm8, and store the results in mask vector k.
760
+ ///
761
+ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
762
+ ///
763
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
764
+ #[inline]
765
+ #[target_feature(enable = "avx512fp16,avx512bw,avx512f")]
766
+ #[rustc_legacy_const_generics(2, 3)]
767
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
768
+ pub unsafe fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
769
+ a: __m512h,
770
+ b: __m512h,
771
+ ) -> __mmask32 {
772
+ static_assert_uimm_bits!(IMM5, 5);
773
+ static_assert_sae!(SAE);
774
+ if SAE == _MM_FROUND_NO_EXC {
775
+ let dst: __mmask32;
776
+ asm!(
777
+ "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
778
+ k = lateout(kreg) dst,
779
+ a = in(zmm_reg) a,
780
+ b = in(zmm_reg) b,
781
+ imm8 = const IMM5,
782
+ options(pure, nomem, nostack)
783
+ );
784
+ dst
785
+ } else {
786
+ cmp_asm!(__mmask32, zmm_reg, a, b)
787
+ }
788
+ }
789
+
790
+ /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
791
+ /// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
792
+ /// zeroed out when the corresponding mask bit is not set).
793
+ ///
794
+ /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
795
+ ///
796
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
797
+ #[inline]
798
+ #[target_feature(enable = "avx512fp16,avx512bw,avx512f")]
799
+ #[rustc_legacy_const_generics(3, 4)]
800
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
801
+ pub unsafe fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
802
+ k1: __mmask32,
803
+ a: __m512h,
804
+ b: __m512h,
805
+ ) -> __mmask32 {
806
+ static_assert_uimm_bits!(IMM5, 5);
807
+ static_assert_sae!(SAE);
808
+ if SAE == _MM_FROUND_NO_EXC {
809
+ let dst: __mmask32;
810
+ asm!(
811
+ "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
812
+ k = lateout(kreg) dst,
813
+ k1 = in(kreg) k1,
814
+ a = in(zmm_reg) a,
815
+ b = in(zmm_reg) b,
816
+ imm8 = const IMM5,
817
+ options(pure, nomem, nostack)
818
+ );
819
+ dst
820
+ } else {
821
+ cmp_asm!(__mmask32, k1, zmm_reg, a, b)
822
+ }
823
+ }
824
+
739
825
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
740
826
/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
741
827
/// passing _MM_FROUND_NO_EXC in the sae parameter.
@@ -803,25 +889,6 @@ pub unsafe fn _mm_mask_cmp_sh_mask<const IMM5: i32>(
803
889
_mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
804
890
}
805
891
806
- /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
807
- /// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
808
- /// any instructions.
809
- ///
810
- /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
811
- #[inline]
812
- #[target_feature(enable = "avx512fp16")]
813
- #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
814
- pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
815
- simd_shuffle!(
816
- a,
817
- _mm256_setzero_ph(),
818
- [
819
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
820
- 16, 16, 16, 16, 16, 16, 16, 16
821
- ]
822
- )
823
- }
824
-
825
892
/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
826
893
/// operand specified by imm8, and return the boolean result (0 or 1).
827
894
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
@@ -10942,10 +11009,10 @@ pub unsafe fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
10942
11009
_mm256_reduce_max_ph(_mm256_max_ph(p, q))
10943
11010
}
10944
11011
10945
- macro_rules! fpclass_asm {
11012
+ macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
10946
11013
($mask_type: ty, $reg: ident, $a: expr) => {{
10947
11014
let dst: $mask_type;
10948
- crate::arch:: asm!(
11015
+ asm!(
10949
11016
"vfpclassph {k}, {src}, {imm8}",
10950
11017
k = lateout(kreg) dst,
10951
11018
src = in($reg) $a,
@@ -10956,7 +11023,7 @@ macro_rules! fpclass_asm {
10956
11023
}};
10957
11024
($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
10958
11025
let dst: $mask_type;
10959
- crate::arch:: asm!(
11026
+ asm!(
10960
11027
"vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
10961
11028
k = lateout(kreg) dst,
10962
11029
mask = in(kreg) $mask,
@@ -15873,6 +15940,56 @@ pub unsafe fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(
15873
15940
_mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
15874
15941
}
15875
15942
15943
+ /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15944
+ ///
15945
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
15946
+ #[inline]
15947
+ #[target_feature(enable = "avx512fp16")]
15948
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15949
+ pub unsafe fn _mm_cvtsh_h(a: __m128h) -> f16 {
15950
+ simd_extract!(a, 0)
15951
+ }
15952
+
15953
+ /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15954
+ ///
15955
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
15956
+ #[inline]
15957
+ #[target_feature(enable = "avx512fp16")]
15958
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15959
+ pub unsafe fn _mm256_cvtsh_h(a: __m256h) -> f16 {
15960
+ simd_extract!(a, 0)
15961
+ }
15962
+
15963
+ /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15964
+ ///
15965
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
15966
+ #[inline]
15967
+ #[target_feature(enable = "avx512fp16")]
15968
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15969
+ pub unsafe fn _mm512_cvtsh_h(a: __m512h) -> f16 {
15970
+ simd_extract!(a, 0)
15971
+ }
15972
+
15973
+ /// Copy the lower 16-bit integer in a to dst.
15974
+ ///
15975
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
15976
+ #[inline]
15977
+ #[target_feature(enable = "avx512fp16")]
15978
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15979
+ pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
15980
+ simd_extract!(a.as_i16x8(), 0)
15981
+ }
15982
+
15983
+ /// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
15984
+ ///
15985
+ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
15986
+ #[inline]
15987
+ #[target_feature(enable = "avx512fp16")]
15988
+ #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15989
+ pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
15990
+ transmute(simd_insert!(i16x8::splat(0), 0, a))
15991
+ }
15992
+
15876
15993
#[allow(improper_ctypes)]
15877
15994
extern "C" {
15878
15995
#[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
@@ -16693,6 +16810,42 @@ mod tests {
16693
16810
assert_eq!(r, 0b01010000010100000101000001010000);
16694
16811
}
16695
16812
16813
+ #[simd_test(enable = "avx512fp16")]
16814
+ unsafe fn test_mm512_cmp_round_ph_mask() {
16815
+ let a = _mm512_set_ph(
16816
+ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16817
+ 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16818
+ 31.0, 32.0,
16819
+ );
16820
+ let b = _mm512_set_ph(
16821
+ 1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16822
+ -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16823
+ -29.0, -30.0, -31.0, -32.0,
16824
+ );
16825
+ let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
16826
+ assert_eq!(r, 0b11110000111100001111000011110000);
16827
+ }
16828
+
16829
+ #[simd_test(enable = "avx512fp16")]
16830
+ unsafe fn test_mm512_mask_cmp_round_ph_mask() {
16831
+ let a = _mm512_set_ph(
16832
+ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16833
+ 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16834
+ 31.0, 32.0,
16835
+ );
16836
+ let b = _mm512_set_ph(
16837
+ 1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16838
+ -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16839
+ -29.0, -30.0, -31.0, -32.0,
16840
+ );
16841
+ let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
16842
+ 0b01010101010101010101010101010101,
16843
+ a,
16844
+ b,
16845
+ );
16846
+ assert_eq!(r, 0b01010000010100000101000001010000);
16847
+ }
16848
+
16696
16849
#[simd_test(enable = "avx512fp16")]
16697
16850
unsafe fn test_mm_cmp_round_sh_mask() {
16698
16851
let a = _mm_set_sh(1.0);
@@ -26800,4 +26953,46 @@ mod tests {
26800
26953
let e = _mm_setr_pd(1.0, 20.0);
26801
26954
assert_eq_m128d(r, e);
26802
26955
}
26956
+
26957
+ #[simd_test(enable = "avx512fp16")]
26958
+ unsafe fn test_mm_cvtsh_h() {
26959
+ let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
26960
+ let r = _mm_cvtsh_h(a);
26961
+ assert_eq!(r, 1.0);
26962
+ }
26963
+
26964
+ #[simd_test(enable = "avx512fp16")]
26965
+ unsafe fn test_mm256_cvtsh_h() {
26966
+ let a = _mm256_setr_ph(
26967
+ 1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26968
+ );
26969
+ let r = _mm256_cvtsh_h(a);
26970
+ assert_eq!(r, 1.0);
26971
+ }
26972
+
26973
+ #[simd_test(enable = "avx512fp16")]
26974
+ unsafe fn test_mm512_cvtsh_h() {
26975
+ let a = _mm512_setr_ph(
26976
+ 1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26977
+ 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26978
+ 31.0, 32.0,
26979
+ );
26980
+ let r = _mm512_cvtsh_h(a);
26981
+ assert_eq!(r, 1.0);
26982
+ }
26983
+
26984
+ #[simd_test(enable = "avx512fp16")]
26985
+ unsafe fn test_mm_cvtsi128_si16() {
26986
+ let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
26987
+ let r = _mm_cvtsi128_si16(a);
26988
+ assert_eq!(r, 1);
26989
+ }
26990
+
26991
+ #[simd_test(enable = "avx512fp16")]
26992
+ unsafe fn test_mm_cvtsi16_si128() {
26993
+ let a = 1;
26994
+ let r = _mm_cvtsi16_si128(a);
26995
+ let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
26996
+ assert_eq_m128i(r, e);
26997
+ }
26803
26998
}
0 commit comments