-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extend SME2.1 intrinsics to mf8 (#375)
SME2.1 intrinsics were developed in parallel with FP8 and thus lacked support for the svmfloat8_t type. This patch adds support for consistency.
- Loading branch information
1 parent
0cecab2
commit 8e3af72
Showing
1 changed file
with
64 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ toc: true | |
--- | ||
|
||
<!-- | ||
SPDX-FileCopyrightText: Copyright 2011-2024 Arm Limited and/or its affiliates <[email protected]> | ||
SPDX-FileCopyrightText: Copyright 2011-2025 Arm Limited and/or its affiliates <[email protected]> | ||
SPDX-FileCopyrightText: Copyright 2022 Google LLC. | ||
CC-BY-SA-4.0 AND Apache-Patent-License | ||
See LICENSE.md file for details | ||
|
@@ -438,9 +438,9 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin | |
* Refined function versioning scope and signature rules to use the default | ||
version scope and signature. | ||
* Added `_n` forms of the SVE2p1 and SME2 `svdot` intrinsics. | ||
* Changed the status of the SME2p1 ACLE from Alpha to Beta. | ||
* Changed the status of the SVE2p1 ACLE from Alpha to Beta. | ||
|
||
* Changed the status of the SME2p1 from Alpha to Beta. | ||
* Changed the status of the SVE2p1 from Alpha to Beta. | ||
* Added mf8 variants of SME 2.1 intrinsics. | ||
|
||
### References | ||
|
||
|
@@ -12160,85 +12160,97 @@ Lookup table read with 2-bit and 4-bit indexes | |
Move multi-vectors to/from ZA | ||
|
||
``` c | ||
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_in("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x4_t svread_hor_za8_s8_vg4(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_in("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x2_t svread_ver_za8_s8_vg2(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_in("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x4_t svread_ver_za8_s8_vg4(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_in("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x2_t svread_za8_s8_vg1x2(uint32_t slice) | ||
__arm_streaming __arm_in("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x4_t svread_za8_s8_vg1x4(uint32_t slice) | ||
__arm_streaming __arm_in("za"); | ||
|
||
|
||
// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16], | ||
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], | ||
// Variants are also available for _za8[_u8], _za8[_mf8], | ||
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16], | ||
// _za32[_s32], _za32[_u32], _za32[_f32], | ||
// _za64[_s64], _za64[_u64] and _za64[_f64] | ||
void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16], | ||
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], | ||
// Variants are also available for _za8[_u8], _za8[_mf8], | ||
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16], | ||
// _za32[_s32], _za32[_u32], _za32[_f32], | ||
// _za64[_s64], _za64[_u64] and _za64[_f64] | ||
void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16], | ||
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], | ||
// Variants are also available for _za8[_u8], _za8[_mf8], | ||
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16], | ||
// _za32[_s32], _za32[_u32], _za32[_f32], | ||
// _za64[_s64], _za64[_u64] and _za64[_f64] | ||
void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16], | ||
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], | ||
// Variants are also available for _za8[_u8], _za8[_mf8], | ||
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16], | ||
// _za32[_s32], _za32[_u32], _za32[_f32], | ||
// _za64[_s64], _za64[_u64] and _za64[_f64] | ||
void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16], | ||
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], | ||
// Variants are also available for _za8[_u8], _za8[_mf8], | ||
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16], | ||
// _za32[_s32], _za32[_u32], _za32[_f32], | ||
// _za64[_s64], _za64[_u64] and _za64[_f64] | ||
void svwrite_za8[_s8]_vg1x2(uint32_t slice, svint8x2_t zn) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8[_u8], za8[_mf8], _za16[_s16], _za16[_u16], | ||
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], | ||
// Variants are also available for _za8[_u8], za8[_mf8], | ||
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16], | ||
// _za32[_s32], _za32[_u32], _za32[_f32], | ||
// _za64[_s64], _za64[_u64] and _za64[_f64] | ||
void svwrite_za8[_s8]_vg1x4(uint32_t slice, svint8x4_t zn) | ||
__arm_streaming __arm_inout("za"); | ||
|
@@ -12513,7 +12525,7 @@ The intrinsics in this section are defined by the header file | |
Move and zero ZA tile slice to vector register. | ||
|
||
``` | ||
// And similarly for u8. | ||
// And similarly for u8 and mf8. | ||
svint8_t svreadz_hor_za8_s8(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
@@ -12529,11 +12541,12 @@ Move and zero ZA tile slice to vector register. | |
svint64_t svreadz_hor_za64_s64(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 | ||
// And similarly for s16, s32, s64, u8, u16, u32, u64, | ||
// mf8, bf16, f16, f32, f64 | ||
svint8_t svreadz_hor_za128_s8(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
// And similarly for u8. | ||
// And similarly for u8 and mf8. | ||
svint8_t svreadz_ver_za8_s8(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
@@ -12549,7 +12562,8 @@ Move and zero ZA tile slice to vector register. | |
svint64_t svreadz_ver_za64_s64(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 | ||
// And similarly for s16, s32, s64, u8, u16, u32, u64, | ||
// mf8, bf16, f16, f32, f64 | ||
svint8_t svreadz_ver_za128_s8(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
``` | ||
|
@@ -12559,29 +12573,33 @@ Move and zero ZA tile slice to vector register. | |
Move and zero multiple ZA tile slices to vector registers | ||
|
||
``` c | ||
// Variants are also available for _za8_u8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t tile, uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
@@ -12592,15 +12610,17 @@ Move and zero multiple ZA tile slices to vector registers | |
Move and zero multiple ZA single-vector groups to vector registers | ||
|
||
``` | ||
// Variants are also available for _za8_u8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|
||
|
||
// Variants are also available for _za8_u8, _za16_s16, _za16_u16, | ||
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, | ||
// Variants are also available for _za8_u8, _za8_mf8, | ||
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16, | ||
// _za32_s32, _za32_u32, _za32_f32, | ||
// _za64_s64, _za64_u64 and _za64_f64 | ||
svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice) | ||
__arm_streaming __arm_inout("za"); | ||
|