Skip to content

Commit

Permalink
Extend SME2.1 intrinsics to mf8 (#375)
Browse files Browse the repository at this point in the history
SME2.1 intrinsics were developed in parallel with FP8 and thus lacked
support for the svmfloat8_t type. This patch adds support for
consistency.
  • Loading branch information
rockdreamer authored Jan 15, 2025
1 parent 0cecab2 commit 8e3af72
Showing 1 changed file with 64 additions and 44 deletions.
108 changes: 64 additions & 44 deletions main/acle.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ toc: true
---

<!--
SPDX-FileCopyrightText: Copyright 2011-2024 Arm Limited and/or its affiliates <[email protected]>
SPDX-FileCopyrightText: Copyright 2011-2025 Arm Limited and/or its affiliates <[email protected]>
SPDX-FileCopyrightText: Copyright 2022 Google LLC.
CC-BY-SA-4.0 AND Apache-Patent-License
See LICENSE.md file for details
Expand Down Expand Up @@ -438,9 +438,9 @@ Armv8.4-A [[ARMARMv84]](#ARMARMv84). Support is added for the Dot Product intrin
* Refined function versioning scope and signature rules to use the default
version scope and signature.
* Added `_n` forms of the SVE2p1 and SME2 `svdot` intrinsics.
* Changed the status of the SME2p1 ACLE from Alpha to Beta.
* Changed the status of the SVE2p1 ACLE from Alpha to Beta.

* Changed the status of the SME2p1 from Alpha to Beta.
* Changed the status of the SVE2p1 from Alpha to Beta.
* Added mf8 variants of SME 2.1 intrinsics.

### References

Expand Down Expand Up @@ -12160,85 +12160,97 @@ Lookup table read with 2-bit and 4-bit indexes
Move multi-vectors to/from ZA

``` c
// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_hor_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_ver_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_ver_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8_u8, _za8_mf8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_in("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], _za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], _za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_za8[_s8]_vg1x2(uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8[_u8], za8[_mf8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// Variants are also available for _za8[_u8], za8[_mf8],
// _za16[_s16], _za16[_u16], _za16[_f16], _za16[_bf16],
// _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_za8[_s8]_vg1x4(uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_inout("za");
Expand Down Expand Up @@ -12513,7 +12525,7 @@ The intrinsics in this section are defined by the header file
Move and zero ZA tile slice to vector register.

```
// And similarly for u8.
// And similarly for u8 and mf8.
svint8_t svreadz_hor_za8_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

Expand All @@ -12529,11 +12541,12 @@ Move and zero ZA tile slice to vector register.
svint64_t svreadz_hor_za64_s64(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
// And similarly for s16, s32, s64, u8, u16, u32, u64,
// mf8, bf16, f16, f32, f64
svint8_t svreadz_hor_za128_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// And similarly for u8.
// And similarly for u8 and mf8.
svint8_t svreadz_ver_za8_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

Expand All @@ -12549,7 +12562,8 @@ Move and zero ZA tile slice to vector register.
svint64_t svreadz_ver_za64_s64(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");

// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
// And similarly for s16, s32, s64, u8, u16, u32, u64,
// mf8, bf16, f16, f32, f64
svint8_t svreadz_ver_za128_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
```
Expand All @@ -12559,29 +12573,33 @@ Move and zero ZA tile slice to vector register.
Move and zero multiple ZA tile slices to vector registers

``` c
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
Expand All @@ -12592,15 +12610,17 @@ Move and zero multiple ZA tile slices to vector registers
Move and zero multiple ZA single-vector groups to vector registers

```
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svreadz_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_inout("za");


// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// Variants are also available for _za8_u8, _za8_mf8,
// _za16_s16, _za16_u16, _za16_f16, _za16_bf16,
// _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svreadz_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_inout("za");
Expand Down

0 comments on commit 8e3af72

Please sign in to comment.