[ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets (#127068)

ostannard · web-flow · commit a619a2e53a9b · 2025-03-04T08:10:22.000Z
In arm-neon.h, we insert shufflevectors around each intrinsic when the
target is big-endian, to compensate for the difference between the
ABI-defined memory format of vectors (with the whole vector stored as
one big-endian access) and LLVM's target-independent expectations (with
the lowest-numbered lane in the lowest address). However, this code was
written for the AArch64 ABI, and the AArch32 ABI differs slightly: it
requires that vectors are stored in memory as-if stored with VSTM, which
does a series of 64-bit accesses, instead of the AArch64 VSTR, which
does a single 128-bit access. This means that for AArch32 we need to
reverse the lanes in each 64-bit chunk of the vector, instead of in the
whole vector.

Since there are only a small number of different shufflevector orderings
needed, I've split them out into macros, so that this doesn't need
separate conditions in each intrinsic definition.
diff --git a/clang/test/CodeGen/arm-neon-endianness.c b/clang/test/CodeGen/arm-neon-endianness.c
@@ -0,0 +1,115 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// REQUIRES: arm-registered-target
+
+// RUN: %clang_cc1 -triple armv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
+// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=LE
+// RUN: %clang_cc1 -triple armebv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
+// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=BE
+
+#include <arm_neon.h>
+
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_0(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
+// LE-NEXT:    ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_0(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
+// BE-NEXT:    ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_0(int32x4_t a) { return vgetq_lane_s32(a, 0); }
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_1(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
+// LE-NEXT:    ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_1(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
+// BE-NEXT:    ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_1(int32x4_t a) { return vgetq_lane_s32(a, 1); }
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_2(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
+// LE-NEXT:    ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_2(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
+// BE-NEXT:    ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_2(int32x4_t a) { return vgetq_lane_s32(a, 2); }
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_3(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
+// LE-NEXT:    ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_3(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
+// BE-NEXT:    ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_3(int32x4_t a) { return vgetq_lane_s32(a, 3); }
+// LE-LABEL: define dso_local i32 @int32x2_t_lane_0(
+// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
+// LE-NEXT:    ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x2_t_lane_0(
+// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
+// BE-NEXT:    ret i32 [[VGET_LANE]]
+//
+int int32x2_t_lane_0(int32x2_t a) { return vget_lane_s32(a, 0); }
+// LE-LABEL: define dso_local i32 @int32x2_t_lane_1(
+// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
+// LE-NEXT:    ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x2_t_lane_1(
+// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
+// BE-NEXT:    ret i32 [[VGET_LANE]]
+//
+int int32x2_t_lane_1(int32x2_t a) { return vget_lane_s32(a, 1); }
+// LE-LABEL: define dso_local i64 @int64x2_t_lane_0(
+// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
+// LE-NEXT:    ret i64 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i64 @int64x2_t_lane_0(
+// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
+// BE-NEXT:    ret i64 [[VGET_LANE]]
+//
+int64_t int64x2_t_lane_0(int64x2_t a) { return vgetq_lane_s64(a, 0); }
+// LE-LABEL: define dso_local i64 @int64x2_t_lane_1(
+// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT:  [[ENTRY:.*:]]
+// LE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
+// LE-NEXT:    ret i64 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i64 @int64x2_t_lane_1(
+// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT:  [[ENTRY:.*:]]
+// BE-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
+// BE-NEXT:    ret i64 [[VGET_LANE]]
+//
+int64_t int64x2_t_lane_1(int64x2_t a) { return vgetq_lane_s64(a, 1); }
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
@@ -1263,20 +1263,17 @@ void Intrinsic::emitReverseVariable(Variable &Dest, Variable &Src) {
 
     for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) {
       OS << "  " << Dest.getName() << ".val[" << K << "] = "
-         << "__builtin_shufflevector("
-         << Src.getName() << ".val[" << K << "], "
-         << Src.getName() << ".val[" << K << "]";
-      for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
-        OS << ", " << J;
-      OS << ");";
+         << "__builtin_shufflevector(" << Src.getName() << ".val[" << K << "], "
+         << Src.getName() << ".val[" << K << "], __lane_reverse_"
+         << Dest.getType().getSizeInBits() << "_"
+         << Dest.getType().getElementSizeInBits() << ");";
       emitNewLine();
     }
   } else {
-    OS << "  " << Dest.getName()
-       << " = __builtin_shufflevector(" << Src.getName() << ", " << Src.getName();
-    for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
-      OS << ", " << J;
-    OS << ");";
+    OS << "  " << Dest.getName() << " = __builtin_shufflevector("
+       << Src.getName() << ", " << Src.getName() << ", __lane_reverse_"
+       << Dest.getType().getSizeInBits() << "_"
+       << Dest.getType().getElementSizeInBits() << ");";
     emitNewLine();
   }
 }
@@ -1877,10 +1874,11 @@ std::string Intrinsic::generate() {
 
   OS << "#else\n";
 
-  // Big endian intrinsics are more complex. The user intended these
-  // intrinsics to operate on a vector "as-if" loaded by (V)LDR,
-  // but we load as-if (V)LD1. So we should swap all arguments and
-  // swap the return value too.
+  // Big endian intrinsics are more complex. The user intended these intrinsics
+  // to operate on a vector "as-if" loaded by LDR (for AArch64), VLDR (for
+  // 64-bit vectors on AArch32), or VLDM (for 128-bit vectors on AArch32) but
+  // we load as-if LD1 (for AArch64) or VLD1 (for AArch32). So we should swap
+  // all arguments and swap the return value too.
   //
   // If we call sub-intrinsics, we should call a version that does
   // not re-swap the arguments!
@@ -2434,6 +2432,31 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#define __ai static __inline__ __attribute__((__always_inline__, "
         "__nodebug__))\n\n";
 
+  // Shufflevector arguments lists for endian-swapping vectors for big-endian
+  // targets. For AArch64, we need to reverse every lane in the vector, but for
+  // AArch32 we need to reverse the lanes within each 64-bit chunk of the
+  // vector. The naming convention here is __lane_reverse_<n>_<m>, where <n> is
+  // the length of the vector in bits, and <m> is length of each lane in bits.
+  OS << "#if !defined(__LITTLE_ENDIAN__)\n";
+  OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
+  OS << "#define __lane_reverse_64_32 1,0\n";
+  OS << "#define __lane_reverse_64_16 3,2,1,0\n";
+  OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
+  OS << "#define __lane_reverse_128_64 1,0\n";
+  OS << "#define __lane_reverse_128_32 3,2,1,0\n";
+  OS << "#define __lane_reverse_128_16 7,6,5,4,3,2,1,0\n";
+  OS << "#define __lane_reverse_128_8 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0\n";
+  OS << "#else\n";
+  OS << "#define __lane_reverse_64_32 1,0\n";
+  OS << "#define __lane_reverse_64_16 3,2,1,0\n";
+  OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
+  OS << "#define __lane_reverse_128_64 0,1\n";
+  OS << "#define __lane_reverse_128_32 1,0,3,2\n";
+  OS << "#define __lane_reverse_128_16 3,2,1,0,7,6,5,4\n";
+  OS << "#define __lane_reverse_128_8 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8\n";
+  OS << "#endif\n";
+  OS << "#endif\n";
+
   SmallVector<Intrinsic *, 128> Defs;
   for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);