Skip to content

Commit a619a2e

Browse files
authored
[ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets (#127068)
In arm-neon.h, we insert shufflevectors around each intrinsic when the target is big-endian, to compensate for the difference between the ABI-defined memory format of vectors (with the whole vector stored as one big-endian access) and LLVM's target-independent expectations (with the lowest-numbered lane in the lowest address). However, this code was written for the AArch64 ABI, and the AArch32 ABI differs slightly: it requires that vectors are stored in memory as-if stored with VSTM, which does a series of 64-bit accesses, instead of the AArch64 VSTR, which does a single 128-bit access. This means that for AArch32 we need to reverse the lanes in each 64-bit chunk of the vector, instead of in the whole vector. Since there are only a small number of different shufflevector orderings needed, I've split them out into macros, so that this doesn't need separate conditions in each intrinsic definition.
1 parent c61c888 commit a619a2e

File tree

2 files changed

+153
-15
lines changed

2 files changed

+153
-15
lines changed
+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
3+
// REQUIRES: arm-registered-target
4+
5+
// RUN: %clang_cc1 -triple armv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
6+
// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=LE
7+
// RUN: %clang_cc1 -triple armebv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
8+
// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=BE
9+
10+
#include <arm_neon.h>
11+
12+
// LE-LABEL: define dso_local i32 @int32x4_t_lane_0(
13+
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
14+
// LE-NEXT: [[ENTRY:.*:]]
15+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
16+
// LE-NEXT: ret i32 [[VGET_LANE]]
17+
//
18+
// BE-LABEL: define dso_local i32 @int32x4_t_lane_0(
19+
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
20+
// BE-NEXT: [[ENTRY:.*:]]
21+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
22+
// BE-NEXT: ret i32 [[VGET_LANE]]
23+
//
24+
int int32x4_t_lane_0(int32x4_t a) { return vgetq_lane_s32(a, 0); }
25+
// LE-LABEL: define dso_local i32 @int32x4_t_lane_1(
26+
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
27+
// LE-NEXT: [[ENTRY:.*:]]
28+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
29+
// LE-NEXT: ret i32 [[VGET_LANE]]
30+
//
31+
// BE-LABEL: define dso_local i32 @int32x4_t_lane_1(
32+
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
33+
// BE-NEXT: [[ENTRY:.*:]]
34+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
35+
// BE-NEXT: ret i32 [[VGET_LANE]]
36+
//
37+
int int32x4_t_lane_1(int32x4_t a) { return vgetq_lane_s32(a, 1); }
38+
// LE-LABEL: define dso_local i32 @int32x4_t_lane_2(
39+
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
40+
// LE-NEXT: [[ENTRY:.*:]]
41+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
42+
// LE-NEXT: ret i32 [[VGET_LANE]]
43+
//
44+
// BE-LABEL: define dso_local i32 @int32x4_t_lane_2(
45+
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
46+
// BE-NEXT: [[ENTRY:.*:]]
47+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
48+
// BE-NEXT: ret i32 [[VGET_LANE]]
49+
//
50+
int int32x4_t_lane_2(int32x4_t a) { return vgetq_lane_s32(a, 2); }
51+
// LE-LABEL: define dso_local i32 @int32x4_t_lane_3(
52+
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
53+
// LE-NEXT: [[ENTRY:.*:]]
54+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
55+
// LE-NEXT: ret i32 [[VGET_LANE]]
56+
//
57+
// BE-LABEL: define dso_local i32 @int32x4_t_lane_3(
58+
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
59+
// BE-NEXT: [[ENTRY:.*:]]
60+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
61+
// BE-NEXT: ret i32 [[VGET_LANE]]
62+
//
63+
int int32x4_t_lane_3(int32x4_t a) { return vgetq_lane_s32(a, 3); }
64+
// LE-LABEL: define dso_local i32 @int32x2_t_lane_0(
65+
// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
66+
// LE-NEXT: [[ENTRY:.*:]]
67+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
68+
// LE-NEXT: ret i32 [[VGET_LANE]]
69+
//
70+
// BE-LABEL: define dso_local i32 @int32x2_t_lane_0(
71+
// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
72+
// BE-NEXT: [[ENTRY:.*:]]
73+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
74+
// BE-NEXT: ret i32 [[VGET_LANE]]
75+
//
76+
int int32x2_t_lane_0(int32x2_t a) { return vget_lane_s32(a, 0); }
77+
// LE-LABEL: define dso_local i32 @int32x2_t_lane_1(
78+
// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
79+
// LE-NEXT: [[ENTRY:.*:]]
80+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
81+
// LE-NEXT: ret i32 [[VGET_LANE]]
82+
//
83+
// BE-LABEL: define dso_local i32 @int32x2_t_lane_1(
84+
// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
85+
// BE-NEXT: [[ENTRY:.*:]]
86+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
87+
// BE-NEXT: ret i32 [[VGET_LANE]]
88+
//
89+
int int32x2_t_lane_1(int32x2_t a) { return vget_lane_s32(a, 1); }
90+
// LE-LABEL: define dso_local i64 @int64x2_t_lane_0(
91+
// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
92+
// LE-NEXT: [[ENTRY:.*:]]
93+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
94+
// LE-NEXT: ret i64 [[VGET_LANE]]
95+
//
96+
// BE-LABEL: define dso_local i64 @int64x2_t_lane_0(
97+
// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
98+
// BE-NEXT: [[ENTRY:.*:]]
99+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
100+
// BE-NEXT: ret i64 [[VGET_LANE]]
101+
//
102+
int64_t int64x2_t_lane_0(int64x2_t a) { return vgetq_lane_s64(a, 0); }
103+
// LE-LABEL: define dso_local i64 @int64x2_t_lane_1(
104+
// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
105+
// LE-NEXT: [[ENTRY:.*:]]
106+
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
107+
// LE-NEXT: ret i64 [[VGET_LANE]]
108+
//
109+
// BE-LABEL: define dso_local i64 @int64x2_t_lane_1(
110+
// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
111+
// BE-NEXT: [[ENTRY:.*:]]
112+
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
113+
// BE-NEXT: ret i64 [[VGET_LANE]]
114+
//
115+
int64_t int64x2_t_lane_1(int64x2_t a) { return vgetq_lane_s64(a, 1); }

clang/utils/TableGen/NeonEmitter.cpp

+38-15
Original file line numberDiff line numberDiff line change
@@ -1263,20 +1263,17 @@ void Intrinsic::emitReverseVariable(Variable &Dest, Variable &Src) {
12631263

12641264
for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) {
12651265
OS << " " << Dest.getName() << ".val[" << K << "] = "
1266-
<< "__builtin_shufflevector("
1267-
<< Src.getName() << ".val[" << K << "], "
1268-
<< Src.getName() << ".val[" << K << "]";
1269-
for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
1270-
OS << ", " << J;
1271-
OS << ");";
1266+
<< "__builtin_shufflevector(" << Src.getName() << ".val[" << K << "], "
1267+
<< Src.getName() << ".val[" << K << "], __lane_reverse_"
1268+
<< Dest.getType().getSizeInBits() << "_"
1269+
<< Dest.getType().getElementSizeInBits() << ");";
12721270
emitNewLine();
12731271
}
12741272
} else {
1275-
OS << " " << Dest.getName()
1276-
<< " = __builtin_shufflevector(" << Src.getName() << ", " << Src.getName();
1277-
for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
1278-
OS << ", " << J;
1279-
OS << ");";
1273+
OS << " " << Dest.getName() << " = __builtin_shufflevector("
1274+
<< Src.getName() << ", " << Src.getName() << ", __lane_reverse_"
1275+
<< Dest.getType().getSizeInBits() << "_"
1276+
<< Dest.getType().getElementSizeInBits() << ");";
12801277
emitNewLine();
12811278
}
12821279
}
@@ -1877,10 +1874,11 @@ std::string Intrinsic::generate() {
18771874

18781875
OS << "#else\n";
18791876

1880-
// Big endian intrinsics are more complex. The user intended these
1881-
// intrinsics to operate on a vector "as-if" loaded by (V)LDR,
1882-
// but we load as-if (V)LD1. So we should swap all arguments and
1883-
// swap the return value too.
1877+
// Big endian intrinsics are more complex. The user intended these intrinsics
1878+
// to operate on a vector "as-if" loaded by LDR (for AArch64), VLDR (for
1879+
// 64-bit vectors on AArch32), or VLDM (for 128-bit vectors on AArch32) but
1880+
// we load as-if LD1 (for AArch64) or VLD1 (for AArch32). So we should swap
1881+
// all arguments and swap the return value too.
18841882
//
18851883
// If we call sub-intrinsics, we should call a version that does
18861884
// not re-swap the arguments!
@@ -2434,6 +2432,31 @@ void NeonEmitter::run(raw_ostream &OS) {
24342432
OS << "#define __ai static __inline__ __attribute__((__always_inline__, "
24352433
"__nodebug__))\n\n";
24362434

2435+
// Shufflevector arguments lists for endian-swapping vectors for big-endian
2436+
// targets. For AArch64, we need to reverse every lane in the vector, but for
2437+
// AArch32 we need to reverse the lanes within each 64-bit chunk of the
2438+
// vector. The naming convention here is __lane_reverse_<n>_<m>, where <n> is
2439+
// the length of the vector in bits, and <m> is length of each lane in bits.
2440+
OS << "#if !defined(__LITTLE_ENDIAN__)\n";
2441+
OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
2442+
OS << "#define __lane_reverse_64_32 1,0\n";
2443+
OS << "#define __lane_reverse_64_16 3,2,1,0\n";
2444+
OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
2445+
OS << "#define __lane_reverse_128_64 1,0\n";
2446+
OS << "#define __lane_reverse_128_32 3,2,1,0\n";
2447+
OS << "#define __lane_reverse_128_16 7,6,5,4,3,2,1,0\n";
2448+
OS << "#define __lane_reverse_128_8 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0\n";
2449+
OS << "#else\n";
2450+
OS << "#define __lane_reverse_64_32 1,0\n";
2451+
OS << "#define __lane_reverse_64_16 3,2,1,0\n";
2452+
OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
2453+
OS << "#define __lane_reverse_128_64 0,1\n";
2454+
OS << "#define __lane_reverse_128_32 1,0,3,2\n";
2455+
OS << "#define __lane_reverse_128_16 3,2,1,0,7,6,5,4\n";
2456+
OS << "#define __lane_reverse_128_8 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8\n";
2457+
OS << "#endif\n";
2458+
OS << "#endif\n";
2459+
24372460
SmallVector<Intrinsic *, 128> Defs;
24382461
for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
24392462
createIntrinsic(R, Defs);

0 commit comments

Comments
 (0)