Skip to content

Commit cf6a452

Browse files
[SLP]Fix same/alternate analysis in split node analysis for compares
getSameOpcode in some cases may consider 2 compares as having same opcode, even though previously they were considered as alternate. It may happen, because getSameOpcode looses info about previous instructions and their states. Need to use isAlternateInstruction function instead for the correct analysis. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: #133769
1 parent 5b8d8bb commit cf6a452

File tree

4 files changed

+39
-29
lines changed

4 files changed

+39
-29
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -9013,8 +9013,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
90139013
Op1Indices.set(Idx);
90149014
continue;
90159015
}
9016-
InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI);
9017-
if (NewS && !NewS.isAltShuffle()) {
9016+
if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
9017+
I->getOpcode() == LocalState.getOpcode()) ||
9018+
(LocalState.getAltOpcode() == LocalState.getOpcode() &&
9019+
!isAlternateInstruction(I, LocalState.getMainOp(),
9020+
LocalState.getAltOp(), *TLI))) {
90189021
Op1.push_back(V);
90199022
Op1Indices.set(Idx);
90209023
continue;

llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll

+6-8
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,9 @@ return:
8686
define float @test_merge_anyof_v4sf(<4 x float> %t) {
8787
; CHECK-LABEL: @test_merge_anyof_v4sf(
8888
; CHECK-NEXT: entry:
89-
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
90-
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
91-
; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
92-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
89+
; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T:%.*]], zeroinitializer
90+
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T]], splat (float 1.000000e+00)
91+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
9392
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
9493
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
9594
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0
@@ -401,10 +400,9 @@ return:
401400
define float @test_merge_anyof_v4si(<4 x i32> %t) {
402401
; CHECK-LABEL: @test_merge_anyof_v4si(
403402
; CHECK-NEXT: entry:
404-
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
405-
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
406-
; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], <i32 255, i32 255, i32 255, i32 255, i32 1, i32 1, i32 1, i32 1>
407-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
403+
; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T:%.*]], splat (i32 1)
404+
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[T]], splat (i32 255)
405+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
408406
; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
409407
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
410408
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0

llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll

+9-9
Original file line numberDiff line numberDiff line change
@@ -207,10 +207,10 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) {
207207

208208
define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
209209
; CHECK-LABEL: @logical_and_icmp_clamp(
210-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
211-
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
212-
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
213-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
210+
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
211+
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
212+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
213+
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
214214
; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
215215
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
216216
; CHECK-NEXT: ret i1 [[TMP6]]
@@ -239,12 +239,12 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
239239

240240
define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
241241
; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
242-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
243-
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
244-
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
245-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
246-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
242+
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
243+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
247244
; CHECK-NEXT: call void @use1(i1 [[TMP5]])
245+
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
246+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
247+
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP8]], <4 x i1> [[TMP1]], i64 4)
248248
; CHECK-NEXT: [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]]
249249
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]])
250250
; CHECK-NEXT: ret i1 [[TMP7]]

llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll

+19-10
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s %}
3-
; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %}
2+
; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,X86 %}
3+
; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefixes=CHECK,AARCH64 %}
44

55
define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
66
; CHECK-LABEL: @logical_and_icmp_diff_preds(
@@ -28,14 +28,23 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
2828
}
2929

3030
define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
31-
; CHECK-LABEL: @logical_and_icmp_clamp(
32-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
33-
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
34-
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
35-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
36-
; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
37-
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
38-
; CHECK-NEXT: ret i1 [[TMP6]]
31+
; X86-LABEL: @logical_and_icmp_clamp(
32+
; X86-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
33+
; X86-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
34+
; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
35+
; X86-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4)
36+
; X86-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
37+
; X86-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
38+
; X86-NEXT: ret i1 [[TMP6]]
39+
;
40+
; AARCH64-LABEL: @logical_and_icmp_clamp(
41+
; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
42+
; AARCH64-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
43+
; AARCH64-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], <i32 17, i32 17, i32 17, i32 17, i32 42, i32 42, i32 42, i32 42>
44+
; AARCH64-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
45+
; AARCH64-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]]
46+
; AARCH64-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]])
47+
; AARCH64-NEXT: ret i1 [[TMP6]]
3948
;
4049
%x0 = extractelement <4 x i32> %x, i32 0
4150
%x1 = extractelement <4 x i32> %x, i32 1

0 commit comments

Comments
 (0)