Skip to content

Commit 3a4b30e

Browse files
committed
[AArch64][GISel] Scalarize i128 ICmp and Select.
Similar to other i128 bit operations, we scalarizer any icmps or selects larger than 64bits.
1 parent 7582308 commit 3a4b30e

File tree

3 files changed

+116
-99
lines changed

3 files changed

+116
-99
lines changed

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
543543
.widenScalarOrEltToNextPow2(1)
544544
.clampScalar(1, s32, s64)
545545
.clampScalar(0, s32, s32)
546+
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
546547
.minScalarEltSameAsIf(
547548
[=](const LegalityQuery &Query) {
548549
const LLT &Ty = Query.Types[0];
@@ -785,6 +786,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
785786
.widenScalarToNextPow2(0)
786787
.clampScalar(0, s32, s64)
787788
.clampScalar(1, s32, s32)
789+
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
788790
.minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
789791
.lowerIf(isVector(0));
790792

llvm/test/CodeGen/AArch64/fcmp.ll

Lines changed: 57 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -465,49 +465,33 @@ define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d,
465465
; CHECK-GI-NEXT: .cfi_offset w30, -16
466466
; CHECK-GI-NEXT: stp q3, q1, [sp] // 32-byte Folded Spill
467467
; CHECK-GI-NEXT: mov v1.16b, v2.16b
468-
; CHECK-GI-NEXT: stp q4, q5, [sp, #32] // 32-byte Folded Spill
469-
; CHECK-GI-NEXT: stp q6, q7, [sp, #64] // 32-byte Folded Spill
468+
; CHECK-GI-NEXT: stp q6, q4, [sp, #32] // 32-byte Folded Spill
469+
; CHECK-GI-NEXT: stp q7, q5, [sp, #64] // 32-byte Folded Spill
470470
; CHECK-GI-NEXT: bl __lttf2
471471
; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
472-
; CHECK-GI-NEXT: cmp w0, #0
473-
; CHECK-GI-NEXT: cset w19, lt
472+
; CHECK-GI-NEXT: mov w19, w0
474473
; CHECK-GI-NEXT: bl __lttf2
475-
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
476-
; CHECK-GI-NEXT: cmp w0, #0
477-
; CHECK-GI-NEXT: bfi x19, x8, #32, #32
478-
; CHECK-GI-NEXT: cset w8, lt
479-
; CHECK-GI-NEXT: fmov x10, d0
480-
; CHECK-GI-NEXT: mov x11, v0.d[1]
481-
; CHECK-GI-NEXT: bfi x8, x8, #32, #32
482-
; CHECK-GI-NEXT: ldp q0, q1, [sp, #48] // 32-byte Folded Reload
483-
; CHECK-GI-NEXT: lsl x9, x19, #63
484-
; CHECK-GI-NEXT: lsl x8, x8, #63
474+
; CHECK-GI-NEXT: ldp q3, q2, [sp, #32] // 32-byte Folded Reload
475+
; CHECK-GI-NEXT: cmp w19, #0
485476
; CHECK-GI-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
486-
; CHECK-GI-NEXT: asr x9, x9, #63
487-
; CHECK-GI-NEXT: fmov x12, d0
488-
; CHECK-GI-NEXT: mov x13, v0.d[1]
489-
; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
490-
; CHECK-GI-NEXT: fmov x14, d1
491-
; CHECK-GI-NEXT: asr x8, x8, #63
492-
; CHECK-GI-NEXT: and x10, x10, x9
493-
; CHECK-GI-NEXT: fmov x15, d0
494-
; CHECK-GI-NEXT: mov x16, v1.d[1]
495-
; CHECK-GI-NEXT: mov x17, v0.d[1]
496-
; CHECK-GI-NEXT: and x12, x12, x8
497-
; CHECK-GI-NEXT: bic x14, x14, x9
498-
; CHECK-GI-NEXT: bic x15, x15, x8
499-
; CHECK-GI-NEXT: orr x10, x10, x14
500-
; CHECK-GI-NEXT: orr x12, x12, x15
501-
; CHECK-GI-NEXT: mov v0.d[0], x10
502-
; CHECK-GI-NEXT: and x10, x11, x9
503-
; CHECK-GI-NEXT: mov v1.d[0], x12
504-
; CHECK-GI-NEXT: and x11, x13, x8
505-
; CHECK-GI-NEXT: bic x9, x16, x9
506-
; CHECK-GI-NEXT: bic x8, x17, x8
507-
; CHECK-GI-NEXT: orr x9, x10, x9
508-
; CHECK-GI-NEXT: orr x8, x11, x8
509-
; CHECK-GI-NEXT: mov v0.d[1], x9
510-
; CHECK-GI-NEXT: mov v1.d[1], x8
477+
; CHECK-GI-NEXT: mov d0, v2.d[1]
478+
; CHECK-GI-NEXT: mov d1, v3.d[1]
479+
; CHECK-GI-NEXT: fcsel d2, d2, d3, lt
480+
; CHECK-GI-NEXT: fmov x8, d2
481+
; CHECK-GI-NEXT: fcsel d3, d0, d1, lt
482+
; CHECK-GI-NEXT: ldp q5, q0, [sp, #64] // 32-byte Folded Reload
483+
; CHECK-GI-NEXT: cmp w0, #0
484+
; CHECK-GI-NEXT: mov d1, v0.d[1]
485+
; CHECK-GI-NEXT: mov d4, v5.d[1]
486+
; CHECK-GI-NEXT: fcsel d0, d0, d5, lt
487+
; CHECK-GI-NEXT: fmov x9, d0
488+
; CHECK-GI-NEXT: mov v0.d[0], x8
489+
; CHECK-GI-NEXT: fmov x8, d3
490+
; CHECK-GI-NEXT: fcsel d2, d1, d4, lt
491+
; CHECK-GI-NEXT: mov v1.d[0], x9
492+
; CHECK-GI-NEXT: fmov x9, d2
493+
; CHECK-GI-NEXT: mov v0.d[1], x8
494+
; CHECK-GI-NEXT: mov v1.d[1], x9
511495
; CHECK-GI-NEXT: add sp, sp, #112
512496
; CHECK-GI-NEXT: ret
513497
entry:
@@ -567,77 +551,52 @@ define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d,
567551
; CHECK-GI-NEXT: mov v1.16b, v3.16b
568552
; CHECK-GI-NEXT: stp q5, q2, [sp, #32] // 32-byte Folded Spill
569553
; CHECK-GI-NEXT: ldr q2, [sp, #192]
570-
; CHECK-GI-NEXT: str q7, [sp, #64] // 16-byte Folded Spill
571-
; CHECK-GI-NEXT: stp q6, q2, [sp, #80] // 32-byte Folded Spill
554+
; CHECK-GI-NEXT: str q2, [sp, #144] // 16-byte Folded Spill
572555
; CHECK-GI-NEXT: ldr q2, [sp, #208]
573-
; CHECK-GI-NEXT: str q2, [sp, #112] // 16-byte Folded Spill
556+
; CHECK-GI-NEXT: stp q2, q6, [sp, #64] // 32-byte Folded Spill
574557
; CHECK-GI-NEXT: ldr q2, [sp, #224]
575-
; CHECK-GI-NEXT: str q2, [sp, #128] // 16-byte Folded Spill
558+
; CHECK-GI-NEXT: stp q7, q2, [sp, #96] // 32-byte Folded Spill
576559
; CHECK-GI-NEXT: ldr q2, [sp, #240]
577-
; CHECK-GI-NEXT: str q2, [sp, #144] // 16-byte Folded Spill
560+
; CHECK-GI-NEXT: str q2, [sp, #128] // 16-byte Folded Spill
578561
; CHECK-GI-NEXT: bl __lttf2
579562
; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
580-
; CHECK-GI-NEXT: cmp w0, #0
581-
; CHECK-GI-NEXT: cset w19, lt
563+
; CHECK-GI-NEXT: mov w19, w0
582564
; CHECK-GI-NEXT: bl __lttf2
583565
; CHECK-GI-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
584-
; CHECK-GI-NEXT: cmp w0, #0
585-
; CHECK-GI-NEXT: cset w20, lt
566+
; CHECK-GI-NEXT: mov w20, w0
586567
; CHECK-GI-NEXT: bl __lttf2
587-
; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
588-
; CHECK-GI-NEXT: bfi x19, x8, #32, #32
589-
; CHECK-GI-NEXT: bfi x20, x8, #32, #32
590-
; CHECK-GI-NEXT: cmp w0, #0
568+
; CHECK-GI-NEXT: ldp q5, q4, [sp, #64] // 32-byte Folded Reload
569+
; CHECK-GI-NEXT: cmp w19, #0
570+
; CHECK-GI-NEXT: ldp q7, q6, [sp, #96] // 32-byte Folded Reload
591571
; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload
592-
; CHECK-GI-NEXT: fmov x8, d0
593-
; CHECK-GI-NEXT: mov x10, v0.d[1]
594-
; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
595-
; CHECK-GI-NEXT: cset w9, lt
596-
; CHECK-GI-NEXT: lsl x13, x19, #63
597-
; CHECK-GI-NEXT: lsl x14, x20, #63
598-
; CHECK-GI-NEXT: fmov x11, d0
599-
; CHECK-GI-NEXT: mov x12, v0.d[1]
600-
; CHECK-GI-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload
601-
; CHECK-GI-NEXT: bfi x9, x8, #32, #32
602-
; CHECK-GI-NEXT: asr x13, x13, #63
603-
; CHECK-GI-NEXT: asr x14, x14, #63
604-
; CHECK-GI-NEXT: fmov x15, d0
605-
; CHECK-GI-NEXT: mov x16, v0.d[1]
606-
; CHECK-GI-NEXT: ldp q0, q1, [sp, #112] // 32-byte Folded Reload
607-
; CHECK-GI-NEXT: lsl x9, x9, #63
608-
; CHECK-GI-NEXT: and x8, x8, x13
609-
; CHECK-GI-NEXT: and x11, x11, x14
610-
; CHECK-GI-NEXT: asr x9, x9, #63
572+
; CHECK-GI-NEXT: mov d0, v4.d[1]
573+
; CHECK-GI-NEXT: mov d1, v5.d[1]
574+
; CHECK-GI-NEXT: fcsel d4, d4, d5, lt
575+
; CHECK-GI-NEXT: mov d2, v7.d[1]
576+
; CHECK-GI-NEXT: mov d3, v6.d[1]
577+
; CHECK-GI-NEXT: fmov x8, d4
578+
; CHECK-GI-NEXT: fcsel d5, d0, d1, lt
579+
; CHECK-GI-NEXT: cmp w20, #0
580+
; CHECK-GI-NEXT: fcsel d1, d7, d6, lt
581+
; CHECK-GI-NEXT: ldp q7, q0, [sp, #128] // 32-byte Folded Reload
582+
; CHECK-GI-NEXT: fcsel d3, d2, d3, lt
583+
; CHECK-GI-NEXT: cmp w0, #0
611584
; CHECK-GI-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
612-
; CHECK-GI-NEXT: fmov x17, d0
613-
; CHECK-GI-NEXT: mov x18, v0.d[1]
614-
; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload
615-
; CHECK-GI-NEXT: fmov x0, d1
616-
; CHECK-GI-NEXT: and x15, x15, x9
617-
; CHECK-GI-NEXT: mov x2, v1.d[1]
618-
; CHECK-GI-NEXT: fmov x1, d0
619-
; CHECK-GI-NEXT: mov x3, v0.d[1]
620-
; CHECK-GI-NEXT: bic x17, x17, x13
621-
; CHECK-GI-NEXT: bic x0, x0, x14
622-
; CHECK-GI-NEXT: orr x8, x8, x17
623-
; CHECK-GI-NEXT: bic x1, x1, x9
624-
; CHECK-GI-NEXT: orr x11, x11, x0
585+
; CHECK-GI-NEXT: mov d2, v0.d[1]
586+
; CHECK-GI-NEXT: mov d6, v7.d[1]
587+
; CHECK-GI-NEXT: fcsel d7, d0, d7, lt
625588
; CHECK-GI-NEXT: mov v0.d[0], x8
626-
; CHECK-GI-NEXT: orr x15, x15, x1
627-
; CHECK-GI-NEXT: mov v1.d[0], x11
628-
; CHECK-GI-NEXT: and x8, x10, x13
629-
; CHECK-GI-NEXT: mov v2.d[0], x15
630-
; CHECK-GI-NEXT: and x10, x12, x14
631-
; CHECK-GI-NEXT: and x11, x16, x9
632-
; CHECK-GI-NEXT: bic x12, x18, x13
633-
; CHECK-GI-NEXT: bic x13, x2, x14
634-
; CHECK-GI-NEXT: bic x9, x3, x9
635-
; CHECK-GI-NEXT: orr x8, x8, x12
636-
; CHECK-GI-NEXT: orr x10, x10, x13
637-
; CHECK-GI-NEXT: orr x9, x11, x9
589+
; CHECK-GI-NEXT: fmov x8, d1
590+
; CHECK-GI-NEXT: fmov x9, d7
591+
; CHECK-GI-NEXT: fcsel d4, d2, d6, lt
592+
; CHECK-GI-NEXT: mov v1.d[0], x8
593+
; CHECK-GI-NEXT: fmov x8, d5
594+
; CHECK-GI-NEXT: mov v2.d[0], x9
595+
; CHECK-GI-NEXT: fmov x9, d3
596+
; CHECK-GI-NEXT: fmov x10, d4
638597
; CHECK-GI-NEXT: mov v0.d[1], x8
639-
; CHECK-GI-NEXT: mov v1.d[1], x10
640-
; CHECK-GI-NEXT: mov v2.d[1], x9
598+
; CHECK-GI-NEXT: mov v1.d[1], x9
599+
; CHECK-GI-NEXT: mov v2.d[1], x10
641600
; CHECK-GI-NEXT: add sp, sp, #192
642601
; CHECK-GI-NEXT: ret
643602
entry:

llvm/test/CodeGen/AArch64/icmp.ll

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3-
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
3+
; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

55
define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
66
; CHECK-LABEL: i64_i64:
@@ -1376,6 +1376,62 @@ entry:
13761376
ret <32 x i8> %s
13771377
}
13781378

1379+
define <2 x i128> @v2i128_i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %d, <2 x i128> %e) {
1380+
; CHECK-SD-LABEL: v2i128_i128:
1381+
; CHECK-SD: // %bb.0: // %entry
1382+
; CHECK-SD-NEXT: add x10, sp, #32
1383+
; CHECK-SD-NEXT: mov x11, sp
1384+
; CHECK-SD-NEXT: cmp x0, x4
1385+
; CHECK-SD-NEXT: orr x12, x10, #0x8
1386+
; CHECK-SD-NEXT: orr x13, x11, #0x8
1387+
; CHECK-SD-NEXT: sbcs xzr, x1, x5
1388+
; CHECK-SD-NEXT: add x8, sp, #48
1389+
; CHECK-SD-NEXT: add x9, sp, #16
1390+
; CHECK-SD-NEXT: csel x12, x13, x12, lt
1391+
; CHECK-SD-NEXT: csel x10, x11, x10, lt
1392+
; CHECK-SD-NEXT: cmp x2, x6
1393+
; CHECK-SD-NEXT: orr x11, x8, #0x8
1394+
; CHECK-SD-NEXT: orr x13, x9, #0x8
1395+
; CHECK-SD-NEXT: sbcs xzr, x3, x7
1396+
; CHECK-SD-NEXT: ldr x0, [x10]
1397+
; CHECK-SD-NEXT: csel x8, x9, x8, lt
1398+
; CHECK-SD-NEXT: csel x9, x13, x11, lt
1399+
; CHECK-SD-NEXT: ldr x1, [x12]
1400+
; CHECK-SD-NEXT: ldr x2, [x8]
1401+
; CHECK-SD-NEXT: ldr x3, [x9]
1402+
; CHECK-SD-NEXT: ret
1403+
;
1404+
; CHECK-GI-LABEL: v2i128_i128:
1405+
; CHECK-GI: // %bb.0: // %entry
1406+
; CHECK-GI-NEXT: cmp x1, x5
1407+
; CHECK-GI-NEXT: ldp x8, x9, [sp]
1408+
; CHECK-GI-NEXT: cset w10, lt
1409+
; CHECK-GI-NEXT: cmp x0, x4
1410+
; CHECK-GI-NEXT: cset w13, lo
1411+
; CHECK-GI-NEXT: cmp x1, x5
1412+
; CHECK-GI-NEXT: csel w10, w13, w10, eq
1413+
; CHECK-GI-NEXT: cmp x3, x7
1414+
; CHECK-GI-NEXT: ldp x13, x14, [sp, #32]
1415+
; CHECK-GI-NEXT: cset w15, lt
1416+
; CHECK-GI-NEXT: cmp x2, x6
1417+
; CHECK-GI-NEXT: ldp x11, x12, [sp, #16]
1418+
; CHECK-GI-NEXT: cset w16, lo
1419+
; CHECK-GI-NEXT: cmp x3, x7
1420+
; CHECK-GI-NEXT: ldp x17, x18, [sp, #48]
1421+
; CHECK-GI-NEXT: csel w15, w16, w15, eq
1422+
; CHECK-GI-NEXT: tst w10, #0x1
1423+
; CHECK-GI-NEXT: csel x0, x8, x13, ne
1424+
; CHECK-GI-NEXT: csel x1, x9, x14, ne
1425+
; CHECK-GI-NEXT: tst w15, #0x1
1426+
; CHECK-GI-NEXT: csel x2, x11, x17, ne
1427+
; CHECK-GI-NEXT: csel x3, x12, x18, ne
1428+
; CHECK-GI-NEXT: ret
1429+
entry:
1430+
%c = icmp slt <2 x i128> %a, %b
1431+
%s = select <2 x i1> %c, <2 x i128> %d, <2 x i128> %e
1432+
ret <2 x i128> %s
1433+
}
1434+
13791435
; ===== ICMP Zero RHS =====
13801436

13811437
define <8 x i1> @icmp_eq_v8i8_Zero_RHS(<8 x i8> %a) {

0 commit comments

Comments
 (0)