Skip to content

Commit e7f47e7

Browse files
authored
[AMDGPU] Enable XNACK on gfx1250 (#161457)
This should be always on. Fixes SWDEV-555931.
1 parent 78739ff commit e7f47e7

30 files changed

+1258
-1169
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,6 +2126,8 @@ def FeatureISAVersion12_50 : FeatureSet<
21262126
FeatureLdsBarrierArriveAtomic,
21272127
FeatureSetPrioIncWgInst,
21282128
Feature45BitNumRecordsBufferResource,
2129+
FeatureSupportsXNACK,
2130+
FeatureXNACK,
21292131
]>;
21302132

21312133
def FeatureISAVersion12_51 : FeatureSet<

llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll

Lines changed: 162 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -90,26 +90,24 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
9090
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8
9191
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
9292
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
93-
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
93+
; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v13, v[0:1], off offset:10
9494
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa
9595
; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
96-
; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
96+
; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2
9797
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8
98-
; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5
98+
; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5
9999
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6
100-
; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
100+
; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6
101101
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
102-
; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9
102+
; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9
103+
; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0
103104
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
104-
; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
105-
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1
106-
; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
105+
; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10
107106
; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
108-
; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
109-
; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
110-
; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
111-
; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
112-
; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
107+
; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v7, 24, v12 :: v_dual_lshlrev_b32 v8, 16, v13
108+
; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3
109+
; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2)
110+
; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v6
113111
; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
114112
;
115113
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -942,7 +940,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
942940
;
943941
; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
944942
; GFX1250-NOUNALIGNED: ; %bb.0:
945-
; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa
943+
; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
946944
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
947945
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
948946
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2
@@ -954,27 +952,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
954952
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0
955953
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
956954
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
957-
; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
958-
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
955+
; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s13, s[0:1], 0x8
959956
; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
960957
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
961-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
962-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
963-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8
964-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3
965-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24
966-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16
967-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8
958+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 24
959+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
960+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s5, 8
961+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s2
962+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s6, 24
963+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s7, 16
964+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s8, 8
968965
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10
969-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24
970-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0
971-
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16
972-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11
973-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6
966+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s9, 24
967+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0
968+
; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s12, 16
969+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s3, s11
970+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s5
971+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s6, s13
974972
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1
975-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2
976-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3
977-
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
973+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2
974+
; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4
978975
; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
979976
;
980977
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -1351,11 +1348,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
13511348
}
13521349

13531350
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
1354-
; GFX12-LABEL: s_load_constant_v3i32_align4:
1355-
; GFX12: ; %bb.0:
1356-
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1357-
; GFX12-NEXT: s_wait_kmcnt 0x0
1358-
; GFX12-NEXT: ; return to shader part epilog
1351+
; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align4:
1352+
; GFX12-UNALIGNED: ; %bb.0:
1353+
; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1354+
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
1355+
; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
1356+
;
1357+
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align4:
1358+
; GFX12-NOUNALIGNED: ; %bb.0:
1359+
; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1360+
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
1361+
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
1362+
;
1363+
; GFX1250-LABEL: s_load_constant_v3i32_align4:
1364+
; GFX1250: ; %bb.0:
1365+
; GFX1250-NEXT: s_mov_b32 s4, s0
1366+
; GFX1250-NEXT: s_mov_b32 s5, s1
1367+
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
1368+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1369+
; GFX1250-NEXT: ; return to shader part epilog
13591370
;
13601371
; GFX9-LABEL: s_load_constant_v3i32_align4:
13611372
; GFX9: ; %bb.0:
@@ -1388,11 +1399,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
13881399
}
13891400

13901401
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
1391-
; GFX12-LABEL: s_load_constant_i96_align8:
1392-
; GFX12: ; %bb.0:
1393-
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1394-
; GFX12-NEXT: s_wait_kmcnt 0x0
1395-
; GFX12-NEXT: ; return to shader part epilog
1402+
; GFX12-UNALIGNED-LABEL: s_load_constant_i96_align8:
1403+
; GFX12-UNALIGNED: ; %bb.0:
1404+
; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1405+
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
1406+
; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
1407+
;
1408+
; GFX12-NOUNALIGNED-LABEL: s_load_constant_i96_align8:
1409+
; GFX12-NOUNALIGNED: ; %bb.0:
1410+
; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1411+
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
1412+
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
1413+
;
1414+
; GFX1250-LABEL: s_load_constant_i96_align8:
1415+
; GFX1250: ; %bb.0:
1416+
; GFX1250-NEXT: s_mov_b32 s4, s0
1417+
; GFX1250-NEXT: s_mov_b32 s5, s1
1418+
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
1419+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1420+
; GFX1250-NEXT: ; return to shader part epilog
13961421
;
13971422
; GFX9-LABEL: s_load_constant_i96_align8:
13981423
; GFX9: ; %bb.0:
@@ -1425,11 +1450,25 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
14251450
}
14261451

14271452
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
1428-
; GFX12-LABEL: s_load_constant_v3i32_align8:
1429-
; GFX12: ; %bb.0:
1430-
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1431-
; GFX12-NEXT: s_wait_kmcnt 0x0
1432-
; GFX12-NEXT: ; return to shader part epilog
1453+
; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align8:
1454+
; GFX12-UNALIGNED: ; %bb.0:
1455+
; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1456+
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
1457+
; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
1458+
;
1459+
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align8:
1460+
; GFX12-NOUNALIGNED: ; %bb.0:
1461+
; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1462+
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
1463+
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
1464+
;
1465+
; GFX1250-LABEL: s_load_constant_v3i32_align8:
1466+
; GFX1250: ; %bb.0:
1467+
; GFX1250-NEXT: s_mov_b32 s4, s0
1468+
; GFX1250-NEXT: s_mov_b32 s5, s1
1469+
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
1470+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1471+
; GFX1250-NEXT: ; return to shader part epilog
14331472
;
14341473
; GFX9-LABEL: s_load_constant_v3i32_align8:
14351474
; GFX9: ; %bb.0:
@@ -1462,11 +1501,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
14621501
}
14631502

14641503
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
1465-
; GFX12-LABEL: s_load_constant_v6i16_align8:
1466-
; GFX12: ; %bb.0:
1467-
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1468-
; GFX12-NEXT: s_wait_kmcnt 0x0
1469-
; GFX12-NEXT: ; return to shader part epilog
1504+
; GFX12-UNALIGNED-LABEL: s_load_constant_v6i16_align8:
1505+
; GFX12-UNALIGNED: ; %bb.0:
1506+
; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1507+
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
1508+
; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
1509+
;
1510+
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v6i16_align8:
1511+
; GFX12-NOUNALIGNED: ; %bb.0:
1512+
; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1513+
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
1514+
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
1515+
;
1516+
; GFX1250-LABEL: s_load_constant_v6i16_align8:
1517+
; GFX1250: ; %bb.0:
1518+
; GFX1250-NEXT: s_mov_b32 s4, s0
1519+
; GFX1250-NEXT: s_mov_b32 s5, s1
1520+
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
1521+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1522+
; GFX1250-NEXT: ; return to shader part epilog
14701523
;
14711524
; GFX9-LABEL: s_load_constant_v6i16_align8:
14721525
; GFX9: ; %bb.0:
@@ -1500,24 +1553,64 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
15001553
}
15011554

15021555
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
1503-
; GFX12-LABEL: s_load_constant_v12i8_align8:
1504-
; GFX12: ; %bb.0:
1505-
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1506-
; GFX12-NEXT: s_wait_kmcnt 0x0
1507-
; GFX12-NEXT: s_lshr_b32 s13, s0, 8
1508-
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
1509-
; GFX12-NEXT: s_lshr_b32 s3, s0, 24
1510-
; GFX12-NEXT: s_lshr_b32 s5, s1, 8
1511-
; GFX12-NEXT: s_lshr_b32 s6, s1, 16
1512-
; GFX12-NEXT: s_lshr_b32 s7, s1, 24
1513-
; GFX12-NEXT: s_lshr_b32 s9, s2, 8
1514-
; GFX12-NEXT: s_lshr_b32 s10, s2, 16
1515-
; GFX12-NEXT: s_lshr_b32 s11, s2, 24
1516-
; GFX12-NEXT: s_mov_b32 s4, s1
1517-
; GFX12-NEXT: s_mov_b32 s8, s2
1518-
; GFX12-NEXT: s_mov_b32 s1, s13
1519-
; GFX12-NEXT: s_mov_b32 s2, s12
1520-
; GFX12-NEXT: ; return to shader part epilog
1556+
; GFX12-UNALIGNED-LABEL: s_load_constant_v12i8_align8:
1557+
; GFX12-UNALIGNED: ; %bb.0:
1558+
; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1559+
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
1560+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s13, s0, 8
1561+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s12, s0, 16
1562+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s3, s0, 24
1563+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s5, s1, 8
1564+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s6, s1, 16
1565+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s7, s1, 24
1566+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s9, s2, 8
1567+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s10, s2, 16
1568+
; GFX12-UNALIGNED-NEXT: s_lshr_b32 s11, s2, 24
1569+
; GFX12-UNALIGNED-NEXT: s_mov_b32 s4, s1
1570+
; GFX12-UNALIGNED-NEXT: s_mov_b32 s8, s2
1571+
; GFX12-UNALIGNED-NEXT: s_mov_b32 s1, s13
1572+
; GFX12-UNALIGNED-NEXT: s_mov_b32 s2, s12
1573+
; GFX12-UNALIGNED-NEXT: ; return to shader part epilog
1574+
;
1575+
; GFX12-NOUNALIGNED-LABEL: s_load_constant_v12i8_align8:
1576+
; GFX12-NOUNALIGNED: ; %bb.0:
1577+
; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
1578+
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
1579+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s13, s0, 8
1580+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s12, s0, 16
1581+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s3, s0, 24
1582+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s5, s1, 8
1583+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s6, s1, 16
1584+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s7, s1, 24
1585+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s9, s2, 8
1586+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s10, s2, 16
1587+
; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s11, s2, 24
1588+
; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s4, s1
1589+
; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s8, s2
1590+
; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s1, s13
1591+
; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s2, s12
1592+
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
1593+
;
1594+
; GFX1250-LABEL: s_load_constant_v12i8_align8:
1595+
; GFX1250: ; %bb.0:
1596+
; GFX1250-NEXT: s_mov_b32 s4, s0
1597+
; GFX1250-NEXT: s_mov_b32 s5, s1
1598+
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
1599+
; GFX1250-NEXT: s_wait_kmcnt 0x0
1600+
; GFX1250-NEXT: s_lshr_b32 s13, s0, 8
1601+
; GFX1250-NEXT: s_lshr_b32 s12, s0, 16
1602+
; GFX1250-NEXT: s_lshr_b32 s3, s0, 24
1603+
; GFX1250-NEXT: s_lshr_b32 s5, s1, 8
1604+
; GFX1250-NEXT: s_lshr_b32 s6, s1, 16
1605+
; GFX1250-NEXT: s_lshr_b32 s7, s1, 24
1606+
; GFX1250-NEXT: s_lshr_b32 s9, s2, 8
1607+
; GFX1250-NEXT: s_lshr_b32 s10, s2, 16
1608+
; GFX1250-NEXT: s_lshr_b32 s11, s2, 24
1609+
; GFX1250-NEXT: s_mov_b32 s4, s1
1610+
; GFX1250-NEXT: s_mov_b32 s8, s2
1611+
; GFX1250-NEXT: s_mov_b32 s1, s13
1612+
; GFX1250-NEXT: s_mov_b32 s2, s12
1613+
; GFX1250-NEXT: ; return to shader part epilog
15211614
;
15221615
; GFX9-LABEL: s_load_constant_v12i8_align8:
15231616
; GFX9: ; %bb.0:

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -569,10 +569,10 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
569569
; GFX1250: ; %bb.0:
570570
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
571571
; GFX1250-NEXT: s_wait_kmcnt 0x0
572-
; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
572+
; GFX1250-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
573573
; GFX1250-NEXT: s_clause 0x1
574-
; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off
575-
; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
574+
; GFX1250-NEXT: global_load_b128 v[0:3], v[8:9], off
575+
; GFX1250-NEXT: global_load_b128 v[4:7], v[8:9], off offset:16
576576
; GFX1250-NEXT: s_wait_loadcnt 0x0
577577
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
578578
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
@@ -752,12 +752,12 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
752752
; GFX1250: ; %bb.0:
753753
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
754754
; GFX1250-NEXT: s_wait_kmcnt 0x0
755-
; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
755+
; GFX1250-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_mov_b32 v16, v0
756756
; GFX1250-NEXT: s_clause 0x3
757-
; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off
758-
; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
759-
; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
760-
; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
757+
; GFX1250-NEXT: global_load_b128 v[0:3], v[16:17], off
758+
; GFX1250-NEXT: global_load_b128 v[4:7], v[16:17], off offset:16
759+
; GFX1250-NEXT: global_load_b128 v[8:11], v[16:17], off offset:32
760+
; GFX1250-NEXT: global_load_b128 v[12:15], v[16:17], off offset:48
761761
; GFX1250-NEXT: s_wait_loadcnt 0x0
762762
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
763763
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
@@ -1055,16 +1055,16 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
10551055
; GFX1250: ; %bb.0:
10561056
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
10571057
; GFX1250-NEXT: s_wait_kmcnt 0x0
1058-
; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
1058+
; GFX1250-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
10591059
; GFX1250-NEXT: s_clause 0x7
1060-
; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off
1061-
; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
1062-
; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
1063-
; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
1064-
; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
1065-
; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
1066-
; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
1067-
; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
1060+
; GFX1250-NEXT: global_load_b128 v[0:3], v[32:33], off
1061+
; GFX1250-NEXT: global_load_b128 v[4:7], v[32:33], off offset:16
1062+
; GFX1250-NEXT: global_load_b128 v[8:11], v[32:33], off offset:32
1063+
; GFX1250-NEXT: global_load_b128 v[12:15], v[32:33], off offset:48
1064+
; GFX1250-NEXT: global_load_b128 v[16:19], v[32:33], off offset:64
1065+
; GFX1250-NEXT: global_load_b128 v[20:23], v[32:33], off offset:80
1066+
; GFX1250-NEXT: global_load_b128 v[24:27], v[32:33], off offset:96
1067+
; GFX1250-NEXT: global_load_b128 v[28:31], v[32:33], off offset:112
10681068
; GFX1250-NEXT: s_wait_loadcnt 0x0
10691069
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
10701070
%load = load <64 x bfloat>, ptr addrspace(1) %ptr

0 commit comments

Comments
 (0)