Skip to content

Commit 8009839

Browse files
committed
[hipblaslt] add waitcnt instruction after SLoadB64 before updating the offset.
1 parent acc005a commit 8009839

1 file changed

Lines changed: 4 additions & 0 deletions

File tree

projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1884,6 +1884,7 @@ def loadBatchedAddress(self, kernel, Batch, tmpSgprResource: ContinuousRegister)
18841884
module.add(SLoadB64(dst=sgpr("AddressD", 2), base=sgpr("AddressD",2), soffset=sgpr(tmpSgpr), comment="load global buffer D address"))
18851885
# Apply batch offset to AddressD for general batched mode
18861886
if not kernel["ProblemType"]["StridedBatched"] and not kernel["ProblemType"]["GroupedGemm"]:
1887+
module.add(SWaitCnt(kmcnt=0, comment="Wait for the Matrix Address Load from the Pointer Array before updating offset"))
18871888
module.add(SAddU64(dst=sgpr("AddressD", 2), src0=sgpr("AddressD", 2), src1=sgpr("BatchOffsetD", 2), comment="add batch offset to D address"))
18881889

18891890
endCheckLabel = Label(self.labels.getName(f"label_skip_c_buffer_deref_{Batch}"), "")
@@ -1896,6 +1897,7 @@ def loadBatchedAddress(self, kernel, Batch, tmpSgprResource: ContinuousRegister)
18961897
module.add(SLoadB64(dst=sgpr("AddressC", 2), base=sgpr("AddressC",2), soffset=sgpr(tmpSgpr), comment="load global buffer C address"))
18971898
# Apply batch offset to AddressC for general batched mode
18981899
if not kernel["ProblemType"]["StridedBatched"] and not kernel["ProblemType"]["GroupedGemm"]:
1900+
module.add(SWaitCnt(kmcnt=0, comment="Wait for the Matrix Address Load from the Pointer Array before updating offset"))
18991901
module.add(SAddU64(dst=sgpr("AddressC", 2), src0=sgpr("AddressC", 2), src1=sgpr("BatchOffsetC", 2), comment="add batch offset to C address"))
19001902

19011903
module.add(endCheckLabel)
@@ -1917,7 +1919,9 @@ def loadBatchedAddress(self, kernel, Batch, tmpSgprResource: ContinuousRegister)
19171919
module.add(SLoadB64(dst=sgpr("AddressB", 2), base=sgpr("AddressB",2), soffset=sgpr(tmpSgpr), comment="load global buffer B address"))
19181920
# Apply batch offset to AddressA and AddressB for general batched mode
19191921
if not kernel["ProblemType"]["StridedBatched"] and not kernel["ProblemType"]["GroupedGemm"]:
1922+
module.add(SWaitCnt(kmcnt=1, comment="Wait for the AddressA Load from the Pointer Array before updating offset"))
19201923
module.add(SAddU64(dst=sgpr("AddressA", 2), src0=sgpr("AddressA", 2), src1=sgpr("BatchOffsetA", 2), comment="add batch offset to A address"))
1924+
module.add(SWaitCnt(kmcnt=0, comment="Wait for the AddressB Load from the Pointer Array before updating offset"))
19211925
module.add(SAddU64(dst=sgpr("AddressB", 2), src0=sgpr("AddressB", 2), src1=sgpr("BatchOffsetB", 2), comment="add batch offset to B address"))
19221926

19231927
module.add(endCheckLabel)

0 commit comments

Comments
 (0)