@@ -1884,6 +1884,7 @@ def loadBatchedAddress(self, kernel, Batch, tmpSgprResource: ContinuousRegister)
18841884 module.add(SLoadB64(dst=sgpr("AddressD", 2), base=sgpr("AddressD",2), soffset=sgpr(tmpSgpr), comment="load global buffer D address"))
18851885 # Apply batch offset to AddressD for general batched mode
18861886 if not kernel["ProblemType"]["StridedBatched"] and not kernel["ProblemType"]["GroupedGemm"]:
1887+ module.add(SWaitCnt(kmcnt=0, comment="Wait for the Matrix Address Load from the Pointer Array before updating offset"))
18871888 module.add(SAddU64(dst=sgpr("AddressD", 2), src0=sgpr("AddressD", 2), src1=sgpr("BatchOffsetD", 2), comment="add batch offset to D address"))
18881889
18891890 endCheckLabel = Label(self.labels.getName(f"label_skip_c_buffer_deref_{Batch}"), "")
@@ -1896,6 +1897,7 @@ def loadBatchedAddress(self, kernel, Batch, tmpSgprResource: ContinuousRegister)
18961897 module.add(SLoadB64(dst=sgpr("AddressC", 2), base=sgpr("AddressC",2), soffset=sgpr(tmpSgpr), comment="load global buffer C address"))
18971898 # Apply batch offset to AddressC for general batched mode
18981899 if not kernel["ProblemType"]["StridedBatched"] and not kernel["ProblemType"]["GroupedGemm"]:
1900+ module.add(SWaitCnt(kmcnt=0, comment="Wait for the Matrix Address Load from the Pointer Array before updating offset"))
18991901 module.add(SAddU64(dst=sgpr("AddressC", 2), src0=sgpr("AddressC", 2), src1=sgpr("BatchOffsetC", 2), comment="add batch offset to C address"))
19001902
19011903 module.add(endCheckLabel)
@@ -1917,7 +1919,9 @@ def loadBatchedAddress(self, kernel, Batch, tmpSgprResource: ContinuousRegister)
19171919 module.add(SLoadB64(dst=sgpr("AddressB", 2), base=sgpr("AddressB",2), soffset=sgpr(tmpSgpr), comment="load global buffer B address"))
19181920 # Apply batch offset to AddressA and AddressB for general batched mode
19191921 if not kernel["ProblemType"]["StridedBatched"] and not kernel["ProblemType"]["GroupedGemm"]:
1922+ module.add(SWaitCnt(kmcnt=1, comment="Wait for the AddressA Load from the Pointer Array before updating offset"))
19201923 module.add(SAddU64(dst=sgpr("AddressA", 2), src0=sgpr("AddressA", 2), src1=sgpr("BatchOffsetA", 2), comment="add batch offset to A address"))
1924+ module.add(SWaitCnt(kmcnt=0, comment="Wait for the AddressB Load from the Pointer Array before updating offset"))
19211925 module.add(SAddU64(dst=sgpr("AddressB", 2), src0=sgpr("AddressB", 2), src1=sgpr("BatchOffsetB", 2), comment="add batch offset to B address"))
19221926
19231927 module.add(endCheckLabel)
0 commit comments