Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/transform/lower_tile_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1247,6 +1247,20 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
has_non_local_store = true;
}
}
} else if (call->op.same_as(builtin::call_extern())) {
Comment thread
LeiWang1999 marked this conversation as resolved.
Outdated
// call_extern may pass address_of(non-local-buffer) pointers
for (const auto &arg : call->args) {
if (auto ic = arg.as<CallNode>()) {
if (ic->op.same_as(builtin::address_of()) &&
!ic->args.empty()) {
if (auto bl = ic->args[0].as<BufferLoadNode>()) {
if (!IsLocalBuffer(bl->buffer)) {
has_non_local_store = true;
}
}
}
}
}
}
}
});
Expand Down
46 changes: 46 additions & 0 deletions src/transform/producer_consumer_ws.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,52 @@ class ProducerConsumerWSRewriter : public StmtExprMutator {
++access_group_idx;
}

// --- Adjust wait positions for SIMT/cp.async producers ---
// SIMT producers (global→shared via cp.async) tie their completion to ALL
// forward barriers. The consumer must therefore wait on a forward barrier
// BEFORE it reads any SIMT-produced shared buffer. If the current
// wait_insert_pos is too late (i.e. the consumer reads the SIMT-produced
// buffer before the earliest TMA wait), we must pull ALL waits earlier.
if (has_simt_producer || has_cp_async_producer) {
int earliest_simt_read = static_cast<int>(consumer_compute_stmts.size());
for (size_t i = 0; i < flat_stmts.size(); ++i) {
if (kinds[i] != TileStmtKind::kSimtProducer &&
kinds[i] != TileStmtKind::kCpAsyncProducer) {
continue;
}
// Collect shared buffers written by this SIMT/cp.async producer.
std::unordered_set<const VarNode *> written_vars;
PostOrderVisit(flat_stmts[i], [&](const ObjectRef &obj) {
if (const auto *store = obj.as<BufferStoreNode>()) {
if (IsSharedBuffer(store->buffer)) {
written_vars.insert(store->buffer->data.get());
}
}
});
// Find first consumer read of any of these buffers.
for (size_t ci = 0; ci < consumer_compute_stmts.size(); ++ci) {
bool found = false;
PostOrderVisit(consumer_compute_stmts[ci], [&](const ObjectRef &obj) {
if (found) return;
if (const auto *load = obj.as<BufferLoadNode>()) {
if (written_vars.count(load->buffer->data.get())) {
found = true;
}
}
});
if (found) {
earliest_simt_read =
std::min(earliest_simt_read, static_cast<int>(ci));
break;
}
}
}
// Pull all wait positions earlier if needed.
for (int g = 0; g < num_producer_groups; ++g) {
wait_insert_pos[g] = std::min(wait_insert_pos[g], earliest_simt_read);
}
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// --- Determine if TMA barriers can be merged ---
// When all pure-TMA producers wait at the same consumer position and
// release at the same position, forward and back-pressure barriers can
Expand Down
Loading