Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
659 changes: 659 additions & 0 deletions docs/designs/ptoas-graph-sync-solver-buf-id-design.md

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions include/PTO/IR/PTOSyncUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ PIPE mapSyncOpTypeToPipe(SyncOpType opType);
/// True if the pipe is a concrete endpoint pipe (not PIPE_ALL/UNASSIGNED).
bool isConcreteSyncPipe(PIPE pipe);

/// Pick a canonical SyncOpType endpoint for a concrete PIPE. This is the
/// inverse of mapSyncOpTypeToPipe up to the fact that mapSyncOpTypeToPipe is
/// many-to-one: e.g. both TVEC and TMOV_M2V map to PIPE_V. The canonical
/// SyncOpType returned here is sufficient for the EmitC pattern matchers,
/// which only care that the high-level attr maps back to the same concrete
/// PIPE. Returns failure() for non-concrete pipes (PIPE_ALL/UNASSIGNED).
FailureOr<SyncOpType> mapPipeToCanonicalSyncOpType(PIPE pipe);

} // namespace pto
} // namespace mlir

Expand Down
47 changes: 47 additions & 0 deletions include/PTO/Transforms/GraphSyncSolver/SyncSolverIR.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ enum struct OpType {
SET_FLAG_OP,
WAIT_FLAG_OP,
SW_FLAG_OP_END,
BUF_OP,
GET_BUF_OP,
RLS_BUF_OP,
BUF_OP_END,
SYNC_OP_END,
RW_OPERATION,
MMAD_OPERATION,
Expand Down Expand Up @@ -503,6 +507,49 @@ class BarrierOp : public SyncOp {
std::string str(int indent, bool recursive) const override;
};

// A5-only buffer-id bracket op (get_buf / rls_buf). Unlike SetWaitOp which
// names both src and dst pipes, a BufOp lives on a single concrete pipe — the
// shared bufId on producer and consumer brackets is what enforces ordering.
class BufOp : public SyncOp {
public:
pto::PIPE pipe{pto::PIPE::PIPE_UNASSIGNED};
int64_t bufId{-1};

BufOp(OpType opType, Operation *op, OperationBase *parentOp, pto::PIPE pipe,
int64_t bufId)
: SyncOp(opType, op, parentOp), pipe(pipe), bufId(bufId) {}

static bool classof(const OperationBase *e) {
return e->opType >= OpType::BUF_OP && e->opType < OpType::BUF_OP_END;
}
};

class GetBufOp : public BufOp {
public:
GetBufOp(Operation *op, OperationBase *parentOp, pto::PIPE pipe,
int64_t bufId)
: BufOp(OpType::GET_BUF_OP, op, parentOp, pipe, bufId) {}

static bool classof(const OperationBase *e) {
return e->opType == OpType::GET_BUF_OP;
}

std::string str(int indent, bool recursive) const override;
};

class RlsBufOp : public BufOp {
public:
RlsBufOp(Operation *op, OperationBase *parentOp, pto::PIPE pipe,
int64_t bufId)
: BufOp(OpType::RLS_BUF_OP, op, parentOp, pipe, bufId) {}

static bool classof(const OperationBase *e) {
return e->opType == OpType::RLS_BUF_OP;
}

std::string str(int indent, bool recursive) const override;
};

// Bool comparator for sync ops ordering (used for containers).
bool operator<(const SyncOp &op1, const SyncOp &op2);
} // namespace mlir::pto::syncsolver
Expand Down
13 changes: 13 additions & 0 deletions include/PTO/Transforms/GraphSyncSolver/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ enum SyncMode {
TEST_CROSS_CORE_MODE,
};

// Emission shape for the sync solver output. SET_WAIT is the legacy
// pto.set_flag / pto.wait_flag pairing. BUF_ID is the A5-only buffer-id
// bracketing model (pto.get_buf / pto.rls_buf around each anchor op).
enum class SyncEmitStyle {
SET_WAIT,
BUF_ID,
};

struct SyncSolverOptions {
// Synchronization mode.
const SyncMode syncMode;
Expand All @@ -130,6 +138,9 @@ struct SyncSolverOptions {
// Architecture is register based (A5).
const bool isRegBasedArch;

// Sync emission style. BUF_ID requires A5 (isRegBasedArch).
SyncEmitStyle emitStyle{SyncEmitStyle::SET_WAIT};

// Decompose MMAD L1 ops into simpler ops for better sync handling.
bool decomposeMmadl1Op{false};

Expand Down Expand Up @@ -184,6 +195,8 @@ struct SyncSolverOptions {
return syncMode == SyncMode::TEST_INTRA_CORE_MODE ||
syncMode == SyncMode::TEST_CROSS_CORE_MODE;
}

bool isBufIdEmit() const { return emitStyle == SyncEmitStyle::BUF_ID; }
};

struct Occurrence;
Expand Down
6 changes: 6 additions & 0 deletions include/PTO/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ def PTOGraphSyncSolver : Pass<"pto-graph-sync-solver", "func::FuncOp"> {
/*default=*/"8",
"Maximum EVENT_ID slots usable by coloring; this caps the hardware "
"available event-id budget used by the graph solver.">,
Option<"syncStyle", "sync-style", "std::string",
/*default=*/"\"set-wait\"",
"Sync emission style: 'set-wait' (default) or 'buf-id' (A5 only). "
"Under 'buf-id' the solver emits pto.get_buf/pto.rls_buf brackets "
"instead of pto.set_flag/pto.wait_flag, and skips pto.barrier "
"since A5 preserves same-pipe order in hardware.">,
];

let dependentDialects = [
Expand Down
19 changes: 19 additions & 0 deletions lib/PTO/IR/PTOSyncUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,22 @@ PIPE mlir::pto::mapSyncOpTypeToPipe(SyncOpType opType) {
bool mlir::pto::isConcreteSyncPipe(PIPE pipe) {
return pipe != PIPE::PIPE_UNASSIGNED && pipe != PIPE::PIPE_ALL;
}

FailureOr<SyncOpType> mlir::pto::mapPipeToCanonicalSyncOpType(PIPE pipe) {
switch (pipe) {
case PIPE::PIPE_MTE2:
return SyncOpType::TLOAD;
case PIPE::PIPE_MTE3:
return SyncOpType::TSTORE_VEC;
case PIPE::PIPE_FIX:
return SyncOpType::TSTORE_ACC;
case PIPE::PIPE_MTE1:
return SyncOpType::TMOV_M2L;
case PIPE::PIPE_V:
return SyncOpType::TVEC;
case PIPE::PIPE_M:
return SyncOpType::TMATMUL;
default:
return failure();
}
}
31 changes: 31 additions & 0 deletions lib/PTO/Transforms/GraphSyncSolver/PTOGraphSyncSolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,41 @@ struct PTOGraphSyncSolverPass
// handleBarrierConflict() drop the PIPE_V barrier that A5 hardware
// does not support.
const bool isA5 = pto::isTargetArchA5(func.getOperation());

SyncEmitStyle emitStyle;
if (syncStyle == "set-wait") {
emitStyle = SyncEmitStyle::SET_WAIT;
} else if (syncStyle == "buf-id") {
emitStyle = SyncEmitStyle::BUF_ID;
} else {
func.emitError("--graph-sync-solver-sync-style: unknown value '")
<< syncStyle << "', expected 'set-wait' or 'buf-id'";
return signalPassFailure();
}
if (emitStyle == SyncEmitStyle::BUF_ID && !isA5) {
func.emitError(
"--graph-sync-solver-sync-style=buf-id requires --pto-arch=a5; "
"get_buf/rls_buf are only available on A5");
return signalPassFailure();
}

SyncSolverOptions opts(SyncMode::INTRA_CORE_SYNC,
/*isMemBasedArch=*/!isA5,
/*isRegBasedArch=*/isA5);
opts.eventIdNumMax = eventIdNumMax;
opts.emitStyle = emitStyle;
if (emitStyle == SyncEmitStyle::BUF_ID) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Skip same-pipe barriers in buf-id mode

When buf-id is selected here, same-pipe hazards are still sent through the existing handleBarrierConflict path; that path only suppresses A5 barriers for PIPE_V/PIPE_M, so legal A5 cases such as two TLOAD/MTE2 operations on the same buffer can still produce pto.barrier even though this option and the pass docs promise that buf-id/A5 emits no barriers. This makes the new mode generate unsupported/unwanted sync for same-pipe MTE2/MTE3/FIX conflicts unless the option also disables barrier conflicts for all pipes in buf-id mode.

Useful? React with 👍 / 👎.

// Constraint 4 ("different pipe pairs should not share an id") — in
// buf-id mode we conservatively disable the cross-pipe-pair id reuse
// optimization that the set-wait flow runs by default for intra-core.
opts.reuseSyncPairToSaveEventIds = false;
// buf-id is a sequential programming model; the hw (get_cnt, rel_cnt)
// counters handle loop-carried sync natively, so the set/wait backward-
// sync hoisting / merging optimizations are unnecessary (and would
// break the in-loop bracket form).
opts.considerOuterBackwardSyncPairs = false;
opts.moveOutAndMergeBackwardSyncPairs = false;
}
auto translator = std::make_unique<IRTranslator>(func, opts);

// Trivial / empty function bodies have nothing to solve.
Expand Down
121 changes: 120 additions & 1 deletion lib/PTO/Transforms/GraphSyncSolver/SyncSolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,30 @@ bool Solver::checkIntersect(ConflictPair *conflictPair1,
if (options.isCrossCoreMode()) {
return checkSyncOpsConflicts(conflictPair1, conflictPair2);
}
if (options.isBufIdEmit()) {
// Two buf-id brackets must use different ids whenever they share any
// pipe — same-id same-pipe re-entry is illegal per the buf-id spec
// constraint 1 ("两次连续 get(P,#id) 非法"). Unlike set-wait, having
// different dst pipes (or different src pipes) is NOT enough to share
// an id: e.g. (MTE2 -> V, id 0) and (V -> MTE3, id 0) both bracket the
// common V anchor with id 0, producing back-to-back `get_buf(V, #0)`
// before the V op, which the spec forbids. The range-based set-wait
// overlap check doesn't apply because buf-id brackets extend strictly
// around the anchor RWOperation (a single index), not across the
// [setOp.endIndex, waitOp.startIndex] gap.
auto pipes1 = std::array{conflictPair1->setCorePipeInfo.pipe,
conflictPair1->waitCorePipeInfo.pipe};
auto pipes2 = std::array{conflictPair2->setCorePipeInfo.pipe,
conflictPair2->waitCorePipeInfo.pipe};
for (auto p1 : pipes1) {
for (auto p2 : pipes2) {
if (p1 == p2) {
return true;
}
}
}
return false;
}
if (conflictPair1->setCorePipeInfo != conflictPair2->setCorePipeInfo ||
conflictPair1->waitCorePipeInfo != conflictPair2->waitCorePipeInfo) {
return false;
Expand Down Expand Up @@ -1521,7 +1545,13 @@ bool Solver::reuseConflictPair(ConflictPair *conflictPair,

std::unique_ptr<EventIdSolver> &
Solver::getEventIdSolverRef(pto::PIPE pipeSrc, pto::PIPE pipeDst) {
if (options.isCrossCoreMode()) {
if (options.isCrossCoreMode() || options.isBufIdEmit()) {
// Cross-core mode shares one event-id pool across all pipe pairs.
// Buf-id mode does the same: get_buf/rls_buf don't carry the "other"
// pipe in the opcode, so the same numeric id on two different pipe-
// pairs would alias to the same hw scoreboard and the resulting
// bracketing pattern would violate constraint 1 of the buf-id spec
// (two consecutive get(pipe, #id) before a single op is illegal).
pipeSrc = pto::PIPE::PIPE_UNASSIGNED;
pipeDst = pto::PIPE::PIPE_UNASSIGNED;
}
Expand Down Expand Up @@ -2224,20 +2254,101 @@ SyncBeforeAfterMap Solver::getBeforeAfterSyncMaps() {
conflictPairs.push_back(conflictPair.get());
}

// Buf-id mirror-image deduplication.
//
// The forward and backward memory hazards for the same (producer op,
// consumer op) pair across pipes generate two distinct ConflictPairs:
// F: setOp=A@Pa, waitOp=B@Pb (forward, intra-iter)
// B: setOp=B@Pb, waitOp=A@Pa (backward, loop-carried)
// In set-wait this is necessary — F's pair fires within an iteration,
// B's pair fires across the loop boundary. In buf-id the two pairs would
// bracket the same (op, pipe) anchor set with two different ids,
// duplicating brackets needlessly: the scoreboard counter ordering
// produced by one bracket pair already enforces both deps (per doc
// §1.2's canonical "for { load; vector }" example). Drop one of the
// mirror images so we emit a single bracket pair per anchor set.
//
// We keep the lower-id ConflictPair (deterministic, and tends to keep
// the forward pair which was usually allocated first).
llvm::DenseSet<ConflictPair *> bufIdRedundantMirror;
if (options.isBufIdEmit()) {
using AnchorEnd = std::pair<OperationBase *, pto::PIPE>;
using AnchorKey = std::pair<AnchorEnd, AnchorEnd>;
llvm::DenseMap<AnchorKey, ConflictPair *> seen;
for (auto *cp : conflictPairs) {
if (cp->isUseless || cp->replacedWithUnitFlag || cp->isBarrier())
continue;
AnchorEnd a{cp->setOp, cp->setCorePipeInfo.pipe};
AnchorEnd b{cp->waitOp, cp->waitCorePipeInfo.pipe};
if (b < a)
std::swap(a, b);
AnchorKey key{a, b};
auto [it, inserted] = seen.try_emplace(key, cp);
if (!inserted) {
// Pick the smaller id as the keeper so output is stable.
if (cp->id < it->second->id) {
bufIdRedundantMirror.insert(it->second);
it->second = cp;
} else {
bufIdRedundantMirror.insert(cp);
}
}
}
}

for (auto *conflictPair : conflictPairs) {
if (conflictPair->isUseless) {
continue;
}
if (conflictPair->replacedWithUnitFlag) {
continue;
}
if (bufIdRedundantMirror.contains(conflictPair)) {
continue;
}
assert(conflictPair->setOp != nullptr && conflictPair->waitOp != nullptr);
if (conflictPair->isBarrier()) {
// A5 hardware preserves same-pipe order, so buf-id mode drops barriers
// entirely. set-wait keeps the existing pto.barrier path.
if (options.isBufIdEmit()) {
continue;
}
auto barrierOp = std::make_unique<BarrierOp>(
conflictPair->waitOp->op, conflictPair->waitOp->parentOp,
conflictPair->waitCorePipeInfo.pipe);
LLVM_DEBUG(barrierOp->debugId = conflictPair->id);
syncMapBefore[conflictPair->waitOp].push_back(std::move(barrierOp));
} else if (options.isBufIdEmit()) {
assert(conflictPair->eventIdNode != nullptr);
auto srcPipe = conflictPair->setCorePipeInfo.pipe;
auto dstPipe = conflictPair->waitCorePipeInfo.pipe;
// Producer-side bracket on src pipe, consumer-side bracket on dst pipe.
// The shared bufId between both brackets is what enforces ordering — the
// hw scoreboard serializes same-id get/rel across pipes.
for (int64_t bufId : conflictPair->eventIdNode->getEventIds()) {
auto getProd = std::make_unique<GetBufOp>(
conflictPair->setOp->op, conflictPair->setOp->parentOp, srcPipe,
bufId);
auto rlsProd = std::make_unique<RlsBufOp>(
conflictPair->setOp->op, conflictPair->setOp->parentOp, srcPipe,
bufId);
auto getCons = std::make_unique<GetBufOp>(
conflictPair->waitOp->op, conflictPair->waitOp->parentOp, dstPipe,
bufId);
auto rlsCons = std::make_unique<RlsBufOp>(
conflictPair->waitOp->op, conflictPair->waitOp->parentOp, dstPipe,
bufId);
LLVM_DEBUG({
getProd->debugId = conflictPair->id;
rlsProd->debugId = conflictPair->id;
getCons->debugId = conflictPair->id;
rlsCons->debugId = conflictPair->id;
});
syncMapBefore[conflictPair->setOp].push_back(std::move(getProd));
syncMapAfter[conflictPair->setOp].push_back(std::move(rlsProd));
syncMapBefore[conflictPair->waitOp].push_back(std::move(getCons));
syncMapAfter[conflictPair->waitOp].push_back(std::move(rlsCons));
}
} else {
assert(conflictPair->eventIdNode != nullptr);
auto setOp = std::make_unique<SetFlagOp>(
Expand Down Expand Up @@ -2268,6 +2379,14 @@ SyncBeforeAfterMap Solver::getBeforeAfterSyncMaps() {
}
}

// In buf-id mode the producer/consumer brackets sit at their natural
// anchors and the hw scoreboard counters handle loop-carried (backward)
// sync without any out-of-loop compensation. Skip the backward-sync hoist
// pipeline that exists to manage set/wait strict pairing.
if (options.isBufIdEmit()) {
return std::make_pair(std::move(syncMapBefore), std::move(syncMapAfter));
}

collectBackwardSyncEventIds();
mergeBackwardSyncPairs(syncMapBefore, syncMapAfter);

Expand Down
Loading
Loading