Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
701 changes: 701 additions & 0 deletions docs/designs/ptoas-graph-sync-solver-buf-id-design.md

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions include/PTO/IR/PTOSyncUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ PIPE mapSyncOpTypeToPipe(SyncOpType opType);
/// True if the pipe is a concrete endpoint pipe (not PIPE_ALL/UNASSIGNED).
bool isConcreteSyncPipe(PIPE pipe);

/// Pick a canonical SyncOpType endpoint for a concrete PIPE. This is the
/// inverse of mapSyncOpTypeToPipe up to the fact that mapSyncOpTypeToPipe is
/// many-to-one: e.g. both TVEC and TMOV_M2V map to PIPE_V. The canonical
/// SyncOpType returned here is sufficient for the EmitC pattern matchers,
/// which only care that the high-level attr maps back to the same concrete
/// PIPE. Returns failure() for non-concrete pipes (PIPE_ALL/UNASSIGNED).
FailureOr<SyncOpType> mapPipeToCanonicalSyncOpType(PIPE pipe);

} // namespace pto
} // namespace mlir

Expand Down
42 changes: 37 additions & 5 deletions include/PTO/Transforms/GraphSyncSolver/SyncSolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ class Solver {
// pairs.
llvm::DenseMap<
std::pair<syncsolver::RWOperation *, syncsolver::RWOperation *>,
llvm::SmallVector<std::tuple<CorePipeInfo, CorePipeInfo>>>
llvm::SmallVector<std::tuple<CorePipeInfo, CorePipeInfo, mlir::Value>>>
checkMemoryConflictsMem;

// Set of pipe pairs that were forced to barrier-all (no event ids available).
Expand Down Expand Up @@ -221,9 +221,20 @@ class Solver {
RWOperation *rwOp2);

// Graph-based conflict checking and memory conflict detection helpers.
// `conflictBuffer` identifies the underlying memory the new pair would
// synchronize on; in buf-id mode it is used to filter out existing pairs
// operating on a different buffer (same-pipe transitive coverage via
// those pairs does not hold for independent buf-id counters). `candOp1`
// and `candOp2` are the candidate pair's anchor ops, used to drop
// existing pairs that live in a mutually exclusive scf.if branch
// (different runtime path → cannot transitively cover). nullptr/null
// disables the corresponding filter (preserves the legacy set/wait
// behavior).
bool checkGraphConflict(
Occurrence *occ1, Occurrence *occ2, CorePipeInfo corePipeSrc,
CorePipeInfo corePipeDst, EventIdInfo eventIdInfo,
mlir::Value conflictBuffer = nullptr,
OperationBase *candOp1 = nullptr, OperationBase *candOp2 = nullptr,
std::optional<int> startIndex = {}, std::optional<int> endIndex = {},
const llvm::SmallVector<ConflictPair *> &extraConflictPairs = {},
const llvm::SmallVector<ConflictPair *> &ignoreConflictPairs = {});
Expand All @@ -242,7 +253,13 @@ class Solver {
std::optional<int64_t> lcmLen = {},
std::optional<int64_t> eventIdNum = {});

llvm::SmallVector<std::tuple<CorePipeInfo, CorePipeInfo>>
// Returns one entry per (corePipeSrc, corePipeDst, conflictingBuffer)
// triple. The buffer Value identifies the SSA value of the underlying
// memory that triggered the conflict; multiple buffers conflicting at
// the same pipe-pair yield multiple entries (one ConflictPair per
// buffer downstream). Buffer may be null when the conflict comes from
// a MemInfo lacking a backing SSA value.
llvm::SmallVector<std::tuple<CorePipeInfo, CorePipeInfo, mlir::Value>>
checkMemoryConflicts(RWOperation *rwOp1, RWOperation *rwOp2);

bool checkMemoryConflictBetweenOccExclusive(
Expand Down Expand Up @@ -271,6 +288,15 @@ class Solver {
// pipes/events).
bool checkIntersect(ConflictPair *conflictPair1, ConflictPair *conflictPair2);

// Returns true when two pairs of (op-anchor) endpoints sit in *mutually
// exclusive* branches of any common scf.if ancestor. Mutex pairs cannot
// execute together at runtime, so they don't conflict in coloring (can
// share an id) and they cannot transitively cover each other in
// checkGraphConflict (different runtime paths). `aOp1`/`aOp2` describe
// one pair's endpoints, `bOp1`/`bOp2` the other's.
bool opsMutuallyExclusive(OperationBase *aOp1, OperationBase *aOp2,
OperationBase *bOp1, OperationBase *bOp2);

// Event-id allocation and reuse helpers.
std::vector<ConflictPair *>
getIntersectingConflictPairs(ConflictPair *conflictPair);
Expand Down Expand Up @@ -336,18 +362,24 @@ class Solver {
bool checkReuseMultiBufferFlagId(ConflictPair *conflictPair);

// Primary handler invoked to register/record a found conflict.
// `conflictBuffer` is the SSA Value of the underlying memory shared
// between op1 and op2 (used by buf-id mode to keep ConflictPairs
// distinguishable per buffer; null in legacy callers).
void handleConflict(Occurrence *occ1, Occurrence *occ2, RWOperation *rwOp1,
RWOperation *rwOp2, CorePipeInfo corePipeSrc,
CorePipeInfo corePipeDst, EventIdInfo eventIdInfo,
bool isUseless);
bool isUseless,
mlir::Value conflictBuffer = nullptr);

void handleBarrierConflict(Occurrence *occ1, Occurrence *occ2,
CorePipeInfo corePipeSrc, CorePipeInfo corePipeDst,
bool isUseless);
bool isUseless,
mlir::Value conflictBuffer = nullptr);

void handleSetWaitConflict(Occurrence *occ1, Occurrence *occ2,
CorePipeInfo corePipeSrc, CorePipeInfo corePipeDst,
EventIdInfo eventIdInfo, bool isUseless);
EventIdInfo eventIdInfo, bool isUseless,
mlir::Value conflictBuffer = nullptr);

void handleUnitFlagConflict(Occurrence *occ1, Occurrence *occ2,
CorePipeInfo corePipeSrc,
Expand Down
47 changes: 47 additions & 0 deletions include/PTO/Transforms/GraphSyncSolver/SyncSolverIR.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ enum struct OpType {
SET_FLAG_OP,
WAIT_FLAG_OP,
SW_FLAG_OP_END,
BUF_OP,
GET_BUF_OP,
RLS_BUF_OP,
BUF_OP_END,
SYNC_OP_END,
RW_OPERATION,
MMAD_OPERATION,
Expand Down Expand Up @@ -503,6 +507,49 @@ class BarrierOp : public SyncOp {
std::string str(int indent, bool recursive) const override;
};

// A5-only buffer-id bracket op (get_buf / rls_buf). Unlike SetWaitOp which
// names both src and dst pipes, a BufOp lives on a single concrete pipe — the
// shared bufId on producer and consumer brackets is what enforces ordering.
class BufOp : public SyncOp {
public:
pto::PIPE pipe{pto::PIPE::PIPE_UNASSIGNED};
int64_t bufId{-1};

BufOp(OpType opType, Operation *op, OperationBase *parentOp, pto::PIPE pipe,
int64_t bufId)
: SyncOp(opType, op, parentOp), pipe(pipe), bufId(bufId) {}

static bool classof(const OperationBase *e) {
return e->opType >= OpType::BUF_OP && e->opType < OpType::BUF_OP_END;
}
};

class GetBufOp : public BufOp {
public:
GetBufOp(Operation *op, OperationBase *parentOp, pto::PIPE pipe,
int64_t bufId)
: BufOp(OpType::GET_BUF_OP, op, parentOp, pipe, bufId) {}

static bool classof(const OperationBase *e) {
return e->opType == OpType::GET_BUF_OP;
}

std::string str(int indent, bool recursive) const override;
};

class RlsBufOp : public BufOp {
public:
RlsBufOp(Operation *op, OperationBase *parentOp, pto::PIPE pipe,
int64_t bufId)
: BufOp(OpType::RLS_BUF_OP, op, parentOp, pipe, bufId) {}

static bool classof(const OperationBase *e) {
return e->opType == OpType::RLS_BUF_OP;
}

std::string str(int indent, bool recursive) const override;
};

// Bool comparator for sync ops ordering (used for containers).
bool operator<(const SyncOp &op1, const SyncOp &op2);
} // namespace mlir::pto::syncsolver
Expand Down
22 changes: 22 additions & 0 deletions include/PTO/Transforms/GraphSyncSolver/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ enum SyncMode {
TEST_CROSS_CORE_MODE,
};

// Emission shape for the sync solver output. SET_WAIT is the legacy
// pto.set_flag / pto.wait_flag pairing. BUF_ID is the A5-only buffer-id
// bracketing model (pto.get_buf / pto.rls_buf around each anchor op).
enum class SyncEmitStyle {
SET_WAIT,
BUF_ID,
};

struct SyncSolverOptions {
// Synchronization mode.
const SyncMode syncMode;
Expand All @@ -130,6 +138,9 @@ struct SyncSolverOptions {
// Architecture is register based (A5).
const bool isRegBasedArch;

// Sync emission style. BUF_ID requires A5 (isRegBasedArch).
SyncEmitStyle emitStyle{SyncEmitStyle::SET_WAIT};

// Decompose MMAD L1 ops into simpler ops for better sync handling.
bool decomposeMmadl1Op{false};

Expand Down Expand Up @@ -184,6 +195,8 @@ struct SyncSolverOptions {
return syncMode == SyncMode::TEST_INTRA_CORE_MODE ||
syncMode == SyncMode::TEST_CROSS_CORE_MODE;
}

bool isBufIdEmit() const { return emitStyle == SyncEmitStyle::BUF_ID; }
};

struct Occurrence;
Expand Down Expand Up @@ -319,6 +332,14 @@ struct ConflictPair {
EventIdInfo eventIdInfo;
EventIdNode *eventIdNode{nullptr};

// The underlying memory buffer that this sync edge protects (the SSA Value
// of the shared tile / memref / pointer that op1 and op2 both touch).
// Currently used by buf-id mode to make checkGraphConflict's transitive
// pruning buffer-aware: a covering pair only counts if it operates on the
// same buffer. Null for barriers and for pairs synthesized without a
// specific buffer (e.g. merged backward-sync compensation).
mlir::Value conflictBuffer{nullptr};

ConflictPair(RWOperation *op1, RWOperation *op2, OperationBase *setOp,
OperationBase *waitOp, Occurrence *setOcc, Occurrence *waitOcc,
CorePipeInfo setCorePipeInfo, CorePipeInfo waitCorePipeInfo,
Expand Down Expand Up @@ -363,6 +384,7 @@ struct ConflictPair {
clonedConflictPair->backwardSyncLoopOcc = backwardSyncLoopOcc;
clonedConflictPair->eventIdInfo = eventIdInfo;
clonedConflictPair->eventIdNode = eventIdNode;
clonedConflictPair->conflictBuffer = conflictBuffer;
return clonedConflictPair;
}

Expand Down
6 changes: 6 additions & 0 deletions include/PTO/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ def PTOGraphSyncSolver : Pass<"pto-graph-sync-solver", "func::FuncOp"> {
/*default=*/"8",
"Maximum EVENT_ID slots usable by coloring; this caps the hardware "
"available event-id budget used by the graph solver.">,
Option<"syncStyle", "sync-style", "std::string",
/*default=*/"\"set-wait\"",
"Sync emission style: 'set-wait' (default) or 'buf-id' (A5 only). "
"Under 'buf-id' the solver emits pto.get_buf/pto.rls_buf brackets "
"instead of pto.set_flag/pto.wait_flag, and skips pto.barrier "
"since A5 preserves same-pipe order in hardware.">,
];

let dependentDialects = [
Expand Down
19 changes: 19 additions & 0 deletions lib/PTO/IR/PTOSyncUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,22 @@ PIPE mlir::pto::mapSyncOpTypeToPipe(SyncOpType opType) {
bool mlir::pto::isConcreteSyncPipe(PIPE pipe) {
return pipe != PIPE::PIPE_UNASSIGNED && pipe != PIPE::PIPE_ALL;
}

FailureOr<SyncOpType> mlir::pto::mapPipeToCanonicalSyncOpType(PIPE pipe) {
switch (pipe) {
case PIPE::PIPE_MTE2:
return SyncOpType::TLOAD;
case PIPE::PIPE_MTE3:
return SyncOpType::TSTORE_VEC;
case PIPE::PIPE_FIX:
return SyncOpType::TSTORE_ACC;
case PIPE::PIPE_MTE1:
return SyncOpType::TMOV_M2L;
case PIPE::PIPE_V:
return SyncOpType::TVEC;
case PIPE::PIPE_M:
return SyncOpType::TMATMUL;
default:
return failure();
}
}
31 changes: 31 additions & 0 deletions lib/PTO/Transforms/GraphSyncSolver/PTOGraphSyncSolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,41 @@ struct PTOGraphSyncSolverPass
// handleBarrierConflict() drop the PIPE_V barrier that A5 hardware
// does not support.
const bool isA5 = pto::isTargetArchA5(func.getOperation());

SyncEmitStyle emitStyle;
if (syncStyle == "set-wait") {
emitStyle = SyncEmitStyle::SET_WAIT;
} else if (syncStyle == "buf-id") {
emitStyle = SyncEmitStyle::BUF_ID;
} else {
func.emitError("--graph-sync-solver-sync-style: unknown value '")
<< syncStyle << "', expected 'set-wait' or 'buf-id'";
return signalPassFailure();
}
if (emitStyle == SyncEmitStyle::BUF_ID && !isA5) {
func.emitError(
"--graph-sync-solver-sync-style=buf-id requires --pto-arch=a5; "
"get_buf/rls_buf are only available on A5");
return signalPassFailure();
}

SyncSolverOptions opts(SyncMode::INTRA_CORE_SYNC,
/*isMemBasedArch=*/!isA5,
/*isRegBasedArch=*/isA5);
opts.eventIdNumMax = eventIdNumMax;
opts.emitStyle = emitStyle;
if (emitStyle == SyncEmitStyle::BUF_ID) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Skip same-pipe barriers in buf-id mode

When buf-id is selected here, same-pipe hazards are still sent through the existing handleBarrierConflict path; that path only suppresses A5 barriers for PIPE_V/PIPE_M, so legal A5 cases such as two TLOAD/MTE2 operations on the same buffer can still produce pto.barrier even though this option and the pass docs promise that buf-id/A5 emits no barriers. This makes the new mode generate unsupported/unwanted sync for same-pipe MTE2/MTE3/FIX conflicts unless the option also disables barrier conflicts for all pipes in buf-id mode.

Useful? React with 👍 / 👎.

// Constraint 4 ("different pipe pairs should not share an id") — in
// buf-id mode we conservatively disable the cross-pipe-pair id reuse
// optimization that the set-wait flow runs by default for intra-core.
opts.reuseSyncPairToSaveEventIds = false;
// buf-id is a sequential programming model; the hw (get_cnt, rel_cnt)
// counters handle loop-carried sync natively, so the set/wait backward-
// sync hoisting / merging optimizations are unnecessary (and would
// break the in-loop bracket form).
opts.considerOuterBackwardSyncPairs = false;
opts.moveOutAndMergeBackwardSyncPairs = false;
}
auto translator = std::make_unique<IRTranslator>(func, opts);

// Trivial / empty function bodies have nothing to solve.
Expand Down
Loading
Loading