Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions quadrants/program/compile_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ CompileConfig::CompileConfig() {
max_vector_width = 8;
debug = false;
cfg_optimization = true;
cfg_optimization_per_task = true;
check_out_of_bound = false;
serial_schedule = false;
simplify_before_lower_access = true;
Expand Down
9 changes: 9 additions & 0 deletions quadrants/program/compile_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ struct CompileConfig {
Arch arch;
bool debug;
bool cfg_optimization;
// When true (default), cfg_optimization runs only AFTER offloading and is scoped to each offloaded task
// independently (a separate control-flow graph per task) instead of one whole-kernel graph spanning all tasks;
// the expensive whole-kernel cfg in the pre-offload phase is ditched entirely. Each offloaded task is a
// separate device launch, so cross-task store-to-load forwarding of registers is impossible anyway, and global
// memory is treated conservatively (live-in and live-out of every task) by the existing CFG boundary seeding
// -- so this is semantics-preserving (cfg_optimization is an optimization, not a correctness pass) while making
// the super-linear reaching-definition / forwarding analyses ~linear in total IR instead of super-linear in
// the combined monolithic kernel IR. Set false to restore the whole-kernel pre+post-offload behaviour.
bool cfg_optimization_per_task{true};
bool check_out_of_bound;
bool validate_autodiff;
int simd_width;
Expand Down
1 change: 1 addition & 0 deletions quadrants/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ void export_lang(py::module &m) {
.def_readwrite("print_ir_dbg_info", &CompileConfig::print_ir_dbg_info)
.def_readwrite("debug", &CompileConfig::debug)
.def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization)
.def_readwrite("cfg_optimization_per_task", &CompileConfig::cfg_optimization_per_task)
.def_readwrite("check_out_of_bound", &CompileConfig::check_out_of_bound)
.def_readwrite("print_accessor_ir", &CompileConfig::print_accessor_ir)
.def_readwrite("use_llvm", &CompileConfig::use_llvm)
Expand Down
118 changes: 116 additions & 2 deletions quadrants/transforms/cfg_optimization.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "quadrants/ir/ir.h"
#include "quadrants/ir/statements.h"
#include "quadrants/ir/control_flow_graph.h"
#include "quadrants/ir/transforms.h"
#include "quadrants/ir/analysis.h"
Expand All @@ -8,6 +9,76 @@
namespace quadrants::lang {

namespace irpass {

namespace {

// Collect the top-level offloaded tasks of |root| iff |root| is an already-offloaded kernel body, i.e. a Block
// whose statements are all OffloadedStmt. Returns an empty vector otherwise (pre-offload IR, function bodies,
// non-Block roots). This is what lets the caller tell "post-offload" (run per-task cfg) from "pre-offload /
// other" (ditch cfg, under cfg_optimization_per_task), since "an offloaded task" only exists post-offload.
std::vector<OffloadedStmt *> collect_offloaded_tasks(IRNode *root) {
std::vector<OffloadedStmt *> tasks;
auto *block = root->cast<Block>();
if (block == nullptr || block->statements.empty()) {
return tasks;
}
for (auto &stmt : block->statements) {
if (!stmt->is<OffloadedStmt>()) {
return {}; // not a pure offloaded kernel body -> whole-kernel path
}
}
for (auto &stmt : block->statements) {
tasks.push_back(stmt->as<OffloadedStmt>());
}
return tasks;
}

// Build and optimize a control-flow graph for a SINGLE offloaded task, scoped to that task alone.
//
// The task is temporarily moved into a throwaway wrapper block and run through the normal Block ->
// OffloadedStmt CFG construction, then moved back, leaving the IR shape unchanged. Building through a wrapper
// (instead of stitching together per-sub-block CFGs) is what makes this correct: the resulting CFG is
// byte-for-byte the slice that the whole-kernel CFG would build for this one task -- including the offloaded
// for-body's implicit-loop `continue` edges (which are wired by visit(OffloadedStmt), not by visit(Block)), the
// prologue/body/epilogue chaining, and the body's is_parallel_executed flag. Optimizing each sub-block in
// isolation would drop the `continue` loop-back edges and wrongly dead-store-eliminate a global store that
// precedes a `continue` (regression caught by test_cfg_continue).
//
// Scoping the analyses to one task is semantics-preserving because each offloaded task is a separate device
// launch and the existing CFG boundary seeding is conservative across the launch boundary:
// reaching_definition_analysis seeds the start node with all global pointers ("may already hold data") and
// live_variable_analysis seeds the final node with all global store destinations ("may be read later"). With
// the CFG spanning only one task, every global address -- fields, external tensors, and the global-temporary
// buffer that carries scalars between tasks -- is therefore treated as live-in and live-out of the task, so no
// store a sibling task may read is eliminated and no value is forwarded across a task (device-launch) boundary.
bool optimize_one_task(Block *parent,
OffloadedStmt *off,
bool after_lower_access,
bool autodiff_enabled,
const std::optional<ControlFlowGraph::LiveVarAnalysisConfig> &lva_config_opt) {
const int location = parent->locate(off);
QD_ASSERT(location != -1);
Block wrapper;
wrapper.insert(parent->extract(off));
bool modified = false;
{
// |cfg| holds raw pointers into |wrapper| (its container nodes) and into the task's own sub-blocks; keep
// both alive until the analyses are done, then move the task back before |wrapper| leaves scope.
auto cfg = analysis::build_cfg(&wrapper);
cfg->simplify_graph();
if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) {
modified = true;
}
if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) {
modified = true;
}
}
parent->insert(wrapper.extract(off), location);
return modified;
}

} // namespace

bool cfg_optimization(const CompileConfig &config,
IRNode *root,
bool after_lower_access,
Expand All @@ -17,10 +88,53 @@ bool cfg_optimization(const CompileConfig &config,
const std::string &kernel_name,
const std::string &phase) {
QD_AUTO_PROF;
auto cfg = analysis::build_cfg(root);

const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data());
bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";
const bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";

// Per-offloaded-task scoping. Once the kernel is offloaded we optimize each task's CFG independently; and we
// deliberately DITCH the expensive whole-kernel cfg_optimization in the pre-offload phase, relying on the
// post-offload per-task cfg below to do the store-to-load forwarding + dead-store elimination once tasks
// exist. The expensive (super-linear) reaching-definition / forwarding analyses otherwise run on the monolithic
// pre-offload kernel IR -- where there are no tasks to scope to -- and dominate compile time. cfg_optimization
// is an optimization, not a correctness pass, so dropping it pre-offload is safe; the only thing lost is
// cross-task forwarding/DSE on the monolithic IR, which is invalid across separate device launches anyway.
// QD_DUMP_CFG forces the whole-kernel path so the full graph can still be dumped for debugging.
if (config.cfg_optimization_per_task && !dump_cfg) {
auto tasks = collect_offloaded_tasks(root);
if (!tasks.empty()) {
// Post-offload: per-task store-to-load forwarding + dead-store elimination (skipped for the real-matrix
// path, matching the whole-kernel path which runs no analyses there).
bool result_modified = false;
if (!real_matrix_enabled) {
auto *block = root->as<Block>();
for (auto *off : tasks) {
result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt);
}
}
// TODO: implement cfg->dead_instruction_elimination()
die(root); // remove unused allocas across the whole kernel
return result_modified;
}
// No offloaded tasks yet. Within compile_to_offloads these are the pre-offload full_simplify calls on the
// monolithic kernel IR (the phases below, all *before* irpass::offload): their whole-kernel cfg is the
// (super-linear) reaching-definition / store-to-load analysis that dominates compile time, and it is
// redundant because the post-offload per-task cfg ("simplify_III" onward) redoes the intra-task
// store-to-load forwarding + dead-store elimination once tasks exist. So for exactly those phases we ditch
// cfg, keeping only the cheap dead-alloca cleanup. For ANY other caller of full_simplify on non-offloaded
// IR (unit tests, standalone blocks / function bodies that are never offloaded), we must still run the
// whole-kernel cfg below, or its forwarding/DSE would be silently lost -- so we fall through.
const bool pre_offload_compile_phase =
phase == "simplify_I" || phase == "simplify_II" || phase == "pre_autodiff" || phase == "post_autodiff";
if (pre_offload_compile_phase) {
die(root);
return false;
}
// else: fall through to the whole-kernel cfg path below.
}

auto cfg = analysis::build_cfg(root);

if (dump_cfg) {
std::string suffix = phase.empty() ? "_before_cfg_opt" : ("_" + phase + "_before_cfg_opt");
cfg->dump_graph_to_file(config, kernel_name, suffix);
Expand Down
Loading