From b68642cef91ecd29b5766a2a59c5b1cc26860b35 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 7 Jun 2026 17:33:45 -0700 Subject: [PATCH 1/4] Scope cfg_optimization per offloaded task (post-offload) Once a kernel is offloaded, cfg_optimization now builds a separate control-flow graph per offloaded task (per sub-block, with the correct parallel-execution flag) instead of one whole-kernel CFG spanning all tasks. store-to-load forwarding and dead-store elimination then run scoped to each task. This is semantics-preserving: each offloaded task is a separate device launch, so cross-task register forwarding is impossible anyway, and global memory (fields, external tensors, and the global-temporary buffer that carries values between tasks) is treated conservatively as live-in/live-out of every task by the existing CFG boundary seeding (reaching-def start-node seed + live-var final-node seed). The win is compile time: the reaching-definition / live- variable dataflow becomes ~linear in total IR rather than super-linear in the combined whole-kernel IR, which blows up for kernels that pack many stages into one @qd.kernel. Gated by CompileConfig::cfg_optimization_per_task (default true; env QD_CFG_OPTIMIZATION_PER_TASK / qd.init kwarg). Pre-offload IR, function bodies, and the real-matrix path fall back to the whole-kernel CFG unchanged. --- quadrants/analysis/build_cfg.cpp | 7 +- quadrants/ir/analysis.h | 6 +- quadrants/program/compile_config.cpp | 1 + quadrants/program/compile_config.h | 8 +++ quadrants/python/export_lang.cpp | 1 + quadrants/transforms/cfg_optimization.cpp | 88 +++++++++++++++++++++++ 6 files changed, 107 insertions(+), 4 deletions(-) diff --git a/quadrants/analysis/build_cfg.cpp b/quadrants/analysis/build_cfg.cpp index fb906b7b96..b4f4a38f12 100644 --- a/quadrants/analysis/build_cfg.cpp +++ b/quadrants/analysis/build_cfg.cpp @@ -420,8 +420,9 @@ class CFGBuilder : public IRVisitor { current_stmt_id_ = backup_stmt_id; } - static std::unique_ptr run(IRNode *root) { + static std::unique_ptr run(IRNode *root, bool root_in_parallel_for) { CFGBuilder builder; + builder.in_parallel_for_ = root_in_parallel_for; root->accept(&builder); if (!builder.graph_->nodes[builder.graph_->final_node]->empty()) { // Make the final node empty (by adding an empty final node). @@ -448,8 +449,8 @@ class CFGBuilder : public IRVisitor { }; namespace irpass::analysis { -std::unique_ptr build_cfg(IRNode *root) { - return CFGBuilder::run(root); +std::unique_ptr build_cfg(IRNode *root, bool root_in_parallel_for) { + return CFGBuilder::run(root, root_in_parallel_for); } } // namespace irpass::analysis diff --git a/quadrants/ir/analysis.h b/quadrants/ir/analysis.h index 4fbff436e5..63bd5d50a0 100644 --- a/quadrants/ir/analysis.h +++ b/quadrants/ir/analysis.h @@ -72,7 +72,11 @@ namespace analysis { */ AliasResult alias_analysis(Stmt *var1, Stmt *var2); -std::unique_ptr build_cfg(IRNode *root); +// |root_in_parallel_for| seeds the builder's parallel-execution context. It is true when |root| is the body +// block of an offloaded range_for/struct_for/mesh_for task built in isolation (see per-task cfg_optimization), +// so that nodes inside it are correctly flagged is_parallel_executed even though they are not visited through +// the enclosing OffloadedStmt. Defaults to false (whole-kernel / serial builds), preserving prior behaviour. +std::unique_ptr build_cfg(IRNode *root, bool root_in_parallel_for = false); void check_fields_registered(IRNode *root); std::unique_ptr clone(IRNode *root); std::unique_ptr clone(Stmt *root); diff --git a/quadrants/program/compile_config.cpp b/quadrants/program/compile_config.cpp index 5034fd1822..71e44b378d 100644 --- a/quadrants/program/compile_config.cpp +++ b/quadrants/program/compile_config.cpp @@ -21,6 +21,7 @@ CompileConfig::CompileConfig() { max_vector_width = 8; debug = false; cfg_optimization = true; + cfg_optimization_per_task = true; check_out_of_bound = false; serial_schedule = false; simplify_before_lower_access = true; diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h index 92c09a5fd8..2001a93331 100644 --- a/quadrants/program/compile_config.h +++ b/quadrants/program/compile_config.h @@ -9,6 +9,14 @@ struct CompileConfig { Arch arch; bool debug; bool cfg_optimization; + // When true (default), cfg_optimization scopes its store-to-load forwarding and dead-store elimination to + // each offloaded task independently once the kernel has been offloaded, instead of running one whole-kernel + // control-flow graph spanning all tasks. Each offloaded task is a separate device launch, so cross-task + // store-to-load forwarding of registers is impossible anyway, and global memory is treated conservatively + // (live-in and live-out of every task) by the existing CFG boundary seeding -- so the per-task scoping is + // semantics-preserving while making the dataflow analyses ~linear in total IR instead of super-linear in the + // combined whole-kernel IR. Set false to restore the whole-kernel behaviour. + bool cfg_optimization_per_task{true}; bool check_out_of_bound; bool validate_autodiff; int simd_width; diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp index a440e2a798..38934dcdde 100644 --- a/quadrants/python/export_lang.cpp +++ b/quadrants/python/export_lang.cpp @@ -164,6 +164,7 @@ void export_lang(py::module &m) { .def_readwrite("print_ir_dbg_info", &CompileConfig::print_ir_dbg_info) .def_readwrite("debug", &CompileConfig::debug) .def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization) + .def_readwrite("cfg_optimization_per_task", &CompileConfig::cfg_optimization_per_task) .def_readwrite("check_out_of_bound", &CompileConfig::check_out_of_bound) .def_readwrite("print_accessor_ir", &CompileConfig::print_accessor_ir) .def_readwrite("use_llvm", &CompileConfig::use_llvm) diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp index 66b4a14067..e47d249f39 100644 --- a/quadrants/transforms/cfg_optimization.cpp +++ b/quadrants/transforms/cfg_optimization.cpp @@ -1,4 +1,5 @@ #include "quadrants/ir/ir.h" +#include "quadrants/ir/statements.h" #include "quadrants/ir/control_flow_graph.h" #include "quadrants/ir/transforms.h" #include "quadrants/ir/analysis.h" @@ -8,6 +9,59 @@ namespace quadrants::lang { namespace irpass { + +namespace { + +// Collect the top-level offloaded tasks of |root| iff |root| is an already-offloaded kernel body, i.e. a Block +// whose statements are all OffloadedStmt. Returns an empty vector otherwise (pre-offload IR, function bodies, +// non-Block roots), in which case the caller falls back to the whole-kernel CFG. This is what makes the +// per-task path activate only post-offload, where the notion of "an offloaded task" exists. +std::vector collect_offloaded_tasks(IRNode *root) { + std::vector tasks; + auto *block = root->cast(); + if (block == nullptr || block->statements.empty()) { + return tasks; + } + for (auto &stmt : block->statements) { + if (!stmt->is()) { + return {}; // not a pure offloaded kernel body -> whole-kernel path + } + } + for (auto &stmt : block->statements) { + tasks.push_back(stmt->as()); + } + return tasks; +} + +// Run store-to-load forwarding + dead-store elimination over a single offloaded task's sub-block, scoped to +// that block alone. Correctness relies on the existing CFG boundary seeding: reaching_definition_analysis seeds +// the start node with all global pointers ("may contain data before this kernel") and live_variable_analysis +// seeds the final node with all global store destinations ("may be loaded after this kernel"). Because the CFG +// here spans only one task, every global address (fields, external tensors, global temporaries that carry data +// between tasks) is therefore conservatively treated as live-in and live-out of the task -- so no store that a +// sibling task may read is ever eliminated, and no value is forwarded across a task (device-launch) boundary. +bool optimize_offload_block(Block *block, + bool in_parallel_for, + bool after_lower_access, + bool autodiff_enabled, + const std::optional &lva_config_opt) { + if (block == nullptr || block->statements.empty()) { + return false; + } + auto cfg = analysis::build_cfg(block, in_parallel_for); + cfg->simplify_graph(); + bool modified = false; + if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) { + modified = true; + } + if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) { + modified = true; + } + return modified; +} + +} // namespace + bool cfg_optimization(const CompileConfig &config, IRNode *root, bool after_lower_access, @@ -17,6 +71,40 @@ bool cfg_optimization(const CompileConfig &config, const std::string &kernel_name, const std::string &phase) { QD_AUTO_PROF; + + // Per-offloaded-task scoping: once the kernel is offloaded, optimize each task's CFG independently instead of + // building one whole-kernel CFG across all tasks. This keeps the (super-linear) dataflow analyses small per + // task without changing semantics -- see the comment on CompileConfig::cfg_optimization_per_task and + // optimize_offload_block above. Disabled for the real-matrix path (which skips the analyses entirely) and + // pre-offload IR (no tasks yet), both of which fall through to the whole-kernel path below. + if (config.cfg_optimization_per_task && !real_matrix_enabled) { + auto tasks = collect_offloaded_tasks(root); + if (!tasks.empty()) { + bool result_modified = false; + for (auto *off : tasks) { + const bool body_parallel = off->task_type == OffloadedStmt::TaskType::range_for || + off->task_type == OffloadedStmt::TaskType::struct_for || + off->task_type == OffloadedStmt::TaskType::mesh_for; + // Prologues/epilogues run serially; only the for-task body is parallel-executed. + result_modified |= optimize_offload_block(off->tls_prologue.get(), false, after_lower_access, + autodiff_enabled, lva_config_opt); + result_modified |= optimize_offload_block(off->mesh_prologue.get(), false, after_lower_access, + autodiff_enabled, lva_config_opt); + result_modified |= optimize_offload_block(off->bls_prologue.get(), false, after_lower_access, + autodiff_enabled, lva_config_opt); + result_modified |= optimize_offload_block(off->body.get(), body_parallel, after_lower_access, + autodiff_enabled, lva_config_opt); + result_modified |= optimize_offload_block(off->bls_epilogue.get(), false, after_lower_access, + autodiff_enabled, lva_config_opt); + result_modified |= optimize_offload_block(off->tls_epilogue.get(), false, after_lower_access, + autodiff_enabled, lva_config_opt); + } + // TODO: implement cfg->dead_instruction_elimination() + die(root); // remove unused allocas across the whole kernel + return result_modified; + } + } + auto cfg = analysis::build_cfg(root); const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data()); From f656cb664c78f66e6608db893f8911adf2a2e5c2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 7 Jun 2026 18:02:36 -0700 Subject: [PATCH 2/4] Fix per-task cfg_optimization to reproduce exact per-task CFG The first cut built a separate CFG per offloaded sub-block (prologue/body/ epilogue). That dropped the offloaded for-body's implicit-loop `continue` edges -- which are wired by visit(OffloadedStmt), not visit(Block) -- and so wrongly dead-store-eliminated a global store preceding a `continue` (test_cfg_continue regressed). Instead, build one CFG per offloaded task by temporarily moving the single OffloadedStmt into a throwaway wrapper block and running it through the normal Block -> OffloadedStmt construction, then moving it back. The per-task CFG is then byte-for-byte the slice the whole-kernel CFG would build for that task (continue wiring, prologue/body/epilogue chaining, parallel-execution flag), so correctness is preserved while the dataflow analyses stay per-task. Also revert the build_cfg(root_in_parallel_for) signature change (no longer needed -- visit(OffloadedStmt) sets the body's parallel flag itself) and fall back to the whole-kernel CFG when QD_DUMP_CFG is requested so dumping still shows the full graph. --- quadrants/analysis/build_cfg.cpp | 7 +- quadrants/ir/analysis.h | 6 +- quadrants/transforms/cfg_optimization.cpp | 90 ++++++++++++----------- 3 files changed, 51 insertions(+), 52 deletions(-) diff --git a/quadrants/analysis/build_cfg.cpp b/quadrants/analysis/build_cfg.cpp index b4f4a38f12..fb906b7b96 100644 --- a/quadrants/analysis/build_cfg.cpp +++ b/quadrants/analysis/build_cfg.cpp @@ -420,9 +420,8 @@ class CFGBuilder : public IRVisitor { current_stmt_id_ = backup_stmt_id; } - static std::unique_ptr run(IRNode *root, bool root_in_parallel_for) { + static std::unique_ptr run(IRNode *root) { CFGBuilder builder; - builder.in_parallel_for_ = root_in_parallel_for; root->accept(&builder); if (!builder.graph_->nodes[builder.graph_->final_node]->empty()) { // Make the final node empty (by adding an empty final node). @@ -449,8 +448,8 @@ class CFGBuilder : public IRVisitor { }; namespace irpass::analysis { -std::unique_ptr build_cfg(IRNode *root, bool root_in_parallel_for) { - return CFGBuilder::run(root, root_in_parallel_for); +std::unique_ptr build_cfg(IRNode *root) { + return CFGBuilder::run(root); } } // namespace irpass::analysis diff --git a/quadrants/ir/analysis.h b/quadrants/ir/analysis.h index 63bd5d50a0..4fbff436e5 100644 --- a/quadrants/ir/analysis.h +++ b/quadrants/ir/analysis.h @@ -72,11 +72,7 @@ namespace analysis { */ AliasResult alias_analysis(Stmt *var1, Stmt *var2); -// |root_in_parallel_for| seeds the builder's parallel-execution context. It is true when |root| is the body -// block of an offloaded range_for/struct_for/mesh_for task built in isolation (see per-task cfg_optimization), -// so that nodes inside it are correctly flagged is_parallel_executed even though they are not visited through -// the enclosing OffloadedStmt. Defaults to false (whole-kernel / serial builds), preserving prior behaviour. -std::unique_ptr build_cfg(IRNode *root, bool root_in_parallel_for = false); +std::unique_ptr build_cfg(IRNode *root); void check_fields_registered(IRNode *root); std::unique_ptr clone(IRNode *root); std::unique_ptr clone(Stmt *root); diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp index e47d249f39..8e676621b3 100644 --- a/quadrants/transforms/cfg_optimization.cpp +++ b/quadrants/transforms/cfg_optimization.cpp @@ -33,30 +33,47 @@ std::vector collect_offloaded_tasks(IRNode *root) { return tasks; } -// Run store-to-load forwarding + dead-store elimination over a single offloaded task's sub-block, scoped to -// that block alone. Correctness relies on the existing CFG boundary seeding: reaching_definition_analysis seeds -// the start node with all global pointers ("may contain data before this kernel") and live_variable_analysis -// seeds the final node with all global store destinations ("may be loaded after this kernel"). Because the CFG -// here spans only one task, every global address (fields, external tensors, global temporaries that carry data -// between tasks) is therefore conservatively treated as live-in and live-out of the task -- so no store that a -// sibling task may read is ever eliminated, and no value is forwarded across a task (device-launch) boundary. -bool optimize_offload_block(Block *block, - bool in_parallel_for, - bool after_lower_access, - bool autodiff_enabled, - const std::optional &lva_config_opt) { - if (block == nullptr || block->statements.empty()) { - return false; - } - auto cfg = analysis::build_cfg(block, in_parallel_for); - cfg->simplify_graph(); +// Build and optimize a control-flow graph for a SINGLE offloaded task, scoped to that task alone. +// +// The task is temporarily moved into a throwaway wrapper block and run through the normal Block -> +// OffloadedStmt CFG construction, then moved back, leaving the IR shape unchanged. Building through a wrapper +// (instead of stitching together per-sub-block CFGs) is what makes this correct: the resulting CFG is +// byte-for-byte the slice that the whole-kernel CFG would build for this one task -- including the offloaded +// for-body's implicit-loop `continue` edges (which are wired by visit(OffloadedStmt), not by visit(Block)), the +// prologue/body/epilogue chaining, and the body's is_parallel_executed flag. Optimizing each sub-block in +// isolation would drop the `continue` loop-back edges and wrongly dead-store-eliminate a global store that +// precedes a `continue` (regression caught by test_cfg_continue). +// +// Scoping the analyses to one task is semantics-preserving because each offloaded task is a separate device +// launch and the existing CFG boundary seeding is conservative across the launch boundary: +// reaching_definition_analysis seeds the start node with all global pointers ("may already hold data") and +// live_variable_analysis seeds the final node with all global store destinations ("may be read later"). With +// the CFG spanning only one task, every global address -- fields, external tensors, and the global-temporary +// buffer that carries scalars between tasks -- is therefore treated as live-in and live-out of the task, so no +// store a sibling task may read is eliminated and no value is forwarded across a task (device-launch) boundary. +bool optimize_one_task(Block *parent, + OffloadedStmt *off, + bool after_lower_access, + bool autodiff_enabled, + const std::optional &lva_config_opt) { + const int location = parent->locate(off); + QD_ASSERT(location != -1); + Block wrapper; + wrapper.insert(parent->extract(off)); bool modified = false; - if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) { - modified = true; - } - if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) { - modified = true; + { + // |cfg| holds raw pointers into |wrapper| (its container nodes) and into the task's own sub-blocks; keep + // both alive until the analyses are done, then move the task back before |wrapper| leaves scope. + auto cfg = analysis::build_cfg(&wrapper); + cfg->simplify_graph(); + if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) { + modified = true; + } + if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) { + modified = true; + } } + parent->insert(wrapper.extract(off), location); return modified; } @@ -75,29 +92,18 @@ bool cfg_optimization(const CompileConfig &config, // Per-offloaded-task scoping: once the kernel is offloaded, optimize each task's CFG independently instead of // building one whole-kernel CFG across all tasks. This keeps the (super-linear) dataflow analyses small per // task without changing semantics -- see the comment on CompileConfig::cfg_optimization_per_task and - // optimize_offload_block above. Disabled for the real-matrix path (which skips the analyses entirely) and - // pre-offload IR (no tasks yet), both of which fall through to the whole-kernel path below. - if (config.cfg_optimization_per_task && !real_matrix_enabled) { + // optimize_one_task above. Skipped for the real-matrix path (which runs no analyses) and for pre-offload IR + // (no tasks yet); also skipped when CFG dumping is requested, so QD_DUMP_CFG keeps dumping the whole-kernel + // graph. All of these fall through to the whole-kernel path below. + const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data()); + const bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1"; + if (config.cfg_optimization_per_task && !real_matrix_enabled && !dump_cfg) { auto tasks = collect_offloaded_tasks(root); if (!tasks.empty()) { + auto *block = root->as(); bool result_modified = false; for (auto *off : tasks) { - const bool body_parallel = off->task_type == OffloadedStmt::TaskType::range_for || - off->task_type == OffloadedStmt::TaskType::struct_for || - off->task_type == OffloadedStmt::TaskType::mesh_for; - // Prologues/epilogues run serially; only the for-task body is parallel-executed. - result_modified |= optimize_offload_block(off->tls_prologue.get(), false, after_lower_access, - autodiff_enabled, lva_config_opt); - result_modified |= optimize_offload_block(off->mesh_prologue.get(), false, after_lower_access, - autodiff_enabled, lva_config_opt); - result_modified |= optimize_offload_block(off->bls_prologue.get(), false, after_lower_access, - autodiff_enabled, lva_config_opt); - result_modified |= optimize_offload_block(off->body.get(), body_parallel, after_lower_access, - autodiff_enabled, lva_config_opt); - result_modified |= optimize_offload_block(off->bls_epilogue.get(), false, after_lower_access, - autodiff_enabled, lva_config_opt); - result_modified |= optimize_offload_block(off->tls_epilogue.get(), false, after_lower_access, - autodiff_enabled, lva_config_opt); + result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt); } // TODO: implement cfg->dead_instruction_elimination() die(root); // remove unused allocas across the whole kernel @@ -107,8 +113,6 @@ bool cfg_optimization(const CompileConfig &config, auto cfg = analysis::build_cfg(root); - const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data()); - bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1"; if (dump_cfg) { std::string suffix = phase.empty() ? "_before_cfg_opt" : ("_" + phase + "_before_cfg_opt"); cfg->dump_graph_to_file(config, kernel_name, suffix); From c21e747e507f7ea563931b5e7abc2acdf85a6fd1 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 7 Jun 2026 18:48:49 -0700 Subject: [PATCH 3/4] Ditch pre-offload cfg_optimization; rely on post-offload per-task cfg Profiling showed the super-linear reaching-definition / store-to-load analyses run mostly in the pre-offload phase, on the monolithic kernel IR before any offloaded tasks exist -- so per-task scoping alone barely helped (the post-offload cfg was already tiny). Under cfg_optimization_per_task, skip the whole-kernel cfg_optimization entirely when the IR is not yet offloaded (no OffloadedStmt tasks), keeping only the cheap dead-alloca cleanup, and let the post-offload per-task cfg perform store-to-load forwarding + dead-store elimination once tasks exist. cfg_optimization is an optimization, not a correctness pass, so dropping it pre-offload is safe; the only thing lost is cross-task forwarding/DSE on the monolithic IR, which is invalid across separate device launches anyway. CFG dumping (QD_DUMP_CFG) still forces the whole-kernel path. --- quadrants/program/compile_config.h | 15 +++++----- quadrants/transforms/cfg_optimization.cpp | 35 +++++++++++++++-------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h index 2001a93331..aed9315491 100644 --- a/quadrants/program/compile_config.h +++ b/quadrants/program/compile_config.h @@ -9,13 +9,14 @@ struct CompileConfig { Arch arch; bool debug; bool cfg_optimization; - // When true (default), cfg_optimization scopes its store-to-load forwarding and dead-store elimination to - // each offloaded task independently once the kernel has been offloaded, instead of running one whole-kernel - // control-flow graph spanning all tasks. Each offloaded task is a separate device launch, so cross-task - // store-to-load forwarding of registers is impossible anyway, and global memory is treated conservatively - // (live-in and live-out of every task) by the existing CFG boundary seeding -- so the per-task scoping is - // semantics-preserving while making the dataflow analyses ~linear in total IR instead of super-linear in the - // combined whole-kernel IR. Set false to restore the whole-kernel behaviour. + // When true (default), cfg_optimization runs only AFTER offloading and is scoped to each offloaded task + // independently (a separate control-flow graph per task) instead of one whole-kernel graph spanning all tasks; + // the expensive whole-kernel cfg in the pre-offload phase is ditched entirely. Each offloaded task is a + // separate device launch, so cross-task store-to-load forwarding of registers is impossible anyway, and global + // memory is treated conservatively (live-in and live-out of every task) by the existing CFG boundary seeding + // -- so this is semantics-preserving (cfg_optimization is an optimization, not a correctness pass) while making + // the super-linear reaching-definition / forwarding analyses ~linear in total IR instead of super-linear in + // the combined monolithic kernel IR. Set false to restore the whole-kernel pre+post-offload behaviour. bool cfg_optimization_per_task{true}; bool check_out_of_bound; bool validate_autodiff; diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp index 8e676621b3..0b2418fe39 100644 --- a/quadrants/transforms/cfg_optimization.cpp +++ b/quadrants/transforms/cfg_optimization.cpp @@ -14,8 +14,8 @@ namespace { // Collect the top-level offloaded tasks of |root| iff |root| is an already-offloaded kernel body, i.e. a Block // whose statements are all OffloadedStmt. Returns an empty vector otherwise (pre-offload IR, function bodies, -// non-Block roots), in which case the caller falls back to the whole-kernel CFG. This is what makes the -// per-task path activate only post-offload, where the notion of "an offloaded task" exists. +// non-Block roots). This is what lets the caller tell "post-offload" (run per-task cfg) from "pre-offload / +// other" (ditch cfg, under cfg_optimization_per_task), since "an offloaded task" only exists post-offload. std::vector collect_offloaded_tasks(IRNode *root) { std::vector tasks; auto *block = root->cast(); @@ -89,26 +89,37 @@ bool cfg_optimization(const CompileConfig &config, const std::string &phase) { QD_AUTO_PROF; - // Per-offloaded-task scoping: once the kernel is offloaded, optimize each task's CFG independently instead of - // building one whole-kernel CFG across all tasks. This keeps the (super-linear) dataflow analyses small per - // task without changing semantics -- see the comment on CompileConfig::cfg_optimization_per_task and - // optimize_one_task above. Skipped for the real-matrix path (which runs no analyses) and for pre-offload IR - // (no tasks yet); also skipped when CFG dumping is requested, so QD_DUMP_CFG keeps dumping the whole-kernel - // graph. All of these fall through to the whole-kernel path below. const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data()); const bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1"; - if (config.cfg_optimization_per_task && !real_matrix_enabled && !dump_cfg) { + + // Per-offloaded-task scoping. Once the kernel is offloaded we optimize each task's CFG independently; and we + // deliberately DITCH the expensive whole-kernel cfg_optimization in the pre-offload phase, relying on the + // post-offload per-task cfg below to do the store-to-load forwarding + dead-store elimination once tasks + // exist. The expensive (super-linear) reaching-definition / forwarding analyses otherwise run on the monolithic + // pre-offload kernel IR -- where there are no tasks to scope to -- and dominate compile time. cfg_optimization + // is an optimization, not a correctness pass, so dropping it pre-offload is safe; the only thing lost is + // cross-task forwarding/DSE on the monolithic IR, which is invalid across separate device launches anyway. + // QD_DUMP_CFG forces the whole-kernel path so the full graph can still be dumped for debugging. + if (config.cfg_optimization_per_task && !dump_cfg) { auto tasks = collect_offloaded_tasks(root); if (!tasks.empty()) { - auto *block = root->as(); + // Post-offload: per-task store-to-load forwarding + dead-store elimination (skipped for the real-matrix + // path, matching the whole-kernel path which runs no analyses there). bool result_modified = false; - for (auto *off : tasks) { - result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt); + if (!real_matrix_enabled) { + auto *block = root->as(); + for (auto *off : tasks) { + result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt); + } } // TODO: implement cfg->dead_instruction_elimination() die(root); // remove unused allocas across the whole kernel return result_modified; } + // Pre-offload IR (no offloaded tasks yet) or a non-offloaded body: ditch the whole-kernel cfg analyses and + // keep only the cheap dead-alloca cleanup; the post-offload per-task path will optimize each task later. + die(root); + return false; } auto cfg = analysis::build_cfg(root); From 6cfa213f5dd830a2c79f5ec81974decaec0ee5fe Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Sun, 7 Jun 2026 21:22:15 -0700 Subject: [PATCH 4/4] Only ditch pre-offload cfg for compile-pipeline phases (fix Half2Vectorization tests) The previous commit ditched the whole-kernel cfg for ALL non-offloaded IR under cfg_optimization_per_task, which over-reached: full_simplify is also called on standalone, never-offloaded blocks (unit tests, function bodies), where its store-to-load forwarding + dead-store elimination must still run. die() cannot remove a dead store (side-effecting), so dropping cfg there left a stray store and regressed Half2Vectorization.{Ndarray,GlobalTemporary,Field} (each +1 statement). Scope the ditch to the compile_to_offloads pre-offload phases (simplify_I, simplify_II, pre/post_autodiff) -- the monolithic-kernel calls whose super-linear cfg dominates compile time and that are redundant because the post-offload per-task cfg (simplify_III onward) redoes intra-task forwarding/DSE once tasks exist. Any other non-offloaded caller falls through to the whole-kernel cfg, restoring the prior behavior. No-op for the qipc graph kernel (only hits simplify_I/II pre-offload), so the compile-time win is unchanged. --- quadrants/transforms/cfg_optimization.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp index 0b2418fe39..2113b7c523 100644 --- a/quadrants/transforms/cfg_optimization.cpp +++ b/quadrants/transforms/cfg_optimization.cpp @@ -116,10 +116,21 @@ bool cfg_optimization(const CompileConfig &config, die(root); // remove unused allocas across the whole kernel return result_modified; } - // Pre-offload IR (no offloaded tasks yet) or a non-offloaded body: ditch the whole-kernel cfg analyses and - // keep only the cheap dead-alloca cleanup; the post-offload per-task path will optimize each task later. - die(root); - return false; + // No offloaded tasks yet. Within compile_to_offloads these are the pre-offload full_simplify calls on the + // monolithic kernel IR (the phases below, all *before* irpass::offload): their whole-kernel cfg is the + // (super-linear) reaching-definition / store-to-load analysis that dominates compile time, and it is + // redundant because the post-offload per-task cfg ("simplify_III" onward) redoes the intra-task + // store-to-load forwarding + dead-store elimination once tasks exist. So for exactly those phases we ditch + // cfg, keeping only the cheap dead-alloca cleanup. For ANY other caller of full_simplify on non-offloaded + // IR (unit tests, standalone blocks / function bodies that are never offloaded), we must still run the + // whole-kernel cfg below, or its forwarding/DSE would be silently lost -- so we fall through. + const bool pre_offload_compile_phase = + phase == "simplify_I" || phase == "simplify_II" || phase == "pre_autodiff" || phase == "post_autodiff"; + if (pre_offload_compile_phase) { + die(root); + return false; + } + // else: fall through to the whole-kernel cfg path below. } auto cfg = analysis::build_cfg(root);