From b68642cef91ecd29b5766a2a59c5b1cc26860b35 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 7 Jun 2026 17:33:45 -0700
Subject: [PATCH 1/4] Scope cfg_optimization per offloaded task (post-offload)

Once a kernel is offloaded, cfg_optimization now builds a separate control-flow
graph per offloaded task (per sub-block, with the correct parallel-execution
flag) instead of one whole-kernel CFG spanning all tasks. store-to-load
forwarding and dead-store elimination then run scoped to each task.

This is semantics-preserving: each offloaded task is a separate device launch,
so cross-task register forwarding is impossible anyway, and global memory
(fields, external tensors, and the global-temporary buffer that carries values
between tasks) is treated conservatively as live-in/live-out of every task by
the existing CFG boundary seeding (reaching-def start-node seed + live-var
final-node seed). The win is compile time: the reaching-definition / live-
variable dataflow becomes ~linear in total IR rather than super-linear in the
combined whole-kernel IR, which blows up for kernels that pack many stages into
one @qd.kernel.

Gated by CompileConfig::cfg_optimization_per_task (default true; env
QD_CFG_OPTIMIZATION_PER_TASK / qd.init kwarg). Pre-offload IR, function bodies,
and the real-matrix path fall back to the whole-kernel CFG unchanged.
---
 quadrants/analysis/build_cfg.cpp          |  7 +-
 quadrants/ir/analysis.h                   |  6 +-
 quadrants/program/compile_config.cpp      |  1 +
 quadrants/program/compile_config.h        |  8 +++
 quadrants/python/export_lang.cpp          |  1 +
 quadrants/transforms/cfg_optimization.cpp | 88 +++++++++++++++++++++++
 6 files changed, 107 insertions(+), 4 deletions(-)
diff --git a/quadrants/analysis/build_cfg.cpp b/quadrants/analysis/build_cfg.cpp
index fb906b7b96..b4f4a38f12 100644
--- a/quadrants/analysis/build_cfg.cpp
+++ b/quadrants/analysis/build_cfg.cpp
@@ -420,8 +420,9 @@ class CFGBuilder : public IRVisitor {
     current_stmt_id_ = backup_stmt_id;
   }
 
-  static std::unique_ptr<ControlFlowGraph> run(IRNode *root) {
+  static std::unique_ptr<ControlFlowGraph> run(IRNode *root, bool root_in_parallel_for) {
     CFGBuilder builder;
+    builder.in_parallel_for_ = root_in_parallel_for;
     root->accept(&builder);
     if (!builder.graph_->nodes[builder.graph_->final_node]->empty()) {
       // Make the final node empty (by adding an empty final node).
@@ -448,8 +449,8 @@ class CFGBuilder : public IRVisitor {
 };
 
 namespace irpass::analysis {
-std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root) {
-  return CFGBuilder::run(root);
+std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root, bool root_in_parallel_for) {
+  return CFGBuilder::run(root, root_in_parallel_for);
 }
 }  // namespace irpass::analysis
 
diff --git a/quadrants/ir/analysis.h b/quadrants/ir/analysis.h
index 4fbff436e5..63bd5d50a0 100644
--- a/quadrants/ir/analysis.h
+++ b/quadrants/ir/analysis.h
@@ -72,7 +72,11 @@ namespace analysis {
  */
 AliasResult alias_analysis(Stmt *var1, Stmt *var2);
 
-std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root);
+// |root_in_parallel_for| seeds the builder's parallel-execution context. It is true when |root| is the body
+// block of an offloaded range_for/struct_for/mesh_for task built in isolation (see per-task cfg_optimization),
+// so that nodes inside it are correctly flagged is_parallel_executed even though they are not visited through
+// the enclosing OffloadedStmt. Defaults to false (whole-kernel / serial builds), preserving prior behaviour.
+std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root, bool root_in_parallel_for = false);
 void check_fields_registered(IRNode *root);
 std::unique_ptr<IRNode> clone(IRNode *root);
 std::unique_ptr<Stmt> clone(Stmt *root);
diff --git a/quadrants/program/compile_config.cpp b/quadrants/program/compile_config.cpp
index 5034fd1822..71e44b378d 100644
--- a/quadrants/program/compile_config.cpp
+++ b/quadrants/program/compile_config.cpp
@@ -21,6 +21,7 @@ CompileConfig::CompileConfig() {
   max_vector_width = 8;
   debug = false;
   cfg_optimization = true;
+  cfg_optimization_per_task = true;
   check_out_of_bound = false;
   serial_schedule = false;
   simplify_before_lower_access = true;
diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h
index 92c09a5fd8..2001a93331 100644
--- a/quadrants/program/compile_config.h
+++ b/quadrants/program/compile_config.h
@@ -9,6 +9,14 @@ struct CompileConfig {
   Arch arch;
   bool debug;
   bool cfg_optimization;
+  // When true (default), cfg_optimization scopes its store-to-load forwarding and dead-store elimination to
+  // each offloaded task independently once the kernel has been offloaded, instead of running one whole-kernel
+  // control-flow graph spanning all tasks. Each offloaded task is a separate device launch, so cross-task
+  // store-to-load forwarding of registers is impossible anyway, and global memory is treated conservatively
+  // (live-in and live-out of every task) by the existing CFG boundary seeding -- so the per-task scoping is
+  // semantics-preserving while making the dataflow analyses ~linear in total IR instead of super-linear in the
+  // combined whole-kernel IR. Set false to restore the whole-kernel behaviour.
+  bool cfg_optimization_per_task{true};
   bool check_out_of_bound;
   bool validate_autodiff;
   int simd_width;
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index a440e2a798..38934dcdde 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -164,6 +164,7 @@ void export_lang(py::module &m) {
       .def_readwrite("print_ir_dbg_info", &CompileConfig::print_ir_dbg_info)
       .def_readwrite("debug", &CompileConfig::debug)
       .def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization)
+      .def_readwrite("cfg_optimization_per_task", &CompileConfig::cfg_optimization_per_task)
       .def_readwrite("check_out_of_bound", &CompileConfig::check_out_of_bound)
       .def_readwrite("print_accessor_ir", &CompileConfig::print_accessor_ir)
       .def_readwrite("use_llvm", &CompileConfig::use_llvm)
diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp
index 66b4a14067..e47d249f39 100644
--- a/quadrants/transforms/cfg_optimization.cpp
+++ b/quadrants/transforms/cfg_optimization.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/ir/ir.h"
+#include "quadrants/ir/statements.h"
 #include "quadrants/ir/control_flow_graph.h"
 #include "quadrants/ir/transforms.h"
 #include "quadrants/ir/analysis.h"
@@ -8,6 +9,59 @@
 namespace quadrants::lang {
 
 namespace irpass {
+
+namespace {
+
+// Collect the top-level offloaded tasks of |root| iff |root| is an already-offloaded kernel body, i.e. a Block
+// whose statements are all OffloadedStmt. Returns an empty vector otherwise (pre-offload IR, function bodies,
+// non-Block roots), in which case the caller falls back to the whole-kernel CFG. This is what makes the
+// per-task path activate only post-offload, where the notion of "an offloaded task" exists.
+std::vector<OffloadedStmt *> collect_offloaded_tasks(IRNode *root) {
+  std::vector<OffloadedStmt *> tasks;
+  auto *block = root->cast<Block>();
+  if (block == nullptr || block->statements.empty()) {
+    return tasks;
+  }
+  for (auto &stmt : block->statements) {
+    if (!stmt->is<OffloadedStmt>()) {
+      return {};  // not a pure offloaded kernel body -> whole-kernel path
+    }
+  }
+  for (auto &stmt : block->statements) {
+    tasks.push_back(stmt->as<OffloadedStmt>());
+  }
+  return tasks;
+}
+
+// Run store-to-load forwarding + dead-store elimination over a single offloaded task's sub-block, scoped to
+// that block alone. Correctness relies on the existing CFG boundary seeding: reaching_definition_analysis seeds
+// the start node with all global pointers ("may contain data before this kernel") and live_variable_analysis
+// seeds the final node with all global store destinations ("may be loaded after this kernel"). Because the CFG
+// here spans only one task, every global address (fields, external tensors, global temporaries that carry data
+// between tasks) is therefore conservatively treated as live-in and live-out of the task -- so no store that a
+// sibling task may read is ever eliminated, and no value is forwarded across a task (device-launch) boundary.
+bool optimize_offload_block(Block *block,
+                            bool in_parallel_for,
+                            bool after_lower_access,
+                            bool autodiff_enabled,
+                            const std::optional<ControlFlowGraph::LiveVarAnalysisConfig> &lva_config_opt) {
+  if (block == nullptr || block->statements.empty()) {
+    return false;
+  }
+  auto cfg = analysis::build_cfg(block, in_parallel_for);
+  cfg->simplify_graph();
+  bool modified = false;
+  if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) {
+    modified = true;
+  }
+  if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) {
+    modified = true;
+  }
+  return modified;
+}
+
+}  // namespace
+
 bool cfg_optimization(const CompileConfig &config,
                       IRNode *root,
                       bool after_lower_access,
@@ -17,6 +71,40 @@ bool cfg_optimization(const CompileConfig &config,
                       const std::string &kernel_name,
                       const std::string &phase) {
   QD_AUTO_PROF;
+
+  // Per-offloaded-task scoping: once the kernel is offloaded, optimize each task's CFG independently instead of
+  // building one whole-kernel CFG across all tasks. This keeps the (super-linear) dataflow analyses small per
+  // task without changing semantics -- see the comment on CompileConfig::cfg_optimization_per_task and
+  // optimize_offload_block above. Disabled for the real-matrix path (which skips the analyses entirely) and
+  // pre-offload IR (no tasks yet), both of which fall through to the whole-kernel path below.
+  if (config.cfg_optimization_per_task && !real_matrix_enabled) {
+    auto tasks = collect_offloaded_tasks(root);
+    if (!tasks.empty()) {
+      bool result_modified = false;
+      for (auto *off : tasks) {
+        const bool body_parallel = off->task_type == OffloadedStmt::TaskType::range_for ||
+                                    off->task_type == OffloadedStmt::TaskType::struct_for ||
+                                    off->task_type == OffloadedStmt::TaskType::mesh_for;
+        // Prologues/epilogues run serially; only the for-task body is parallel-executed.
+        result_modified |= optimize_offload_block(off->tls_prologue.get(), false, after_lower_access,
+                                                  autodiff_enabled, lva_config_opt);
+        result_modified |= optimize_offload_block(off->mesh_prologue.get(), false, after_lower_access,
+                                                  autodiff_enabled, lva_config_opt);
+        result_modified |= optimize_offload_block(off->bls_prologue.get(), false, after_lower_access,
+                                                  autodiff_enabled, lva_config_opt);
+        result_modified |= optimize_offload_block(off->body.get(), body_parallel, after_lower_access,
+                                                  autodiff_enabled, lva_config_opt);
+        result_modified |= optimize_offload_block(off->bls_epilogue.get(), false, after_lower_access,
+                                                  autodiff_enabled, lva_config_opt);
+        result_modified |= optimize_offload_block(off->tls_epilogue.get(), false, after_lower_access,
+                                                  autodiff_enabled, lva_config_opt);
+      }
+      // TODO: implement cfg->dead_instruction_elimination()
+      die(root);  // remove unused allocas across the whole kernel
+      return result_modified;
+    }
+  }
+
   auto cfg = analysis::build_cfg(root);
 
   const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data());

From f656cb664c78f66e6608db893f8911adf2a2e5c2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 7 Jun 2026 18:02:36 -0700
Subject: [PATCH 2/4] Fix per-task cfg_optimization to reproduce exact per-task
 CFG

The first cut built a separate CFG per offloaded sub-block (prologue/body/
epilogue). That dropped the offloaded for-body's implicit-loop `continue`
edges -- which are wired by visit(OffloadedStmt), not visit(Block) -- and so
wrongly dead-store-eliminated a global store preceding a `continue`
(test_cfg_continue regressed).

Instead, build one CFG per offloaded task by temporarily moving the single
OffloadedStmt into a throwaway wrapper block and running it through the normal
Block -> OffloadedStmt construction, then moving it back. The per-task CFG is
then byte-for-byte the slice the whole-kernel CFG would build for that task
(continue wiring, prologue/body/epilogue chaining, parallel-execution flag),
so correctness is preserved while the dataflow analyses stay per-task.

Also revert the build_cfg(root_in_parallel_for) signature change (no longer
needed -- visit(OffloadedStmt) sets the body's parallel flag itself) and fall
back to the whole-kernel CFG when QD_DUMP_CFG is requested so dumping still
shows the full graph.
---
 quadrants/analysis/build_cfg.cpp          |  7 +-
 quadrants/ir/analysis.h                   |  6 +-
 quadrants/transforms/cfg_optimization.cpp | 90 ++++++++++++-----------
 3 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/quadrants/analysis/build_cfg.cpp b/quadrants/analysis/build_cfg.cpp
index b4f4a38f12..fb906b7b96 100644
--- a/quadrants/analysis/build_cfg.cpp
+++ b/quadrants/analysis/build_cfg.cpp
@@ -420,9 +420,8 @@ class CFGBuilder : public IRVisitor {
     current_stmt_id_ = backup_stmt_id;
   }
 
-  static std::unique_ptr<ControlFlowGraph> run(IRNode *root, bool root_in_parallel_for) {
+  static std::unique_ptr<ControlFlowGraph> run(IRNode *root) {
     CFGBuilder builder;
-    builder.in_parallel_for_ = root_in_parallel_for;
     root->accept(&builder);
     if (!builder.graph_->nodes[builder.graph_->final_node]->empty()) {
       // Make the final node empty (by adding an empty final node).
@@ -449,8 +448,8 @@ class CFGBuilder : public IRVisitor {
 };
 
 namespace irpass::analysis {
-std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root, bool root_in_parallel_for) {
-  return CFGBuilder::run(root, root_in_parallel_for);
+std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root) {
+  return CFGBuilder::run(root);
 }
 }  // namespace irpass::analysis
 
diff --git a/quadrants/ir/analysis.h b/quadrants/ir/analysis.h
index 63bd5d50a0..4fbff436e5 100644
--- a/quadrants/ir/analysis.h
+++ b/quadrants/ir/analysis.h
@@ -72,11 +72,7 @@ namespace analysis {
  */
 AliasResult alias_analysis(Stmt *var1, Stmt *var2);
 
-// |root_in_parallel_for| seeds the builder's parallel-execution context. It is true when |root| is the body
-// block of an offloaded range_for/struct_for/mesh_for task built in isolation (see per-task cfg_optimization),
-// so that nodes inside it are correctly flagged is_parallel_executed even though they are not visited through
-// the enclosing OffloadedStmt. Defaults to false (whole-kernel / serial builds), preserving prior behaviour.
-std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root, bool root_in_parallel_for = false);
+std::unique_ptr<ControlFlowGraph> build_cfg(IRNode *root);
 void check_fields_registered(IRNode *root);
 std::unique_ptr<IRNode> clone(IRNode *root);
 std::unique_ptr<Stmt> clone(Stmt *root);
diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp
index e47d249f39..8e676621b3 100644
--- a/quadrants/transforms/cfg_optimization.cpp
+++ b/quadrants/transforms/cfg_optimization.cpp
@@ -33,30 +33,47 @@ std::vector<OffloadedStmt *> collect_offloaded_tasks(IRNode *root) {
   return tasks;
 }
 
-// Run store-to-load forwarding + dead-store elimination over a single offloaded task's sub-block, scoped to
-// that block alone. Correctness relies on the existing CFG boundary seeding: reaching_definition_analysis seeds
-// the start node with all global pointers ("may contain data before this kernel") and live_variable_analysis
-// seeds the final node with all global store destinations ("may be loaded after this kernel"). Because the CFG
-// here spans only one task, every global address (fields, external tensors, global temporaries that carry data
-// between tasks) is therefore conservatively treated as live-in and live-out of the task -- so no store that a
-// sibling task may read is ever eliminated, and no value is forwarded across a task (device-launch) boundary.
-bool optimize_offload_block(Block *block,
-                            bool in_parallel_for,
-                            bool after_lower_access,
-                            bool autodiff_enabled,
-                            const std::optional<ControlFlowGraph::LiveVarAnalysisConfig> &lva_config_opt) {
-  if (block == nullptr || block->statements.empty()) {
-    return false;
-  }
-  auto cfg = analysis::build_cfg(block, in_parallel_for);
-  cfg->simplify_graph();
+// Build and optimize a control-flow graph for a SINGLE offloaded task, scoped to that task alone.
+//
+// The task is temporarily moved into a throwaway wrapper block and run through the normal Block ->
+// OffloadedStmt CFG construction, then moved back, leaving the IR shape unchanged. Building through a wrapper
+// (instead of stitching together per-sub-block CFGs) is what makes this correct: the resulting CFG is
+// byte-for-byte the slice that the whole-kernel CFG would build for this one task -- including the offloaded
+// for-body's implicit-loop `continue` edges (which are wired by visit(OffloadedStmt), not by visit(Block)), the
+// prologue/body/epilogue chaining, and the body's is_parallel_executed flag. Optimizing each sub-block in
+// isolation would drop the `continue` loop-back edges and wrongly dead-store-eliminate a global store that
+// precedes a `continue` (regression caught by test_cfg_continue).
+//
+// Scoping the analyses to one task is semantics-preserving because each offloaded task is a separate device
+// launch and the existing CFG boundary seeding is conservative across the launch boundary:
+// reaching_definition_analysis seeds the start node with all global pointers ("may already hold data") and
+// live_variable_analysis seeds the final node with all global store destinations ("may be read later"). With
+// the CFG spanning only one task, every global address -- fields, external tensors, and the global-temporary
+// buffer that carries scalars between tasks -- is therefore treated as live-in and live-out of the task, so no
+// store a sibling task may read is eliminated and no value is forwarded across a task (device-launch) boundary.
+bool optimize_one_task(Block *parent,
+                       OffloadedStmt *off,
+                       bool after_lower_access,
+                       bool autodiff_enabled,
+                       const std::optional<ControlFlowGraph::LiveVarAnalysisConfig> &lva_config_opt) {
+  const int location = parent->locate(off);
+  QD_ASSERT(location != -1);
+  Block wrapper;
+  wrapper.insert(parent->extract(off));
   bool modified = false;
-  if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) {
-    modified = true;
-  }
-  if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) {
-    modified = true;
+  {
+    // |cfg| holds raw pointers into |wrapper| (its container nodes) and into the task's own sub-blocks; keep
+    // both alive until the analyses are done, then move the task back before |wrapper| leaves scope.
+    auto cfg = analysis::build_cfg(&wrapper);
+    cfg->simplify_graph();
+    if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) {
+      modified = true;
+    }
+    if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) {
+      modified = true;
+    }
   }
+  parent->insert(wrapper.extract(off), location);
   return modified;
 }
 
@@ -75,29 +92,18 @@ bool cfg_optimization(const CompileConfig &config,
   // Per-offloaded-task scoping: once the kernel is offloaded, optimize each task's CFG independently instead of
   // building one whole-kernel CFG across all tasks. This keeps the (super-linear) dataflow analyses small per
   // task without changing semantics -- see the comment on CompileConfig::cfg_optimization_per_task and
-  // optimize_offload_block above. Disabled for the real-matrix path (which skips the analyses entirely) and
-  // pre-offload IR (no tasks yet), both of which fall through to the whole-kernel path below.
-  if (config.cfg_optimization_per_task && !real_matrix_enabled) {
+  // optimize_one_task above. Skipped for the real-matrix path (which runs no analyses) and for pre-offload IR
+  // (no tasks yet); also skipped when CFG dumping is requested, so QD_DUMP_CFG keeps dumping the whole-kernel
+  // graph. All of these fall through to the whole-kernel path below.
+  const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data());
+  const bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";
+  if (config.cfg_optimization_per_task && !real_matrix_enabled && !dump_cfg) {
     auto tasks = collect_offloaded_tasks(root);
     if (!tasks.empty()) {
+      auto *block = root->as<Block>();
       bool result_modified = false;
       for (auto *off : tasks) {
-        const bool body_parallel = off->task_type == OffloadedStmt::TaskType::range_for ||
-                                    off->task_type == OffloadedStmt::TaskType::struct_for ||
-                                    off->task_type == OffloadedStmt::TaskType::mesh_for;
-        // Prologues/epilogues run serially; only the for-task body is parallel-executed.
-        result_modified |= optimize_offload_block(off->tls_prologue.get(), false, after_lower_access,
-                                                  autodiff_enabled, lva_config_opt);
-        result_modified |= optimize_offload_block(off->mesh_prologue.get(), false, after_lower_access,
-                                                  autodiff_enabled, lva_config_opt);
-        result_modified |= optimize_offload_block(off->bls_prologue.get(), false, after_lower_access,
-                                                  autodiff_enabled, lva_config_opt);
-        result_modified |= optimize_offload_block(off->body.get(), body_parallel, after_lower_access,
-                                                  autodiff_enabled, lva_config_opt);
-        result_modified |= optimize_offload_block(off->bls_epilogue.get(), false, after_lower_access,
-                                                  autodiff_enabled, lva_config_opt);
-        result_modified |= optimize_offload_block(off->tls_epilogue.get(), false, after_lower_access,
-                                                  autodiff_enabled, lva_config_opt);
+        result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt);
       }
       // TODO: implement cfg->dead_instruction_elimination()
       die(root);  // remove unused allocas across the whole kernel
@@ -107,8 +113,6 @@ bool cfg_optimization(const CompileConfig &config,
 
   auto cfg = analysis::build_cfg(root);
 
-  const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data());
-  bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";
   if (dump_cfg) {
     std::string suffix = phase.empty() ? "_before_cfg_opt" : ("_" + phase + "_before_cfg_opt");
     cfg->dump_graph_to_file(config, kernel_name, suffix);

From c21e747e507f7ea563931b5e7abc2acdf85a6fd1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 7 Jun 2026 18:48:49 -0700
Subject: [PATCH 3/4] Ditch pre-offload cfg_optimization; rely on post-offload
 per-task cfg

Profiling showed the super-linear reaching-definition / store-to-load analyses
run mostly in the pre-offload phase, on the monolithic kernel IR before any
offloaded tasks exist -- so per-task scoping alone barely helped (the
post-offload cfg was already tiny). Under cfg_optimization_per_task, skip the
whole-kernel cfg_optimization entirely when the IR is not yet offloaded (no
OffloadedStmt tasks), keeping only the cheap dead-alloca cleanup, and let the
post-offload per-task cfg perform store-to-load forwarding + dead-store
elimination once tasks exist.

cfg_optimization is an optimization, not a correctness pass, so dropping it
pre-offload is safe; the only thing lost is cross-task forwarding/DSE on the
monolithic IR, which is invalid across separate device launches anyway. CFG
dumping (QD_DUMP_CFG) still forces the whole-kernel path.
---
 quadrants/program/compile_config.h        | 15 +++++-----
 quadrants/transforms/cfg_optimization.cpp | 35 +++++++++++++++--------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h
index 2001a93331..aed9315491 100644
--- a/quadrants/program/compile_config.h
+++ b/quadrants/program/compile_config.h
@@ -9,13 +9,14 @@ struct CompileConfig {
   Arch arch;
   bool debug;
   bool cfg_optimization;
-  // When true (default), cfg_optimization scopes its store-to-load forwarding and dead-store elimination to
-  // each offloaded task independently once the kernel has been offloaded, instead of running one whole-kernel
-  // control-flow graph spanning all tasks. Each offloaded task is a separate device launch, so cross-task
-  // store-to-load forwarding of registers is impossible anyway, and global memory is treated conservatively
-  // (live-in and live-out of every task) by the existing CFG boundary seeding -- so the per-task scoping is
-  // semantics-preserving while making the dataflow analyses ~linear in total IR instead of super-linear in the
-  // combined whole-kernel IR. Set false to restore the whole-kernel behaviour.
+  // When true (default), cfg_optimization runs only AFTER offloading and is scoped to each offloaded task
+  // independently (a separate control-flow graph per task) instead of one whole-kernel graph spanning all tasks;
+  // the expensive whole-kernel cfg in the pre-offload phase is ditched entirely. Each offloaded task is a
+  // separate device launch, so cross-task store-to-load forwarding of registers is impossible anyway, and global
+  // memory is treated conservatively (live-in and live-out of every task) by the existing CFG boundary seeding
+  // -- so this is semantics-preserving (cfg_optimization is an optimization, not a correctness pass) while making
+  // the super-linear reaching-definition / forwarding analyses ~linear in total IR instead of super-linear in
+  // the combined monolithic kernel IR. Set false to restore the whole-kernel pre+post-offload behaviour.
   bool cfg_optimization_per_task{true};
   bool check_out_of_bound;
   bool validate_autodiff;
diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp
index 8e676621b3..0b2418fe39 100644
--- a/quadrants/transforms/cfg_optimization.cpp
+++ b/quadrants/transforms/cfg_optimization.cpp
@@ -14,8 +14,8 @@ namespace {
 
 // Collect the top-level offloaded tasks of |root| iff |root| is an already-offloaded kernel body, i.e. a Block
 // whose statements are all OffloadedStmt. Returns an empty vector otherwise (pre-offload IR, function bodies,
-// non-Block roots), in which case the caller falls back to the whole-kernel CFG. This is what makes the
-// per-task path activate only post-offload, where the notion of "an offloaded task" exists.
+// non-Block roots). This is what lets the caller tell "post-offload" (run per-task cfg) from "pre-offload /
+// other" (ditch cfg, under cfg_optimization_per_task), since "an offloaded task" only exists post-offload.
 std::vector<OffloadedStmt *> collect_offloaded_tasks(IRNode *root) {
   std::vector<OffloadedStmt *> tasks;
   auto *block = root->cast<Block>();
@@ -89,26 +89,37 @@ bool cfg_optimization(const CompileConfig &config,
                       const std::string &phase) {
   QD_AUTO_PROF;
 
-  // Per-offloaded-task scoping: once the kernel is offloaded, optimize each task's CFG independently instead of
-  // building one whole-kernel CFG across all tasks. This keeps the (super-linear) dataflow analyses small per
-  // task without changing semantics -- see the comment on CompileConfig::cfg_optimization_per_task and
-  // optimize_one_task above. Skipped for the real-matrix path (which runs no analyses) and for pre-offload IR
-  // (no tasks yet); also skipped when CFG dumping is requested, so QD_DUMP_CFG keeps dumping the whole-kernel
-  // graph. All of these fall through to the whole-kernel path below.
   const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data());
   const bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";
-  if (config.cfg_optimization_per_task && !real_matrix_enabled && !dump_cfg) {
+
+  // Per-offloaded-task scoping. Once the kernel is offloaded we optimize each task's CFG independently; and we
+  // deliberately DITCH the expensive whole-kernel cfg_optimization in the pre-offload phase, relying on the
+  // post-offload per-task cfg below to do the store-to-load forwarding + dead-store elimination once tasks
+  // exist. The expensive (super-linear) reaching-definition / forwarding analyses otherwise run on the monolithic
+  // pre-offload kernel IR -- where there are no tasks to scope to -- and dominate compile time. cfg_optimization
+  // is an optimization, not a correctness pass, so dropping it pre-offload is safe; the only thing lost is
+  // cross-task forwarding/DSE on the monolithic IR, which is invalid across separate device launches anyway.
+  // QD_DUMP_CFG forces the whole-kernel path so the full graph can still be dumped for debugging.
+  if (config.cfg_optimization_per_task && !dump_cfg) {
     auto tasks = collect_offloaded_tasks(root);
     if (!tasks.empty()) {
-      auto *block = root->as<Block>();
+      // Post-offload: per-task store-to-load forwarding + dead-store elimination (skipped for the real-matrix
+      // path, matching the whole-kernel path which runs no analyses there).
       bool result_modified = false;
-      for (auto *off : tasks) {
-        result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt);
+      if (!real_matrix_enabled) {
+        auto *block = root->as<Block>();
+        for (auto *off : tasks) {
+          result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt);
+        }
       }
       // TODO: implement cfg->dead_instruction_elimination()
       die(root);  // remove unused allocas across the whole kernel
       return result_modified;
     }
+    // Pre-offload IR (no offloaded tasks yet) or a non-offloaded body: ditch the whole-kernel cfg analyses and
+    // keep only the cheap dead-alloca cleanup; the post-offload per-task path will optimize each task later.
+    die(root);
+    return false;
   }
 
   auto cfg = analysis::build_cfg(root);

From 6cfa213f5dd830a2c79f5ec81974decaec0ee5fe Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Sun, 7 Jun 2026 21:22:15 -0700
Subject: [PATCH 4/4] Only ditch pre-offload cfg for compile-pipeline phases
 (fix Half2Vectorization tests)

The previous commit ditched the whole-kernel cfg for ALL non-offloaded IR under
cfg_optimization_per_task, which over-reached: full_simplify is also called on
standalone, never-offloaded blocks (unit tests, function bodies), where its
store-to-load forwarding + dead-store elimination must still run. die() cannot
remove a dead store (side-effecting), so dropping cfg there left a stray store and
regressed Half2Vectorization.{Ndarray,GlobalTemporary,Field} (each +1 statement).

Scope the ditch to the compile_to_offloads pre-offload phases (simplify_I,
simplify_II, pre/post_autodiff) -- the monolithic-kernel calls whose super-linear
cfg dominates compile time and that are redundant because the post-offload per-task
cfg (simplify_III onward) redoes intra-task forwarding/DSE once tasks exist. Any
other non-offloaded caller falls through to the whole-kernel cfg, restoring the
prior behavior. No-op for the qipc graph kernel (only hits simplify_I/II pre-offload),
so the compile-time win is unchanged.
---
 quadrants/transforms/cfg_optimization.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp
index 0b2418fe39..2113b7c523 100644
--- a/quadrants/transforms/cfg_optimization.cpp
+++ b/quadrants/transforms/cfg_optimization.cpp
@@ -116,10 +116,21 @@ bool cfg_optimization(const CompileConfig &config,
       die(root);  // remove unused allocas across the whole kernel
       return result_modified;
     }
-    // Pre-offload IR (no offloaded tasks yet) or a non-offloaded body: ditch the whole-kernel cfg analyses and
-    // keep only the cheap dead-alloca cleanup; the post-offload per-task path will optimize each task later.
-    die(root);
-    return false;
+    // No offloaded tasks yet. Within compile_to_offloads these are the pre-offload full_simplify calls on the
+    // monolithic kernel IR (the phases below, all *before* irpass::offload): their whole-kernel cfg is the
+    // (super-linear) reaching-definition / store-to-load analysis that dominates compile time, and it is
+    // redundant because the post-offload per-task cfg ("simplify_III" onward) redoes the intra-task
+    // store-to-load forwarding + dead-store elimination once tasks exist. So for exactly those phases we ditch
+    // cfg, keeping only the cheap dead-alloca cleanup. For ANY other caller of full_simplify on non-offloaded
+    // IR (unit tests, standalone blocks / function bodies that are never offloaded), we must still run the
+    // whole-kernel cfg below, or its forwarding/DSE would be silently lost -- so we fall through.
+    const bool pre_offload_compile_phase =
+        phase == "simplify_I" || phase == "simplify_II" || phase == "pre_autodiff" || phase == "post_autodiff";
+    if (pre_offload_compile_phase) {
+      die(root);
+      return false;
+    }
+    // else: fall through to the whole-kernel cfg path below.
   }
 
   auto cfg = analysis::build_cfg(root);