diff --git a/quadrants/program/compile_config.cpp b/quadrants/program/compile_config.cpp
index 5034fd1822..71e44b378d 100644
--- a/quadrants/program/compile_config.cpp
+++ b/quadrants/program/compile_config.cpp
@@ -21,6 +21,7 @@ CompileConfig::CompileConfig() {
   max_vector_width = 8;
   debug = false;
   cfg_optimization = true;
+  cfg_optimization_per_task = true;
   check_out_of_bound = false;
   serial_schedule = false;
   simplify_before_lower_access = true;
diff --git a/quadrants/program/compile_config.h b/quadrants/program/compile_config.h
index 92c09a5fd8..aed9315491 100644
--- a/quadrants/program/compile_config.h
+++ b/quadrants/program/compile_config.h
@@ -9,6 +9,15 @@ struct CompileConfig {
   Arch arch;
   bool debug;
   bool cfg_optimization;
+  // When true (default), cfg_optimization runs only AFTER offloading and is scoped to each offloaded task
+  // independently (a separate control-flow graph per task) instead of one whole-kernel graph spanning all tasks;
+  // the expensive whole-kernel cfg in the pre-offload phase is ditched entirely. Each offloaded task is a
+  // separate device launch, so cross-task store-to-load forwarding of registers is impossible anyway, and global
+  // memory is treated conservatively (live-in and live-out of every task) by the existing CFG boundary seeding
+  // -- so this is semantics-preserving (cfg_optimization is an optimization, not a correctness pass) while making
+  // the super-linear reaching-definition / forwarding analyses ~linear in total IR instead of super-linear in
+  // the combined monolithic kernel IR. Set false to restore the whole-kernel pre+post-offload behaviour.
+  bool cfg_optimization_per_task{true};
   bool check_out_of_bound;
   bool validate_autodiff;
   int simd_width;
diff --git a/quadrants/python/export_lang.cpp b/quadrants/python/export_lang.cpp
index a440e2a798..38934dcdde 100644
--- a/quadrants/python/export_lang.cpp
+++ b/quadrants/python/export_lang.cpp
@@ -164,6 +164,7 @@ void export_lang(py::module &m) {
       .def_readwrite("print_ir_dbg_info", &CompileConfig::print_ir_dbg_info)
       .def_readwrite("debug", &CompileConfig::debug)
       .def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization)
+      .def_readwrite("cfg_optimization_per_task", &CompileConfig::cfg_optimization_per_task)
       .def_readwrite("check_out_of_bound", &CompileConfig::check_out_of_bound)
       .def_readwrite("print_accessor_ir", &CompileConfig::print_accessor_ir)
       .def_readwrite("use_llvm", &CompileConfig::use_llvm)
diff --git a/quadrants/transforms/cfg_optimization.cpp b/quadrants/transforms/cfg_optimization.cpp
index 66b4a14067..2113b7c523 100644
--- a/quadrants/transforms/cfg_optimization.cpp
+++ b/quadrants/transforms/cfg_optimization.cpp
@@ -1,4 +1,5 @@
 #include "quadrants/ir/ir.h"
+#include "quadrants/ir/statements.h"
 #include "quadrants/ir/control_flow_graph.h"
 #include "quadrants/ir/transforms.h"
 #include "quadrants/ir/analysis.h"
@@ -8,6 +9,76 @@
 namespace quadrants::lang {
 
 namespace irpass {
+
+namespace {
+
+// Collect the top-level offloaded tasks of |root| iff |root| is an already-offloaded kernel body, i.e. a Block
+// whose statements are all OffloadedStmt. Returns an empty vector otherwise (pre-offload IR, function bodies,
+// non-Block roots). This is what lets the caller tell "post-offload" (run per-task cfg) from "pre-offload /
+// other" (ditch cfg, under cfg_optimization_per_task), since "an offloaded task" only exists post-offload.
+std::vector<OffloadedStmt *> collect_offloaded_tasks(IRNode *root) {
+  std::vector<OffloadedStmt *> tasks;
+  auto *block = root->cast<Block>();
+  if (block == nullptr || block->statements.empty()) {
+    return tasks;
+  }
+  for (auto &stmt : block->statements) {
+    if (!stmt->is<OffloadedStmt>()) {
+      return {};  // not a pure offloaded kernel body -> whole-kernel path
+    }
+  }
+  for (auto &stmt : block->statements) {
+    tasks.push_back(stmt->as<OffloadedStmt>());
+  }
+  return tasks;
+}
+
+// Build and optimize a control-flow graph for a SINGLE offloaded task, scoped to that task alone.
+//
+// The task is temporarily moved into a throwaway wrapper block and run through the normal Block ->
+// OffloadedStmt CFG construction, then moved back, leaving the IR shape unchanged. Building through a wrapper
+// (instead of stitching together per-sub-block CFGs) is what makes this correct: the resulting CFG is
+// byte-for-byte the slice that the whole-kernel CFG would build for this one task -- including the offloaded
+// for-body's implicit-loop `continue` edges (which are wired by visit(OffloadedStmt), not by visit(Block)), the
+// prologue/body/epilogue chaining, and the body's is_parallel_executed flag. Optimizing each sub-block in
+// isolation would drop the `continue` loop-back edges and wrongly dead-store-eliminate a global store that
+// precedes a `continue` (regression caught by test_cfg_continue).
+//
+// Scoping the analyses to one task is semantics-preserving because each offloaded task is a separate device
+// launch and the existing CFG boundary seeding is conservative across the launch boundary:
+// reaching_definition_analysis seeds the start node with all global pointers ("may already hold data") and
+// live_variable_analysis seeds the final node with all global store destinations ("may be read later"). With
+// the CFG spanning only one task, every global address -- fields, external tensors, and the global-temporary
+// buffer that carries scalars between tasks -- is therefore treated as live-in and live-out of the task, so no
+// store a sibling task may read is eliminated and no value is forwarded across a task (device-launch) boundary.
+bool optimize_one_task(Block *parent,
+                       OffloadedStmt *off,
+                       bool after_lower_access,
+                       bool autodiff_enabled,
+                       const std::optional<ControlFlowGraph::LiveVarAnalysisConfig> &lva_config_opt) {
+  const int location = parent->locate(off);
+  QD_ASSERT(location != -1);
+  Block wrapper;
+  wrapper.insert(parent->extract(off));
+  bool modified = false;
+  {
+    // |cfg| holds raw pointers into |wrapper| (its container nodes) and into the task's own sub-blocks; keep
+    // both alive until the analyses are done, then move the task back before |wrapper| leaves scope.
+    auto cfg = analysis::build_cfg(&wrapper);
+    cfg->simplify_graph();
+    if (cfg->store_to_load_forwarding(after_lower_access, autodiff_enabled)) {
+      modified = true;
+    }
+    if (cfg->dead_store_elimination(after_lower_access, lva_config_opt)) {
+      modified = true;
+    }
+  }
+  parent->insert(wrapper.extract(off), location);
+  return modified;
+}
+
+}  // namespace
+
 bool cfg_optimization(const CompileConfig &config,
                       IRNode *root,
                       bool after_lower_access,
@@ -17,10 +88,53 @@ bool cfg_optimization(const CompileConfig &config,
                       const std::string &kernel_name,
                       const std::string &phase) {
   QD_AUTO_PROF;
-  auto cfg = analysis::build_cfg(root);
 
   const char *dump_cfg_env = std::getenv(DUMP_CFG_ENV.data());
-  bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";
+  const bool dump_cfg = dump_cfg_env != nullptr && std::string(dump_cfg_env) == "1";
+
+  // Per-offloaded-task scoping. Once the kernel is offloaded we optimize each task's CFG independently; and we
+  // deliberately DITCH the expensive whole-kernel cfg_optimization in the pre-offload phase, relying on the
+  // post-offload per-task cfg below to do the store-to-load forwarding + dead-store elimination once tasks
+  // exist. The expensive (super-linear) reaching-definition / forwarding analyses otherwise run on the monolithic
+  // pre-offload kernel IR -- where there are no tasks to scope to -- and dominate compile time. cfg_optimization
+  // is an optimization, not a correctness pass, so dropping it pre-offload is safe; the only thing lost is
+  // cross-task forwarding/DSE on the monolithic IR, which is invalid across separate device launches anyway.
+  // QD_DUMP_CFG forces the whole-kernel path so the full graph can still be dumped for debugging.
+  if (config.cfg_optimization_per_task && !dump_cfg) {
+    auto tasks = collect_offloaded_tasks(root);
+    if (!tasks.empty()) {
+      // Post-offload: per-task store-to-load forwarding + dead-store elimination (skipped for the real-matrix
+      // path, matching the whole-kernel path which runs no analyses there).
+      bool result_modified = false;
+      if (!real_matrix_enabled) {
+        auto *block = root->as<Block>();
+        for (auto *off : tasks) {
+          result_modified |= optimize_one_task(block, off, after_lower_access, autodiff_enabled, lva_config_opt);
+        }
+      }
+      // TODO: implement cfg->dead_instruction_elimination()
+      die(root);  // remove unused allocas across the whole kernel
+      return result_modified;
+    }
+    // No offloaded tasks yet. Within compile_to_offloads these are the pre-offload full_simplify calls on the
+    // monolithic kernel IR (the phases below, all *before* irpass::offload): their whole-kernel cfg is the
+    // (super-linear) reaching-definition / store-to-load analysis that dominates compile time, and it is
+    // redundant because the post-offload per-task cfg ("simplify_III" onward) redoes the intra-task
+    // store-to-load forwarding + dead-store elimination once tasks exist. So for exactly those phases we ditch
+    // cfg, keeping only the cheap dead-alloca cleanup. For ANY other caller of full_simplify on non-offloaded
+    // IR (unit tests, standalone blocks / function bodies that are never offloaded), we must still run the
+    // whole-kernel cfg below, or its forwarding/DSE would be silently lost -- so we fall through.
+    const bool pre_offload_compile_phase =
+        phase == "simplify_I" || phase == "simplify_II" || phase == "pre_autodiff" || phase == "post_autodiff";
+    if (pre_offload_compile_phase) {
+      die(root);
+      return false;
+    }
+    // else: fall through to the whole-kernel cfg path below.
+  }
+
+  auto cfg = analysis::build_cfg(root);
+
   if (dump_cfg) {
     std::string suffix = phase.empty() ? "_before_cfg_opt" : ("_" + phase + "_before_cfg_opt");
     cfg->dump_graph_to_file(config, kernel_name, suffix);