Delay Inductor until we get real input tensors (#2689)

shino16 · pre-commit-ci[bot] · web-flow · commit 9ada61547fb8 · 2025-11-03T08:46:55.000Z
Co-authored-by: Masato Shinokawa &lt;shino16@users.noreply.github.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/thunder/dynamo/compiler.py b/thunder/dynamo/compiler.py
@@ -147,8 +147,6 @@ def __call__(self, gm: torch.fx.GraphModule, sample_args: list[torch.SymInt, tor
         split_module, subgraph_info = _splitter(
             gm,
             partial(jit, **thunder_options),
-            torch._inductor.compile,
-            sample_args,
             thunder_options,
         )
         self.subgraph_infos.append(subgraph_info)
diff --git a/thunder/dynamo/report.py b/thunder/dynamo/report.py
@@ -1107,7 +1107,7 @@ def foo(x):
             thunder_options[k] = v
 
     thunder_jit = partial(jit, **thunder_options, nv_save_fake_inputs=True)
-    _, subgraph_info = _splitter(gm, thunder_jit, torch._inductor.compile, _unused_sample_args=None)
+    _, subgraph_info = _splitter(gm, thunder_jit)
 
     thunder_module_names = [f"{report.graph_name}_{name}" for name in get_thunder_module_names(subgraph_info)]
     original_modules_to_thunder_modules = (
diff --git a/thunder/dynamo/splitter.py b/thunder/dynamo/splitter.py
@@ -3,10 +3,8 @@
 from typing import TYPE_CHECKING
 import copy
 from functools import partial
-import warnings
 
 import torch
-from torch._subclasses.fake_tensor import DynamicOutputShapeException
 from torch.fx.passes.split_module import split_module
 
 from thunder.core import baseutils
@@ -16,12 +14,12 @@
     CompilerType,
     SplitReason,
     SplitReasonType,
+    LazyInductorModule,
     is_node_supported_by_thunder,
     get_nodes_in_unsupported_ctx_regions,
     update_node_and_submodule,
     recompile_graph,
     checkpoint_converter,
-    make_fake_arguments,
     _get_example_inputs_from_placeholder,
     _ThunderSplitGraphModule,
     translate_dtensor_ops,
@@ -35,8 +33,6 @@
 def _splitter(
     gm: torch.fx.GraphModule,
     thunder_jit: Callable,
-    torch_inductor: Callable,
-    _unused_sample_args: list[torch.SymInt, torch.Tensor],
     thunder_options: dict[str, Any] | None = None,
 ) -> tuple[torch.fx.GraphModule, SubgraphInfo]:
     """
@@ -226,32 +222,10 @@ def is_thunder_supported_partition(node: torch.fx.Node) -> bool:
         elif node.name.startswith("submod"):  # For inductor
             graph_module = getattr(split_gm, node.name)
 
-            class ModuleWrapper(torch.nn.Module):
-                def __init__(self, fn):
-                    super().__init__()
-                    self.fn = fn
-
-                def forward(self, *args, **kwargs):
-                    return self.fn(*args, **kwargs)
-
-            def fallback_eager(reason: str) -> torch.nn.Module:
-                warnings.warn(f"{reason} Falling back to eager.")
-                # TODO: Use torch.compile here. Investigate its behavior and ensure correctness.
-                return graph_module
-
-            fake_args = make_fake_arguments(graph_module)
-            if fake_args is None:
-                jit_fn = fallback_eager("Example values for arguments are not available.")
-            else:
-                try:
-                    # torch._inductor.compile returns a function, but update_node_and_submodule expects a Module
-                    jit_fn = ModuleWrapper(torch_inductor(graph_module, fake_args))
-                except DynamicOutputShapeException as e:
-                    # This exception is meant to be handled by Dynamo, which is responsible for graph break
-                    jit_fn = fallback_eager(f"Dynamic output shape operator encountered: {e}.")
-
-            # This is for ease of debugging. We add graph attribute so GraphModule.print_readable will print it
-            jit_fn.graph = graph_module.graph
+            fake_mode = torch._guards.detect_fake_mode()
+            # Delay Inductor compilation until invocation with real tensors,
+            # because we do not know the strides of tensors that Thunder-compiled submodules return.
+            jit_fn = LazyInductorModule(graph_module, fake_mode)
 
             # Update the node name from "submod_*" to "inductor_*" for more user-friendly names
             update_node_and_submodule(split_gm, node, node.name.replace("submod", "inductor"), jit_fn)
diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from collections.abc import Callable, Sequence
+from contextlib import contextmanager
 from enum import Enum, auto
 from typing import TYPE_CHECKING
 import dataclasses
@@ -8,12 +9,15 @@
 from types import NoneType
 from collections import defaultdict
 from collections import namedtuple
+import warnings
+
+from looseversion import LooseVersion
 
 import torch
 from torch.nn.modules.module import _addindent
 from torch.utils.weak import TensorWeakRef
-from torch._guards import detect_fake_mode
-from torch._subclasses.fake_tensor import FakeTensorMode, FakeTensor
+from torch._guards import tracing, TracingContext
+from torch._subclasses.fake_tensor import DynamicOutputShapeException
 
 if torch.distributed.is_available():
     from torch.distributed.tensor import DTensor
@@ -154,6 +158,56 @@ def is_thunder_supported_partition(self, node: torch.fx.Node) -> bool:
         return node.name.startswith("submod") and int(node.name.replace("submod_", "")) in self.supported_indexes
 
 
+class LazyInductorModule(torch.nn.Module):
+    def __init__(self, graph_module, fake_mode):
+        super().__init__()
+        self.graph_module = graph_module
+        self.compiled_fn = None
+        self.fake_mode = fake_mode
+
+        # For ease of debugging, we add graph attribute so GraphModule.print_readable will print it
+        self.graph = graph_module.graph
+
+    # TODO: Remove this once we drop support for PyTorch 2.7.x
+    @contextmanager
+    def _maybe_patch_increment_toplevel(self):
+        # In PyTorch before 2.8.0, FXGraphCache assumes that it is run behind Dynamo
+        # and tries to update metrics_context.
+        # See https://github.com/pytorch/pytorch/pull/150423
+        if LooseVersion(torch.__version__) >= LooseVersion("2.8.0"):
+            yield
+            return
+
+        from torch._dynamo.utils import CompileEventLogger
+
+        def fake_increment_toplevel(*args, **kwargs):
+            metrics_context = torch._dynamo.utils.get_metrics_context()
+            assert not metrics_context.in_progress()
+            return
+
+        original = CompileEventLogger.increment_toplevel
+        CompileEventLogger.increment_toplevel = fake_increment_toplevel
+        try:
+            yield
+        finally:
+            CompileEventLogger.increment_toplevel = original
+
+    def forward(self, *args):
+        if self.compiled_fn is None:
+            with self._maybe_patch_increment_toplevel():
+                # Inductor needs fake_mode, particularly its shape_env, to handle SymInts
+                with tracing(TracingContext(fake_mode=self.fake_mode)):
+                    try:
+                        self.compiled_fn = torch._inductor.compile(self.graph_module, args)
+                    except DynamicOutputShapeException as e:
+                        # This exception is meant to be handled by Dynamo, which is responsible for graph break
+                        # TODO: Use torch.compile for fallback. Ensure its correctness.
+                        warnings.warn(f"Dynamic output shape operator encountered: {e}. Falling back to eager.")
+                        self.compiled_fn = self.graph_module
+
+        return self.compiled_fn(*args)
+
+
 @dataclasses.dataclass()
 class ProfileStats:
     """
@@ -1064,25 +1118,6 @@ def get_compiled_fn_and_timing(report, compile_fn, timer_fn):
     return sorted_compiled_gm_to_measurement[0].compiled_fn
 
 
-def make_fake_arguments(gm: torch.fx.GraphModule) -> list[FakeTensor] | None:
-    fake_mode = detect_fake_mode()
-    if fake_mode is None:
-        fake_mode = FakeTensorMode()
-    args = []
-    for node in gm.graph.nodes:
-        if node.op == "placeholder":
-            meta_val = node.meta.get("example_value")
-            if meta_val is None:
-                # We observed Dynamo creating nodes without `example_value` on Tensor.tolist().
-                # This no longer happens in PyTorch 2.10 (see https://github.com/pytorch/pytorch/pull/163807).
-                return None
-            if isinstance(meta_val, torch.Tensor):
-                # Tie to the currently enabled fake mode
-                meta_val = fake_mode.fake_tensor_converter.from_real_tensor(fake_mode, meta_val)
-            args.append(meta_val)
-    return args
-
-
 def translate_dtensor_ops(gm: torch.fx.GraphModule) -> None:
     # We need this function because:
     #
diff --git a/thunder/tests/test_dynamo.py b/thunder/tests/test_dynamo.py
@@ -998,7 +998,7 @@ def forward(self, x):
     n = gm.graph.find_nodes(op="output")
     gm.graph.erase_node(n[0])
 
-    _, subgraph_info = thunder.dynamo.splitter._splitter(gm, thunder.jit, torch._inductor.compile, [])
+    _, subgraph_info = thunder.dynamo.splitter._splitter(gm, thunder.jit)
     original_split_gm = subgraph_info.original_split_graph_module.split_graph_module
     assert original_split_gm.graph.find_nodes(op="output")
     for subm in original_split_gm.children():
@@ -1768,22 +1768,21 @@ def forward(self, x):
         torch.testing.assert_close(org_m.fc.weight.grad, thunder_m.fc.weight.grad)
 
 
-def test_thunderfx_node_with_no_example_value():
+def test_thunderfx_tolist():
     def test_fn(x):
         y = x + 10
         z = y.tolist()[0]
         return z + 2
 
     x = torch.tensor([1, 2, 3, 4, 5])
     # Without this patch, tolist() would cause graph break. See https://github.com/pytorch/pytorch/pull/163807
-    with patch("torch._dynamo.config.capture_scalar_outputs", True):
-        with pytest.warns(match="Example values for arguments are not available"):
-            actual = thunderfx(test_fn)(x)
+    with torch._dynamo.config.patch(capture_scalar_outputs=True):
+        actual = thunderfx(test_fn)(x)
     expected = test_fn(x)
     torch.testing.assert_close(actual, expected)
 
 
-def test_thunderfx_no_example_value_and_autocast():
+def test_thunderfx_tolist_autocast():
     def fn(x):
         with torch.autocast("cpu"):
             y = x + 10
@@ -1792,9 +1791,8 @@ def fn(x):
 
     x = torch.tensor([1, 2, 3, 4, 5])
     # Without this patch, tolist() would cause graph break. See https://github.com/pytorch/pytorch/pull/163807
-    with patch("torch._dynamo.config.capture_scalar_outputs", True):
-        with pytest.warns(match="Example values for arguments are not available"):
-            actual = thunderfx(fn)(x)
+    with torch._dynamo.config.patch(capture_scalar_outputs=True):
+        actual = thunderfx(fn)(x)
     expected = fn(x)
     torch.testing.assert_close(actual, expected)
 

Original file line number	Diff line number	Diff line change
`@@ -147,8 +147,6 @@ def __call__(self, gm: torch.fx.GraphModule, sample_args: list[torch.SymInt, tor`
`147`	`147`	`split_module, subgraph_info = _splitter(`
`148`	`148`	`gm,`
`149`	`149`	`partial(jit, **thunder_options),`
`150`		`- torch._inductor.compile,`
`151`		`- sample_args,`
`152`	`150`	`thunder_options,`
`153`	`151`	`)`
`154`	`152`	`self.subgraph_infos.append(subgraph_info)`