Avoid PyTensor function overhead in OpFromGraph

ricardoV94 · ricardoV94 · commit 95b0cc40d44b · 2025-04-19T17:05:07.000+02:00
Also provide pure C-implementation when all Ops allow it.
diff --git a/pytensor/compile/builders.py b/pytensor/compile/builders.py
@@ -871,6 +871,52 @@ def clone(self):
         res.fgraph = res.fgraph.clone()
         return res
 
+    def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
+        from pytensor.link.c.basic import CLinker
+        from pytensor.link.vm import VMLinker
+
+        # FIXME: Don't call self.fn just to get the optimized fgraph
+        fg = self.fn.maker.fgraph
+        # fg = self.fgraph
+        # rewriter = get_default_mode().optimizer
+        # rewriter(fg)
+        fg_no_recycling = [
+            new_o
+            for (new_o, old_o) in zip(fg.outputs, node.outputs, strict=True)
+            if old_o in no_recycling
+        ]
+
+        node_input_storage = [storage_map[r] for r in node.inputs]
+        node_output_storage = [storage_map[r] for r in node.outputs]
+
+        def create_thunk(linker):
+            linker.accept(fg, no_recycling=fg_no_recycling)
+            thunk, _, _ = linker.make_thunk(
+                input_storage=node_input_storage, output_storage=node_output_storage
+            )
+
+            if isinstance(linker, VMLinker):
+                # VMs will complain if a non-lazy thunk returns anything
+                # We wrap it in a function that returns None
+                def thunk_without_returns():
+                    thunk()
+
+                return thunk_without_returns
+
+            return thunk
+
+        if impl != "py":
+            try:
+                # We default to CLinker because it generates code for the whole graph that the compiler can reason about.
+                # Whereas the VMLinker will compile each node separately and call them in a pre-defined VM.
+                # It also has less overhead
+                return create_thunk(linker=CLinker())
+            except NotImplementedError:
+                # Some Op doesn't have a C implementation, VM it is
+                return create_thunk(linker=VMLinker(use_cloop=True, c_thunks=True))
+        else:
+            return create_thunk(VMLinker(use_cloop=False, c_thunks=False))
+
     def perform(self, node, inputs, outputs):
         variables = self.fn(*inputs)
         assert len(variables) == len(outputs)
diff --git a/tests/compile/test_builders.py b/tests/compile/test_builders.py
@@ -4,6 +4,7 @@
 import pytest
 
 import pytensor.tensor as pt
+from pytensor import scan
 from pytensor.compile import shared
 from pytensor.compile.builders import OpFromGraph
 from pytensor.compile.function import function
@@ -740,3 +741,59 @@ def test_debugprint():
 
     for truth, out in zip(exp_res.split("\n"), lines, strict=True):
         assert truth.strip() == out.strip()
+
+
+@pytest.mark.parametrize("kind", ("ofg", "inlined", "scan"))
+@pytest.mark.parametrize("mode", ("fast_compile", "fast_run"))
+@pytest.mark.parametrize("c_op", (True, False), ids=lambda x: f"c_op={x}")
+def test_benchmark(c_op, mode, kind, benchmark):
+    n = 25
+
+    if c_op:
+
+        def _f(x):
+            if isinstance(x, np.ndarray):
+                y = np.exp(x)
+            else:
+                y = pt.exp(x)
+            y /= y.sum()
+            return y
+    else:
+
+        def _f(x):
+            if isinstance(x, np.ndarray):
+                return np.sort(x)
+            else:
+                return pt.sort(x)
+
+    x = pt.vector("x")
+
+    if kind == "ofg":
+        f = OpFromGraph([x], [_f(x)])
+    else:
+        f = _f
+
+    if kind == "scan":
+        # Scan is included for a reference of how bad the overhead can be
+        outs, _ = scan(fn=f, outputs_info=[x], n_steps=n)
+        out = outs[-1]
+    else:
+        out = x
+        for i in range(n):
+            out = f(out)
+
+    compiled_fn = function([x], out, trust_input=True, mode=mode)
+    compiled_fn.dprint(print_memory_map=True)
+    compiled_fn.vm.allow_gc = (
+        False  # For fairness to the default VM, since OFG inner VM does not do GC
+    )
+
+    rng = np.random.default_rng(1)
+    x_test = rng.normal(size=(10,))
+
+    res = benchmark(compiled_fn, x_test)
+
+    expected_res = x_test
+    for i in range(n):
+        expected_res = _f(expected_res)
+    np.testing.assert_allclose(res, expected_res)