Bug fix and optimisation for persistent reduction kernel tuning (#2596)

jataylo · web-flow · commit db3ba6678eac · 2025-09-04T14:13:32.000-05:00
Original PR (#2417) had incorrect indentation. Updated PR such that autotune will always add tiny configs, otherwise use the hinted configs only. Tested locally on test_torchinductor: Ran 894 tests in 952.242s FAILED (failures=1, skipped=28) And completed autotune runs for microbench models Microbenchmark for network : resnet152 Num devices: 1 Dtype: FP32 Mini batch size [img] : 64 Time per mini-batch : 0.09107530117034912 Throughput [img/sec] : 702.7152167226226
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -2595,20 +2595,20 @@ def _persistent_reduction_configs(
         elif reduction_hint == ReductionHint.OUTER:
             configs = configs[-1:]
 
-    if reduction_hint == ReductionHint.OUTER_TINY:
-        tiny_configs = [
-            triton_config_reduction(
-                size_hints,
-                2 * (256 // rnumel) if rnumel <= 256 else 1,
-                rnumel,
-            )
-        ]
-        if max_autotune_enabled:
-            for tconfig in tiny_configs:
-                if tconfig not in configs:
-                    configs.append(tconfig)
-            else:
-                configs = tiny_configs
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+        )
+    ]
+
+    if max_autotune_enabled:
+        for conf in tiny_configs:
+            if conf not in configs:
+                configs.append(conf)
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        configs = tiny_configs
 
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction