fix: correct output buffer sizing — use per-thread cap, not total

claude · claude · commit 68b61d99a5eb · 2026-03-08T02:51:10.000+08:00
output_cap was n*64 (total entries) but the kernel indexes as
tid*output_cap, making the buffer n*n*64*5 uint32s (~128GB at n=10000).
Fixed to 64 per-thread entries = 12.8MB at n=10000.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/emojiasm/gpu.py b/emojiasm/gpu.py
@@ -431,13 +431,11 @@ def gpu_run(
     # Conservative estimate: allow up to 64 output entries per thread.
     is_tier2 = tier == 2
     if is_tier2:
-        output_cap = n * 64
+        max_out_per_thread = 64  # max output entries per thread
     else:
-        output_cap = 0
+        max_out_per_thread = 0
 
-    # Max output entries per thread for Tier 2
-    max_out_per_thread = output_cap
-    output_cap_array = mx.array([output_cap], dtype=mx.uint32)
+    output_cap_array = mx.array([max_out_per_thread], dtype=mx.uint32)
 
     # Get (cached) kernel
     kernel = _get_kernel()