kavanaghpatrick · kavanaghpatrick · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/emojiasm/bytecode.py b/emojiasm/bytecode.py
@@ -16,7 +16,7 @@
     Tier 2: Numeric + output (PRINT/PRINTLN but no INPUT; GPU with output buffer)
     Tier 3: Full features (INPUT/strings require CPU fallback)
 
-Max stack depth capped at 128 entries in device memory (KB #147).
+Max stack depth capped at 256 entries in device memory (KB #147).
 """
 
 from __future__ import annotations
@@ -88,7 +88,7 @@
 _MAX_OPERAND = (1 << 24) - 1
 
 # Maximum stack depth allowed on GPU (KB #147)
-_GPU_MAX_STACK = 128
+_GPU_MAX_STACK = 256
 
 
 # ── Output dataclass ─────────────────────────────────────────────────────
@@ -299,7 +299,7 @@ def _analyze_max_stack_depth(program: Program) -> int:
     """Conservative max stack depth via instruction walk.
 
     Walks each function linearly (ignoring branches — conservative because
-    we take the max across all instruction positions). Caps at 128 per
+    we take the max across all instruction positions). Caps at 256 per
     KB #147 GPU memory budget.
     """
     max_depth = 0

diff --git a/emojiasm/gpu.py b/emojiasm/gpu.py
@@ -20,7 +20,7 @@
 
 # ── Constants ────────────────────────────────────────────────────────────
 
-DEFAULT_STACK_DEPTH = 128
+DEFAULT_STACK_DEPTH = 256
 DEFAULT_MAX_STEPS = 1_000_000
 DEFAULT_THREADGROUP_SIZE = 256
 

diff --git a/emojiasm/metal/vm.metal b/emojiasm/metal/vm.metal
@@ -91,10 +91,10 @@ constant uint32_t STATUS_TIMEOUT    = 3;
 // ── Fixed-size limits ───────────────────────────────────────────────────
 
 // Call stack depth (thread-local, small enough for registers per KB #146)
-constant int CALL_STACK_DEPTH = 16;
+constant int CALL_STACK_DEPTH = 32;
 
 // Memory cells per thread (thread-local array)
-constant int NUM_MEMORY_CELLS = 32;
+constant int NUM_MEMORY_CELLS = 128;
 
 // ── Output buffer entry (Tier 2 output capture) ────────────────────────
 

diff --git a/emojiasm/transpiler.py b/emojiasm/transpiler.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import ast
-from .opcodes import Op
+from .opcodes import Op, EMOJI_TO_OP
 from .parser import Program, Function, Instruction
 from .disasm import disassemble
 
@@ -31,21 +31,87 @@ def __init__(self, message: str, lineno: int = 0):
         super().__init__(f"TranspileError{loc}: {message}")
 
 
-# Emoji pool for variable memory cells (50 characters)
+# Emoji pool for variable memory cells (200+ characters)
 EMOJI_POOL = list(
+    # Original 50 (backward compatible, do not reorder)
     "🔢📊🎯⭐🌟💎🔥🌊🌈🍎"
     "🍊🍋🍇🍓🍒🥝🥑🌽🥕🍄"
     "🐱🐶🐸🦊🐻🐼🐨🐯🦁🐮"
     "🐷🐵🐔🐧🦅🦆🦉🐝🐛🦋"
     "🌻🌺🌸🌼🌹🍀🌿🌴🌵🎄"
+    # Animals (additional)
+    "🐭🐹🐰🐴🐗🐺🦄🐌🐞🐜"
+    "🐢🐍🐙🐠🐟🐬🐳🐋🐊🐅"
+    "🐆🐘🐪🐫🐃🐂🐄🐎🐖🐏"
+    "🐑🐐🐕🐩🐇🐁🐀🐿🦔🦇"
+    "🦎🦖🦕🦑🦞🦀🐡🦈🦍🦧"
+    "🦛🦏🦒🦘🦙🦌🦃🦚🦜🦢"
+    "🦩🦝🦨🦡🦫🦦🦥🐈🐓🦗"
+    # Food and drink
+    "🍐🍌🍉🍈🍑🥭🍍🥥🍅🍆"
+    "🥦🥬🥒🧄🧅🥔🍠🥐🥯🍞"
+    "🥖🥨🧀🥚🍳🧈🥞🧇🥓🥩"
+    "🍗🍖🍔🍟🍕🥪🥙🧆🥗🥘"
+    "🥫🍝🍜🍲🍛🍣🍱🥟🍤🍙"
+    "🍚🍘🍥🥠🥮🍢🍡🍧🍨🍦"
+    "🥧🧁🍰🎂🍮🍭🍬🍫🍿🍩"
+    "🍪🌰🥜🍯"
+    # Sports and activities
+    "⚽🏀🏈⚾🥎🎾🏐🏉🥏🎱"
+    "🏓🏸🏒🏑🥍🏏🥅🏹🎣🥊"
+    "🥋🎽🛹🛷🥌🎿🏂"
+    # Vehicles and transport
+    "🚗🚕🚙🚌🏎🚓🚑🚒🚐🛻"
+    "🚚🚛🚜🏍🛵🚲🛴🚔🚍🚘"
+    "🚖🚡🚠🚟🚃🚋🚞🚝🚄🚅"
+    "🚈🚂🚆🛶🚤🛥🚢"
 )
 
-# Emoji pool for function names
+# Emoji pool for function names (50+ characters)
 FUNC_EMOJI_POOL = list(
+    # Original entries (backward compatible, do not reorder)
     "🔲🔳🟥🟦🟩🟨🟧🟪🟫⬛"
     "⬜❤️💙💚💛🧡💜🤎🖤🤍"
+    # Colored circles and shapes
+    "🔴🟠🟡🟢🔵🟣🟤⚫⚪🔶"
+    "🔷🔸🔹🔺🔻💠🔘"
+    # Zodiac and symbols
+    "♈♉♊♋♌♍♎♏♐♑♒♓⛎"
 )
 
+def _validate_emoji_pools() -> None:
+    """Check emoji pools for duplicates and cross-pool collisions.
+
+    Called at module load time. Raises RuntimeError if:
+    - FUNC_EMOJI_POOL has duplicate entries
+    - EMOJI_POOL has duplicate entries
+    - FUNC_EMOJI_POOL and EMOJI_POOL overlap
+
+    Note: Collisions with opcode emoji (EMOJI_TO_OP) are acceptable because
+    the parser distinguishes opcode context from STORE/LOAD memory cell names.
+    """
+    func_dupes = len(FUNC_EMOJI_POOL) - len(set(FUNC_EMOJI_POOL))
+    if func_dupes:
+        raise RuntimeError(
+            f"FUNC_EMOJI_POOL has {func_dupes} duplicate entries"
+        )
+
+    var_dupes = len(EMOJI_POOL) - len(set(EMOJI_POOL))
+    if var_dupes:
+        raise RuntimeError(
+            f"EMOJI_POOL has {var_dupes} duplicate entries"
+        )
+
+    overlap = set(FUNC_EMOJI_POOL) & set(EMOJI_POOL)
+    if overlap:
+        raise RuntimeError(
+            f"FUNC_EMOJI_POOL and EMOJI_POOL overlap: {overlap}"
+        )
+
+
+_validate_emoji_pools()
+
+
 # Operator mappings
 _BINOP_MAP = {
     ast.Add: Op.ADD,

diff --git a/specs/tier4-capacity/.progress.md b/specs/tier4-capacity/.progress.md
@@ -0,0 +1,37 @@
+# tier4-capacity
+
+## Original Goal
+
+Implement EmojiASM issue #30: Tier 4 VM and GPU kernel capacity limits. Raise the hard limits that constrain what transpiled Python programs can do. Expand GPU memory cells (32->128+), variable pool (50->200+), function pool (20->50+), GPU call stack depth (16->32+), GPU stack depth (128->256+). These are hard walls that cause silent failures or crashes.
+
+## Completed Tasks
+- [x] 1.1 Increase GPU memory cells from 32 to 128
+- [x] 1.2 Increase GPU call stack depth from 16 to 32
+- [x] 1.3 Increase GPU stack depth from 128 to 256
+- [x] 1.4 Expand variable emoji pool from 50 to 200+
+- [x] 1.5 Expand function emoji pool from 20 to 50+
+- [x] 2.1 Update docstrings and comments for new limits
+- [x] 2.2 Add collision validation utility
+- [x] 3.1 Test expanded variable pool limits
+- [x] 3.2 Test expanded function pool and GPU kernel limits
+
+## Current Task
+Awaiting next task
+
+## Learnings
+
+- Existing `EMOJI_POOL` has collisions with reserved opcodes: `🔢` (MOD) and `📊` (DIRECTIVE_DATA). These work because transpiler uses them as STORE/LOAD args (memory cell names), not as instruction opcodes in source text. Parser context distinguishes them.
+- KB #147 confirms 256-entry stack feasible on Apple Silicon GPU (1KB device memory per thread).
+- KB #185 estimates ~6.5KB/thread at current sizes. With increases, ~9.7KB/thread -- still under 10KB budget.
+- `FUNC_EMOJI_POOL` has 21 entries (not 20 as stated in issue) because `❤️` uses variation selector making it count differently from the 10-per-line grouping.
+- `stack_depth` is already parameterized as a kernel argument in `gpu.py` -- only the default needs changing. Memory cells and call stack are compile-time Metal constants.
+- The `_split_kernel_source()` function in `gpu.py` patches scalar reference parameters to pointer dereferences for MLX compatibility -- no changes needed for our constant-only changes.
+- `_build_memory_map()` in `bytecode.py` maps emoji cell names to integer indices 0..N-1 -- the 24-bit operand field supports up to 16M cells, so 128 is no concern.
+- Two existing tests (`test_capped_at_128` in test_bytecode.py and `test_default_stack_depth` in test_gpu_kernel.py) hardcoded old constant values and needed updating along with the constant changes.
+- Expanded EMOJI_POOL to 258 entries (animals, food, sports, vehicles). Only 1 opcode collision (`🔢` = MOD), which is expected and harmless. Python `list()` on emoji strings correctly splits multi-byte emoji into individual characters.
+- Expanded FUNC_EMOJI_POOL to 51 entries using colored circles/shapes (🔴🟠🟡🟢🔵🟣🟤⚫⚪🔶🔷🔸🔹🔺🔻💠🔘) and zodiac symbols (♈♉♊♋♌♍♎♏♐♑♒♓⛎). All are single-codepoint emoji safe for list() splitting. No collisions with EMOJI_POOL or opcodes.
+- Only bytecode.py had stale "128" references (module docstring and `_analyze_max_stack_depth` docstring). transpiler.py comments were already updated in tasks 1.4/1.5. vm.metal comments don't reference specific old numeric values.
+- `_validate_emoji_pools()` runs at module load time, checking for duplicates within each pool and cross-pool overlap. Opcode collisions are intentionally allowed since parser context distinguishes them.
+
+## Next
+Task 1.6: POC Checkpoint -- verify all existing tests pass
diff --git a/specs/tier4-capacity/design.md b/specs/tier4-capacity/design.md
@@ -0,0 +1,134 @@
+---
+spec: tier4-capacity
+phase: design
+created: 2026-03-08
+generated: auto
+---
+
+# Design: tier4-capacity
+
+## Overview
+
+Increase five hard-coded capacity limits across three layers: Metal kernel constants, Python bytecode compiler caps, and transpiler emoji pools. No architectural changes -- just constant adjustments and expanding string lists.
+
+## Architecture
+
+```
+Transpiler (transpiler.py)          Bytecode (bytecode.py)           GPU Kernel (vm.metal)
+  EMOJI_POOL: 50 -> 200+             _GPU_MAX_STACK: 128 -> 256       NUM_MEMORY_CELLS: 32 -> 128
+  FUNC_EMOJI_POOL: 20 -> 50+                                          CALL_STACK_DEPTH: 16 -> 32
+                                    GPU Interface (gpu.py)             stack_depth: 128 -> 256
+                                      DEFAULT_STACK_DEPTH: 128 -> 256
+```
+
+## Components
+
+### Component A: Metal Kernel Constants (`emojiasm/metal/vm.metal`)
+**Purpose**: Define per-thread resource sizes for GPU execution
+**Changes**:
+- L94: `CALL_STACK_DEPTH` 16 -> 32
+- L97: `NUM_MEMORY_CELLS` 32 -> 128
+
+**Impact on thread-local memory**:
+- `call_stack[32]`: 32 * 4B = 128B (was 64B)
+- `memory[128]`: 128 * 4B = 512B (was 128B)
+- Net increase: ~448B per thread (~7KB total with stacks, well within budget per KB #185)
+
+### Component B: Bytecode Compiler Cap (`emojiasm/bytecode.py`)
+**Purpose**: Cap static stack analysis for GPU programs
+**Changes**:
+- L91: `_GPU_MAX_STACK` 128 -> 256
+
+### Component C: GPU Interface Default (`emojiasm/gpu.py`)
+**Purpose**: Set default per-instance stack size passed to kernel
+**Changes**:
+- L23: `DEFAULT_STACK_DEPTH` 128 -> 256
+
+**Impact on device memory**:
+- Stacks buffer: `n * 256 * 4B` = 1KB per thread (was 512B)
+- For 10K threads: 10MB (was 5MB), well within GPU memory
+
+### Component D: Variable Emoji Pool (`emojiasm/transpiler.py`)
+**Purpose**: Map Python variable names to unique emoji memory cell identifiers
+**Changes**:
+- Expand `EMOJI_POOL` from 50 to 200+ characters
+- Use emoji from multiple Unicode blocks: food, animals, objects, nature, sports, vehicles, flags
+
+**Emoji selection criteria**:
+1. Must NOT appear in `EMOJI_TO_OP` (opcodes.py)
+2. Must NOT appear in directive constants (`DIRECTIVE_FUNC`, `DIRECTIVE_LABEL`, etc.)
+3. Must NOT appear in `FUNC_EMOJI_POOL`
+4. Should be single-codepoint or stable multi-codepoint sequences
+5. Prefer visually distinct emoji
+
+### Component E: Function Emoji Pool (`emojiasm/transpiler.py`)
+**Purpose**: Map Python function names to unique emoji identifiers
+**Changes**:
+- Expand `FUNC_EMOJI_POOL` from 20 to 50+ characters
+- Add more colored shapes, symbols, and distinct emoji
+
+**Selection criteria**: Same collision avoidance as Component D, plus must not overlap with `EMOJI_POOL`.
+
+## Data Flow
+
+1. Python source -> Transpiler assigns variables from `EMOJI_POOL` (up to 200+)
+2. Transpiler assigns functions from `FUNC_EMOJI_POOL` (up to 50+)
+3. Program -> Bytecode compiler maps emoji cells to integer indices 0..N-1
+4. Bytecode `_analyze_max_stack_depth()` caps at 256 (was 128)
+5. `gpu_run()` allocates stacks buffer with `n * 256` entries
+6. Metal kernel uses `memory[128]`, `call_stack[32]`, dynamic `stack_depth=256`
+
+## Technical Decisions
+
+| Decision | Options | Choice | Rationale |
+|----------|---------|--------|-----------|
+| Memory cells count | 64, 128, 256 | 128 | Matches stack depth; 512B thread-local fits register budget per KB #147 |
+| Call stack depth | 24, 32, 64 | 32 | Supports fib(20); 128B minimal overhead |
+| Stack depth | 192, 256, 512 | 256 | Per KB #147 max feasible; 1KB device memory per thread |
+| Variable pool size | 150, 200, 300 | 200+ | Covers complex programs; more emoji available if needed |
+| Function pool size | 40, 50, 80 | 50+ | Covers modular programs; 50 functions is generous |
+| Configurability | Constants vs params | Constants (except stack_depth) | Memory cells and call stack are compile-time Metal constants; stack_depth already parameterized |
+
+## File Structure
+
+| File | Action | Purpose |
+|------|--------|---------|
+| `emojiasm/metal/vm.metal` | Modify | Update `CALL_STACK_DEPTH` and `NUM_MEMORY_CELLS` |
+| `emojiasm/bytecode.py` | Modify | Update `_GPU_MAX_STACK` |
+| `emojiasm/gpu.py` | Modify | Update `DEFAULT_STACK_DEPTH` |
+| `emojiasm/transpiler.py` | Modify | Expand `EMOJI_POOL` and `FUNC_EMOJI_POOL` |
+| `tests/test_gpu_kernel.py` | Modify | Add/update tests for new limits |
+| `tests/test_transpiler.py` | Modify | Add tests for expanded pools |
+| `tests/test_emojiasm.py` | No change | Existing tests should pass as-is |
+
+## Error Handling
+
+| Error | Handling | User Impact |
+|-------|----------|-------------|
+| Variable pool exceeded (>200) | `TranspileError` with count | Same as before, higher limit |
+| Function pool exceeded (>50) | `TranspileError` with count | Same as before, higher limit |
+| GPU memory cell OOB (>128) | `STATUS_ERROR` in kernel | Same error path, higher limit |
+| GPU call stack overflow (>32) | `STATUS_ERROR` in kernel | Same error path, higher limit |
+| GPU stack overflow (>256) | `STATUS_ERROR` in kernel | Same error path, higher limit |
+
+## Existing Patterns to Follow
+
+- `vm.metal` L92-97: `constant int` declarations for limits
+- `transpiler.py` L35-47: Emoji pools as `list()` of concatenated string literals
+- `transpiler.py` L139-141: Pool exhaustion raises `TranspileError`
+- `bytecode.py` L91: `_GPU_MAX_STACK` caps analysis
+- `gpu.py` L23: `DEFAULT_STACK_DEPTH` constant
+
+## Per-Thread Memory Budget (Updated)
+
+| Resource | Old Size | New Size |
+|----------|----------|----------|
+| Operand stack (device) | 512B | 1024B |
+| Call stack (thread-local) | 64B | 128B |
+| Memory cells (thread-local) | 128B | 512B |
+| Arrays (thread-local) | 8KB | 8KB (unchanged) |
+| PRNG state (thread-local) | 24B | 24B |
+| **Total thread-local** | **~8.2KB** | **~8.7KB** |
+| **Total with device stack** | **~8.7KB** | **~9.7KB** |
+
+Within 10KB budget per NFR-2. At 9.7KB/thread, 64MB supports ~6,500 concurrent VMs.