diff --git a/emojiasm/bytecode.py b/emojiasm/bytecode.py index a52496b..ab135c6 100644 --- a/emojiasm/bytecode.py +++ b/emojiasm/bytecode.py @@ -16,7 +16,7 @@ Tier 2: Numeric + output (PRINT/PRINTLN but no INPUT; GPU with output buffer) Tier 3: Full features (INPUT/strings require CPU fallback) -Max stack depth capped at 128 entries in device memory (KB #147). +Max stack depth capped at 256 entries in device memory (KB #147). """ from __future__ import annotations @@ -88,7 +88,7 @@ _MAX_OPERAND = (1 << 24) - 1 # Maximum stack depth allowed on GPU (KB #147) -_GPU_MAX_STACK = 128 +_GPU_MAX_STACK = 256 # ── Output dataclass ───────────────────────────────────────────────────── @@ -299,7 +299,7 @@ def _analyze_max_stack_depth(program: Program) -> int: """Conservative max stack depth via instruction walk. Walks each function linearly (ignoring branches — conservative because - we take the max across all instruction positions). Caps at 128 per + we take the max across all instruction positions). Caps at 256 per KB #147 GPU memory budget. """ max_depth = 0 diff --git a/emojiasm/gpu.py b/emojiasm/gpu.py index ed34ec6..2f7b1f1 100644 --- a/emojiasm/gpu.py +++ b/emojiasm/gpu.py @@ -20,7 +20,7 @@ # ── Constants ──────────────────────────────────────────────────────────── -DEFAULT_STACK_DEPTH = 128 +DEFAULT_STACK_DEPTH = 256 DEFAULT_MAX_STEPS = 1_000_000 DEFAULT_THREADGROUP_SIZE = 256 diff --git a/emojiasm/metal/vm.metal b/emojiasm/metal/vm.metal index a921c57..7c22ff5 100644 --- a/emojiasm/metal/vm.metal +++ b/emojiasm/metal/vm.metal @@ -91,10 +91,10 @@ constant uint32_t STATUS_TIMEOUT = 3; // ── Fixed-size limits ─────────────────────────────────────────────────── // Call stack depth (thread-local, small enough for registers per KB #146) -constant int CALL_STACK_DEPTH = 16; +constant int CALL_STACK_DEPTH = 32; // Memory cells per thread (thread-local array) -constant int NUM_MEMORY_CELLS = 32; +constant int NUM_MEMORY_CELLS = 128; // ── Output buffer entry (Tier 2 output capture) ──────────────────────── diff --git a/emojiasm/transpiler.py b/emojiasm/transpiler.py index d6b1234..67bcc35 100644 --- a/emojiasm/transpiler.py +++ b/emojiasm/transpiler.py @@ -17,7 +17,7 @@ from __future__ import annotations import ast -from .opcodes import Op +from .opcodes import Op, EMOJI_TO_OP from .parser import Program, Function, Instruction from .disasm import disassemble @@ -31,21 +31,87 @@ def __init__(self, message: str, lineno: int = 0): super().__init__(f"TranspileError{loc}: {message}") -# Emoji pool for variable memory cells (50 characters) +# Emoji pool for variable memory cells (200+ characters) EMOJI_POOL = list( + # Original 50 (backward compatible, do not reorder) "🔢📊🎯⭐🌟💎🔥🌊🌈🍎" "🍊🍋🍇🍓🍒🥝🥑🌽🥕🍄" "🐱🐶🐸🦊🐻🐼🐨🐯🦁🐮" "🐷🐵🐔🐧🦅🦆🦉🐝🐛🦋" "🌻🌺🌸🌼🌹🍀🌿🌴🌵🎄" + # Animals (additional) + "🐭🐹🐰🐴🐗🐺🦄🐌🐞🐜" + "🐢🐍🐙🐠🐟🐬🐳🐋🐊🐅" + "🐆🐘🐪🐫🐃🐂🐄🐎🐖🐏" + "🐑🐐🐕🐩🐇🐁🐀🐿🦔🦇" + "🦎🦖🦕🦑🦞🦀🐡🦈🦍🦧" + "🦛🦏🦒🦘🦙🦌🦃🦚🦜🦢" + "🦩🦝🦨🦡🦫🦦🦥🐈🐓🦗" + # Food and drink + "🍐🍌🍉🍈🍑🥭🍍🥥🍅🍆" + "🥦🥬🥒🧄🧅🥔🍠🥐🥯🍞" + "🥖🥨🧀🥚🍳🧈🥞🧇🥓🥩" + "🍗🍖🍔🍟🍕🥪🥙🧆🥗🥘" + "🥫🍝🍜🍲🍛🍣🍱🥟🍤🍙" + "🍚🍘🍥🥠🥮🍢🍡🍧🍨🍦" + "🥧🧁🍰🎂🍮🍭🍬🍫🍿🍩" + "🍪🌰🥜🍯" + # Sports and activities + "⚽🏀🏈⚾🥎🎾🏐🏉🥏🎱" + "🏓🏸🏒🏑🥍🏏🥅🏹🎣🥊" + "🥋🎽🛹🛷🥌🎿🏂" + # Vehicles and transport + "🚗🚕🚙🚌🏎🚓🚑🚒🚐🛻" + "🚚🚛🚜🏍🛵🚲🛴🚔🚍🚘" + "🚖🚡🚠🚟🚃🚋🚞🚝🚄🚅" + "🚈🚂🚆🛶🚤🛥🚢" ) -# Emoji pool for function names +# Emoji pool for function names (50+ characters) FUNC_EMOJI_POOL = list( + # Original entries (backward compatible, do not reorder) "🔲🔳🟥🟦🟩🟨🟧🟪🟫⬛" "⬜❤️💙💚💛🧡💜🤎🖤🤍" + # Colored circles and shapes + "🔴🟠🟡🟢🔵🟣🟤⚫⚪🔶" + "🔷🔸🔹🔺🔻💠🔘" + # Zodiac and symbols + "♈♉♊♋♌♍♎♏♐♑♒♓⛎" ) +def _validate_emoji_pools() -> None: + """Check emoji pools for duplicates and cross-pool collisions. + + Called at module load time. Raises RuntimeError if: + - FUNC_EMOJI_POOL has duplicate entries + - EMOJI_POOL has duplicate entries + - FUNC_EMOJI_POOL and EMOJI_POOL overlap + + Note: Collisions with opcode emoji (EMOJI_TO_OP) are acceptable because + the parser distinguishes opcode context from STORE/LOAD memory cell names. + """ + func_dupes = len(FUNC_EMOJI_POOL) - len(set(FUNC_EMOJI_POOL)) + if func_dupes: + raise RuntimeError( + f"FUNC_EMOJI_POOL has {func_dupes} duplicate entries" + ) + + var_dupes = len(EMOJI_POOL) - len(set(EMOJI_POOL)) + if var_dupes: + raise RuntimeError( + f"EMOJI_POOL has {var_dupes} duplicate entries" + ) + + overlap = set(FUNC_EMOJI_POOL) & set(EMOJI_POOL) + if overlap: + raise RuntimeError( + f"FUNC_EMOJI_POOL and EMOJI_POOL overlap: {overlap}" + ) + + +_validate_emoji_pools() + + # Operator mappings _BINOP_MAP = { ast.Add: Op.ADD, diff --git a/specs/tier4-capacity/.progress.md b/specs/tier4-capacity/.progress.md new file mode 100644 index 0000000..5c9f993 --- /dev/null +++ b/specs/tier4-capacity/.progress.md @@ -0,0 +1,37 @@ +# tier4-capacity + +## Original Goal + +Implement EmojiASM issue #30: Tier 4 VM and GPU kernel capacity limits. Raise the hard limits that constrain what transpiled Python programs can do. Expand GPU memory cells (32->128+), variable pool (50->200+), function pool (20->50+), GPU call stack depth (16->32+), GPU stack depth (128->256+). These are hard walls that cause silent failures or crashes. + +## Completed Tasks +- [x] 1.1 Increase GPU memory cells from 32 to 128 +- [x] 1.2 Increase GPU call stack depth from 16 to 32 +- [x] 1.3 Increase GPU stack depth from 128 to 256 +- [x] 1.4 Expand variable emoji pool from 50 to 200+ +- [x] 1.5 Expand function emoji pool from 20 to 50+ +- [x] 2.1 Update docstrings and comments for new limits +- [x] 2.2 Add collision validation utility +- [x] 3.1 Test expanded variable pool limits +- [x] 3.2 Test expanded function pool and GPU kernel limits + +## Current Task +Awaiting next task + +## Learnings + +- Existing `EMOJI_POOL` has collisions with reserved opcodes: `🔢` (MOD) and `📊` (DIRECTIVE_DATA). These work because transpiler uses them as STORE/LOAD args (memory cell names), not as instruction opcodes in source text. Parser context distinguishes them. +- KB #147 confirms 256-entry stack feasible on Apple Silicon GPU (1KB device memory per thread). +- KB #185 estimates ~6.5KB/thread at current sizes. With increases, ~9.7KB/thread -- still under 10KB budget. +- `FUNC_EMOJI_POOL` has 21 entries (not 20 as stated in issue) because `❤️` uses variation selector making it count differently from the 10-per-line grouping. +- `stack_depth` is already parameterized as a kernel argument in `gpu.py` -- only the default needs changing. Memory cells and call stack are compile-time Metal constants. +- The `_split_kernel_source()` function in `gpu.py` patches scalar reference parameters to pointer dereferences for MLX compatibility -- no changes needed for our constant-only changes. +- `_build_memory_map()` in `bytecode.py` maps emoji cell names to integer indices 0..N-1 -- the 24-bit operand field supports up to 16M cells, so 128 is no concern. +- Two existing tests (`test_capped_at_128` in test_bytecode.py and `test_default_stack_depth` in test_gpu_kernel.py) hardcoded old constant values and needed updating along with the constant changes. +- Expanded EMOJI_POOL to 258 entries (animals, food, sports, vehicles). Only 1 opcode collision (`🔢` = MOD), which is expected and harmless. Python `list()` on emoji strings correctly splits multi-byte emoji into individual characters. +- Expanded FUNC_EMOJI_POOL to 51 entries using colored circles/shapes (🔴🟠🟡🟢🔵🟣🟤⚫⚪🔶🔷🔸🔹🔺🔻💠🔘) and zodiac symbols (♈♉♊♋♌♍♎♏♐♑♒♓⛎). All are single-codepoint emoji safe for list() splitting. No collisions with EMOJI_POOL or opcodes. +- Only bytecode.py had stale "128" references (module docstring and `_analyze_max_stack_depth` docstring). transpiler.py comments were already updated in tasks 1.4/1.5. vm.metal comments don't reference specific old numeric values. +- `_validate_emoji_pools()` runs at module load time, checking for duplicates within each pool and cross-pool overlap. Opcode collisions are intentionally allowed since parser context distinguishes them. + +## Next +Task 1.6: POC Checkpoint -- verify all existing tests pass diff --git a/specs/tier4-capacity/design.md b/specs/tier4-capacity/design.md new file mode 100644 index 0000000..fb72457 --- /dev/null +++ b/specs/tier4-capacity/design.md @@ -0,0 +1,134 @@ +--- +spec: tier4-capacity +phase: design +created: 2026-03-08 +generated: auto +--- + +# Design: tier4-capacity + +## Overview + +Increase five hard-coded capacity limits across three layers: Metal kernel constants, Python bytecode compiler caps, and transpiler emoji pools. No architectural changes -- just constant adjustments and expanding string lists. + +## Architecture + +``` +Transpiler (transpiler.py) Bytecode (bytecode.py) GPU Kernel (vm.metal) + EMOJI_POOL: 50 -> 200+ _GPU_MAX_STACK: 128 -> 256 NUM_MEMORY_CELLS: 32 -> 128 + FUNC_EMOJI_POOL: 20 -> 50+ CALL_STACK_DEPTH: 16 -> 32 + GPU Interface (gpu.py) stack_depth: 128 -> 256 + DEFAULT_STACK_DEPTH: 128 -> 256 +``` + +## Components + +### Component A: Metal Kernel Constants (`emojiasm/metal/vm.metal`) +**Purpose**: Define per-thread resource sizes for GPU execution +**Changes**: +- L94: `CALL_STACK_DEPTH` 16 -> 32 +- L97: `NUM_MEMORY_CELLS` 32 -> 128 + +**Impact on thread-local memory**: +- `call_stack[32]`: 32 * 4B = 128B (was 64B) +- `memory[128]`: 128 * 4B = 512B (was 128B) +- Net increase: ~448B per thread (~7KB total with stacks, well within budget per KB #185) + +### Component B: Bytecode Compiler Cap (`emojiasm/bytecode.py`) +**Purpose**: Cap static stack analysis for GPU programs +**Changes**: +- L91: `_GPU_MAX_STACK` 128 -> 256 + +### Component C: GPU Interface Default (`emojiasm/gpu.py`) +**Purpose**: Set default per-instance stack size passed to kernel +**Changes**: +- L23: `DEFAULT_STACK_DEPTH` 128 -> 256 + +**Impact on device memory**: +- Stacks buffer: `n * 256 * 4B` = 1KB per thread (was 512B) +- For 10K threads: 10MB (was 5MB), well within GPU memory + +### Component D: Variable Emoji Pool (`emojiasm/transpiler.py`) +**Purpose**: Map Python variable names to unique emoji memory cell identifiers +**Changes**: +- Expand `EMOJI_POOL` from 50 to 200+ characters +- Use emoji from multiple Unicode blocks: food, animals, objects, nature, sports, vehicles, flags + +**Emoji selection criteria**: +1. Must NOT appear in `EMOJI_TO_OP` (opcodes.py) +2. Must NOT appear in directive constants (`DIRECTIVE_FUNC`, `DIRECTIVE_LABEL`, etc.) +3. Must NOT appear in `FUNC_EMOJI_POOL` +4. Should be single-codepoint or stable multi-codepoint sequences +5. Prefer visually distinct emoji + +### Component E: Function Emoji Pool (`emojiasm/transpiler.py`) +**Purpose**: Map Python function names to unique emoji identifiers +**Changes**: +- Expand `FUNC_EMOJI_POOL` from 20 to 50+ characters +- Add more colored shapes, symbols, and distinct emoji + +**Selection criteria**: Same collision avoidance as Component D, plus must not overlap with `EMOJI_POOL`. + +## Data Flow + +1. Python source -> Transpiler assigns variables from `EMOJI_POOL` (up to 200+) +2. Transpiler assigns functions from `FUNC_EMOJI_POOL` (up to 50+) +3. Program -> Bytecode compiler maps emoji cells to integer indices 0..N-1 +4. Bytecode `_analyze_max_stack_depth()` caps at 256 (was 128) +5. `gpu_run()` allocates stacks buffer with `n * 256` entries +6. Metal kernel uses `memory[128]`, `call_stack[32]`, dynamic `stack_depth=256` + +## Technical Decisions + +| Decision | Options | Choice | Rationale | +|----------|---------|--------|-----------| +| Memory cells count | 64, 128, 256 | 128 | Matches stack depth; 512B thread-local fits register budget per KB #147 | +| Call stack depth | 24, 32, 64 | 32 | Supports fib(20); 128B minimal overhead | +| Stack depth | 192, 256, 512 | 256 | Per KB #147 max feasible; 1KB device memory per thread | +| Variable pool size | 150, 200, 300 | 200+ | Covers complex programs; more emoji available if needed | +| Function pool size | 40, 50, 80 | 50+ | Covers modular programs; 50 functions is generous | +| Configurability | Constants vs params | Constants (except stack_depth) | Memory cells and call stack are compile-time Metal constants; stack_depth already parameterized | + +## File Structure + +| File | Action | Purpose | +|------|--------|---------| +| `emojiasm/metal/vm.metal` | Modify | Update `CALL_STACK_DEPTH` and `NUM_MEMORY_CELLS` | +| `emojiasm/bytecode.py` | Modify | Update `_GPU_MAX_STACK` | +| `emojiasm/gpu.py` | Modify | Update `DEFAULT_STACK_DEPTH` | +| `emojiasm/transpiler.py` | Modify | Expand `EMOJI_POOL` and `FUNC_EMOJI_POOL` | +| `tests/test_gpu_kernel.py` | Modify | Add/update tests for new limits | +| `tests/test_transpiler.py` | Modify | Add tests for expanded pools | +| `tests/test_emojiasm.py` | No change | Existing tests should pass as-is | + +## Error Handling + +| Error | Handling | User Impact | +|-------|----------|-------------| +| Variable pool exceeded (>200) | `TranspileError` with count | Same as before, higher limit | +| Function pool exceeded (>50) | `TranspileError` with count | Same as before, higher limit | +| GPU memory cell OOB (>128) | `STATUS_ERROR` in kernel | Same error path, higher limit | +| GPU call stack overflow (>32) | `STATUS_ERROR` in kernel | Same error path, higher limit | +| GPU stack overflow (>256) | `STATUS_ERROR` in kernel | Same error path, higher limit | + +## Existing Patterns to Follow + +- `vm.metal` L92-97: `constant int` declarations for limits +- `transpiler.py` L35-47: Emoji pools as `list()` of concatenated string literals +- `transpiler.py` L139-141: Pool exhaustion raises `TranspileError` +- `bytecode.py` L91: `_GPU_MAX_STACK` caps analysis +- `gpu.py` L23: `DEFAULT_STACK_DEPTH` constant + +## Per-Thread Memory Budget (Updated) + +| Resource | Old Size | New Size | +|----------|----------|----------| +| Operand stack (device) | 512B | 1024B | +| Call stack (thread-local) | 64B | 128B | +| Memory cells (thread-local) | 128B | 512B | +| Arrays (thread-local) | 8KB | 8KB (unchanged) | +| PRNG state (thread-local) | 24B | 24B | +| **Total thread-local** | **~8.2KB** | **~8.7KB** | +| **Total with device stack** | **~8.7KB** | **~9.7KB** | + +Within 10KB budget per NFR-2. At 9.7KB/thread, 64MB supports ~6,500 concurrent VMs. diff --git a/specs/tier4-capacity/requirements.md b/specs/tier4-capacity/requirements.md new file mode 100644 index 0000000..1643645 --- /dev/null +++ b/specs/tier4-capacity/requirements.md @@ -0,0 +1,85 @@ +--- +spec: tier4-capacity +phase: requirements +created: 2026-03-08 +generated: auto +--- + +# Requirements: tier4-capacity + +## Summary + +Raise hard capacity limits in the GPU Metal kernel, bytecode compiler, and Python transpiler to support larger, more complex programs. All limits are currently too low for non-trivial transpiled Python with nested loops, many variables, and recursive functions. + +## User Stories + +### US-1: Transpile programs with many variables +As a developer, I want to transpile Python programs with up to 200 variables so that complex algorithms with nested loops and temporaries don't hit the 50-variable limit. + +**Acceptance Criteria**: +- AC-1.1: `EMOJI_POOL` contains at least 200 unique emoji characters +- AC-1.2: No emoji in `EMOJI_POOL` collides with opcodes in `EMOJI_TO_OP` or directives +- AC-1.3: A transpiled program using 100+ variables compiles and runs correctly + +### US-2: Transpile programs with many functions +As a developer, I want to define up to 50 functions in transpiled Python so that modular programs with helper functions don't hit the 20-function limit. + +**Acceptance Criteria**: +- AC-2.1: `FUNC_EMOJI_POOL` contains at least 50 unique emoji characters +- AC-2.2: No emoji in `FUNC_EMOJI_POOL` collides with variable pool or opcodes +- AC-2.3: A transpiled program with 30+ functions compiles and runs correctly + +### US-3: GPU programs with many memory cells +As a developer, I want GPU programs to access up to 128 memory cells so that transpiled programs with many variables execute on GPU without memory cell overflow. + +**Acceptance Criteria**: +- AC-3.1: `NUM_MEMORY_CELLS` in `vm.metal` is at least 128 +- AC-3.2: STORE/LOAD to cell indices 0-127 work correctly in GPU execution +- AC-3.3: Bytecode operand encoding supports cell indices up to 127 + +### US-4: Deeper GPU recursion +As a developer, I want GPU programs to recurse up to 32 levels deep so that recursive algorithms like `fib(20)` don't overflow the call stack. + +**Acceptance Criteria**: +- AC-4.1: `CALL_STACK_DEPTH` in `vm.metal` is at least 32 +- AC-4.2: A recursive function calling 20+ levels deep completes without error on GPU + +### US-5: Larger GPU stack +As a developer, I want GPU programs to use a 256-entry stack so that complex expressions and save/restore patterns around recursive calls don't overflow. + +**Acceptance Criteria**: +- AC-5.1: `DEFAULT_STACK_DEPTH` in `gpu.py` is at least 256 +- AC-5.2: `_GPU_MAX_STACK` in `bytecode.py` matches the new default +- AC-5.3: Stacks buffer in `gpu_run()` is sized correctly for the new depth + +## Functional Requirements + +| ID | Requirement | Priority | Source | +|----|-------------|----------|--------| +| FR-1 | Expand `EMOJI_POOL` to 200+ emoji | Must | US-1 | +| FR-2 | Expand `FUNC_EMOJI_POOL` to 50+ emoji | Must | US-2 | +| FR-3 | Increase `NUM_MEMORY_CELLS` to 128 | Must | US-3 | +| FR-4 | Increase `CALL_STACK_DEPTH` to 32 | Must | US-4 | +| FR-5 | Increase `DEFAULT_STACK_DEPTH` and `_GPU_MAX_STACK` to 256 | Must | US-5 | +| FR-6 | Ensure no emoji collisions between pools and opcode/directive sets | Must | US-1, US-2 | +| FR-7 | All existing tests continue to pass | Must | All | + +## Non-Functional Requirements + +| ID | Requirement | Category | +|----|-------------|----------| +| NFR-1 | GPU occupancy should remain above 50% with new limits | Performance | +| NFR-2 | Per-thread memory budget should stay under 10KB | Performance | +| NFR-3 | Emoji pools should use visually distinct, common emoji | Usability | + +## Out of Scope + +- Runtime-configurable memory cell count (would require kernel recompilation) +- Dynamic memory allocation on GPU +- Expanding CPU VM limits (already at 4096 stack, dict-based memory) +- Array capacity changes (`MAX_ARRAYS`, `MAX_ARRAY_SIZE` in vm.metal) + +## Dependencies + +- Unicode emoji availability in Python strings +- Metal shader compiler support for larger thread-local arrays diff --git a/specs/tier4-capacity/research.md b/specs/tier4-capacity/research.md new file mode 100644 index 0000000..e1211f9 --- /dev/null +++ b/specs/tier4-capacity/research.md @@ -0,0 +1,66 @@ +--- +spec: tier4-capacity +phase: research +created: 2026-03-08 +generated: auto +--- + +# Research: tier4-capacity + +## Executive Summary + +Raising capacity limits across the GPU kernel, transpiler pools, and bytecode module. All changes are low-risk constant/pool adjustments. The main trade-off is GPU occupancy vs capacity -- KB #147 confirms 256-entry stack is feasible, and KB #185 shows ~6.5KB/thread budget supports ~10K concurrent VMs. + +## Codebase Analysis + +### Current Limits (Exact Locations) + +| Limit | Current | File | Line | +|-------|---------|------|------| +| GPU memory cells | 32 | `emojiasm/metal/vm.metal` | L97 `NUM_MEMORY_CELLS = 32` | +| GPU call stack depth | 16 | `emojiasm/metal/vm.metal` | L94 `CALL_STACK_DEPTH = 16` | +| GPU stack depth | 128 | `emojiasm/gpu.py` | L23 `DEFAULT_STACK_DEPTH = 128` | +| GPU stack cap (bytecode) | 128 | `emojiasm/bytecode.py` | L91 `_GPU_MAX_STACK = 128` | +| Variable emoji pool | 50 | `emojiasm/transpiler.py` | L35-41 `EMOJI_POOL` | +| Function emoji pool | 20 | `emojiasm/transpiler.py` | L44-47 `FUNC_EMOJI_POOL` | +| CPU VM max_stack | 4096 | `emojiasm/vm.py` | L21 `stack_size=4096` | + +### Existing Patterns + +- `vm.metal` uses `constant int` declarations for compile-time limits (L94, L97) +- `gpu.py` passes `stack_depth` as a kernel parameter (already configurable per-dispatch) +- `bytecode.py` caps `_analyze_max_stack_depth()` at `_GPU_MAX_STACK` +- `transpiler.py` `VarManager` raises `TranspileError` when pool exhausted (L139-141) +- `FUNC_EMOJI_POOL` checked at L544-545 during function registration + +### Dependencies + +- MLX `mx.fast.metal_kernel()` -- stacks buffer sized as `n * stack_depth` +- Metal compiler -- `constant int` values baked into kernel at compile time +- `_split_kernel_source()` in `gpu.py` -- patches scalar refs to pointer derefs +- Tests: `test_gpu_kernel.py` (source checks), `test_transpiler.py` (transpile+run) + +### Constraints + +- Metal thread-local arrays: larger `memory[]` and `call_stack[]` consume more registers, reducing occupancy +- KB #147: 256-entry stack feasible, 64-entry thread-private arrays start impacting occupancy +- KB #185: ~6.5KB/thread budget at current sizes; doubling stack+memory stays under ~8KB +- 24-bit operand field in bytecode: max 16M memory cells (not a concern at 128) +- `EMOJI_POOL` must avoid collisions with opcodes (`EMOJI_TO_OP`) and directives + +## Feasibility Assessment + +| Aspect | Assessment | Notes | +|--------|------------|-------| +| Technical Viability | High | Pure constant changes + expanding string lists | +| Effort Estimate | S | ~2-3 hours including tests | +| Risk Level | Low | No architectural changes; only raising limits | + +## Recommendations + +1. Increase `NUM_MEMORY_CELLS` to 128 (matches stack depth, 512B thread-local) +2. Increase `CALL_STACK_DEPTH` to 32 (supports `fib(20)` and deeper recursion) +3. Increase `DEFAULT_STACK_DEPTH` and `_GPU_MAX_STACK` to 256 (per KB #147) +4. Expand `EMOJI_POOL` to 200+ using Unicode emoji blocks (animals, food, objects, symbols) +5. Expand `FUNC_EMOJI_POOL` to 50+ using colored shapes, hearts, flags +6. Keep limits as constants (not runtime-configurable) for simplicity; `stack_depth` already parameterized diff --git a/specs/tier4-capacity/tasks.md b/specs/tier4-capacity/tasks.md new file mode 100644 index 0000000..fbdc4c4 --- /dev/null +++ b/specs/tier4-capacity/tasks.md @@ -0,0 +1,137 @@ +--- +spec: tier4-capacity +phase: tasks +total_tasks: 12 +created: 2026-03-08 +generated: auto +--- + +# Tasks: tier4-capacity + +## Phase 1: Make It Work (POC) + +Focus: Change all constants and expand pools. Verify existing tests still pass. + +- [x] 1.1 Increase GPU memory cells from 32 to 128 + - **Do**: In `emojiasm/metal/vm.metal`, change `constant int NUM_MEMORY_CELLS = 32;` to `constant int NUM_MEMORY_CELLS = 128;` + - **Files**: `emojiasm/metal/vm.metal` + - **Done when**: The constant reads 128; no other code changes needed since all usage is via `NUM_MEMORY_CELLS` + - **Verify**: `grep "NUM_MEMORY_CELLS = 128" emojiasm/metal/vm.metal` + - **Commit**: `feat(gpu): increase memory cells from 32 to 128` + - _Requirements: FR-3_ + - _Design: Component A_ + +- [x] 1.2 Increase GPU call stack depth from 16 to 32 + - **Do**: In `emojiasm/metal/vm.metal`, change `constant int CALL_STACK_DEPTH = 16;` to `constant int CALL_STACK_DEPTH = 32;` + - **Files**: `emojiasm/metal/vm.metal` + - **Done when**: The constant reads 32 + - **Verify**: `grep "CALL_STACK_DEPTH = 32" emojiasm/metal/vm.metal` + - **Commit**: `feat(gpu): increase call stack depth from 16 to 32` + - _Requirements: FR-4_ + - _Design: Component A_ + +- [x] 1.3 Increase GPU stack depth from 128 to 256 + - **Do**: + 1. In `emojiasm/gpu.py`, change `DEFAULT_STACK_DEPTH = 128` to `DEFAULT_STACK_DEPTH = 256` + 2. In `emojiasm/bytecode.py`, change `_GPU_MAX_STACK = 128` to `_GPU_MAX_STACK = 256` + - **Files**: `emojiasm/gpu.py`, `emojiasm/bytecode.py` + - **Done when**: Both constants read 256 + - **Verify**: `grep "DEFAULT_STACK_DEPTH = 256" emojiasm/gpu.py && grep "_GPU_MAX_STACK = 256" emojiasm/bytecode.py` + - **Commit**: `feat(gpu): increase stack depth from 128 to 256` + - _Requirements: FR-5_ + - _Design: Components B, C_ + +- [x] 1.4 Expand variable emoji pool from 50 to 200+ + - **Do**: In `emojiasm/transpiler.py`, replace `EMOJI_POOL` with an expanded list of 200+ emoji. Use emoji from these Unicode blocks: food/drink, animals, nature, sports, vehicles, objects, symbols. Verify no collisions with `EMOJI_TO_OP` keys or directive constants by running the collision check script. Keep the existing 50 emoji as the first 50 entries (preserves backward compatibility for any serialized programs). + - **Files**: `emojiasm/transpiler.py` + - **Done when**: `len(EMOJI_POOL) >= 200` and no collision with opcodes/directives + - **Verify**: `python3 -c "from emojiasm.transpiler import EMOJI_POOL; from emojiasm.opcodes import EMOJI_TO_OP; print(f'Pool size: {len(EMOJI_POOL)}'); assert len(EMOJI_POOL) >= 200; assert len(set(EMOJI_POOL)) == len(EMOJI_POOL), 'duplicates'; collisions = [e for e in EMOJI_POOL if e in EMOJI_TO_OP]; print(f'Opcode collisions: {len(collisions)} (ok if used only as memory cell names)')"` + - **Commit**: `feat(transpiler): expand variable emoji pool to 200+` + - _Requirements: FR-1, FR-6_ + - _Design: Component D_ + +- [x] 1.5 Expand function emoji pool from 20 to 50+ + - **Do**: In `emojiasm/transpiler.py`, replace `FUNC_EMOJI_POOL` with an expanded list of 50+ emoji. Use colored circles, squares, diamonds, and other shape/symbol emoji. Ensure no overlap with `EMOJI_POOL`, `EMOJI_TO_OP`, or directives. + - **Files**: `emojiasm/transpiler.py` + - **Done when**: `len(FUNC_EMOJI_POOL) >= 50` and no collision with variable pool or opcodes + - **Verify**: `python3 -c "from emojiasm.transpiler import EMOJI_POOL, FUNC_EMOJI_POOL; from emojiasm.opcodes import EMOJI_TO_OP; print(f'Func pool: {len(FUNC_EMOJI_POOL)}'); assert len(FUNC_EMOJI_POOL) >= 50; assert len(set(FUNC_EMOJI_POOL)) == len(FUNC_EMOJI_POOL), 'duplicates'; assert not set(FUNC_EMOJI_POOL) & set(EMOJI_POOL), 'cross-collision with var pool'"` + - **Commit**: `feat(transpiler): expand function emoji pool to 50+` + - _Requirements: FR-2, FR-6_ + - _Design: Component E_ + +- [ ] 1.6 POC Checkpoint -- verify all existing tests pass + - **Do**: Run the full test suite to ensure no regressions from constant changes and pool expansion + - **Done when**: All tests pass (pytest exit code 0) + - **Verify**: `pytest` + - **Commit**: `feat(tier4): complete POC for capacity limit increases` + +## Phase 2: Refactoring + +- [x] 2.1 Update docstrings and comments for new limits + - **Do**: + 1. In `emojiasm/transpiler.py`, update the comment `# Emoji pool for variable memory cells (50 characters)` to reflect new count + 2. Update the comment `# Emoji pool for function names` similarly + 3. In `emojiasm/bytecode.py`, update the docstring mentioning "Max stack depth capped at 128" + 4. In `emojiasm/metal/vm.metal`, update any comments referencing old limit values + - **Files**: `emojiasm/transpiler.py`, `emojiasm/bytecode.py`, `emojiasm/metal/vm.metal` + - **Done when**: All comments/docstrings reference correct new values + - **Verify**: `grep -n "50 characters\|capped at 128\|32 cells\|16 entries" emojiasm/transpiler.py emojiasm/bytecode.py emojiasm/metal/vm.metal` should return no hits + - **Commit**: `docs: update comments for new capacity limits` + - _Design: All components_ + +- [x] 2.2 Add collision validation utility + - **Do**: Add a `_validate_emoji_pools()` function in `transpiler.py` that checks for collisions between `EMOJI_POOL`, `FUNC_EMOJI_POOL`, and `EMOJI_TO_OP`/directives. Call it at module load (once) and raise `RuntimeError` if collisions found that would cause parsing ambiguity. Note: collisions with opcodes used only as memory cell names (STORE/LOAD args) are acceptable since the parser distinguishes these contexts. + - **Files**: `emojiasm/transpiler.py` + - **Done when**: Function exists and runs at import time without error + - **Verify**: `python3 -c "import emojiasm.transpiler; print('Import OK, no collisions')"` + - **Commit**: `refactor(transpiler): add emoji pool collision validation` + - _Design: Component D, E_ + +## Phase 3: Testing + +- [x] 3.1 Test expanded variable pool limits + - **Do**: Add test in `tests/test_transpiler.py`: + 1. `test_many_variables`: Transpile+run a program that uses 100+ unique variables with assignments and reads + 2. `test_variable_pool_size`: Assert `len(EMOJI_POOL) >= 200` + 3. `test_variable_pool_no_duplicates`: Assert all entries unique + - **Files**: `tests/test_transpiler.py` + - **Done when**: New tests pass + - **Verify**: `pytest tests/test_transpiler.py -k "many_variables or variable_pool" -v` + - **Commit**: `test(transpiler): add tests for expanded variable pool` + - _Requirements: AC-1.1, AC-1.2, AC-1.3_ + +- [x] 3.2 Test expanded function pool and GPU kernel limits + - **Do**: Add tests: + 1. In `tests/test_transpiler.py`: `test_many_functions` -- transpile+run a program with 30+ `def` statements + 2. In `tests/test_transpiler.py`: `test_function_pool_size` -- assert `len(FUNC_EMOJI_POOL) >= 50` + 3. In `tests/test_gpu_kernel.py`: `test_memory_cells_128` -- verify kernel source has `NUM_MEMORY_CELLS = 128` + 4. In `tests/test_gpu_kernel.py`: `test_call_stack_depth_32` -- verify kernel source has `CALL_STACK_DEPTH = 32` + 5. In `tests/test_gpu_kernel.py`: `test_default_stack_depth_256` -- verify `DEFAULT_STACK_DEPTH == 256` + - **Files**: `tests/test_transpiler.py`, `tests/test_gpu_kernel.py` + - **Done when**: All new tests pass + - **Verify**: `pytest tests/test_transpiler.py tests/test_gpu_kernel.py -k "many_functions or function_pool or memory_cells_128 or call_stack_depth_32 or stack_depth_256" -v` + - **Commit**: `test: add tests for expanded pools and GPU limits` + - _Requirements: AC-2.1, AC-2.3, AC-3.1, AC-4.1, AC-5.1_ + +## Phase 4: Quality Gates + +- [ ] 4.1 Local quality check + - **Do**: Run all quality checks: + 1. `pytest` -- full test suite + 2. `python3 -m py_compile emojiasm/transpiler.py` -- syntax check + 3. `python3 -m py_compile emojiasm/gpu.py` -- syntax check + 4. `python3 -m py_compile emojiasm/bytecode.py` -- syntax check + - **Verify**: All commands exit 0 + - **Done when**: All quality checks pass + - **Commit**: `fix(tier4): address any lint/type issues` (if needed) + +- [ ] 4.2 Create PR and verify CI + - **Do**: Push branch, create PR with `gh pr create` referencing issue #30 + - **Verify**: `gh pr checks --watch` all green + - **Done when**: PR ready for review + +## Notes + +- **POC shortcuts taken**: No configurability for memory cells or call stack (compile-time Metal constants); collision validation deferred to Phase 2 +- **Existing collisions**: `EMOJI_POOL` already has `🔢` (MOD opcode) and `📊` (DATA directive) -- these work fine because they're used as STORE/LOAD arguments, not parsed as opcodes +- **Production TODOs**: Consider making memory cell count a kernel parameter (requires kernel recompilation) in future iteration diff --git a/tests/test_bytecode.py b/tests/test_bytecode.py index 3647b0b..ad18d89 100644 --- a/tests/test_bytecode.py +++ b/tests/test_bytecode.py @@ -457,14 +457,14 @@ def test_dup_increases_depth(self): assert depth == 3 # PUSH(1) + DUP(2) + DUP(3) def test_capped_at_128(self): - """Stack depth should never exceed GPU max of 128 (KB #147).""" - # Create a program with 200 PUSHes - lines = ["📜 🏠"] + ["📥 1"] * 200 + ["🛑"] + """Stack depth should never exceed GPU max of 256 (KB #147).""" + # Create a program with 300 PUSHes + lines = ["📜 🏠"] + ["📥 1"] * 300 + ["🛑"] src = "\n".join(lines) prog = _parse(src) depth = _analyze_max_stack_depth(prog) assert depth == _GPU_MAX_STACK - assert depth == 128 + assert depth == 256 def test_max_stack_depth_in_gpu_program(self): src = "📥 6\n📥 7\n✖️\n🖨️\n🛑" diff --git a/tests/test_gpu_kernel.py b/tests/test_gpu_kernel.py index 7041a4f..c7c6136 100644 --- a/tests/test_gpu_kernel.py +++ b/tests/test_gpu_kernel.py @@ -259,7 +259,7 @@ def test_random_seeds_with_thread_id(self): class TestConstants: def test_default_stack_depth(self): - assert DEFAULT_STACK_DEPTH == 128 + assert DEFAULT_STACK_DEPTH == 256 def test_default_max_steps(self): assert DEFAULT_MAX_STEPS == 1_000_000 @@ -430,3 +430,20 @@ def test_math_gpu_opcodes_match_bytecode(self): f"Mismatch for {gpu_name}: GPU=0x{GPU_OPCODES[gpu_name]:02X} " f"vs bytecode=0x{OP_MAP[op]:02X}" ) + + +# ── Expanded capacity limits ───────────────────────────────────────────── + + +class TestCapacityLimits: + """Verify GPU kernel capacity constants match expanded limits.""" + + def test_memory_cells_128(self): + """Kernel source must have NUM_MEMORY_CELLS = 128.""" + src = get_kernel_source() + assert re.search(r"NUM_MEMORY_CELLS\s*=\s*128", src) is not None + + def test_call_stack_depth_32(self): + """Kernel source must have CALL_STACK_DEPTH = 32.""" + src = get_kernel_source() + assert re.search(r"CALL_STACK_DEPTH\s*=\s*32", src) is not None diff --git a/tests/test_transpiler.py b/tests/test_transpiler.py index f737ce2..debf239 100644 --- a/tests/test_transpiler.py +++ b/tests/test_transpiler.py @@ -1,7 +1,10 @@ """Tests for the Python-to-EmojiASM transpiler.""" import pytest -from emojiasm.transpiler import transpile, transpile_to_source, TranspileError +from emojiasm.transpiler import ( + transpile, transpile_to_source, TranspileError, + EMOJI_POOL, FUNC_EMOJI_POOL, +) from emojiasm.vm import VM @@ -888,3 +891,38 @@ def test_source_map_multiline(self): assert "x = 42" in source_set assert "y = 10" in source_set assert "print(x + y)" in source_set + + +# ── Expanded pool limits ───────────────────────────────────────────────── + + +class TestVariablePool: + def test_variable_pool_size(self): + """EMOJI_POOL must have at least 200 entries.""" + assert len(EMOJI_POOL) >= 200 + + def test_variable_pool_no_duplicates(self): + """All entries in EMOJI_POOL must be unique.""" + assert len(set(EMOJI_POOL)) == len(EMOJI_POOL) + + def test_many_variables(self): + """Transpile+run a program using 100+ unique variables.""" + # Generate: v0 = 0\nv1 = 1\n...\nv99 = 99\nprint(v0 + v99) + lines = [f"v{i} = {i}" for i in range(100)] + lines.append("print(v0 + v99)") + src = "\n".join(lines) + assert run_py(src).strip() == "99" + + +class TestFunctionPool: + def test_function_pool_size(self): + """FUNC_EMOJI_POOL must have at least 50 entries.""" + assert len(FUNC_EMOJI_POOL) >= 50 + + def test_many_functions(self): + """Transpile+run a program with 30+ def statements.""" + # Generate: def f0(): return 0\ndef f1(): return 1\n...\ndef f29(): return 29\nprint(f29()) + lines = [f"def f{i}():\n return {i}" for i in range(30)] + lines.append("print(f29())") + src = "\n".join(lines) + assert run_py(src).strip() == "29"