diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 67c2906..def3705 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -21,19 +21,28 @@ jobs: - name: Detect changed codecs id: changed run: | - # Get list of changed .rs files in src/ (excluding main.rs, codec.rs, lib.rs) - CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- 'src/*.rs' | grep -vE '(main|codec|lib)\.rs$' || true) - - if [ -z "$CHANGED_FILES" ]; then - echo "No codec files changed" - echo "codecs=" >> $GITHUB_OUTPUT - else - # Extract codec names (filename without .rs extension) - CODECS=$(echo "$CHANGED_FILES" | xargs -n1 basename | sed 's/\.rs$//' | tr '\n' ' ') - echo "Changed codecs: $CODECS" - echo "codecs=$CODECS" >> $GITHUB_OUTPUT + # Get list of changed Rust codec files (excluding main.rs, codec.rs, lib.rs) + RUST_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- 'src/*.rs' | grep -vE '(main|codec|lib)\.rs$' || true) + + # Get list of changed Docker codec directories + DOCKER_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- 'src/*/Dockerfile' 'src/*/*.py' 'src/*/*.java' 'src/*/*.c' 'src/*/*.go' | xargs -n1 dirname 2>/dev/null | sort -u | xargs -n1 basename 2>/dev/null || true) + + RUST_CODECS="" + DOCKER_CODECS="" + + if [ -n "$RUST_CHANGES" ]; then + RUST_CODECS=$(echo "$RUST_CHANGES" | xargs -n1 basename | sed 's/\.rs$//' | tr '\n' ' ') fi + if [ -n "$DOCKER_CHANGES" ]; then + DOCKER_CODECS=$(echo "$DOCKER_CHANGES" | tr '\n' ' ') + fi + + echo "Rust codecs: $RUST_CODECS" + echo "Docker codecs: $DOCKER_CODECS" + echo "rust_codecs=$RUST_CODECS" >> $GITHUB_OUTPUT + echo "docker_codecs=$DOCKER_CODECS" >> $GITHUB_OUTPUT + - name: Setup Rust uses: dtolnay/rust-toolchain@stable @@ -54,19 +63,27 @@ jobs: - name: Build run: cargo build --release - - name: Run codec tests - id: test + - name: Run Rust codec tests + if: steps.changed.outputs.rust_codecs != '' run: | - CODECS="${{ steps.changed.outputs.codecs }}" - if [ -z "$CODECS" ]; then - echo "No codec changes detected, running all codecs" - cargo run --release 2>&1 | tee results.txt - else - for codec in $CODECS; do - echo "Testing codec: $codec" - cargo run --release -- --codec "$codec" 2>&1 | tee -a results.txt - done - fi + for codec in ${{ steps.changed.outputs.rust_codecs }}; do + echo "Testing Rust codec: $codec" + cargo run --release -- --codec "$codec" 2>&1 | tee -a results.txt + done + + - name: Run Docker codec tests + if: steps.changed.outputs.docker_codecs != '' + run: | + for codec in ${{ steps.changed.outputs.docker_codecs }}; do + echo "Testing Docker codec: $codec" + cargo run --release -- --docker --codec "$codec" 2>&1 | tee -a results.txt + done + + - name: Run all codecs (no changes detected) + if: steps.changed.outputs.rust_codecs == '' && steps.changed.outputs.docker_codecs == '' + run: | + echo "No codec changes detected, running all codecs" + cargo run --release -- --docker 2>&1 | tee results.txt - name: Check formatting run: cargo fmt --check diff --git a/.gitignore b/.gitignore index 7b7ee2c..32a43b3 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ Thumbs.db *.swo *~ +# Worktrees +.worktrees/ + # Data (only data.json.gz is tracked) *.json *.json.gz diff --git a/README.md b/README.md index a8abe63..425eac7 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,49 @@ let codecs: Vec<(Box, &[(EventKey, EventValue)])> = vec![ - PRs must add a single file: `src/.rs` - **Submission deadline: March 1st, 2025** — evaluation dataset revealed and winners announced +## External Codecs (Non-Rust) + +You can submit codecs in any language by creating a Docker container that implements the encode/decode ABI. + +### Directory Structure + +``` +src/-/ +├── Dockerfile +└── +``` + +### ABI Requirements + +Your container must accept `encode` or `decode` as the first argument: + +```bash +# Encode: JSON events in via stdin, compressed bytes out via stdout +docker run encode < events.json > compressed.bin + +# Decode: Compressed bytes in via stdin, JSON events out via stdout +docker run decode < compressed.bin > events.json +``` + +### Example Dockerfile (Python) + +```dockerfile +FROM python:3.11-slim +WORKDIR /app +COPY codec.py . +ENTRYPOINT ["python", "codec.py"] +``` + +### Testing Locally + +```bash +# Run with Docker support +cargo run --release -- --docker + +# Test specific external codec +cargo run --release -- --codec - +``` + ## Generating Your Own Evaluation Dataset Want to test your codec against different data? You can generate your own dataset diff --git a/docs/plans/2026-01-30-multi-language-support-design.md b/docs/plans/2026-01-30-multi-language-support-design.md new file mode 100644 index 0000000..dcec2a1 --- /dev/null +++ b/docs/plans/2026-01-30-multi-language-support-design.md @@ -0,0 +1,245 @@ +# Multi-Language Codec Support + +## Goal + +Maximize participation in the compression-golf competition by allowing submissions in languages other than Rust (Python, Java, C, Go, etc.). + +## Overview + +External codecs are Docker containers that implement a standard ABI. The existing Rust harness orchestrates building and running these containers, then verifies correctness using the same round-trip validation as Rust codecs. + +## ABI Specification + +External codecs are Docker containers with an entrypoint that accepts two subcommands: + +```bash +# Encode: JSON events in via stdin, compressed bytes out via stdout +docker run encode < events.json > compressed.bin + +# Decode: Compressed bytes in via stdin, JSON events out via stdout +docker run decode < compressed.bin > events.json +``` + +### Input/Output Format + +- **Encode input**: Line-delimited JSON (same format as `data.json`) +- **Encode output**: Raw bytes (the codec's proprietary compressed format) +- **Decode input**: The exact bytes produced by encode +- **Decode output**: Line-delimited JSON (must match original input exactly) + +### Exit Codes + +- `0` = success +- Non-zero = error (harness captures stderr for diagnostics) + +### Verification + +The harness calls encode, then decode, then compares the decoded output to the original input. If they match byte-for-byte, the submission is valid. This is the same integrity guarantee as Rust submissions. + +## Directory Structure + +External codecs use a convention-based directory structure: + +``` +src/ +├── alice.rs # Rust codec (existing) +├── bob.rs # Rust codec (existing) +├── alice-python/ # External codec: name=alice, lang=python +│ ├── Dockerfile +│ ├── codec.py +│ └── requirements.txt +├── carol-java/ # External codec: name=carol, lang=java +│ ├── Dockerfile +│ └── Codec.java +└── dave-c/ # External codec: name=dave, lang=c + ├── Dockerfile + └── codec.c +``` + +### Naming Convention + +- Directory name format: `{name}-{lang}` +- `name` can match an existing Rust codec (e.g., `alice.rs` and `alice-python/` can coexist) +- `lang` should be from common identifiers: `python`, `java`, `c`, `cpp`, `go`, `rust`, etc. + +### Auto-Discovery + +The harness discovers external codecs by scanning for `src/*/Dockerfile`. No configuration file needed. + +## Dockerfile Requirements + +1. Must accept `encode` or `decode` as the command argument +2. Must read from stdin, write to stdout +3. Must exit 0 on success, non-zero on failure + +### Example: Python + +```dockerfile +FROM python:3.11-slim +WORKDIR /app +COPY . . +RUN pip install -r requirements.txt +ENTRYPOINT ["python", "codec.py"] +``` + +### Example: Java + +```dockerfile +FROM eclipse-temurin:21 +WORKDIR /app +COPY . . +RUN javac Codec.java +ENTRYPOINT ["java", "Codec"] +``` + +### Example: C + +```dockerfile +FROM gcc:13 +WORKDIR /app +COPY . . +RUN gcc -O3 -o codec codec.c +ENTRYPOINT ["./codec"] +``` + +## CLI Interface + +### Flags + +```bash +# Default: Rust codecs only (current behavior) +cargo run --release + +# Include external Docker codecs +cargo run --release -- --docker + +# Target specific codec - harness auto-detects type +cargo run --release -- --codec alice # finds alice.rs (Rust) +cargo run --release -- --codec alice-python # finds src/alice-python/Dockerfile (Docker) +``` + +### Codec Resolution + +When `--codec ` is specified: + +1. Check if `name` is a registered Rust codec → run as Rust +2. Check if `src/{name}/Dockerfile` exists → run as Docker +3. Neither → error: "Unknown codec: {name}" + +Targeting a Docker codec with `--codec` implicitly enables Docker mode. + +## Harness Implementation + +The Rust harness handles the full lifecycle: + +```rust +fn build_external_codec(name: &str) -> Result { + let path = format!("src/{}", name); + + Command::new("docker") + .args(["build", "-t", name, &path]) + .status()?; + + Ok(name.to_string()) +} + +fn run_encode(image: &str, input: &[u8]) -> Result> { + let mut child = Command::new("docker") + .args(["run", "-i", "--rm", image, "encode"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + + child.stdin.take().unwrap().write_all(input)?; + let output = child.wait_with_output()?; + + if !output.status.success() { + return Err(/* error with stderr */); + } + + Ok(output.stdout) +} + +fn run_decode(image: &str, input: &[u8]) -> Result> { + // Same pattern as run_encode but with "decode" argument +} +``` + +## CI/CD Integration + +### PR Workflow Updates + +The workflow detects both Rust and Docker codec changes: + +```yaml +# Detect Rust codec changes (existing) +rust_changes=$(git diff --name-only origin/main...HEAD -- 'src/*.rs') + +# Detect Docker codec changes (new) +docker_changes=$(git diff --name-only origin/main...HEAD -- 'src/*/Dockerfile' 'src/*/**') +``` + +### Workflow Behavior + +| Change detected | Action | +|-----------------|--------| +| `src/alice.rs` | `cargo run --release -- --codec alice` | +| `src/alice-python/*` | `cargo run --release -- --codec alice-python` | +| Both | Run both targeted tests | + +GitHub Actions runners have Docker pre-installed, so no special setup is needed. + +### Constraints + +- Timeout: Uses existing CI timeout (harness inherits runner limits) +- Build on every PR (can optimize with caching later) + +## Error Handling + +| Error | Harness response | +|-------|------------------| +| `docker build` fails | Report build error, show stderr, skip codec | +| `encode` times out | Report timeout, skip codec | +| `encode` exits non-zero | Report error, show stderr, skip codec | +| `decode` fails | Report decode error, skip codec | +| Round-trip mismatch | Report verification failure, show sample mismatches | + +### Output Format + +External codecs appear in the same results table as Rust codecs: + +``` +┌────────────────────────┬────────────────┬────────────┐ +│ Codec │ Size │ vs Naive │ +├────────────────────────┼────────────────┼────────────┤ +│ Naive │ 210,727,389 │ baseline │ +│ XiangpengHao │ 6,847,283 │ -96.7% │ +│ alice-python │ 7,102,445 │ -96.6% │ +│ bob-java │ 8,234,112 │ -96.1% │ +│ carol-c [BUILD FAILED] │ - │ - │ +└────────────────────────┴────────────────┴────────────┘ +``` + +### Debugging + +- `--verbose` flag shows full Docker build output and stderr +- Failed codecs don't block other codecs from running +- Temporary files preserved on failure for inspection + +## Submission Process + +To submit an external codec: + +1. Create `src/-/` directory +2. Add a `Dockerfile` implementing the ABI +3. Add your codec implementation +4. Test locally: `cargo run --release -- --codec ` +5. Submit PR with the new directory + +## Future Optimizations + +- Docker layer caching in CI +- Pre-built base images for common languages +- Optional memory limits via `docker run --memory` +- Image registry for faster CI (authors push pre-built images) diff --git a/docs/plans/2026-01-30-multi-language-support.md b/docs/plans/2026-01-30-multi-language-support.md new file mode 100644 index 0000000..e27310c --- /dev/null +++ b/docs/plans/2026-01-30-multi-language-support.md @@ -0,0 +1,712 @@ +# Multi-Language Codec Support Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Enable non-Rust codec submissions via Docker containers that implement a standard encode/decode ABI. + +**Architecture:** The Rust harness discovers external codecs by scanning for `src/*/Dockerfile`, builds each container, then invokes encode/decode via stdin/stdout. Results integrate into the existing output table. + +**Tech Stack:** Rust (std::process::Command for Docker orchestration), Docker containers for external codecs. + +--- + +## Task 1: Add Docker CLI Flag and External Codec Discovery + +**Files:** +- Modify: `src/main.rs:147-168` (argument parsing) + +**Step 1: Add --docker flag parsing** + +In the argument parsing section of main.rs, add a `docker_enabled` flag: + +```rust +fn main() -> Result<(), Box> { + let args: Vec = std::env::args().collect(); + + let mut path = "data.json".to_string(); + let mut codec_filter: Option = None; + let mut docker_enabled = false; + + let mut i = 1; + while i < args.len() { + match args[i].as_str() { + "--codec" => { + if i + 1 < args.len() { + codec_filter = Some(args[i + 1].to_lowercase()); + i += 1; + } + } + "--docker" => { + docker_enabled = true; + } + arg if !arg.starts_with('-') => { + path = arg.to_string(); + } + _ => {} + } + i += 1; + } +``` + +**Step 2: Run harness to verify flag is accepted** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo run --release -- --docker 2>&1 | head -5` +Expected: Compiles and runs without error (flag parsed but not used yet) + +**Step 3: Commit** + +```bash +git add src/main.rs +git commit -m "feat: add --docker CLI flag for external codec support" +``` + +--- + +## Task 2: Add External Codec Discovery Function + +**Files:** +- Modify: `src/main.rs` (add function before main) + +**Step 1: Add discover_external_codecs function** + +Add this function before the `main()` function: + +```rust +fn discover_external_codecs() -> Vec { + let src_path = std::path::Path::new("src"); + let mut codecs = Vec::new(); + + if let Ok(entries) = std::fs::read_dir(src_path) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let dockerfile = path.join("Dockerfile"); + if dockerfile.exists() { + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + codecs.push(name.to_string()); + } + } + } + } + } + + codecs.sort(); + codecs +} +``` + +**Step 2: Run build to verify it compiles** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo build --release 2>&1 | tail -3` +Expected: `Finished` with no errors + +**Step 3: Commit** + +```bash +git add src/main.rs +git commit -m "feat: add external codec discovery via Dockerfile scanning" +``` + +--- + +## Task 3: Add Docker Build Function + +**Files:** +- Modify: `src/main.rs` (add function) + +**Step 1: Add build_docker_codec function** + +Add this function after `discover_external_codecs`: + +```rust +fn build_docker_codec(name: &str) -> Result<(), Box> { + let path = format!("src/{}", name); + + let status = std::process::Command::new("docker") + .args(["build", "-t", name, &path]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .status()?; + + if !status.success() { + return Err(format!("Docker build failed for {}", name).into()); + } + + Ok(()) +} +``` + +**Step 2: Run build to verify it compiles** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo build --release 2>&1 | tail -3` +Expected: `Finished` with no errors + +**Step 3: Commit** + +```bash +git add src/main.rs +git commit -m "feat: add docker build function for external codecs" +``` + +--- + +## Task 4: Add Docker Encode/Decode Functions + +**Files:** +- Modify: `src/main.rs` (add functions) + +**Step 1: Add run_docker_encode function** + +Add this function after `build_docker_codec`: + +```rust +fn run_docker_encode(image: &str, events: &[(EventKey, EventValue)]) -> Result, Box> { + use std::io::Write; + + let mut child = std::process::Command::new("docker") + .args(["run", "-i", "--rm", image, "encode"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + + // Write events as line-delimited JSON to stdin + { + let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?; + for (key, value) in events { + let event = serde_json::json!({ + "id": key.id, + "type": key.event_type, + "repo": value.repo, + "created_at": value.created_at + }); + writeln!(stdin, "{}", serde_json::to_string(&event)?)?; + } + } + + let output = child.wait_with_output()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Encode failed: {}", stderr).into()); + } + + Ok(output.stdout) +} +``` + +**Step 2: Add run_docker_decode function** + +Add this function after `run_docker_encode`: + +```rust +fn run_docker_decode(image: &str, data: &[u8]) -> Result, Box> { + use std::io::Write; + + let mut child = std::process::Command::new("docker") + .args(["run", "-i", "--rm", image, "decode"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + + { + let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?; + stdin.write_all(data)?; + } + + let output = child.wait_with_output()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Decode failed: {}", stderr).into()); + } + + // Parse line-delimited JSON from stdout + let stdout = String::from_utf8(output.stdout)?; + let mut events = Vec::new(); + + for line in stdout.lines() { + if line.is_empty() { + continue; + } + let raw: RawGitHubEvent = serde_json::from_str(line)?; + let key = EventKey { + event_type: raw.event_type, + id: raw.id, + }; + let value = EventValue { + repo: raw.repo, + created_at: raw.created_at, + }; + events.push((key, value)); + } + + Ok(events) +} +``` + +**Step 3: Run build to verify it compiles** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo build --release 2>&1 | tail -3` +Expected: `Finished` with no errors + +**Step 4: Commit** + +```bash +git add src/main.rs +git commit -m "feat: add docker encode/decode execution functions" +``` + +--- + +## Task 5: Integrate External Codecs into Main Loop + +**Files:** +- Modify: `src/main.rs:209-215` (after Rust codec loop, before table close) + +**Step 1: Add external codec execution after Rust codecs** + +Replace the table closing and success message (lines 211-214) with: + +```rust + // Run external Docker codecs if --docker flag is set + if docker_enabled { + let external_codecs = discover_external_codecs(); + for codec_name in external_codecs { + // Skip if filter is set and doesn't match + if let Some(ref filter) = codec_filter { + if !codec_name.to_lowercase().contains(filter) { + continue; + } + } + + // Build the Docker image + if let Err(e) = build_docker_codec(&codec_name) { + println!( + "│ {:<22} │ {:>14} │ {:>10} │", + format!("{} [BUILD FAILED]", codec_name), + "-", + "-" + ); + eprintln!("Build error for {}: {}", codec_name, e); + continue; + } + + // Run encode + let encoded = match run_docker_encode(&codec_name, &events) { + Ok(data) => data, + Err(e) => { + println!( + "│ {:<22} │ {:>14} │ {:>10} │", + format!("{} [ENCODE FAILED]", codec_name), + "-", + "-" + ); + eprintln!("Encode error for {}: {}", codec_name, e); + continue; + } + }; + + print_row(&codec_name, encoded.len(), baseline); + + // Run decode and verify + let decoded = match run_docker_decode(&codec_name, &encoded) { + Ok(data) => data, + Err(e) => { + eprintln!("Decode error for {}: {}", codec_name, e); + continue; + } + }; + + // Sort decoded events for comparison (external codecs may not preserve order) + let mut sorted_decoded = decoded.clone(); + sorted_decoded.sort_by(|a, b| a.0.cmp(&b.0)); + assert_events_eq(&codec_name, &sorted_events, &sorted_decoded); + } + } + + println!("└────────────────────────┴────────────────┴────────────┘"); + println!("\nAll verifications passed"); + + Ok(()) +} +``` + +**Step 2: Run build to verify it compiles** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo build --release 2>&1 | tail -3` +Expected: `Finished` with no errors + +**Step 3: Run harness without --docker to verify existing behavior unchanged** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo run --release -- --codec naive 2>&1` +Expected: Shows Naive codec results, "All verifications passed" + +**Step 4: Run harness with --docker (no external codecs yet)** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo run --release -- --docker --codec naive 2>&1` +Expected: Same as above (no external codecs discovered) + +**Step 5: Commit** + +```bash +git add src/main.rs +git commit -m "feat: integrate external Docker codecs into main evaluation loop" +``` + +--- + +## Task 6: Auto-Enable Docker Mode for External Codec Targets + +**Files:** +- Modify: `src/main.rs` (after argument parsing, before loading events) + +**Step 1: Add auto-detection for external codec targets** + +After the argument parsing loop but before `let events = load_events(&path)?;`, add: + +```rust + // Auto-enable docker mode if targeting an external codec + if let Some(ref filter) = codec_filter { + let potential_path = format!("src/{}/Dockerfile", filter); + if std::path::Path::new(&potential_path).exists() { + docker_enabled = true; + } + } +``` + +**Step 2: Run build to verify it compiles** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo build --release 2>&1 | tail -3` +Expected: `Finished` with no errors + +**Step 3: Commit** + +```bash +git add src/main.rs +git commit -m "feat: auto-enable docker mode when targeting external codec" +``` + +--- + +## Task 7: Create Example Python Codec for Testing + +**Files:** +- Create: `src/example-python/Dockerfile` +- Create: `src/example-python/codec.py` + +**Step 1: Create the directory** + +```bash +mkdir -p src/example-python +``` + +**Step 2: Create the Dockerfile** + +Create `src/example-python/Dockerfile`: + +```dockerfile +FROM python:3.11-slim +WORKDIR /app +COPY codec.py . +ENTRYPOINT ["python", "codec.py"] +``` + +**Step 3: Create a minimal Python codec** + +Create `src/example-python/codec.py`: + +```python +#!/usr/bin/env python3 +""" +Example external codec for compression-golf. +This is a naive JSON + zlib implementation for testing the harness. +""" +import sys +import json +import zlib + +def encode(): + """Read JSON events from stdin, write compressed bytes to stdout.""" + lines = sys.stdin.read() + compressed = zlib.compress(lines.encode('utf-8'), level=9) + sys.stdout.buffer.write(compressed) + +def decode(): + """Read compressed bytes from stdin, write JSON events to stdout.""" + compressed = sys.stdin.buffer.read() + decompressed = zlib.decompress(compressed) + sys.stdout.write(decompressed.decode('utf-8')) + +if __name__ == '__main__': + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + command = sys.argv[1] + if command == 'encode': + encode() + elif command == 'decode': + decode() + else: + print(f"Unknown command: {command}", file=sys.stderr) + sys.exit(1) +``` + +**Step 4: Verify Docker is available** + +Run: `docker --version` +Expected: Docker version info (e.g., "Docker version 24.x.x") + +**Step 5: Test the example codec manually** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && docker build -t example-python src/example-python/` +Expected: Successfully builds the image + +**Step 6: Test encode/decode manually with small input** + +Run: `echo '{"id":"1","type":"PushEvent","repo":{"id":123,"name":"test/repo","url":"https://api.github.com/repos/test/repo"},"created_at":"2024-01-01T00:00:00Z"}' | docker run -i --rm example-python encode | docker run -i --rm example-python decode` +Expected: Same JSON line output + +**Step 7: Commit** + +```bash +git add src/example-python/ +git commit -m "feat: add example Python codec for testing external codec support" +``` + +--- + +## Task 8: End-to-End Test with Harness + +**Files:** +- None (testing only) + +**Step 1: Run harness with --docker targeting example-python** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo run --release -- --docker --codec example-python 2>&1` +Expected: Shows example-python in results table with size and vs Naive percentage, "All verifications passed" + +**Step 2: Run harness with --docker to show all codecs** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo run --release -- --docker 2>&1` +Expected: Shows all Rust codecs plus example-python, all verifications pass + +**Step 3: Verify Naive still works standalone** + +Run: `cd /Users/jedgington/github/jakedgy/compression-golf/.worktrees/multi-language && cargo run --release -- --codec naive 2>&1` +Expected: Shows Naive results only, no Docker codecs run + +--- + +## Task 9: Update CI Workflow for Docker Support + +**Files:** +- Modify: `.github/workflows/pr.yml` + +**Step 1: Update detection to include Docker codecs** + +Replace the entire pr.yml with: + +```yaml +name: PR Validation + +on: + pull_request: + branches: [main] + +env: + CARGO_TERM_COLOR: always + +jobs: + test: + name: Test Submissions + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Detect changed codecs + id: changed + run: | + # Get list of changed Rust codec files (excluding main.rs, codec.rs, lib.rs) + RUST_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- 'src/*.rs' | grep -vE '(main|codec|lib)\.rs$' || true) + + # Get list of changed Docker codec directories + DOCKER_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- 'src/*/Dockerfile' 'src/*/*.py' 'src/*/*.java' 'src/*/*.c' 'src/*/*.go' | xargs -n1 dirname 2>/dev/null | sort -u | xargs -n1 basename 2>/dev/null || true) + + RUST_CODECS="" + DOCKER_CODECS="" + + if [ -n "$RUST_CHANGES" ]; then + RUST_CODECS=$(echo "$RUST_CHANGES" | xargs -n1 basename | sed 's/\.rs$//' | tr '\n' ' ') + fi + + if [ -n "$DOCKER_CHANGES" ]; then + DOCKER_CODECS=$(echo "$DOCKER_CHANGES" | tr '\n' ' ') + fi + + echo "Rust codecs: $RUST_CODECS" + echo "Docker codecs: $DOCKER_CODECS" + echo "rust_codecs=$RUST_CODECS" >> $GITHUB_OUTPUT + echo "docker_codecs=$DOCKER_CODECS" >> $GITHUB_OUTPUT + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Decompress dataset + run: gunzip -k data.json.gz + + - name: Build + run: cargo build --release + + - name: Run Rust codec tests + if: steps.changed.outputs.rust_codecs != '' + run: | + for codec in ${{ steps.changed.outputs.rust_codecs }}; do + echo "Testing Rust codec: $codec" + cargo run --release -- --codec "$codec" 2>&1 | tee -a results.txt + done + + - name: Run Docker codec tests + if: steps.changed.outputs.docker_codecs != '' + run: | + for codec in ${{ steps.changed.outputs.docker_codecs }}; do + echo "Testing Docker codec: $codec" + cargo run --release -- --docker --codec "$codec" 2>&1 | tee -a results.txt + done + + - name: Run all codecs (no changes detected) + if: steps.changed.outputs.rust_codecs == '' && steps.changed.outputs.docker_codecs == '' + run: | + echo "No codec changes detected, running all codecs" + cargo run --release -- --docker 2>&1 | tee results.txt + + - name: Check formatting + run: cargo fmt --check +``` + +**Step 2: Commit** + +```bash +git add .github/workflows/pr.yml +git commit -m "feat: update CI workflow to support Docker codec testing" +``` + +--- + +## Task 10: Update README with External Codec Instructions + +**Files:** +- Modify: `README.md` (add section for external codecs) + +**Step 1: Read current README** + +Read the README.md to find the appropriate place to add external codec documentation. + +**Step 2: Add external codec section** + +Add a new section after the existing submission instructions: + +```markdown +## External Codecs (Non-Rust) + +You can submit codecs in any language by creating a Docker container that implements the encode/decode ABI. + +### Directory Structure + +``` +src/-/ +├── Dockerfile +└── +``` + +### ABI Requirements + +Your container must accept `encode` or `decode` as the first argument: + +```bash +# Encode: JSON events in via stdin, compressed bytes out via stdout +docker run encode < events.json > compressed.bin + +# Decode: Compressed bytes in via stdin, JSON events out via stdout +docker run decode < compressed.bin > events.json +``` + +### Example Dockerfile (Python) + +```dockerfile +FROM python:3.11-slim +WORKDIR /app +COPY codec.py . +ENTRYPOINT ["python", "codec.py"] +``` + +### Testing Locally + +```bash +# Run with Docker support +cargo run --release -- --docker + +# Test specific external codec +cargo run --release -- --codec - +``` +``` + +**Step 3: Commit** + +```bash +git add README.md +git commit -m "docs: add external codec submission instructions to README" +``` + +--- + +## Task 11: Remove Example Codec (Optional Cleanup) + +**Files:** +- Delete: `src/example-python/` (if you don't want it in the final PR) + +**Step 1: Decide whether to keep example codec** + +The example-python codec is useful for: +- Demonstrating the ABI to contributors +- Testing CI workflow + +If you want to remove it before merging: + +```bash +rm -rf src/example-python +git add -A +git commit -m "chore: remove example codec (served its testing purpose)" +``` + +If you want to keep it as a reference implementation, skip this task. + +--- + +## Summary + +After completing all tasks, the harness will support: + +1. **CLI**: `--docker` flag to enable external codecs +2. **Discovery**: Auto-scans `src/*/Dockerfile` for external codecs +3. **Execution**: Builds and runs containers via stdin/stdout +4. **Verification**: Same round-trip validation as Rust codecs +5. **CI**: Detects and tests both Rust and Docker codec changes +6. **Documentation**: README explains how to submit external codecs diff --git a/src/example-python/Dockerfile b/src/example-python/Dockerfile new file mode 100644 index 0000000..8f4cbbe --- /dev/null +++ b/src/example-python/Dockerfile @@ -0,0 +1,4 @@ +FROM python:3.11-slim +WORKDIR /app +COPY codec.py . +ENTRYPOINT ["python", "codec.py"] diff --git a/src/example-python/codec.py b/src/example-python/codec.py new file mode 100644 index 0000000..3300d7a --- /dev/null +++ b/src/example-python/codec.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +""" +Example external codec for compression-golf. +This is a naive JSON + zlib implementation for testing the harness. +""" +import sys +import json +import zlib + +def encode(): + """Read JSON events from stdin, write compressed bytes to stdout.""" + lines = sys.stdin.read() + compressed = zlib.compress(lines.encode('utf-8'), level=9) + sys.stdout.buffer.write(compressed) + +def decode(): + """Read compressed bytes from stdin, write JSON events to stdout.""" + compressed = sys.stdin.buffer.read() + decompressed = zlib.decompress(compressed) + sys.stdout.write(decompressed.decode('utf-8')) + +if __name__ == '__main__': + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + command = sys.argv[1] + if command == 'encode': + encode() + elif command == 'decode': + decode() + else: + print(f"Unknown command: {command}", file=sys.stderr) + sys.exit(1) diff --git a/src/main.rs b/src/main.rs index 814fc5c..154a6c4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -144,11 +144,135 @@ fn assert_events_eq( ); } +fn discover_external_codecs() -> Vec { + let src_path = std::path::Path::new("src"); + let mut codecs = Vec::new(); + + if let Ok(entries) = std::fs::read_dir(src_path) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let dockerfile = path.join("Dockerfile"); + if dockerfile.exists() { + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + codecs.push(name.to_string()); + } + } + } + } + } + + codecs.sort(); + codecs +} + +fn build_docker_codec(name: &str) -> Result<(), Box> { + let path = format!("src/{}", name); + + let status = std::process::Command::new("docker") + .args(["build", "-t", name, &path]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::piped()) + .status()?; + + if !status.success() { + return Err(format!("Docker build failed for {}", name).into()); + } + + Ok(()) +} + +fn run_docker_encode( + image: &str, + events: &[(EventKey, EventValue)], +) -> Result, Box> { + use std::io::Write; + + let mut child = std::process::Command::new("docker") + .args(["run", "-i", "--rm", image, "encode"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + + // Write events as line-delimited JSON to stdin + { + let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?; + for (key, value) in events { + let event = serde_json::json!({ + "id": key.id, + "type": key.event_type, + "repo": value.repo, + "created_at": value.created_at + }); + writeln!(stdin, "{}", serde_json::to_string(&event)?)?; + } + } + + let output = child.wait_with_output()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Encode failed: {}", stderr).into()); + } + + Ok(output.stdout) +} + +fn run_docker_decode( + image: &str, + data: &[u8], +) -> Result, Box> { + use std::io::Write; + + let mut child = std::process::Command::new("docker") + .args(["run", "-i", "--rm", image, "decode"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn()?; + + { + let stdin = child.stdin.as_mut().ok_or("Failed to open stdin")?; + stdin.write_all(data)?; + } + + let output = child.wait_with_output()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Decode failed: {}", stderr).into()); + } + + // Parse line-delimited JSON from stdout + let stdout = String::from_utf8(output.stdout)?; + let mut events = Vec::new(); + + for line in stdout.lines() { + if line.is_empty() { + continue; + } + let raw: RawGitHubEvent = serde_json::from_str(line)?; + let key = EventKey { + event_type: raw.event_type, + id: raw.id, + }; + let value = EventValue { + repo: raw.repo, + created_at: raw.created_at, + }; + events.push((key, value)); + } + + Ok(events) +} + fn main() -> Result<(), Box> { let args: Vec = std::env::args().collect(); let mut path = "data.json".to_string(); let mut codec_filter: Option = None; + let mut docker_enabled = false; let mut i = 1; while i < args.len() { @@ -159,6 +283,9 @@ fn main() -> Result<(), Box> { i += 1; } } + "--docker" => { + docker_enabled = true; + } arg if !arg.starts_with('-') => { path = arg.to_string(); } @@ -166,6 +293,15 @@ fn main() -> Result<(), Box> { } i += 1; } + + // Auto-enable docker mode if targeting an external codec + if let Some(ref filter) = codec_filter { + let potential_path = format!("src/{}/Dockerfile", filter); + if std::path::Path::new(&potential_path).exists() { + docker_enabled = true; + } + } + let events = load_events(&path)?; println!("Loaded {} events from {}\n", events.len(), path); @@ -208,6 +344,62 @@ fn main() -> Result<(), Box> { assert_events_eq(codec.name(), expected, &decoded); } + // Run external Docker codecs if --docker flag is set + if docker_enabled { + let external_codecs = discover_external_codecs(); + for codec_name in external_codecs { + // Skip if filter is set and doesn't match + if let Some(ref filter) = codec_filter { + if !codec_name.to_lowercase().contains(filter) { + continue; + } + } + + // Build the Docker image + if let Err(e) = build_docker_codec(&codec_name) { + println!( + "│ {:<22} │ {:>14} │ {:>10} │", + format!("{} [BUILD FAILED]", codec_name), + "-", + "-" + ); + eprintln!("Build error for {}: {}", codec_name, e); + continue; + } + + // Run encode + let encoded = match run_docker_encode(&codec_name, &events) { + Ok(data) => data, + Err(e) => { + println!( + "│ {:<22} │ {:>14} │ {:>10} │", + format!("{} [ENCODE FAILED]", codec_name), + "-", + "-" + ); + eprintln!("Encode error for {}: {}", codec_name, e); + continue; + } + }; + + print_row(&codec_name, encoded.len(), baseline); + + // Run decode and verify + let decoded = match run_docker_decode(&codec_name, &encoded) { + Ok(data) => data, + Err(e) => { + eprintln!("Decode error for {}: {}", codec_name, e); + continue; + } + }; + + // Sort decoded events for comparison (external codecs may not preserve order) + let mut sorted_decoded = decoded.clone(); + sorted_decoded.sort_by(|a, b| a.0.cmp(&b.0)); + assert_events_eq(&codec_name, &sorted_events, &sorted_decoded); + } + } + println!("└────────────────────────┴────────────────┴────────────┘"); println!("\nAll verifications passed");