diff --git a/README.md b/README.md
index 13ce77d..dacc549 100644
--- a/README.md
+++ b/README.md
@@ -1035,7 +1035,7 @@ Options:
   --complex-model TEXT    Model for complex prompts
   --models TEXT           Comma-separated model list (legacy)
   --token TEXT            Auth token
-  --optimize [off|safe|aggressive]  Context optimization mode (default: off)
+  --optimize [off|safe|aggressive|progressive]  Context compression: off | safe | aggressive | progressive (default: off)
   --verbose               Enable debug logging
   --log-raw               Log full raw requests and responses to JSONL
 ```
@@ -1420,8 +1420,14 @@ Auth is disabled by default (local-only). Set `NADIRCLAW_AUTH_TOKEN` to require
 | `NADIRCLAW_CONFIDENCE_THRESHOLD` | `0.06` | Classification threshold (lower = more complex) |
 | `NADIRCLAW_PORT` | `8856` | Server port |
 | `NADIRCLAW_LOG_DIR` | `~/.nadirclaw/logs` | Log directory |
-| `NADIRCLAW_OPTIMIZE` | `off` | Context optimization mode: `off`, `safe` (lossless), `aggressive` (future) |
+| `NADIRCLAW_OPTIMIZE` | `off` | Context compression: `off` (disabled), `safe` (lossless), `aggressive`, or `progressive` (staged ladder that escalates to Headroom). `off` is the master on/off switch |
 | `NADIRCLAW_OPTIMIZE_MAX_TURNS` | `40` | Max conversation turns to keep when trimming history |
+| `NADIRCLAW_OPTIMIZE_BACKEND` | `native` | Optimizer backend: `native` (built-in) or `headroom` (needs `pip install nadirclaw[headroom]`; falls back to native if absent). See [savings analysis](docs/context-optimize-savings.md#backends-native-default-vs-headroom) |
+| `NADIRCLAW_HEADROOM_KOMPRESS` | `off` | When backend is `headroom`, enable Kompress ML text compression (downloads a HuggingFace model on first use) |
+| `NADIRCLAW_OPTIMIZE_PROGRESSIVE` | `off` | Legacy alias for `NADIRCLAW_OPTIMIZE=progressive` — forces the [progressive ladder](docs/context-optimize-savings.md#progressive-staged-compression) regardless of mode. Prefer setting `NADIRCLAW_OPTIMIZE=progressive` |
+| `NADIRCLAW_OPTIMIZE_TARGET_TOKENS` | _(unset)_ | Token budget for progressive compression (e.g. the model's context window). Unset → native stages only |
+| `NADIRCLAW_OPTIMIZE_MAX_STAGE` | `headroom_structural` | Cap on the progressive ladder: `native_safe`, `native_aggressive`, `headroom_structural`, or `headroom_ml` |
+| `NADIRCLAW_OPTIMIZE_ALLOW_LOSSY` | `off` | Permit the lossy ML prose stage (`headroom_ml`) in progressive compression |
 | `NADIRCLAW_LOG_RAW` | `false` | Log full raw requests and responses (`true`/`false`) |
 | `NADIRCLAW_MODELS` | `openai-codex/gpt-5.3-codex,gemini-3-flash-preview` | Legacy model list (fallback if tier vars not set) |
 | `OTEL_EXPORTER_OTLP_ENDPOINT` | *(empty — disabled)* | OpenTelemetry collector endpoint (enables tracing) |
diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md
new file mode 100644
index 0000000..1caac76
--- /dev/null
+++ b/THIRD_PARTY_NOTICES.md
@@ -0,0 +1,19 @@
+# Third-Party Notices
+
+NadirClaw is MIT-licensed. It can optionally use the following third-party
+components, declared as opt-in extras. Their licenses and attributions are
+reproduced here.
+
+## headroom-ai
+
+- **Used by:** the optional `headroom` optimizer backend
+  (`NADIRCLAW_OPTIMIZE_BACKEND=headroom`), installed via `pip install nadirclaw[headroom]`.
+- **Project:** Headroom — https://github.com/chopratejas/headroom
+- **License:** Apache License 2.0
+- **NOTICE:** Headroom, Copyright 2025 Headroom Contributors.
+
+NadirClaw integrates Headroom only through its public Python API
+(`headroom.compress`); no Headroom source code is copied or vendored into this
+project. A full copy of the Apache License 2.0 is available at
+https://www.apache.org/licenses/LICENSE-2.0 and is distributed with the
+`headroom-ai` package when installed.
diff --git a/benchmarks/optimize_real_data.py b/benchmarks/optimize_real_data.py
new file mode 100644
index 0000000..a6910d7
--- /dev/null
+++ b/benchmarks/optimize_real_data.py
@@ -0,0 +1,121 @@
+"""Real-data benchmark: optimizer backends on public coding + chat datasets.
+
+- Chat: allenai/WildChat-1M (real multi-turn user<->assistant conversations)
+- Coding/tools: glaiveai/glaive-function-calling-v2 (tool schemas + function calls + JSON)
+
+Compares: native-safe (lossless, ships today), Pro-aggressive (native ceiling),
+headroom (new opt-in backend). Single tiktoken estimator for all => fair.
+"""
+import json, os, re, sys, time, collections, urllib.request
+
+# Resolve the NadirClaw repo root from this file, and the sibling Nadir package.
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_NADIRCLAW = os.path.dirname(_HERE)
+sys.path.insert(0, _NADIRCLAW)
+_NADIR = os.path.join(os.path.dirname(_NADIRCLAW), "Nadir")
+if os.path.isdir(_NADIR):
+    sys.path.insert(0, _NADIR)
+
+import nadirclaw.optimize as claw
+try:
+    import nadir.optimize as pro
+except Exception:                       # Nadir Pro not on path — fall back to native
+    pro = claw
+
+est = claw._estimate_tokens_messages
+
+N = 200          # conversations per dataset
+CACHE = os.environ.get("BENCH_CACHE_DIR", "/tmp")
+
+
+def _fetch(dataset, config, split, dest, total=N):
+    """Fetch rows from the HF datasets-server (no full dataset download). Cached to disk."""
+    if os.path.exists(dest):
+        return
+    rows = []
+    for off in range(0, total, 100):
+        url = (f"https://datasets-server.huggingface.co/rows?dataset={dataset}"
+               f"&config={config}&split={split}&offset={off}&length=100")
+        for _ in range(3):
+            try:
+                with urllib.request.urlopen(url, timeout=40) as r:
+                    rows += [x["row"] for x in json.load(r).get("rows", [])]
+                break
+            except Exception:
+                time.sleep(2)
+    json.dump(rows, open(dest, "w"))
+
+
+_WILDCHAT = os.path.join(CACHE, "ds_wildchat.json")
+_GLAIVE = os.path.join(CACHE, "ds_glaive.json")
+_fetch("allenai/WildChat-1M", "default", "train", _WILDCHAT)
+_fetch("glaiveai/glaive-function-calling-v2", "default", "train", _GLAIVE)
+
+
+def load_wildchat():
+    rows = json.load(open(_WILDCHAT))[:N]
+    convs = []
+    for r in rows:
+        msgs = [{"role": t.get("role", "user"), "content": t.get("content") or ""}
+                for t in (r.get("conversation") or []) if isinstance(t, dict)]
+        msgs = [m for m in msgs if isinstance(m["content"], str) and m["content"]]
+        if len(msgs) >= 2:
+            convs.append(msgs)
+    return convs
+
+
+def load_glaive():
+    rows = json.load(open(_GLAIVE))[:N]
+    convs = []
+    marker = re.compile(r"(USER:|ASSISTANT:|FUNCTION RESPONSE:)", re.I)
+    rolemap = {"USER": "user", "ASSISTANT": "assistant", "FUNCTION RESPONSE": "tool"}
+    for r in rows:
+        sysm = (r.get("system") or "").strip()
+        if sysm.upper().startswith("SYSTEM:"):
+            sysm = sysm[7:].strip()
+        msgs = [{"role": "system", "content": sysm}] if sysm else []
+        chat = r.get("chat") or ""
+        parts = marker.split(chat)
+        # parts: ['', 'USER:', ' ...', 'ASSISTANT:', ' ...', ...]
+        i = 1
+        while i < len(parts) - 0:
+            lab = parts[i].rstrip(":").upper()
+            content = parts[i + 1].strip() if i + 1 < len(parts) else ""
+            if lab in rolemap and content:
+                msgs.append({"role": rolemap[lab], "content": content})
+            i += 2
+        if len(msgs) >= 2:
+            convs.append(msgs)
+    return convs
+
+
+def bench(convs, runners):
+    out = {name: [0, 0] for name in runners}          # name -> [orig, after]
+    transforms = {name: collections.Counter() for name in runners}
+    for msgs in convs:
+        for name, fn in runners.items():
+            r = fn([{**m} for m in msgs])
+            out[name][0] += r.original_tokens
+            out[name][1] += r.optimized_tokens
+            for t in r.optimizations_applied:
+                transforms[name][t.split(":")[1] if t.startswith("headroom:") else t] += 1
+    return out, transforms
+
+
+RUNNERS = {
+    "native-safe":    lambda m: claw.optimize_messages(m, mode="safe", backend="native"),
+    "pro-aggressive": lambda m: pro.optimize_messages(m, mode="aggressive", backend="native"),
+    "headroom":       lambda m: claw.optimize_messages(m, mode="safe", backend="headroom"),
+}
+
+for label, loader in [("CHAT — WildChat-1M", load_wildchat), ("CODING/TOOLS — glaive-function-calling-v2", load_glaive)]:
+    convs = loader()
+    t0 = time.time()
+    res, tf = bench(convs, RUNNERS)
+    base = res["native-safe"][0]
+    print(f"\n### {label}  ({len(convs)} conversations, {base:,} raw tokens, {time.time()-t0:.0f}s)")
+    print(f"{'backend':<18}{'after':>10}{'saved':>9}{'%':>7}   top transforms")
+    for name in RUNNERS:
+        o, a = res[name]
+        top = ", ".join(f"{k}:{v}" for k, v in tf[name].most_common(4))
+        print(f"{name:<18}{a:>10,}{o-a:>9,}{100*(o-a)/max(1,o):>6.1f}%   {top}")
diff --git a/docs/context-optimize-savings.md b/docs/context-optimize-savings.md
index 26bc710..13447ff 100644
--- a/docs/context-optimize-savings.md
+++ b/docs/context-optimize-savings.md
@@ -37,6 +37,7 @@ Combined with smart routing, NadirClaw now saves in two ways:
 - **Tool schema deduplication** — Agent frameworks often re-send the full tool schema with every turn. NadirClaw keeps the first occurrence and replaces repeats with a short reference.
 - **Chat history trimming** — Long conversations accumulate tokens that are far from the current task. Trimming to recent turns (default: 40) keeps context relevant and cheap.
 - **Whitespace normalization** — Log dumps, stack traces, and verbose output contain runs of blank lines and spaces that carry no semantic value.
+- **Columnar JSON-array packing** (`json_array_pack`, aggressive mode) — Large arrays of same-keyed objects (DB query results, API list responses, large tool outputs) repeat every key on every row. Packing them into a header (`⟦cols=[...]⟧`) plus one value-array per row emits each key once. Information-lossless and deterministically reversible, but not byte-identical JSON, so it runs in **aggressive** mode only. On a 100-row homogeneous array this reaches ~68% vs pretty-printed JSON (vs ~45% for `json_minify` alone).
 
 ## Projected Monthly Savings (Opus 4.6)
 
@@ -56,6 +57,9 @@ All safe-mode transforms are deterministic and lossless:
 
 - JSON values roundtrip exactly (parse + compact re-serialize)
 - Code blocks inside fences (```) are never modified
+- **Leading indentation is preserved**, so raw (unfenced) source code — e.g. file-read
+  tool outputs — stays syntactically valid. Whitespace normalization only collapses
+  *interior* multi-spaces and excess blank lines, never indentation.
 - URLs are preserved character-for-character
 - Unicode and emoji roundtrip correctly
 - Deeply nested structures are handled without data loss
@@ -76,3 +80,100 @@ NADIRCLAW_OPTIMIZE=safe nadirclaw serve
 # Dry-run on a file
 nadirclaw optimize payload.json --mode safe --format json
 ```
+
+## Backends: native (default) vs headroom
+
+The optimizer has a pluggable backend, selected independently of the `off|safe|aggressive`
+mode. The mode still decides *how hard* to compress; the backend decides *who* runs it.
+
+| Backend | Default | Engine | Extra capabilities |
+|---|---|---|---|
+| `native` | ✅ | Built-in stdlib pipeline (this document) | None — pure Python, no extra deps |
+| `headroom` | opt-in | [Headroom](https://github.com/chopratejas/headroom) (Apache-2.0) | Statistical JSON-array crushing (SmartCrusher), AST-aware code compression, content-type routing |
+
+`headroom` delegates to the optional [`headroom-ai`](https://pypi.org/project/headroom-ai/)
+package. It ships **installed by default with Nadir Pro** but stays **inactive** until you
+select it. In open-source NadirClaw it is an opt-in extra:
+
+```bash
+pip install "nadirclaw[headroom]"
+```
+
+Activate it:
+
+```bash
+# Server-wide
+NADIRCLAW_OPTIMIZE=safe NADIRCLAW_OPTIMIZE_BACKEND=headroom nadirclaw serve
+
+# Per-request override (in the request body)
+{"model": "auto", "optimize": "safe", "optimize_backend": "headroom", "messages": [...]}
+```
+
+Safety and fallback:
+
+- If `headroom-ai` is not installed (or raises), the optimizer **transparently falls back
+  to `native`** and logs a one-time warning. Requests never fail because of the backend.
+- Token-savings metrics are always recomputed with NadirClaw's own estimator, so reported
+  numbers stay consistent across backends (Savings/Billing math is unaffected).
+- Headroom's ML text compressor (Kompress) downloads a HuggingFace model on first use, so
+  it is kept **disabled** by default. Opt in with `NADIRCLAW_HEADROOM_KOMPRESS=on`.
+- The fastest Headroom compressors (SmartCrusher etc.) are a compiled Rust extension bundled
+  in the prebuilt wheels. On source installs without the wheel they simply don't run, and
+  Headroom fails open — output is still correct, just less compressed.
+
+Attribution for the Apache-2.0 dependency lives in
+[`THIRD_PARTY_NOTICES.md`](../THIRD_PARTY_NOTICES.md).
+
+## Progressive (staged) compression
+
+`compress_progressive()` escalates through compression stages and **stops as soon as a
+token budget is met** — so you only pay the cost (and fidelity risk) of heavier compression
+when lighter stages aren't enough. Headroom is wired in as the middle/late tiers.
+
+The ladder, cheapest/safest first:
+
+| Stage | What runs | Loss | Needs |
+|---|---|---|---|
+| 1. `native_safe` | system/tool dedup, json minify, whitespace | lossless | — |
+| 2. `native_aggressive` | + columnar packing, semantic dedup, Pro transforms | lossless-to-semantic | — |
+| 3. `headroom_structural` | Headroom content compressors (SmartCrusher, LogCompressor, …) | high-fidelity | `headroom-ai` |
+| 4. `headroom_ml` | Headroom Kompress (ML token-dropping on prose) | lossy | `headroom-ai` + `allow_lossy` |
+
+Rules:
+
+- With **no `target_tokens`**, the ladder stops after `native_aggressive` — Headroom and the
+  lossy ML stage are never reached. Default behaviour stays dependency-free and lossless.
+- The Headroom stages are **skipped silently** when `headroom-ai` is not installed.
+- `headroom_ml` (lossy) only runs when `allow_lossy=True`.
+- Chat-history trimming always runs last as a final backstop.
+
+```python
+from nadirclaw.optimize import compress_progressive   # or nadir.optimize for Pro
+
+result = compress_progressive(
+    messages,
+    target_tokens=180_000,     # e.g. the model's context window
+    allow_lossy=False,         # set True to permit the lossy ML stage
+    max_stage="headroom_structural",
+)
+# result.optimizations_applied is prefixed with stage:<name> markers that ran
+```
+
+Enable it on the server — `progressive` is just a value of the single `optimize`
+control, alongside `off` / `safe` / `aggressive`:
+
+```bash
+# off | safe | aggressive | progressive  (off = compression disabled)
+NADIRCLAW_OPTIMIZE=progressive \
+NADIRCLAW_OPTIMIZE_TARGET_TOKENS=180000 \
+NADIRCLAW_OPTIMIZE_MAX_STAGE=headroom_structural \
+nadirclaw serve
+
+# equivalently: nadirclaw serve --optimize progressive
+# per-request:  {"optimize": "progressive", "messages": [...]}
+# turn compression off:  {"optimize": "off", ...}
+```
+
+On a logs+prose payload where native compression yields ~0%, escalating to
+`headroom_structural` reached ~90% — the escalation only spends the Headroom budget when
+native genuinely can't deliver.
diff --git a/docs/optimizer-benchmark-2026-06.md b/docs/optimizer-benchmark-2026-06.md
new file mode 100644
index 0000000..7c05f02
--- /dev/null
+++ b/docs/optimizer-benchmark-2026-06.md
@@ -0,0 +1,166 @@
+# Context Optimizer Benchmark — native vs Headroom, synthetic vs real data
+
+**Date:** 2026-06-06
+**Scope:** Evaluate the context optimizer (`nadirclaw.optimize` / `nadir.optimize`) across
+backends (`native`, `headroom`) and modes (`safe`, `aggressive`) on both synthetic
+payloads and real public coding + chat datasets. Establish whether Headroom improves
+on the native pipeline, and where the realistic savings ceiling is.
+
+## TL;DR
+
+- **On real conversational traffic, lossless savings are 1–10%, not the 30–60% synthetic
+  payloads suggest.** Real traffic is prose-dominated; structural optimizers (JSON minify,
+  schema dedup, whitespace) have little to grab.
+- **The `headroom` *library `compress()` wrapper* underdelivers** (it routes conservatively and
+  protects messages). But **Headroom's transforms, called directly "as is", do reproduce their
+  published numbers** — LogCompressor ~80% on repetitive logs, SmartCrusher ~73% on homogeneous
+  JSON arrays. The earlier "Headroom underperforms" verdict was about the wrapper, not the engine.
+- **One real capability gap was found and closed natively:** SmartCrusher packs homogeneous JSON
+  arrays into a columnar table (~50% beyond our `json_minify`). We now do this losslessly in
+  `aggressive` mode via the new `json_array_pack` transform (68% vs pretty JSON, vs Headroom's 73%),
+  with no dependency and no CCR machinery.
+- **Pro-aggressive is the best performer** everywhere — all native, no new dependencies.
+- **Decision:** keep `native` the default, `headroom` a safe opt-in (already shipped this way).
+  Remaining gains require prose compression (lossy ML + a CCR recovery path that is not built).
+
+## Method
+
+- **Backends compared:** `native-safe` (lossless, ships today), `pro-aggressive` (native
+  ceiling: + secret-mask, tool-schema compaction, log/stack compression, semantic dedup),
+  `headroom` (optional `headroom-ai` backend, Kompress disabled unless noted).
+- **Token metric:** the optimizer's own tiktoken `cl100k_base` estimator, applied identically
+  to every backend's output, so comparisons are fair regardless of each engine's internal count.
+- **Datasets (public, no PII):**
+  - Chat: `allenai/WildChat-1M` — real multi-turn user↔assistant conversations.
+  - Coding/tools: `glaiveai/glaive-function-calling-v2` — tool schemas + function calls + JSON.
+  - 200 conversations each, fetched via the HF datasets-server `/rows` API (no full download).
+- **Environment note:** `headroom-ai` (Rust/PyO3 ≤ 3.13) **cannot build on Python 3.14**.
+  Benchmarks ran on a Python 3.13 venv with the prebuilt wheel. getnadir/Nadir target 3.12, so
+  this is fine in production, but 3.13 is the current ceiling for the Headroom dependency.
+
+## Results — synthetic payloads
+
+Hand-built "bloated" payloads (repeated tool schemas, pretty-printed JSON arrays, log dumps):
+
+| Backend | total reduction | notes |
+|---|--:|---|
+| native-safe | **30.4%** | lossless; strong on repeated tool schemas |
+| headroom | 25.2% | worse — no cross-message schema dedup; lossy crush never fired |
+| pro-aggressive | **60.3%** | `pattern_compression` took 200 log lines 0% → 87% |
+
+## Results — real data (the important part)
+
+| | raw tokens | native-safe | pro-aggressive | headroom |
+|---|--:|--:|--:|--:|
+| **Chat** (WildChat-1M, 195 convs) | 191,101 | **1.0%** | **4.7%** | 0.1% |
+| **Coding/tools** (glaive, 200 convs) | 111,697 | **8.2%** | **9.6%** | 2.6% |
+
+Transform frequency (real data):
+- Chat: `whitespace_normalize` 37×, `semantic_dedup` 20× (the only real lift), `json_minify` 7×.
+- Coding: `json_minify` 135× (the workhorse), `tool_schema_compact` 53×, `tool_schema_dedup` **0×**
+  (real requests carry each schema once, not repeated across turns).
+
+## Diagnosis — why real savings are low
+
+Token mass is **prose-dominated**, and structural optimizers cannot compress prose:
+
+- **Chat is ~100% natural language.** Native has almost nothing to grab (1%). `semantic_dedup`
+  is the only lever that moved it (→ 4.7%).
+- **Coding token distribution (glaive, by role):** assistant **67.7k (60%)**, system 22.3k (20%,
+  the tool schemas), user 17.3k, tool 4.8k. The compressible part is the 20% of JSON schema;
+  the 60% assistant prose is untouchable by structural methods.
+- **Lossless levers are exhausted:**
+  - Fenced JSON (the minifier skips code fences): **0** minifiable blocks in chat, **1** in glaive.
+    The 66 chat code-fences are *code*, not JSON.
+  - Verbatim block repetition across turns (≥80-char lines repeated in an earlier message):
+    **2.5%** chat / **0.5%** coding — the largest remaining lossless lever, and still small.
+
+## Headroom findings (tested two ways)
+
+**Via the library `compress()` wrapper (what our backend integrates):** underdelivers.
+SmartCrusher and CodeCompressor never engaged at any `target_ratio` — the wrapper routes
+conservatively and protects user/recent messages, so the heavy transforms rarely fire on real
+message content. This is why our backend benchmark was low.
+
+**Calling the transforms directly ("as is"):** the published per-type numbers reproduce.
+
+| Transform (direct call) | content | result | notes |
+|---|---|--:|---|
+| `LogCompressor.compress` | 200 repetitive log lines | **79.7%** (lossy) | matches their 80–95% claim; our Pro `pattern_compression` does 87% on the same logs |
+| `SmartCrusher.crush` | 100-row homogeneous JSON array | **73% vs pretty** (lossless table) | columnar format; this is the one capability we lacked |
+| `SmartCrusher.crush` | 50 *unique* (non-redundant) objects | ~47% (lossless) | falls back to ≈ our `json_minify` when rows aren't homogeneous |
+| `CodeAwareCompressor.compress` | Python source | **broke** (AST bytes bug → `syntax_valid=False`) | not usable in this build |
+
+Why their headline % looks bigger than ours: it is measured against **pretty-printed** JSON, and
+on **ideal redundant content** (homogeneous arrays, repetitive logs). Real conversational traffic
+is prose-dominated and we already minify, so the marginal win is smaller — except the columnar
+table, which is genuinely additive (see below).
+
+**Kompress (ML token-dropping)** is the one place Headroom wins on *prose*: ~12% on unique prose
+(native: 0%), ~60% on repetitive boilerplate. But it is lossy (drops function words, fuses
+sentences) and emits a `[... Retrieve more: hash=...]` marker that is **unrecoverable in our
+wiring** (no `headroom_retrieve` endpoint). It stays `disabled` by default.
+
+## Native columnar packing (`json_array_pack`) — the capability we adopted
+
+The only reproducible Headroom win we lacked was SmartCrusher's columnar table for homogeneous
+JSON arrays. We now do it natively in `aggressive` mode:
+
+| 100-row homogeneous array | tokens | vs pretty |
+|---|--:|--:|
+| pretty JSON | 4,202 | — |
+| `json_minify` (safe) | 2,302 | 45% |
+| **`json_array_pack` (aggressive)** | **1,323** | **68%** |
+| Headroom SmartCrusher | 1,119 | 73% |
+
+It rewrites an array of same-keyed objects into a header (`⟦cols=[...]⟧`) plus one JSON
+value-array per row, emitting each key once instead of N times. It is **information-lossless and
+deterministically reversible** (`_unpack_table`), runs only when the array is strictly homogeneous
+(≥ 5 rows, identical key sets) and only when it saves tokens, and **never runs in `safe` mode**
+(it is not byte-identical JSON). The 5pp gap to SmartCrusher is format: it uses bare CSV rows; we
+keep JSON-array rows so reversibility is robust across nested/special values.
+
+**Caveat:** the public chat/coding datasets above barely contain homogeneous arrays
+(`json_array_pack` fired once on glaive), so this does not move those totals. It targets
+tool-output traffic — DB query results, API list responses, large `get_*` tool returns — which
+production agent loops carry but these datasets do not.
+
+## Code safety (correctness fix)
+
+Testing the optimizer on raw source code surfaced a real bug: `whitespace_normalize`
+collapsed the **leading indentation** of unfenced code (file-read tool outputs), flattening
+nested Python into **invalid syntax** while reporting ~12–14% "savings". That apparent
+code compression was the corruption — not real savings.
+
+Fixed across all three optimizer copies (NadirClaw, Nadir Pro inherits it, getnadir): the
+normalizer now preserves leading whitespace and only collapses interior multi-spaces.
+Regression tests assert raw code stays `ast.parse`-valid in both safe and aggressive modes.
+
+Corrected takeaway: honest lossless savings on clean source code are **~0%** — structural
+optimization has nothing safe to remove from well-formatted code. (Headroom's CodeCompressor
+is the only engine that targets code, and it errored to invalid output in this build.)
+
+## Recommendations
+
+1. **Keep the shipped posture.** Native default, headroom opt-in, Kompress off. Validated correct.
+2. **For chat-heavy traffic, `aggressive` mode is the lever** — `semantic_dedup` is the only thing
+   that moves prose, lossless-ish, already available in NadirClaw and Pro.
+3. **Optional small lossless win (deferred):** a verbatim block-dedup transform would add ~2.5% on
+   chat. Modest; weigh against the readability cost of inline reference markers.
+4. **Large prose savings require investment:** ML token compression (Kompress) behind a real
+   CCR `headroom_retrieve` recovery endpoint. Only worth it if prose-heavy traffic dominates the
+   bill. Not built; lossy without it.
+
+## Reproduction
+
+```bash
+# Headroom needs Python <= 3.13 (Rust/PyO3). Build a 3.13 venv:
+python3.13 -m venv /tmp/hr-bench
+/tmp/hr-bench/bin/pip install "headroom-ai>=0.23.0" tiktoken sentence-transformers
+# Real-data benchmark (fetches 200 convs each from WildChat + glaive via HF datasets-server):
+/tmp/hr-bench/bin/python NadirClaw/benchmarks/optimize_real_data.py
+```
+
+The benchmark script lives at [`benchmarks/optimize_real_data.py`](../benchmarks/optimize_real_data.py).
+See also [context-optimize-savings.md](context-optimize-savings.md) for the transform-level
+savings analysis and the `native` vs `headroom` backend reference.
diff --git a/nadirclaw/__init__.py b/nadirclaw/__init__.py
index 9a92fda..f887770 100644
--- a/nadirclaw/__init__.py
+++ b/nadirclaw/__init__.py
@@ -1,3 +1,3 @@
 """NadirClaw — Open-source LLM router."""
 
-__version__ = "0.19.1"
+__version__ = "0.19.2"
diff --git a/nadirclaw/ccr.py b/nadirclaw/ccr.py
new file mode 100644
index 0000000..c510a6b
--- /dev/null
+++ b/nadirclaw/ccr.py
@@ -0,0 +1,161 @@
+"""CCR — Compress-Cache-Retrieve fetch-back loop (native, deterministic).
+
+The biggest prompt-token win is *offloading* large message content out of the
+prompt and letting the model pull it back on demand. Headroom does this, but its
+store is driven by a ContextVar written from a worker thread with a
+non-deterministic key — reliable only inside their own proxy. So NadirClaw owns
+the loop natively:
+
+  1. ``offload_messages`` moves oversized content into a ``{hash: original}`` map
+     we control, leaving a short preview + a ``nadir_retrieve(hash=...)`` marker.
+  2. ``retrieve_tool_def`` is injected so the model knows it can fetch the rest.
+  3. ``resolve_loop`` intercepts the model's retrieve calls, serves the exact
+     original from the map, and continues — so nothing is ever lost.
+
+This composes with the optimizer's lossless transforms (the inline content still
+gets compressed); offload is the last, most aggressive tier and is fully
+reversible because we keep every original byte.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+
+from nadirclaw.optimize import _estimate_tokens_str
+
+RETRIEVE_TOOL_NAME = "nadir_retrieve"
+
+# Default: offload non-user messages whose content exceeds this many tokens.
+DEFAULT_MIN_OFFLOAD_TOKENS = 400
+_OFFLOAD_ROLES = ("tool", "system", "assistant", "function")
+_MARKER_RE = re.compile(r'hash=[\'"]?([a-f0-9]{8,})')
+
+
+def retrieve_tool_def() -> dict:
+    """OpenAI-format function tool the model calls to fetch offloaded content."""
+    return {
+        "type": "function",
+        "function": {
+            "name": RETRIEVE_TOOL_NAME,
+            "description": (
+                "Retrieve the full original content that was offloaded from the "
+                "prompt to save tokens. Pass the hash from an offload marker like "
+                "'[... offloaded ... retrieve full content with nadir_retrieve(hash=\"abc123\")]'."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "hash": {"type": "string", "description": "Hash from the offload marker."}
+                },
+                "required": ["hash"],
+            },
+        },
+    }
+
+
+def _hash(content: str) -> str:
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
+
+
+def offload_messages(
+    messages: list[dict],
+    *,
+    min_tokens: int = DEFAULT_MIN_OFFLOAD_TOKENS,
+    roles: tuple = _OFFLOAD_ROLES,
+):
+    """Move oversized message content into a captured map, leaving a retrieve marker.
+
+    The user's turns are never offloaded. Returns ``(messages, captured, hashes)``
+    where ``captured[hash]`` is the exact original content.
+    """
+    captured: dict[str, str] = {}
+    hashes: list[str] = []
+    out: list[dict] = []
+    for m in messages:
+        content = m.get("content")
+        if (
+            isinstance(content, str)
+            and m.get("role") in roles
+            and _estimate_tokens_str(content) >= min_tokens
+        ):
+            h = _hash(content)
+            captured[h] = content
+            preview = re.sub(r"\s+", " ", content[:140]).strip()
+            tokens = _estimate_tokens_str(content)
+            out.append({
+                **m,
+                "content": (
+                    f"[{tokens} tokens offloaded to save context — preview: {preview}… — "
+                    f'retrieve the full content with {RETRIEVE_TOOL_NAME}(hash="{h}")]'
+                ),
+            })
+            if h not in hashes:
+                hashes.append(h)
+        else:
+            out.append(m)
+    return out, captured, hashes
+
+
+def resolve(captured: dict, hash_key: str) -> "str | None":
+    """Resolve an offload hash to its exact original content."""
+    if not captured or not hash_key:
+        return None
+    return captured.get(hash_key)
+
+
+def marker_hashes(messages: list[dict]) -> list[str]:
+    """Return offload hashes referenced by markers in *messages* (order-preserving)."""
+    seen: list[str] = []
+    for h in _MARKER_RE.findall(json.dumps(messages)):
+        if h not in seen:
+            seen.append(h)
+    return seen
+
+
+def extract_retrieve_calls(response: dict) -> list[tuple[str, str]]:
+    """Parse an OpenAI-format response for ``nadir_retrieve`` tool calls -> [(id, hash)]."""
+    calls: list[tuple[str, str]] = []
+    for choice in (response or {}).get("choices", []):
+        message = choice.get("message", {}) or {}
+        for tc in message.get("tool_calls", []) or []:
+            fn = tc.get("function", {}) or {}
+            if fn.get("name") != RETRIEVE_TOOL_NAME:
+                continue
+            args = fn.get("arguments")
+            if isinstance(args, str):
+                try:
+                    args = json.loads(args)
+                except Exception:
+                    args = {}
+            calls.append((tc.get("id", ""), (args or {}).get("hash", "")))
+    return calls
+
+
+def resolve_loop(messages, first_response, captured, call_llm, *, max_rounds: int = 3):
+    """Server-side fetch-back loop.
+
+    While the model asks for offloaded content, resolve each hash from *captured*,
+    append the originals as tool messages, and re-invoke ``call_llm(messages)``
+    until the model returns a final (non-retrieval) answer or *max_rounds* is hit.
+    ``call_llm`` is injected so this is testable without a live provider.
+    Returns ``(final_response, full_conversation)``.
+    """
+    convo = list(messages)
+    response = first_response
+    for _ in range(max_rounds):
+        calls = extract_retrieve_calls(response)
+        if not calls:
+            return response, convo
+        convo = convo + [response["choices"][0]["message"]]
+        for tool_call_id, h in calls:
+            original = resolve(captured, h)
+            convo.append({
+                "role": "tool",
+                "tool_call_id": tool_call_id,
+                "name": RETRIEVE_TOOL_NAME,
+                "content": original if original is not None else f"[retrieve failed: unknown hash {h}]",
+            })
+        response = call_llm(convo)
+    return response, convo
diff --git a/nadirclaw/cli.py b/nadirclaw/cli.py
index 9181fa9..25ddced 100644
--- a/nadirclaw/cli.py
+++ b/nadirclaw/cli.py
@@ -39,8 +39,8 @@ def setup(reconfigure):
 @click.option("--token", default=None, help="Auth token")
 @click.option("--verbose", is_flag=True, help="Enable verbose logging")
 @click.option("--log-raw", is_flag=True, help="Log full raw requests and responses to JSONL")
-@click.option("--optimize", default=None, type=click.Choice(["off", "safe", "aggressive"]),
-              help="Context optimization mode (default: off)")
+@click.option("--optimize", default=None, type=click.Choice(["off", "safe", "aggressive", "progressive"]),
+              help="Context compression: off | safe | aggressive | progressive (default: off)")
 def serve(port, simple_model, complex_model, models, token, verbose, log_raw, optimize):
     """Start the NadirClaw router server."""
     import logging
diff --git a/nadirclaw/optimize.py b/nadirclaw/optimize.py
index 1467af0..a54899a 100644
--- a/nadirclaw/optimize.py
+++ b/nadirclaw/optimize.py
@@ -13,6 +13,8 @@
 from __future__ import annotations
 
 import json
+import logging
+import os
 import re
 from dataclasses import dataclass, field
 
@@ -29,6 +31,9 @@ class OptimizeResult:
     tokens_saved: int
     mode: str
     optimizations_applied: list[str] = field(default_factory=list)
+    # When progressive offload runs, maps offload-hash -> original content so the
+    # caller can inject the retrieve tool and serve the fetch-back loop (see ccr.py).
+    offload_captured: dict = field(default_factory=dict)
 
 
 # ---------------------------------------------------------------------------
@@ -248,8 +253,12 @@ def _normalize_whitespace(content: str) -> tuple[str, bool]:
         if in_code_block:
             out_lines.append(line)
             continue
-        # Collapse multi-spaces outside code blocks
-        out_lines.append(_MULTI_SPACES.sub(" ", line))
+        # Collapse interior multi-spaces but PRESERVE leading indentation —
+        # otherwise raw (unfenced) source code has its indentation flattened
+        # into invalid syntax. Leading whitespace is semantically significant
+        # (Python, YAML, diffs), so it must survive even in "safe" mode.
+        n_lead = len(line) - len(line.lstrip(" \t"))
+        out_lines.append(line[:n_lead] + _MULTI_SPACES.sub(" ", line[n_lead:]))
 
     result = "\n".join(out_lines)
     # Collapse 3+ consecutive blank lines → 2
@@ -444,11 +453,117 @@ def _semantic_dedup(
     return result, changed
 
 
+# ---------------------------------------------------------------------------
+# Transform — Homogeneous JSON-array packing (aggressive, columnar)
+# ---------------------------------------------------------------------------
+#
+# Large arrays of objects that share the same keys (DB query results, API list
+# responses, tool outputs) repeat every key on every row. Packing them into a
+# columnar table — a single header of keys plus one JSON value-array per row —
+# emits each key once instead of N times. This is *information-lossless*
+# (deterministically reversible via _unpack_table) but not byte-identical JSON,
+# so it runs in aggressive mode only, never in safe mode.
+
+_TABLE_OPEN = "⟦cols="   # ⟦cols=[...]⟧
+_TABLE_CLOSE = "⟧"       # ⟧
+_TABLE_END = "⟦end⟧"  # ⟦end⟧
+_MIN_TABLE_ROWS = 5
+
+
+def _pack_array(arr: list) -> "str | None":
+    """Pack a homogeneous list-of-dicts into a columnar table, or None if unfit.
+
+    Only packs when every element is a dict with the *identical* key set (so the
+    reverse is unambiguous) and there are at least ``_MIN_TABLE_ROWS`` rows.
+    """
+    if len(arr) < _MIN_TABLE_ROWS or not all(isinstance(x, dict) for x in arr):
+        return None
+    cols = list(arr[0].keys())
+    if len(cols) < 2:
+        return None
+    colset = set(cols)
+    for d in arr:
+        if set(d.keys()) != colset:
+            return None  # not strictly homogeneous — leave for json_minify
+    lines = [f"{_TABLE_OPEN}{json.dumps(cols, separators=(',', ':'), ensure_ascii=False)}{_TABLE_CLOSE}"]
+    for d in arr:
+        lines.append(json.dumps([d[c] for c in cols], separators=(",", ":"), ensure_ascii=False))
+    lines.append(_TABLE_END)
+    return "\n".join(lines)
+
+
+def _unpack_table(packed: str) -> list:
+    """Inverse of :func:`_pack_array` — reconstruct the exact list-of-dicts."""
+    lines = packed.split("\n")
+    header = lines[0]
+    cols = json.loads(header[len(_TABLE_OPEN):-len(_TABLE_CLOSE)])
+    rows = [json.loads(ln) for ln in lines[1:] if ln and ln != _TABLE_END]
+    return [dict(zip(cols, r)) for r in rows]
+
+
+def _pack_homogeneous_arrays(content: str) -> tuple[str, bool]:
+    """Replace embedded homogeneous JSON arrays with a compact columnar table.
+
+    Skips fenced code blocks and only replaces when the packed form is strictly
+    smaller (in tokens) than the minified array.
+    """
+    if not content or len(content) < 80 or "[" not in content:
+        return content, False
+
+    parts = re.split(r"(```[^\n]*\n.*?```)", content, flags=re.DOTALL)
+    changed = False
+    out_segments: list[str] = []
+    for seg in parts:
+        if seg.startswith("```"):
+            out_segments.append(seg)
+            continue
+        new_seg, seg_changed = _pack_segment(seg)
+        out_segments.append(new_seg)
+        changed = changed or seg_changed
+    return "".join(out_segments), changed
+
+
+def _pack_segment(text: str) -> tuple[str, bool]:
+    decoder = json.JSONDecoder()
+    result: list[str] = []
+    pos = 0
+    changed = False
+    while pos < len(text):
+        idx = text.find("[", pos)
+        if idx == -1:
+            result.append(text[pos:])
+            break
+        result.append(text[pos:idx])
+        try:
+            obj, end = decoder.raw_decode(text, idx)
+        except (json.JSONDecodeError, ValueError):
+            result.append("[")
+            pos = idx + 1
+            continue
+        packed = _pack_array(obj) if isinstance(obj, list) else None
+        if packed is not None:
+            minified = json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
+            if _estimate_tokens_str(packed) < _estimate_tokens_str(minified):
+                result.append(packed)
+                changed = True
+            else:
+                result.append(text[idx:end])
+        else:
+            result.append(text[idx:end])
+        pos = end
+    return "".join(result), changed
+
+
 _SAFE_TRANSFORMS = [
     ("system_prompt_dedup", lambda msgs, **_: _dedup_system_prompts(msgs)),
     ("tool_schema_dedup", lambda msgs, **_: _dedup_tool_schemas(msgs)),
 ]
 
+# Core aggressive content-level transforms (run before caller-supplied hooks).
+_AGGRESSIVE_CONTENT_TRANSFORMS = [
+    ("json_array_pack", _pack_homogeneous_arrays),
+]
+
 # Content-level transforms (operate on individual message content strings)
 _SAFE_CONTENT_TRANSFORMS = [
     ("json_minify", _minify_json_in_content),
@@ -456,10 +571,175 @@ def _semantic_dedup(
 ]
 
 
+# ---------------------------------------------------------------------------
+# Backend selection — native (default) or headroom (opt-in)
+# ---------------------------------------------------------------------------
+
+_headroom_warned = False
+
+
+def _resolve_backend(backend: str | None) -> str:
+    """Resolve the optimizer backend: explicit arg, else env, else ``native``."""
+    val = (backend or os.getenv("NADIRCLAW_OPTIMIZE_BACKEND", "native")).lower()
+    return val if val in ("native", "headroom") else "native"
+
+
+def _warn_headroom_once(msg: str) -> None:
+    global _headroom_warned
+    if not _headroom_warned:
+        _headroom_warned = True
+        logging.getLogger(__name__).warning(
+            "context-optimize: %s — falling back to native backend.", msg
+        )
+
+
+def _headroom_optimize(
+    messages: list[dict],
+    mode: str,
+    max_turns: int,
+    original_tokens: int,
+) -> "OptimizeResult | None":
+    """Compress via the optional ``headroom-ai`` package (Apache-2.0).
+
+    Returns ``None`` so the caller transparently falls back to the native
+    pipeline when ``headroom-ai`` is not installed or raises.  Token metrics
+    are recomputed with our own estimator so reported savings stay consistent
+    across backends (Savings/Billing math depends on a single estimator).
+    """
+    try:
+        from headroom import compress, CompressConfig
+    except Exception:
+        _warn_headroom_once("headroom-ai not installed (pip install nadirclaw[headroom])")
+        return None
+
+    try:
+        # Kompress (ML token compression) downloads a HuggingFace model on
+        # first use, so it stays opt-in behind an explicit env flag.
+        kompress_on = os.getenv("NADIRCLAW_HEADROOM_KOMPRESS", "off").lower() in (
+            "on", "1", "true", "yes",
+        )
+        cfg = CompressConfig(
+            compress_user_messages=(mode == "aggressive"),
+            kompress_model=None if kompress_on else "disabled",
+        )
+        result = compress([{**m} for m in messages], config=cfg)
+        msgs = list(result.messages)
+        # Conversation-turn trimming is ours, not headroom's — apply for parity.
+        msgs, _ = _trim_chat_history(msgs, max_turns=max_turns)
+    except Exception as exc:  # pragma: no cover — defensive; headroom itself fails open
+        _warn_headroom_once(f"headroom compress failed: {exc}")
+        return None
+
+    optimized_tokens = _estimate_tokens_messages(msgs)
+    applied = [f"headroom:{t}" for t in (getattr(result, "transforms_applied", None) or [])]
+    return OptimizeResult(
+        messages=msgs,
+        original_tokens=original_tokens,
+        optimized_tokens=optimized_tokens,
+        tokens_saved=max(0, original_tokens - optimized_tokens),
+        mode=mode,
+        optimizations_applied=applied or ["headroom"],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Reusable pipeline stages (shared by optimize_messages + compress_progressive)
+# ---------------------------------------------------------------------------
+
+def _apply_content_transforms(msgs: list[dict], transforms) -> tuple[list[dict], list[str]]:
+    """Apply ``[(name, fn)]`` content-level transforms; return (msgs, applied)."""
+    applied: list[str] = []
+    for name, fn in transforms:
+        content_changed = False
+        for i, m in enumerate(msgs):
+            content = m.get("content")
+            if not isinstance(content, str) or len(content) < 10:
+                continue
+            new_content, changed = fn(content)
+            if changed:
+                msgs[i] = {**m, "content": new_content}
+                content_changed = True
+        if content_changed:
+            applied.append(name)
+    return msgs, applied
+
+
+def _apply_safe_transforms(msgs: list[dict], extra_safe_content=None) -> tuple[list[dict], list[str]]:
+    """Lossless structural transforms (system/tool-schema dedup, json minify, whitespace)."""
+    applied: list[str] = []
+    for name, fn in _SAFE_TRANSFORMS:
+        msgs, did_change = fn(msgs)
+        if did_change:
+            applied.append(name)
+    msgs, content_applied = _apply_content_transforms(
+        msgs, list(_SAFE_CONTENT_TRANSFORMS) + list(extra_safe_content or [])
+    )
+    return msgs, applied + content_applied
+
+
+def _apply_aggressive_transforms(
+    msgs: list[dict], extra_aggressive_message=None, extra_aggressive_content=None
+) -> tuple[list[dict], list[str]]:
+    """Columnar packing + caller hooks + semantic dedup (lossless-to-semantic)."""
+    applied: list[str] = []
+    msgs, a = _apply_content_transforms(msgs, list(_AGGRESSIVE_CONTENT_TRANSFORMS))
+    applied += a
+    for name, fn in (extra_aggressive_message or []):
+        msgs, did_change = fn(msgs)
+        if did_change:
+            applied.append(name)
+    msgs, a = _apply_content_transforms(msgs, list(extra_aggressive_content or []))
+    applied += a
+    msgs, did_semantic = _semantic_dedup(msgs)
+    if did_semantic:
+        applied.append("semantic_dedup")
+    return msgs, applied
+
+
+def _headroom_stage(msgs: list[dict], *, kompress: bool) -> "tuple[list[dict], list[str]] | None":
+    """Run Headroom's content compressors over the messages (optional dep).
+
+    Protections are disabled so the structural compressors (SmartCrusher,
+    LogCompressor, ...) actually engage; the lossy ML prose compressor
+    (Kompress) is gated behind *kompress*. Returns ``None`` when ``headroom-ai``
+    is unavailable or raises, so the caller can skip this stage cleanly.
+    """
+    try:
+        from headroom import compress, CompressConfig
+    except Exception:
+        _warn_headroom_once("headroom-ai not installed (pip install nadirclaw[headroom])")
+        return None
+    try:
+        cfg = CompressConfig(
+            compress_user_messages=True,
+            compress_system_messages=True,
+            protect_recent=0,
+            protect_analysis_context=False,
+            kompress_model=None if kompress else "disabled",
+        )
+        result = compress([{**m} for m in msgs], config=cfg)
+    except Exception as exc:  # pragma: no cover — headroom itself fails open
+        _warn_headroom_once(f"headroom compress failed: {exc}")
+        return None
+    seen: set[str] = set()
+    applied: list[str] = []
+    for t in (getattr(result, "transforms_applied", None) or []):
+        name = "headroom:" + (t.split(":")[0] if ":" in t else t)
+        if name not in seen:
+            seen.add(name)
+            applied.append(name)
+    return list(result.messages), applied or ["headroom"]
+
+
 def optimize_messages(
     messages: list[dict],
     mode: str = "off",
     max_turns: int = 40,
+    *,
+    backend: str | None = None,
+    extra_safe_content: "list | None" = None,
+    extra_aggressive_message: "list | None" = None,
+    extra_aggressive_content: "list | None" = None,
 ) -> OptimizeResult:
     """Optimize a list of message dicts for token reduction.
 
@@ -472,6 +752,14 @@ def optimize_messages(
         (safe + semantic deduplication via sentence embeddings).
     max_turns
         Maximum conversation turns to keep when trimming history.
+    backend
+        ``"native"`` (default — the stdlib transform pipeline) or
+        ``"headroom"`` (delegates to the optional ``headroom-ai`` package and
+        falls back to native if it is unavailable).  When ``None`` the
+        ``NADIRCLAW_OPTIMIZE_BACKEND`` env var is consulted (default native).
+    extra_safe_content, extra_aggressive_message, extra_aggressive_content
+        Optional ``[(name, fn)]`` hooks letting a superset package (Nadir Pro)
+        register additional transforms without forking this pipeline.
 
     Returns
     -------
@@ -489,36 +777,28 @@ def optimize_messages(
             mode="off",
         )
 
+    # --- Backend selection (headroom is opt-in; native is the default) ---
+    if _resolve_backend(backend) == "headroom":
+        hr = _headroom_optimize(messages, mode, max_turns, original_tokens)
+        if hr is not None:
+            return hr
+        # else: fall through to the native pipeline below
+
     applied: list[str] = []
 
     # Deep copy messages to avoid mutating input
     msgs = [{**m} for m in messages]
 
-    # --- Message-level transforms (safe) ---
-    for name, fn in _SAFE_TRANSFORMS:
-        msgs, did_change = fn(msgs)
-        if did_change:
-            applied.append(name)
-
-    # --- Content-level transforms (safe) ---
-    for name, fn in _SAFE_CONTENT_TRANSFORMS:
-        content_changed = False
-        for i, m in enumerate(msgs):
-            content = m.get("content")
-            if not isinstance(content, str) or len(content) < 10:
-                continue
-            new_content, changed = fn(content)
-            if changed:
-                msgs[i] = {**m, "content": new_content}
-                content_changed = True
-        if content_changed:
-            applied.append(name)
+    # --- Safe (lossless) transforms ---
+    msgs, a = _apply_safe_transforms(msgs, extra_safe_content)
+    applied += a
 
     # --- Aggressive-only transforms ---
     if mode == "aggressive":
-        msgs, did_semantic = _semantic_dedup(msgs)
-        if did_semantic:
-            applied.append("semantic_dedup")
+        msgs, a = _apply_aggressive_transforms(
+            msgs, extra_aggressive_message, extra_aggressive_content
+        )
+        applied += a
 
     # --- Chat history trimming ---
     msgs, did_trim = _trim_chat_history(msgs, max_turns=max_turns)
@@ -535,3 +815,135 @@ def optimize_messages(
         mode=mode,
         optimizations_applied=applied,
     )
+
+
+# ---------------------------------------------------------------------------
+# Progressive (staged) compression
+# ---------------------------------------------------------------------------
+
+# Escalation ladder, cheapest/safest first. Headroom is the middle tier (its
+# structural compressors fill gaps native cannot — ragged JSON, big logs — and
+# its lossy ML prose compressor is later). The final tier is native CCR offload:
+# move oversized content out of the prompt behind a retrieve handle, fully
+# reversible because the caller serves it back on demand (see ccr.py).
+_PROGRESSIVE_STAGES = (
+    "native_safe", "native_aggressive", "headroom_structural", "headroom_ml", "offload",
+)
+
+
+def compress_progressive(
+    messages: list[dict],
+    *,
+    target_tokens: "int | None" = None,
+    max_turns: int = 40,
+    allow_lossy: bool = False,
+    allow_offload: bool = False,
+    max_stage: str = "native_aggressive",
+    extra_safe_content: "list | None" = None,
+    extra_aggressive_message: "list | None" = None,
+    extra_aggressive_content: "list | None" = None,
+) -> OptimizeResult:
+    """Apply escalating compression stages, stopping once a budget is met.
+
+    Stages run in order and the loop stops as soon as the message set fits
+    ``target_tokens``:
+
+      1. ``native_safe``        — lossless structural transforms
+      2. ``native_aggressive``  — + columnar packing, semantic dedup, caller hooks
+      3. ``headroom_structural``— Headroom content compressors (optional dep)
+      4. ``headroom_ml``        — Headroom Kompress (lossy ML prose)
+      5. ``offload``            — native CCR: move oversized content out of the
+                                  prompt behind a ``nadir_retrieve`` handle
+
+    Escalation rules:
+
+    - When ``target_tokens`` is ``None`` the ladder stops after ``max_stage``
+      (default ``native_aggressive``) — Headroom, lossy ML and offload are never
+      reached unless an explicit budget still isn't met. This keeps the default
+      behaviour dependency-free and lossless-to-semantic.
+    - The Headroom stages require the optional ``headroom-ai`` package and are
+      skipped silently when it is absent or errors.
+    - ``headroom_ml`` (lossy) only runs when ``allow_lossy=True``.
+    - ``offload`` only runs when ``allow_offload=True``. It is fully reversible —
+      the originals are returned in ``OptimizeResult.offload_captured`` so the
+      caller MUST inject the retrieve tool (:func:`nadirclaw.ccr.retrieve_tool_def`)
+      and serve the fetch-back loop (:func:`nadirclaw.ccr.resolve_loop`), or the
+      model cannot recover the offloaded content.
+    - Chat-history trimming always runs last as a final backstop.
+
+    Returns an :class:`OptimizeResult` whose ``mode`` is ``"progressive"`` and
+    whose ``optimizations_applied`` is prefixed with the ``stage:<name>`` markers
+    that actually ran.
+    """
+    original_tokens = _estimate_tokens_messages(messages)
+    msgs = [{**m} for m in messages]
+    applied: list[str] = []
+    stages_run: list[str] = []
+
+    ladder = list(_PROGRESSIVE_STAGES)
+    if max_stage in ladder:
+        ladder = ladder[: ladder.index(max_stage) + 1]
+    if not allow_lossy and "headroom_ml" in ladder:
+        ladder.remove("headroom_ml")
+    if not allow_offload and "offload" in ladder:
+        ladder.remove("offload")
+
+    offload_captured: dict = {}
+
+    def _fits() -> bool:
+        return target_tokens is not None and _estimate_tokens_messages(msgs) <= target_tokens
+
+    for stage in ladder:
+        if _fits():
+            break
+        # Headroom and offload stages only engage when a budget is set and unmet.
+        if stage in ("headroom_structural", "headroom_ml", "offload") and target_tokens is None:
+            break
+
+        if stage == "native_safe":
+            msgs, a = _apply_safe_transforms(msgs, extra_safe_content)
+        elif stage == "native_aggressive":
+            msgs, a = _apply_aggressive_transforms(
+                msgs, extra_aggressive_message, extra_aggressive_content
+            )
+        elif stage == "headroom_structural":
+            hr = _headroom_stage(msgs, kompress=False)
+            if hr is None:
+                continue
+            msgs, a = hr
+        elif stage == "headroom_ml":
+            hr = _headroom_stage(msgs, kompress=True)
+            if hr is None:
+                continue
+            msgs, a = hr
+        elif stage == "offload":
+            # Native CCR offload: move oversized content behind a retrieve handle.
+            from nadirclaw import ccr
+
+            msgs, captured, hashes = ccr.offload_messages(msgs)
+            if not hashes:
+                continue
+            offload_captured.update(captured)
+            a = ["offload"]
+        else:  # pragma: no cover
+            continue
+
+        applied += a
+        stages_run.append(stage)
+
+    # Final backstop — trim history if still over budget (or unconditionally
+    # when over max_turns, matching optimize_messages).
+    msgs, did_trim = _trim_chat_history(msgs, max_turns=max_turns)
+    if did_trim:
+        applied.append("chat_history_trim")
+
+    optimized_tokens = _estimate_tokens_messages(msgs)
+    return OptimizeResult(
+        messages=msgs,
+        original_tokens=original_tokens,
+        optimized_tokens=optimized_tokens,
+        tokens_saved=max(0, original_tokens - optimized_tokens),
+        mode="progressive",
+        optimizations_applied=[f"stage:{s}" for s in stages_run] + applied,
+        offload_captured=offload_captured,
+    )
diff --git a/nadirclaw/server.py b/nadirclaw/server.py
index e16cb5a..e36e67b 100644
--- a/nadirclaw/server.py
+++ b/nadirclaw/server.py
@@ -1418,19 +1418,36 @@ async def chat_completions(
         # Context optimization — compact messages before dispatch
         # ------------------------------------------------------------------
         optimize_mode = (request.model_extra or {}).get("optimize") or settings.OPTIMIZE
+        optimize_backend = (request.model_extra or {}).get("optimize_backend") or settings.OPTIMIZE_BACKEND
         optimization_info = None
         if optimize_mode != "off":
-            from nadirclaw.optimize import optimize_messages
-
             raw_msgs = [
                 {"role": m.role, "content": m.text_content()}
                 for m in request.messages
             ]
-            opt_result = optimize_messages(
-                raw_msgs,
-                mode=optimize_mode,
-                max_turns=settings.OPTIMIZE_MAX_TURNS,
-            )
+            # `optimize=progressive` (or the legacy NADIRCLAW_OPTIMIZE_PROGRESSIVE
+            # flag) selects the staged ladder that escalates native → headroom →
+            # lossy ML only until the token budget is met. Headroom stages are
+            # skipped if headroom-ai is not installed.
+            if optimize_mode == "progressive" or settings.OPTIMIZE_PROGRESSIVE:
+                from nadirclaw.optimize import compress_progressive
+
+                opt_result = compress_progressive(
+                    raw_msgs,
+                    target_tokens=settings.OPTIMIZE_TARGET_TOKENS,
+                    max_turns=settings.OPTIMIZE_MAX_TURNS,
+                    allow_lossy=settings.OPTIMIZE_ALLOW_LOSSY,
+                    max_stage=settings.OPTIMIZE_MAX_STAGE,
+                )
+            else:
+                from nadirclaw.optimize import optimize_messages
+
+                opt_result = optimize_messages(
+                    raw_msgs,
+                    mode=optimize_mode,
+                    max_turns=settings.OPTIMIZE_MAX_TURNS,
+                    backend=optimize_backend,
+                )
             if opt_result.tokens_saved > 0:
                 optimized_msgs = [
                     ChatMessage(role=m["role"], content=m["content"])
diff --git a/nadirclaw/settings.py b/nadirclaw/settings.py
index 5def506..cccc89b 100644
--- a/nadirclaw/settings.py
+++ b/nadirclaw/settings.py
@@ -248,11 +248,16 @@ def PROVIDER_HEALTH_FAILURE_THRESHOLD(self) -> int:
 
     @property
     def OPTIMIZE(self) -> str:
-        """Context optimization mode: off, safe, aggressive. Default: off."""
+        """Context optimization mode: off, safe, aggressive, progressive. Default: off.
+
+        ``off`` disables compression entirely. ``safe``/``aggressive`` run the
+        single-pass pipeline; ``progressive`` runs the staged ladder that
+        escalates to Headroom only until the token budget is met.
+        """
         val = os.getenv("NADIRCLAW_OPTIMIZE", "off").lower()
-        if val not in ("off", "safe", "aggressive"):
+        if val not in ("off", "safe", "aggressive", "progressive"):
             _settings_logger.warning(
-                "Invalid NADIRCLAW_OPTIMIZE=%r — expected off|safe|aggressive. "
+                "Invalid NADIRCLAW_OPTIMIZE=%r — expected off|safe|aggressive|progressive. "
                 "Falling back to 'off'.",
                 val,
             )
@@ -267,6 +272,76 @@ def OPTIMIZE_MAX_TURNS(self) -> int:
         except ValueError:
             return 40
 
+    @property
+    def OPTIMIZE_BACKEND(self) -> str:
+        """Optimizer backend: native (default) or headroom (opt-in).
+
+        ``native`` runs the built-in stdlib transform pipeline. ``headroom``
+        delegates to the optional ``headroom-ai`` package and transparently
+        falls back to native when it is not installed. Default: native.
+        """
+        val = os.getenv("NADIRCLAW_OPTIMIZE_BACKEND", "native").lower()
+        if val not in ("native", "headroom"):
+            _settings_logger.warning(
+                "Invalid NADIRCLAW_OPTIMIZE_BACKEND=%r — expected native|headroom. "
+                "Falling back to 'native'.",
+                val,
+            )
+            return "native"
+        return val
+
+    @property
+    def OPTIMIZE_PROGRESSIVE(self) -> bool:
+        """Use progressive (staged) compression that escalates to Headroom only
+        when a token budget is still unmet after native transforms. Default: off."""
+        return os.getenv("NADIRCLAW_OPTIMIZE_PROGRESSIVE", "false").lower() in ("true", "1", "yes", "on")
+
+    @property
+    def OPTIMIZE_TARGET_TOKENS(self):
+        """Token budget for progressive compression. When set, escalation stops
+        as soon as the message set fits. Unset (default) → native stages only."""
+        raw = os.getenv("NADIRCLAW_OPTIMIZE_TARGET_TOKENS", "").strip()
+        if not raw:
+            return None
+        try:
+            return max(1, int(raw))
+        except ValueError:
+            return None
+
+    @property
+    def OPTIMIZE_ALLOW_LOSSY(self) -> bool:
+        """Allow the lossy ML prose stage (Headroom Kompress) in progressive
+        compression. Default: off (escalation stops at lossless/structural)."""
+        return os.getenv("NADIRCLAW_OPTIMIZE_ALLOW_LOSSY", "false").lower() in ("true", "1", "yes", "on")
+
+    @property
+    def OPTIMIZE_ALLOW_OFFLOAD(self) -> bool:
+        """Allow the native CCR offload stage (move oversized content behind a
+        ``nadir_retrieve`` handle) in progressive compression. Default: off.
+
+        NOTE: offload is only safe when the caller injects the retrieve tool
+        (``nadirclaw.ccr.retrieve_tool_def``) and serves the fetch-back loop
+        (``nadirclaw.ccr.resolve_loop``). The built-in server does NOT yet run
+        that loop, so this setting is currently consumed only by direct library
+        callers of ``compress_progressive(allow_offload=True)`` — not by
+        ``nadirclaw serve``. Enabling it without a retrieve path means the model
+        sees a marker it cannot resolve."""
+        return os.getenv("NADIRCLAW_OPTIMIZE_ALLOW_OFFLOAD", "false").lower() in ("true", "1", "yes", "on")
+
+    @property
+    def OPTIMIZE_MAX_STAGE(self) -> str:
+        """Cap on the progressive escalation ladder. Default: headroom_structural."""
+        val = os.getenv("NADIRCLAW_OPTIMIZE_MAX_STAGE", "headroom_structural").lower()
+        allowed = ("native_safe", "native_aggressive", "headroom_structural", "headroom_ml", "offload")
+        if val not in allowed:
+            _settings_logger.warning(
+                "Invalid NADIRCLAW_OPTIMIZE_MAX_STAGE=%r — expected one of %s. "
+                "Falling back to 'headroom_structural'.",
+                val, "|".join(allowed),
+            )
+            return "headroom_structural"
+        return val
+
     @property
     def CORS_ORIGINS(self) -> list[str]:
         """Allowed CORS origins (comma-separated). Empty = local-only regex default."""
diff --git a/nadirclaw/trained_verifier.py b/nadirclaw/trained_verifier.py
index 4c83a12..644c205 100644
--- a/nadirclaw/trained_verifier.py
+++ b/nadirclaw/trained_verifier.py
@@ -32,9 +32,12 @@
     >>> result = v.score(prompt, cheap_answer)
     >>> result.score, result.accepted        # float in [0, 1], bool
 
-The ``reference_answer`` and ``expect_json`` arguments are accepted for
-parity with ``HeuristicVerifier`` but are currently ignored — the
-trained model scores ``(prompt, cheap_answer)`` only.
+The ``reference_answer`` argument, when provided, is folded into the
+structured ``text_pair`` the cross-encoder was trained on (see
+``score()``). When ``None``, an empty ``EXPENSIVE:`` block is
+substituted, matching the production backend's behaviour. The
+``expect_json`` argument is accepted for parity with
+``HeuristicVerifier`` but is currently ignored by the trained model.
 
 Dependencies
 ------------
@@ -219,10 +222,16 @@ def score(
     ) -> TrainedScore:
         """Score how acceptable ``cheap_answer`` is for ``prompt``.
 
-        ``reference_answer`` and ``expect_json`` are accepted for
-        interface parity with ``HeuristicVerifier`` and are currently
-        ignored by the trained model. The cross-encoder was trained
-        on ``(prompt, cheap_answer)`` pairs only.
+        The cross-encoder was trained on inputs of the form
+        ``(prompt, "CHEAP:\\n{cheap}\\n\\nEXPENSIVE:\\n{reference}")``.
+        ``reference_answer`` is folded into the structured ``text_pair``
+        when provided; when ``None`` an empty ``EXPENSIVE:`` block is
+        substituted, matching the production backend at
+        ``getnadir.dev/backend/app/services/verifier_model.py``.
+
+        ``expect_json`` is accepted for interface parity with
+        ``HeuristicVerifier`` and is currently ignored by the trained
+        model.
         """
         self._ensure_loaded()
 
@@ -240,9 +249,17 @@ def score(
 
         import torch
 
+        # Match the training format used by the production backend
+        # (getnadir.dev/backend/app/services/verifier_model.py) and
+        # documented on the HuggingFace model card. Without the
+        # ``CHEAP:``/``EXPENSIVE:`` wrapper the scores drift against
+        # the calibrated tau=0.80 acceptance threshold.
+        text_pair = (
+            f"CHEAP:\n{cheap}\n\nEXPENSIVE:\n{(reference_answer or '').strip()}"
+        )
         enc = self._tokenizer(
             prompt or "",
-            cheap,
+            text_pair,
             truncation=True,
             max_length=_MAX_SEQ_LEN,
             padding=False,
diff --git a/pyproject.toml b/pyproject.toml
index 847a4a0..4392139 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,13 @@ cascade-rules = [
     # (`load_profile`). `load_inline` works without it, so this is opt-in.
     "pyyaml>=6.0",
 ]
+headroom = [
+    # Optional context-compression backend (Apache-2.0, github.com/chopratejas/headroom).
+    # Activated with NADIRCLAW_OPTIMIZE_BACKEND=headroom. When absent, the optimizer
+    # transparently falls back to the built-in native pipeline, so this stays opt-in.
+    # Prebuilt wheels bundle the compiled Rust `_core` extension (SmartCrusher etc.).
+    "headroom-ai>=0.23.0",
+]
 telemetry = [
     "opentelemetry-api>=1.20.0",
     "opentelemetry-sdk>=1.20.0",
diff --git a/tests/test_ccr.py b/tests/test_ccr.py
new file mode 100644
index 0000000..cad4cfb
--- /dev/null
+++ b/tests/test_ccr.py
@@ -0,0 +1,138 @@
+"""Tests for the native CCR (Compress-Cache-Retrieve) fetch-back loop.
+
+Offload moves oversized non-user content out of the prompt behind a retrieve
+handle, keeping the exact original in a map. The loop resolves the model's
+retrieve calls so nothing is ever lost. All deterministic, no Headroom needed.
+"""
+import json
+
+import pytest
+
+import nadirclaw.ccr as ccr
+import nadirclaw.optimize as o
+
+
+def _big_tool_msgs():
+    rows = [{"id": 1000 + i, "user": f"user{i}", "status": "active" if i % 4 else "suspended",
+             "plan": "pro" if i % 5 == 0 else "free"} for i in range(60)]
+    return [
+        {"role": "system", "content": "You are a support assistant."},
+        {"role": "user", "content": "How many users are suspended?"},
+        {"role": "tool", "content": "get_users() ->\n" + json.dumps(rows, indent=2)},
+    ]
+
+
+# ---------------------------------------------------------------------------
+# offload + resolve
+# ---------------------------------------------------------------------------
+
+def test_offload_shrinks_and_captures():
+    msgs = _big_tool_msgs()
+    before = o._estimate_tokens_messages(msgs)
+    out, captured, hashes = ccr.offload_messages(msgs)
+    after = o._estimate_tokens_messages(out)
+    assert after < before * 0.5            # big reduction
+    assert len(hashes) == 1                # the one big tool message
+    assert hashes[0] in captured
+
+
+def test_offload_is_byte_exact_recoverable():
+    msgs = _big_tool_msgs()
+    original = msgs[2]["content"]
+    out, captured, hashes = ccr.offload_messages(msgs)
+    assert ccr.resolve(captured, hashes[0]) == original   # exact bytes back
+
+
+def test_user_message_never_offloaded():
+    msgs = [{"role": "user", "content": "x" * 5000}]   # huge, but it's the user's turn
+    out, captured, hashes = ccr.offload_messages(msgs)
+    assert hashes == [] and out == msgs
+
+
+def test_small_messages_not_offloaded():
+    msgs = [{"role": "tool", "content": "short result"}]
+    out, captured, hashes = ccr.offload_messages(msgs, min_tokens=400)
+    assert hashes == []
+
+
+def test_marker_carries_the_hash():
+    out, captured, hashes = ccr.offload_messages(_big_tool_msgs())
+    marker = out[2]["content"]
+    assert f'hash="{hashes[0]}"' in marker
+    assert ccr.RETRIEVE_TOOL_NAME in marker
+
+
+# ---------------------------------------------------------------------------
+# retrieve tool + response parsing
+# ---------------------------------------------------------------------------
+
+def test_retrieve_tool_def_shape():
+    tool = ccr.retrieve_tool_def()
+    assert tool["function"]["name"] == ccr.RETRIEVE_TOOL_NAME
+    assert "hash" in tool["function"]["parameters"]["properties"]
+
+
+def test_extract_retrieve_calls():
+    resp = {"choices": [{"message": {"tool_calls": [
+        {"id": "c1", "type": "function",
+         "function": {"name": "nadir_retrieve", "arguments": json.dumps({"hash": "deadbeef"})}},
+        {"id": "c2", "type": "function",
+         "function": {"name": "other_tool", "arguments": "{}"}},
+    ]}}]}
+    assert ccr.extract_retrieve_calls(resp) == [("c1", "deadbeef")]
+
+
+# ---------------------------------------------------------------------------
+# full fetch-back loop (mock LLM, no provider)
+# ---------------------------------------------------------------------------
+
+def test_resolve_loop_recovers_data_and_answers():
+    msgs = _big_tool_msgs()
+    rows_suspended = sum(1 for i in range(60) if not (i % 4))  # status logic in _big_tool_msgs
+    out, captured, hashes = ccr.offload_messages(msgs)
+
+    def mock_llm(convo):
+        tool_msgs = [m for m in convo if m.get("role") == "tool" and m.get("name") == "nadir_retrieve"]
+        if not tool_msgs:  # round 1: ask to retrieve
+            return {"choices": [{"message": {"role": "assistant", "content": None, "tool_calls": [
+                {"id": "c1", "type": "function",
+                 "function": {"name": "nadir_retrieve", "arguments": json.dumps({"hash": hashes[0]})}}]}}]}
+        data = tool_msgs[-1]["content"]      # round 2: answer from the REAL data
+        return {"choices": [{"message": {"role": "assistant",
+                                         "content": f'{data.count(chr(34) + "suspended" + chr(34))} suspended'}}]}
+
+    final, convo = ccr.resolve_loop(out, mock_llm(out), captured, mock_llm)
+    answer = final["choices"][0]["message"]["content"]
+    assert str(rows_suspended) in answer            # model answered correctly from recovered data
+    # the resolved tool message in the conversation is the exact original
+    assert any(m.get("role") == "tool" and m.get("content") == msgs[2]["content"] for m in convo)
+
+
+def test_resolve_loop_handles_unknown_hash():
+    resp = {"choices": [{"message": {"tool_calls": [
+        {"id": "c1", "type": "function",
+         "function": {"name": "nadir_retrieve", "arguments": json.dumps({"hash": "nope"})}}]}}]}
+    final, convo = ccr.resolve_loop([], resp, {}, lambda convo: {"choices": [{"message": {"content": "done"}}]})
+    assert any("retrieve failed" in (m.get("content") or "") for m in convo)
+
+
+# ---------------------------------------------------------------------------
+# progressive offload stage
+# ---------------------------------------------------------------------------
+
+def test_progressive_offload_gated_off_by_default():
+    r = o.compress_progressive(_big_tool_msgs(), target_tokens=200, max_stage="offload")
+    assert r.offload_captured == {}
+    assert "stage:offload" not in r.optimizations_applied
+
+
+def test_progressive_offload_engages_and_is_recoverable():
+    import re
+    msgs = _big_tool_msgs()
+    r = o.compress_progressive(msgs, target_tokens=200, max_stage="offload", allow_offload=True)
+    assert "stage:offload" in r.optimizations_applied
+    assert r.offload_captured
+    # Offload captures the (losslessly) compressed content — not byte-identical to
+    # the pretty original, but still complete: every user must be recoverable.
+    recovered = "".join(r.offload_captured.values())
+    assert len(set(re.findall(r"user\d+", recovered))) == 60
diff --git a/tests/test_code_safety.py b/tests/test_code_safety.py
new file mode 100644
index 0000000..d3886e7
--- /dev/null
+++ b/tests/test_code_safety.py
@@ -0,0 +1,70 @@
+"""Regression tests: compression must not corrupt source code.
+
+Raw (unfenced) code arrives in coding-agent traffic as file-read tool outputs.
+Whitespace normalization must preserve leading indentation, or it flattens
+Python/YAML/diffs into invalid syntax. Fenced code must stay byte-identical.
+"""
+import ast
+import textwrap
+
+import pytest
+
+from nadirclaw.optimize import optimize_messages
+
+PY_SRC = textwrap.dedent('''\
+    import json
+
+
+    def process(record, config):
+        result = {}
+        for key, spec in config.items():
+            value = record.get(key)
+            if value is None:
+                if spec.get("required"):
+                    raise ValueError(key)
+                continue
+            result[key] = value
+        return result
+
+
+    class Validator:
+        def __init__(self, schema):
+            self.schema = schema
+
+        def check(self, data):
+            for field in self.schema:
+                if field not in data:
+                    return False
+            return True
+''')
+
+
+@pytest.mark.parametrize("mode", ["safe", "aggressive"])
+def test_raw_code_stays_valid_python(mode):
+    """Unfenced source code in a tool message must remain parseable."""
+    msgs = [{"role": "tool", "content": PY_SRC}]
+    out = optimize_messages(msgs, mode=mode).messages[0]["content"]
+    ast.parse(out)  # raises SyntaxError if indentation was flattened
+
+
+@pytest.mark.parametrize("mode", ["safe", "aggressive"])
+def test_leading_indentation_preserved(mode):
+    msgs = [{"role": "tool", "content": PY_SRC}]
+    out = optimize_messages(msgs, mode=mode).messages[0]["content"]
+    # The deepest line is indented 16 spaces; it must keep its indentation.
+    line = next(ln for ln in out.split("\n") if "raise ValueError" in ln)
+    assert line.startswith("                raise ValueError")
+
+
+def test_fenced_code_is_byte_identical():
+    snippet = "def f(x):\n    if x:\n        return  x  +  1\n    return 0"
+    content = "Here:\n```python\n" + snippet + "\n```"
+    out = optimize_messages([{"role": "assistant", "content": content}], mode="safe").messages[0]["content"]
+    assert snippet in out  # fenced block untouched, including its interior spacing
+
+
+def test_interior_spaces_still_collapse_in_prose():
+    # The fix only protects leading indentation; prose double-spaces still collapse.
+    content = "this   sentence   has   wide   gaps and is long enough to process"
+    out = optimize_messages([{"role": "user", "content": content}], mode="safe").messages[0]["content"]
+    assert "   " not in out
diff --git a/tests/test_json_array_pack.py b/tests/test_json_array_pack.py
new file mode 100644
index 0000000..b00ccdb
--- /dev/null
+++ b/tests/test_json_array_pack.py
@@ -0,0 +1,110 @@
+"""Tests for columnar JSON-array packing (aggressive-mode transform).
+
+Packing rewrites homogeneous arrays-of-objects into a header + one value-array
+per row. It must be information-lossless (deterministically reversible), must
+never run in safe mode, and must skip arrays it cannot pack unambiguously.
+"""
+import json
+
+import pytest
+
+from nadirclaw.optimize import (
+    _pack_array,
+    _pack_homogeneous_arrays,
+    _unpack_table,
+    optimize_messages,
+)
+
+
+def _roundtrip(arr):
+    packed = _pack_array(arr)
+    assert packed is not None
+    return _unpack_table(packed)
+
+
+# ---------------------------------------------------------------------------
+# Losslessness across value types
+# ---------------------------------------------------------------------------
+
+def test_roundtrip_scalars():
+    arr = [{"id": i, "name": f"u{i}", "active": bool(i % 2), "score": i / 3} for i in range(8)]
+    assert _roundtrip(arr) == arr
+
+
+def test_roundtrip_nested_and_null_and_tricky_strings():
+    arr = [
+        {"id": 1, "meta": {"a": [1, 2], "b": None}, "note": 'has "quotes", commas, [brackets]'},
+        {"id": 2, "meta": {"a": [], "b": 5}, "note": "tab\tand\nnewline"},
+        {"id": 3, "meta": {"a": [9], "b": None}, "note": "unicode ✓ é 中"},
+        {"id": 4, "meta": {"a": [1], "b": 0}, "note": ""},
+        {"id": 5, "meta": {"a": [2, 3], "b": 1}, "note": "⟦cols= looks like a marker"},
+    ]
+    assert _roundtrip(arr) == arr
+
+
+def test_roundtrip_preserves_row_and_key_order():
+    arr = [{"z": i, "a": i + 1, "m": i + 2} for i in range(6)]
+    out = _roundtrip(arr)
+    assert out == arr
+    assert list(out[0].keys()) == ["z", "a", "m"]
+
+
+# ---------------------------------------------------------------------------
+# Skip conditions (fall back to json_minify)
+# ---------------------------------------------------------------------------
+
+def test_skip_too_few_rows():
+    assert _pack_array([{"a": 1, "b": 2}] * 4) is None  # < 5 rows
+
+
+def test_skip_non_homogeneous_keys():
+    assert _pack_array([{"a": 1, "b": 2}, {"a": 1}, {"c": 3}] * 3) is None
+
+
+def test_skip_single_column():
+    assert _pack_array([{"a": i} for i in range(10)]) is None
+
+
+def test_skip_non_dict_elements():
+    assert _pack_array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) is None
+
+
+# ---------------------------------------------------------------------------
+# Pipeline integration
+# ---------------------------------------------------------------------------
+
+def _msgs():
+    rows = [{"id": 1000 + i, "user": f"user{i}", "status": "active" if i % 3 else "inactive",
+             "plan": "pro" if i % 5 == 0 else "free"} for i in range(40)]
+    return [{"role": "user", "content": "list users"},
+            {"role": "tool", "content": "result:\n" + json.dumps(rows, indent=2)}]
+
+
+def test_aggressive_packs_and_saves():
+    r = optimize_messages(_msgs(), mode="aggressive")
+    assert "json_array_pack" in r.optimizations_applied
+    assert r.tokens_saved > 0
+
+
+def test_safe_mode_never_packs():
+    r = optimize_messages(_msgs(), mode="safe")
+    assert "json_array_pack" not in r.optimizations_applied
+    assert "⟦cols=" not in r.messages[1]["content"]
+
+
+def test_only_packs_when_smaller():
+    # A short homogeneous array whose table form isn't worth it stays as-is.
+    arr = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6},
+           {"a": 7, "b": 8}, {"a": 9, "b": 10}]
+    content = "data " + json.dumps(arr)
+    out, changed = _pack_homogeneous_arrays(content)
+    # tiny arrays may not beat minified form; if unchanged, output is untouched
+    if not changed:
+        assert out == content
+
+
+def test_fenced_code_is_left_untouched():
+    arr = [{"id": i, "v": i * 2} for i in range(10)]
+    content = "```json\n" + json.dumps(arr, indent=2) + "\n```"
+    out, changed = _pack_homogeneous_arrays(content)
+    assert not changed and out == content
diff --git a/tests/test_optimize_backends.py b/tests/test_optimize_backends.py
new file mode 100644
index 0000000..25fb444
--- /dev/null
+++ b/tests/test_optimize_backends.py
@@ -0,0 +1,113 @@
+"""Tests for the optimizer backend selection (native vs headroom).
+
+These cover the contract that matters for safety:
+- ``off`` mode is a zero-cost no-op regardless of backend.
+- The ``headroom`` backend transparently falls back to ``native`` when the
+  optional ``headroom-ai`` package is absent (the common case), producing
+  byte-identical output and never reporting headroom transforms.
+- Backend selection resolves from arg → env → ``native`` default.
+- The extension hooks used by Nadir Pro fire in the expected places.
+"""
+
+import importlib
+
+import pytest
+
+from nadirclaw.optimize import (
+    OptimizeResult,
+    _resolve_backend,
+    optimize_messages,
+)
+
+HEADROOM_INSTALLED = importlib.util.find_spec("headroom") is not None
+
+
+def _sample():
+    return [
+        {"role": "system", "content": "You are a helpful assistant. " * 3},
+        {"role": "user", "content": 'Parse {"a":   1,    "b":   2}  with   extra   spaces.'},
+    ]
+
+
+# ---------------------------------------------------------------------------
+# Backend resolution
+# ---------------------------------------------------------------------------
+
+def test_resolve_backend_default_native(monkeypatch):
+    monkeypatch.delenv("NADIRCLAW_OPTIMIZE_BACKEND", raising=False)
+    assert _resolve_backend(None) == "native"
+
+
+def test_resolve_backend_explicit_arg_wins(monkeypatch):
+    monkeypatch.setenv("NADIRCLAW_OPTIMIZE_BACKEND", "native")
+    assert _resolve_backend("headroom") == "headroom"
+
+
+def test_resolve_backend_env(monkeypatch):
+    monkeypatch.setenv("NADIRCLAW_OPTIMIZE_BACKEND", "headroom")
+    assert _resolve_backend(None) == "headroom"
+
+
+def test_resolve_backend_invalid_falls_back(monkeypatch):
+    monkeypatch.delenv("NADIRCLAW_OPTIMIZE_BACKEND", raising=False)
+    assert _resolve_backend("nonsense") == "native"
+
+
+# ---------------------------------------------------------------------------
+# off-mode short-circuit
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize("backend", ["native", "headroom"])
+def test_off_is_noop_for_any_backend(backend):
+    msgs = _sample()
+    result = optimize_messages(msgs, mode="off", backend=backend)
+    assert isinstance(result, OptimizeResult)
+    assert result.tokens_saved == 0
+    assert result.mode == "off"
+    # off returns the original list object untouched (zero overhead)
+    assert result.messages is msgs
+
+
+# ---------------------------------------------------------------------------
+# headroom backend fallback (when headroom-ai is not installed)
+# ---------------------------------------------------------------------------
+
+@pytest.mark.skipif(HEADROOM_INSTALLED, reason="headroom-ai installed; fallback path not exercised")
+@pytest.mark.parametrize("mode", ["safe", "aggressive"])
+def test_headroom_falls_back_to_native_when_absent(mode):
+    msgs = _sample()
+    native = optimize_messages([{**m} for m in msgs], mode=mode, backend="native")
+    headroom = optimize_messages([{**m} for m in msgs], mode=mode, backend="headroom")
+
+    # Output must be identical to native...
+    assert headroom.messages == native.messages
+    # ...and must never claim a headroom transform ran.
+    assert not any(t.startswith("headroom") for t in headroom.optimizations_applied)
+
+
+# ---------------------------------------------------------------------------
+# extension hooks (the mechanism Nadir Pro builds on)
+# ---------------------------------------------------------------------------
+
+def test_extra_safe_content_hook_runs():
+    def shout(content):
+        new = content.replace("hello", "HELLO")
+        return new, new != content
+
+    msgs = [{"role": "user", "content": "hello there, this is a reasonably long message"}]
+    result = optimize_messages(
+        msgs, mode="safe", extra_safe_content=[("shout", shout)]
+    )
+    assert "shout" in result.optimizations_applied
+    assert "HELLO" in result.messages[0]["content"]
+
+
+def test_extra_aggressive_hooks_skipped_in_safe_mode():
+    def boom(_content):
+        raise AssertionError("aggressive hook must not run in safe mode")
+
+    msgs = [{"role": "user", "content": "a fairly long user message to exceed the length floor"}]
+    # Should not raise — aggressive hooks only fire in aggressive mode.
+    optimize_messages(
+        msgs, mode="safe", extra_aggressive_content=[("boom", boom)]
+    )
diff --git a/tests/test_progressive.py b/tests/test_progressive.py
new file mode 100644
index 0000000..b172042
--- /dev/null
+++ b/tests/test_progressive.py
@@ -0,0 +1,79 @@
+"""Tests for progressive (staged) compression.
+
+Headroom stages require the optional ``headroom-ai`` package (Python <= 3.13).
+These tests cover the escalation logic, early-stop, stage capping, lossy gating,
+and graceful skip when Headroom is absent — all observable on the native stages.
+The Headroom stages engaging is exercised manually in a 3.13 venv.
+"""
+import json
+
+import pytest
+
+from nadirclaw.optimize import compress_progressive
+
+
+def _stages(result):
+    return [x.split(":", 1)[1] for x in result.optimizations_applied if x.startswith("stage:")]
+
+
+def _big_msgs():
+    rows = [{"id": 1000 + i, "user": f"user{i}", "status": "active" if i % 3 else "off",
+             "plan": "pro" if i % 5 == 0 else "free"} for i in range(60)]
+    return [
+        {"role": "system", "content": "You are a helpful assistant. " * 6},
+        {"role": "user", "content": "summarize the users"},
+        {"role": "tool", "content": "get_users():\n" + json.dumps(rows, indent=2)},
+    ]
+
+
+def test_mode_is_progressive():
+    r = compress_progressive(_big_msgs())
+    assert r.mode == "progressive"
+
+
+def test_no_target_stops_at_max_stage_native():
+    # Without a budget, escalation stops after native_aggressive — never reaches Headroom.
+    r = compress_progressive(_big_msgs())
+    assert _stages(r) == ["native_safe", "native_aggressive"]
+    assert not any(s.startswith("headroom") for s in _stages(r))
+    assert r.tokens_saved > 0
+
+
+def test_generous_target_early_stops_after_safe():
+    msgs = _big_msgs()
+    from nadirclaw.optimize import _estimate_tokens_messages
+    orig = _estimate_tokens_messages(msgs)
+    r = compress_progressive(msgs, target_tokens=int(orig * 0.9), max_stage="headroom_ml", allow_lossy=True)
+    # Safe alone gets under 90% here, so it must stop immediately.
+    assert _stages(r) == ["native_safe"]
+
+
+def test_max_stage_caps_ladder():
+    # Cap at native_safe → aggressive never runs even though target is unmet.
+    r = compress_progressive(_big_msgs(), target_tokens=1, max_stage="native_safe")
+    assert _stages(r) == ["native_safe"]
+
+
+def test_headroom_skipped_gracefully_when_absent():
+    # Unmeetable target + headroom requested: on a host without headroom-ai the
+    # headroom stages are skipped (not recorded), output stays valid native.
+    import importlib.util
+    r = compress_progressive(_big_msgs(), target_tokens=1, max_stage="headroom_ml", allow_lossy=True)
+    if importlib.util.find_spec("headroom") is None:
+        assert _stages(r) == ["native_safe", "native_aggressive"]
+    # Either way the result is well-formed and never larger than the input.
+    assert r.optimized_tokens <= r.original_tokens
+    assert all("content" in m for m in r.messages)
+
+
+def test_lossy_gated_off_by_default():
+    # allow_lossy=False must drop headroom_ml from the ladder entirely.
+    r = compress_progressive(_big_msgs(), target_tokens=1, max_stage="headroom_ml", allow_lossy=False)
+    assert "headroom_ml" not in _stages(r)
+
+
+def test_already_small_is_noop():
+    small = [{"role": "user", "content": "hi"}]
+    r = compress_progressive(small, target_tokens=10_000)
+    assert _stages(r) == []  # already under budget, nothing runs
+    assert r.tokens_saved == 0
diff --git a/tests/test_trained_verifier.py b/tests/test_trained_verifier.py
index 606d0db..3c3c7da 100644
--- a/tests/test_trained_verifier.py
+++ b/tests/test_trained_verifier.py
@@ -148,6 +148,85 @@ def test_trained_verifier_interface_matches_heuristic():
         assert {"score", "accepted", "threshold", "reasons", "verifier"} <= d.keys()
 
 
+def test_trained_verifier_wraps_input_in_production_format():
+    """The tokenizer must receive ``text_pair`` wrapped in the
+    ``CHEAP:\\n...\\n\\nEXPENSIVE:\\n...`` format the cross-encoder was
+    trained on. Without this wrapper, scores drift against the
+    calibrated tau=0.80 threshold.
+
+    Production reference:
+      ``getnadir.dev/backend/app/services/verifier_model.py:195``
+    """
+    from nadirclaw.trained_verifier import TrainedVerifier
+
+    captured: dict = {}
+
+    class _FakeEncoding(dict):
+        def __init__(self):
+            super().__init__()
+            # Minimal tensor-like values so the .to(device) loop works.
+            class _T:
+                def to(self, _device):
+                    return self
+
+            self["input_ids"] = _T()
+            self["attention_mask"] = _T()
+
+    class _FakeTokenizer:
+        def __call__(self, prompt, text_pair, **kwargs):
+            captured["prompt"] = prompt
+            captured["text_pair"] = text_pair
+            captured["kwargs"] = kwargs
+            return _FakeEncoding()
+
+    class _FakeLogits:
+        # Two-class head; softmax([0, 0]) => probs[..., 1] == 0.5
+        shape = (1, 2)
+
+        def __init__(self):
+            import torch
+            self._t = torch.tensor([[0.0, 0.0]])
+
+        def __getattr__(self, name):
+            return getattr(self._t, name)
+
+    class _FakeModelOut:
+        def __init__(self):
+            import torch
+            self.logits = torch.tensor([[0.0, 0.0]])
+
+    class _FakeModel:
+        def __call__(self, **kwargs):
+            return _FakeModelOut()
+
+        def eval(self):
+            return self
+
+        def to(self, _device):
+            return self
+
+    v = TrainedVerifier(threshold=0.8, device="cpu")
+    v._tokenizer = _FakeTokenizer()
+    v._model = _FakeModel()
+    v._resolved_device = "cpu"
+
+    # Case 1: reference_answer provided.
+    out = v.score("What is 2+2?", "4", reference_answer="four")
+    assert captured["prompt"] == "What is 2+2?"
+    assert captured["text_pair"] == "CHEAP:\n4\n\nEXPENSIVE:\nfour"
+    assert 0.0 <= out.score <= 1.0
+
+    # Case 2: reference_answer=None -> empty EXPENSIVE: block.
+    captured.clear()
+    v.score("What is 2+2?", "4")
+    assert captured["text_pair"] == "CHEAP:\n4\n\nEXPENSIVE:\n"
+
+    # Case 3: reference_answer is whitespace-only -> stripped to empty.
+    captured.clear()
+    v.score("What is 2+2?", "4", reference_answer="   \n  ")
+    assert captured["text_pair"] == "CHEAP:\n4\n\nEXPENSIVE:\n"
+
+
 def test_trained_verifier_get_singleton_caches():
     """The module-level singleton accessor should cache same-threshold calls
     and return fresh instances for mismatched thresholds. Construction