diff --git a/README.md b/README.md index 13ce77d..dacc549 100644 --- a/README.md +++ b/README.md @@ -1035,7 +1035,7 @@ Options: --complex-model TEXT Model for complex prompts --models TEXT Comma-separated model list (legacy) --token TEXT Auth token - --optimize [off|safe|aggressive] Context optimization mode (default: off) + --optimize [off|safe|aggressive|progressive] Context compression: off | safe | aggressive | progressive (default: off) --verbose Enable debug logging --log-raw Log full raw requests and responses to JSONL ``` @@ -1420,8 +1420,14 @@ Auth is disabled by default (local-only). Set `NADIRCLAW_AUTH_TOKEN` to require | `NADIRCLAW_CONFIDENCE_THRESHOLD` | `0.06` | Classification threshold (lower = more complex) | | `NADIRCLAW_PORT` | `8856` | Server port | | `NADIRCLAW_LOG_DIR` | `~/.nadirclaw/logs` | Log directory | -| `NADIRCLAW_OPTIMIZE` | `off` | Context optimization mode: `off`, `safe` (lossless), `aggressive` (future) | +| `NADIRCLAW_OPTIMIZE` | `off` | Context compression: `off` (disabled), `safe` (lossless), `aggressive`, or `progressive` (staged ladder that escalates to Headroom). `off` is the master on/off switch | | `NADIRCLAW_OPTIMIZE_MAX_TURNS` | `40` | Max conversation turns to keep when trimming history | +| `NADIRCLAW_OPTIMIZE_BACKEND` | `native` | Optimizer backend: `native` (built-in) or `headroom` (needs `pip install nadirclaw[headroom]`; falls back to native if absent). See [savings analysis](docs/context-optimize-savings.md#backends-native-default-vs-headroom) | +| `NADIRCLAW_HEADROOM_KOMPRESS` | `off` | When backend is `headroom`, enable Kompress ML text compression (downloads a HuggingFace model on first use) | +| `NADIRCLAW_OPTIMIZE_PROGRESSIVE` | `off` | Legacy alias for `NADIRCLAW_OPTIMIZE=progressive` — forces the [progressive ladder](docs/context-optimize-savings.md#progressive-staged-compression) regardless of mode. Prefer setting `NADIRCLAW_OPTIMIZE=progressive` | +| `NADIRCLAW_OPTIMIZE_TARGET_TOKENS` | _(unset)_ | Token budget for progressive compression (e.g. the model's context window). Unset → native stages only | +| `NADIRCLAW_OPTIMIZE_MAX_STAGE` | `headroom_structural` | Cap on the progressive ladder: `native_safe`, `native_aggressive`, `headroom_structural`, or `headroom_ml` | +| `NADIRCLAW_OPTIMIZE_ALLOW_LOSSY` | `off` | Permit the lossy ML prose stage (`headroom_ml`) in progressive compression | | `NADIRCLAW_LOG_RAW` | `false` | Log full raw requests and responses (`true`/`false`) | | `NADIRCLAW_MODELS` | `openai-codex/gpt-5.3-codex,gemini-3-flash-preview` | Legacy model list (fallback if tier vars not set) | | `OTEL_EXPORTER_OTLP_ENDPOINT` | *(empty — disabled)* | OpenTelemetry collector endpoint (enables tracing) | diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md new file mode 100644 index 0000000..1caac76 --- /dev/null +++ b/THIRD_PARTY_NOTICES.md @@ -0,0 +1,19 @@ +# Third-Party Notices + +NadirClaw is MIT-licensed. It can optionally use the following third-party +components, declared as opt-in extras. Their licenses and attributions are +reproduced here. + +## headroom-ai + +- **Used by:** the optional `headroom` optimizer backend + (`NADIRCLAW_OPTIMIZE_BACKEND=headroom`), installed via `pip install nadirclaw[headroom]`. +- **Project:** Headroom — https://github.com/chopratejas/headroom +- **License:** Apache License 2.0 +- **NOTICE:** Headroom, Copyright 2025 Headroom Contributors. + +NadirClaw integrates Headroom only through its public Python API +(`headroom.compress`); no Headroom source code is copied or vendored into this +project. A full copy of the Apache License 2.0 is available at +https://www.apache.org/licenses/LICENSE-2.0 and is distributed with the +`headroom-ai` package when installed. diff --git a/benchmarks/optimize_real_data.py b/benchmarks/optimize_real_data.py new file mode 100644 index 0000000..a6910d7 --- /dev/null +++ b/benchmarks/optimize_real_data.py @@ -0,0 +1,121 @@ +"""Real-data benchmark: optimizer backends on public coding + chat datasets. + +- Chat: allenai/WildChat-1M (real multi-turn user<->assistant conversations) +- Coding/tools: glaiveai/glaive-function-calling-v2 (tool schemas + function calls + JSON) + +Compares: native-safe (lossless, ships today), Pro-aggressive (native ceiling), +headroom (new opt-in backend). Single tiktoken estimator for all => fair. +""" +import json, os, re, sys, time, collections, urllib.request + +# Resolve the NadirClaw repo root from this file, and the sibling Nadir package. +_HERE = os.path.dirname(os.path.abspath(__file__)) +_NADIRCLAW = os.path.dirname(_HERE) +sys.path.insert(0, _NADIRCLAW) +_NADIR = os.path.join(os.path.dirname(_NADIRCLAW), "Nadir") +if os.path.isdir(_NADIR): + sys.path.insert(0, _NADIR) + +import nadirclaw.optimize as claw +try: + import nadir.optimize as pro +except Exception: # Nadir Pro not on path — fall back to native + pro = claw + +est = claw._estimate_tokens_messages + +N = 200 # conversations per dataset +CACHE = os.environ.get("BENCH_CACHE_DIR", "/tmp") + + +def _fetch(dataset, config, split, dest, total=N): + """Fetch rows from the HF datasets-server (no full dataset download). Cached to disk.""" + if os.path.exists(dest): + return + rows = [] + for off in range(0, total, 100): + url = (f"https://datasets-server.huggingface.co/rows?dataset={dataset}" + f"&config={config}&split={split}&offset={off}&length=100") + for _ in range(3): + try: + with urllib.request.urlopen(url, timeout=40) as r: + rows += [x["row"] for x in json.load(r).get("rows", [])] + break + except Exception: + time.sleep(2) + json.dump(rows, open(dest, "w")) + + +_WILDCHAT = os.path.join(CACHE, "ds_wildchat.json") +_GLAIVE = os.path.join(CACHE, "ds_glaive.json") +_fetch("allenai/WildChat-1M", "default", "train", _WILDCHAT) +_fetch("glaiveai/glaive-function-calling-v2", "default", "train", _GLAIVE) + + +def load_wildchat(): + rows = json.load(open(_WILDCHAT))[:N] + convs = [] + for r in rows: + msgs = [{"role": t.get("role", "user"), "content": t.get("content") or ""} + for t in (r.get("conversation") or []) if isinstance(t, dict)] + msgs = [m for m in msgs if isinstance(m["content"], str) and m["content"]] + if len(msgs) >= 2: + convs.append(msgs) + return convs + + +def load_glaive(): + rows = json.load(open(_GLAIVE))[:N] + convs = [] + marker = re.compile(r"(USER:|ASSISTANT:|FUNCTION RESPONSE:)", re.I) + rolemap = {"USER": "user", "ASSISTANT": "assistant", "FUNCTION RESPONSE": "tool"} + for r in rows: + sysm = (r.get("system") or "").strip() + if sysm.upper().startswith("SYSTEM:"): + sysm = sysm[7:].strip() + msgs = [{"role": "system", "content": sysm}] if sysm else [] + chat = r.get("chat") or "" + parts = marker.split(chat) + # parts: ['', 'USER:', ' ...', 'ASSISTANT:', ' ...', ...] + i = 1 + while i < len(parts) - 0: + lab = parts[i].rstrip(":").upper() + content = parts[i + 1].strip() if i + 1 < len(parts) else "" + if lab in rolemap and content: + msgs.append({"role": rolemap[lab], "content": content}) + i += 2 + if len(msgs) >= 2: + convs.append(msgs) + return convs + + +def bench(convs, runners): + out = {name: [0, 0] for name in runners} # name -> [orig, after] + transforms = {name: collections.Counter() for name in runners} + for msgs in convs: + for name, fn in runners.items(): + r = fn([{**m} for m in msgs]) + out[name][0] += r.original_tokens + out[name][1] += r.optimized_tokens + for t in r.optimizations_applied: + transforms[name][t.split(":")[1] if t.startswith("headroom:") else t] += 1 + return out, transforms + + +RUNNERS = { + "native-safe": lambda m: claw.optimize_messages(m, mode="safe", backend="native"), + "pro-aggressive": lambda m: pro.optimize_messages(m, mode="aggressive", backend="native"), + "headroom": lambda m: claw.optimize_messages(m, mode="safe", backend="headroom"), +} + +for label, loader in [("CHAT — WildChat-1M", load_wildchat), ("CODING/TOOLS — glaive-function-calling-v2", load_glaive)]: + convs = loader() + t0 = time.time() + res, tf = bench(convs, RUNNERS) + base = res["native-safe"][0] + print(f"\n### {label} ({len(convs)} conversations, {base:,} raw tokens, {time.time()-t0:.0f}s)") + print(f"{'backend':<18}{'after':>10}{'saved':>9}{'%':>7} top transforms") + for name in RUNNERS: + o, a = res[name] + top = ", ".join(f"{k}:{v}" for k, v in tf[name].most_common(4)) + print(f"{name:<18}{a:>10,}{o-a:>9,}{100*(o-a)/max(1,o):>6.1f}% {top}") diff --git a/docs/context-optimize-savings.md b/docs/context-optimize-savings.md index 26bc710..13447ff 100644 --- a/docs/context-optimize-savings.md +++ b/docs/context-optimize-savings.md @@ -37,6 +37,7 @@ Combined with smart routing, NadirClaw now saves in two ways: - **Tool schema deduplication** — Agent frameworks often re-send the full tool schema with every turn. NadirClaw keeps the first occurrence and replaces repeats with a short reference. - **Chat history trimming** — Long conversations accumulate tokens that are far from the current task. Trimming to recent turns (default: 40) keeps context relevant and cheap. - **Whitespace normalization** — Log dumps, stack traces, and verbose output contain runs of blank lines and spaces that carry no semantic value. +- **Columnar JSON-array packing** (`json_array_pack`, aggressive mode) — Large arrays of same-keyed objects (DB query results, API list responses, large tool outputs) repeat every key on every row. Packing them into a header (`⟦cols=[...]⟧`) plus one value-array per row emits each key once. Information-lossless and deterministically reversible, but not byte-identical JSON, so it runs in **aggressive** mode only. On a 100-row homogeneous array this reaches ~68% vs pretty-printed JSON (vs ~45% for `json_minify` alone). ## Projected Monthly Savings (Opus 4.6) @@ -56,6 +57,9 @@ All safe-mode transforms are deterministic and lossless: - JSON values roundtrip exactly (parse + compact re-serialize) - Code blocks inside fences (```) are never modified +- **Leading indentation is preserved**, so raw (unfenced) source code — e.g. file-read + tool outputs — stays syntactically valid. Whitespace normalization only collapses + *interior* multi-spaces and excess blank lines, never indentation. - URLs are preserved character-for-character - Unicode and emoji roundtrip correctly - Deeply nested structures are handled without data loss @@ -76,3 +80,100 @@ NADIRCLAW_OPTIMIZE=safe nadirclaw serve # Dry-run on a file nadirclaw optimize payload.json --mode safe --format json ``` + +## Backends: native (default) vs headroom + +The optimizer has a pluggable backend, selected independently of the `off|safe|aggressive` +mode. The mode still decides *how hard* to compress; the backend decides *who* runs it. + +| Backend | Default | Engine | Extra capabilities | +|---|---|---|---| +| `native` | ✅ | Built-in stdlib pipeline (this document) | None — pure Python, no extra deps | +| `headroom` | opt-in | [Headroom](https://github.com/chopratejas/headroom) (Apache-2.0) | Statistical JSON-array crushing (SmartCrusher), AST-aware code compression, content-type routing | + +`headroom` delegates to the optional [`headroom-ai`](https://pypi.org/project/headroom-ai/) +package. It ships **installed by default with Nadir Pro** but stays **inactive** until you +select it. In open-source NadirClaw it is an opt-in extra: + +```bash +pip install "nadirclaw[headroom]" +``` + +Activate it: + +```bash +# Server-wide +NADIRCLAW_OPTIMIZE=safe NADIRCLAW_OPTIMIZE_BACKEND=headroom nadirclaw serve + +# Per-request override (in the request body) +{"model": "auto", "optimize": "safe", "optimize_backend": "headroom", "messages": [...]} +``` + +Safety and fallback: + +- If `headroom-ai` is not installed (or raises), the optimizer **transparently falls back + to `native`** and logs a one-time warning. Requests never fail because of the backend. +- Token-savings metrics are always recomputed with NadirClaw's own estimator, so reported + numbers stay consistent across backends (Savings/Billing math is unaffected). +- Headroom's ML text compressor (Kompress) downloads a HuggingFace model on first use, so + it is kept **disabled** by default. Opt in with `NADIRCLAW_HEADROOM_KOMPRESS=on`. +- The fastest Headroom compressors (SmartCrusher etc.) are a compiled Rust extension bundled + in the prebuilt wheels. On source installs without the wheel they simply don't run, and + Headroom fails open — output is still correct, just less compressed. + +Attribution for the Apache-2.0 dependency lives in +[`THIRD_PARTY_NOTICES.md`](../THIRD_PARTY_NOTICES.md). + +## Progressive (staged) compression + +`compress_progressive()` escalates through compression stages and **stops as soon as a +token budget is met** — so you only pay the cost (and fidelity risk) of heavier compression +when lighter stages aren't enough. Headroom is wired in as the middle/late tiers. + +The ladder, cheapest/safest first: + +| Stage | What runs | Loss | Needs | +|---|---|---|---| +| 1. `native_safe` | system/tool dedup, json minify, whitespace | lossless | — | +| 2. `native_aggressive` | + columnar packing, semantic dedup, Pro transforms | lossless-to-semantic | — | +| 3. `headroom_structural` | Headroom content compressors (SmartCrusher, LogCompressor, …) | high-fidelity | `headroom-ai` | +| 4. `headroom_ml` | Headroom Kompress (ML token-dropping on prose) | lossy | `headroom-ai` + `allow_lossy` | + +Rules: + +- With **no `target_tokens`**, the ladder stops after `native_aggressive` — Headroom and the + lossy ML stage are never reached. Default behaviour stays dependency-free and lossless. +- The Headroom stages are **skipped silently** when `headroom-ai` is not installed. +- `headroom_ml` (lossy) only runs when `allow_lossy=True`. +- Chat-history trimming always runs last as a final backstop. + +```python +from nadirclaw.optimize import compress_progressive # or nadir.optimize for Pro + +result = compress_progressive( + messages, + target_tokens=180_000, # e.g. the model's context window + allow_lossy=False, # set True to permit the lossy ML stage + max_stage="headroom_structural", +) +# result.optimizations_applied is prefixed with stage: markers that ran +``` + +Enable it on the server — `progressive` is just a value of the single `optimize` +control, alongside `off` / `safe` / `aggressive`: + +```bash +# off | safe | aggressive | progressive (off = compression disabled) +NADIRCLAW_OPTIMIZE=progressive \ +NADIRCLAW_OPTIMIZE_TARGET_TOKENS=180000 \ +NADIRCLAW_OPTIMIZE_MAX_STAGE=headroom_structural \ +nadirclaw serve + +# equivalently: nadirclaw serve --optimize progressive +# per-request: {"optimize": "progressive", "messages": [...]} +# turn compression off: {"optimize": "off", ...} +``` + +On a logs+prose payload where native compression yields ~0%, escalating to +`headroom_structural` reached ~90% — the escalation only spends the Headroom budget when +native genuinely can't deliver. diff --git a/docs/optimizer-benchmark-2026-06.md b/docs/optimizer-benchmark-2026-06.md new file mode 100644 index 0000000..7c05f02 --- /dev/null +++ b/docs/optimizer-benchmark-2026-06.md @@ -0,0 +1,166 @@ +# Context Optimizer Benchmark — native vs Headroom, synthetic vs real data + +**Date:** 2026-06-06 +**Scope:** Evaluate the context optimizer (`nadirclaw.optimize` / `nadir.optimize`) across +backends (`native`, `headroom`) and modes (`safe`, `aggressive`) on both synthetic +payloads and real public coding + chat datasets. Establish whether Headroom improves +on the native pipeline, and where the realistic savings ceiling is. + +## TL;DR + +- **On real conversational traffic, lossless savings are 1–10%, not the 30–60% synthetic + payloads suggest.** Real traffic is prose-dominated; structural optimizers (JSON minify, + schema dedup, whitespace) have little to grab. +- **The `headroom` *library `compress()` wrapper* underdelivers** (it routes conservatively and + protects messages). But **Headroom's transforms, called directly "as is", do reproduce their + published numbers** — LogCompressor ~80% on repetitive logs, SmartCrusher ~73% on homogeneous + JSON arrays. The earlier "Headroom underperforms" verdict was about the wrapper, not the engine. +- **One real capability gap was found and closed natively:** SmartCrusher packs homogeneous JSON + arrays into a columnar table (~50% beyond our `json_minify`). We now do this losslessly in + `aggressive` mode via the new `json_array_pack` transform (68% vs pretty JSON, vs Headroom's 73%), + with no dependency and no CCR machinery. +- **Pro-aggressive is the best performer** everywhere — all native, no new dependencies. +- **Decision:** keep `native` the default, `headroom` a safe opt-in (already shipped this way). + Remaining gains require prose compression (lossy ML + a CCR recovery path that is not built). + +## Method + +- **Backends compared:** `native-safe` (lossless, ships today), `pro-aggressive` (native + ceiling: + secret-mask, tool-schema compaction, log/stack compression, semantic dedup), + `headroom` (optional `headroom-ai` backend, Kompress disabled unless noted). +- **Token metric:** the optimizer's own tiktoken `cl100k_base` estimator, applied identically + to every backend's output, so comparisons are fair regardless of each engine's internal count. +- **Datasets (public, no PII):** + - Chat: `allenai/WildChat-1M` — real multi-turn user↔assistant conversations. + - Coding/tools: `glaiveai/glaive-function-calling-v2` — tool schemas + function calls + JSON. + - 200 conversations each, fetched via the HF datasets-server `/rows` API (no full download). +- **Environment note:** `headroom-ai` (Rust/PyO3 ≤ 3.13) **cannot build on Python 3.14**. + Benchmarks ran on a Python 3.13 venv with the prebuilt wheel. getnadir/Nadir target 3.12, so + this is fine in production, but 3.13 is the current ceiling for the Headroom dependency. + +## Results — synthetic payloads + +Hand-built "bloated" payloads (repeated tool schemas, pretty-printed JSON arrays, log dumps): + +| Backend | total reduction | notes | +|---|--:|---| +| native-safe | **30.4%** | lossless; strong on repeated tool schemas | +| headroom | 25.2% | worse — no cross-message schema dedup; lossy crush never fired | +| pro-aggressive | **60.3%** | `pattern_compression` took 200 log lines 0% → 87% | + +## Results — real data (the important part) + +| | raw tokens | native-safe | pro-aggressive | headroom | +|---|--:|--:|--:|--:| +| **Chat** (WildChat-1M, 195 convs) | 191,101 | **1.0%** | **4.7%** | 0.1% | +| **Coding/tools** (glaive, 200 convs) | 111,697 | **8.2%** | **9.6%** | 2.6% | + +Transform frequency (real data): +- Chat: `whitespace_normalize` 37×, `semantic_dedup` 20× (the only real lift), `json_minify` 7×. +- Coding: `json_minify` 135× (the workhorse), `tool_schema_compact` 53×, `tool_schema_dedup` **0×** + (real requests carry each schema once, not repeated across turns). + +## Diagnosis — why real savings are low + +Token mass is **prose-dominated**, and structural optimizers cannot compress prose: + +- **Chat is ~100% natural language.** Native has almost nothing to grab (1%). `semantic_dedup` + is the only lever that moved it (→ 4.7%). +- **Coding token distribution (glaive, by role):** assistant **67.7k (60%)**, system 22.3k (20%, + the tool schemas), user 17.3k, tool 4.8k. The compressible part is the 20% of JSON schema; + the 60% assistant prose is untouchable by structural methods. +- **Lossless levers are exhausted:** + - Fenced JSON (the minifier skips code fences): **0** minifiable blocks in chat, **1** in glaive. + The 66 chat code-fences are *code*, not JSON. + - Verbatim block repetition across turns (≥80-char lines repeated in an earlier message): + **2.5%** chat / **0.5%** coding — the largest remaining lossless lever, and still small. + +## Headroom findings (tested two ways) + +**Via the library `compress()` wrapper (what our backend integrates):** underdelivers. +SmartCrusher and CodeCompressor never engaged at any `target_ratio` — the wrapper routes +conservatively and protects user/recent messages, so the heavy transforms rarely fire on real +message content. This is why our backend benchmark was low. + +**Calling the transforms directly ("as is"):** the published per-type numbers reproduce. + +| Transform (direct call) | content | result | notes | +|---|---|--:|---| +| `LogCompressor.compress` | 200 repetitive log lines | **79.7%** (lossy) | matches their 80–95% claim; our Pro `pattern_compression` does 87% on the same logs | +| `SmartCrusher.crush` | 100-row homogeneous JSON array | **73% vs pretty** (lossless table) | columnar format; this is the one capability we lacked | +| `SmartCrusher.crush` | 50 *unique* (non-redundant) objects | ~47% (lossless) | falls back to ≈ our `json_minify` when rows aren't homogeneous | +| `CodeAwareCompressor.compress` | Python source | **broke** (AST bytes bug → `syntax_valid=False`) | not usable in this build | + +Why their headline % looks bigger than ours: it is measured against **pretty-printed** JSON, and +on **ideal redundant content** (homogeneous arrays, repetitive logs). Real conversational traffic +is prose-dominated and we already minify, so the marginal win is smaller — except the columnar +table, which is genuinely additive (see below). + +**Kompress (ML token-dropping)** is the one place Headroom wins on *prose*: ~12% on unique prose +(native: 0%), ~60% on repetitive boilerplate. But it is lossy (drops function words, fuses +sentences) and emits a `[... Retrieve more: hash=...]` marker that is **unrecoverable in our +wiring** (no `headroom_retrieve` endpoint). It stays `disabled` by default. + +## Native columnar packing (`json_array_pack`) — the capability we adopted + +The only reproducible Headroom win we lacked was SmartCrusher's columnar table for homogeneous +JSON arrays. We now do it natively in `aggressive` mode: + +| 100-row homogeneous array | tokens | vs pretty | +|---|--:|--:| +| pretty JSON | 4,202 | — | +| `json_minify` (safe) | 2,302 | 45% | +| **`json_array_pack` (aggressive)** | **1,323** | **68%** | +| Headroom SmartCrusher | 1,119 | 73% | + +It rewrites an array of same-keyed objects into a header (`⟦cols=[...]⟧`) plus one JSON +value-array per row, emitting each key once instead of N times. It is **information-lossless and +deterministically reversible** (`_unpack_table`), runs only when the array is strictly homogeneous +(≥ 5 rows, identical key sets) and only when it saves tokens, and **never runs in `safe` mode** +(it is not byte-identical JSON). The 5pp gap to SmartCrusher is format: it uses bare CSV rows; we +keep JSON-array rows so reversibility is robust across nested/special values. + +**Caveat:** the public chat/coding datasets above barely contain homogeneous arrays +(`json_array_pack` fired once on glaive), so this does not move those totals. It targets +tool-output traffic — DB query results, API list responses, large `get_*` tool returns — which +production agent loops carry but these datasets do not. + +## Code safety (correctness fix) + +Testing the optimizer on raw source code surfaced a real bug: `whitespace_normalize` +collapsed the **leading indentation** of unfenced code (file-read tool outputs), flattening +nested Python into **invalid syntax** while reporting ~12–14% "savings". That apparent +code compression was the corruption — not real savings. + +Fixed across all three optimizer copies (NadirClaw, Nadir Pro inherits it, getnadir): the +normalizer now preserves leading whitespace and only collapses interior multi-spaces. +Regression tests assert raw code stays `ast.parse`-valid in both safe and aggressive modes. + +Corrected takeaway: honest lossless savings on clean source code are **~0%** — structural +optimization has nothing safe to remove from well-formatted code. (Headroom's CodeCompressor +is the only engine that targets code, and it errored to invalid output in this build.) + +## Recommendations + +1. **Keep the shipped posture.** Native default, headroom opt-in, Kompress off. Validated correct. +2. **For chat-heavy traffic, `aggressive` mode is the lever** — `semantic_dedup` is the only thing + that moves prose, lossless-ish, already available in NadirClaw and Pro. +3. **Optional small lossless win (deferred):** a verbatim block-dedup transform would add ~2.5% on + chat. Modest; weigh against the readability cost of inline reference markers. +4. **Large prose savings require investment:** ML token compression (Kompress) behind a real + CCR `headroom_retrieve` recovery endpoint. Only worth it if prose-heavy traffic dominates the + bill. Not built; lossy without it. + +## Reproduction + +```bash +# Headroom needs Python <= 3.13 (Rust/PyO3). Build a 3.13 venv: +python3.13 -m venv /tmp/hr-bench +/tmp/hr-bench/bin/pip install "headroom-ai>=0.23.0" tiktoken sentence-transformers +# Real-data benchmark (fetches 200 convs each from WildChat + glaive via HF datasets-server): +/tmp/hr-bench/bin/python NadirClaw/benchmarks/optimize_real_data.py +``` + +The benchmark script lives at [`benchmarks/optimize_real_data.py`](../benchmarks/optimize_real_data.py). +See also [context-optimize-savings.md](context-optimize-savings.md) for the transform-level +savings analysis and the `native` vs `headroom` backend reference. diff --git a/nadirclaw/__init__.py b/nadirclaw/__init__.py index 9a92fda..f887770 100644 --- a/nadirclaw/__init__.py +++ b/nadirclaw/__init__.py @@ -1,3 +1,3 @@ """NadirClaw — Open-source LLM router.""" -__version__ = "0.19.1" +__version__ = "0.19.2" diff --git a/nadirclaw/ccr.py b/nadirclaw/ccr.py new file mode 100644 index 0000000..c510a6b --- /dev/null +++ b/nadirclaw/ccr.py @@ -0,0 +1,161 @@ +"""CCR — Compress-Cache-Retrieve fetch-back loop (native, deterministic). + +The biggest prompt-token win is *offloading* large message content out of the +prompt and letting the model pull it back on demand. Headroom does this, but its +store is driven by a ContextVar written from a worker thread with a +non-deterministic key — reliable only inside their own proxy. So NadirClaw owns +the loop natively: + + 1. ``offload_messages`` moves oversized content into a ``{hash: original}`` map + we control, leaving a short preview + a ``nadir_retrieve(hash=...)`` marker. + 2. ``retrieve_tool_def`` is injected so the model knows it can fetch the rest. + 3. ``resolve_loop`` intercepts the model's retrieve calls, serves the exact + original from the map, and continues — so nothing is ever lost. + +This composes with the optimizer's lossless transforms (the inline content still +gets compressed); offload is the last, most aggressive tier and is fully +reversible because we keep every original byte. +""" + +from __future__ import annotations + +import hashlib +import json +import re + +from nadirclaw.optimize import _estimate_tokens_str + +RETRIEVE_TOOL_NAME = "nadir_retrieve" + +# Default: offload non-user messages whose content exceeds this many tokens. +DEFAULT_MIN_OFFLOAD_TOKENS = 400 +_OFFLOAD_ROLES = ("tool", "system", "assistant", "function") +_MARKER_RE = re.compile(r'hash=[\'"]?([a-f0-9]{8,})') + + +def retrieve_tool_def() -> dict: + """OpenAI-format function tool the model calls to fetch offloaded content.""" + return { + "type": "function", + "function": { + "name": RETRIEVE_TOOL_NAME, + "description": ( + "Retrieve the full original content that was offloaded from the " + "prompt to save tokens. Pass the hash from an offload marker like " + "'[... offloaded ... retrieve full content with nadir_retrieve(hash=\"abc123\")]'." + ), + "parameters": { + "type": "object", + "properties": { + "hash": {"type": "string", "description": "Hash from the offload marker."} + }, + "required": ["hash"], + }, + }, + } + + +def _hash(content: str) -> str: + return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] + + +def offload_messages( + messages: list[dict], + *, + min_tokens: int = DEFAULT_MIN_OFFLOAD_TOKENS, + roles: tuple = _OFFLOAD_ROLES, +): + """Move oversized message content into a captured map, leaving a retrieve marker. + + The user's turns are never offloaded. Returns ``(messages, captured, hashes)`` + where ``captured[hash]`` is the exact original content. + """ + captured: dict[str, str] = {} + hashes: list[str] = [] + out: list[dict] = [] + for m in messages: + content = m.get("content") + if ( + isinstance(content, str) + and m.get("role") in roles + and _estimate_tokens_str(content) >= min_tokens + ): + h = _hash(content) + captured[h] = content + preview = re.sub(r"\s+", " ", content[:140]).strip() + tokens = _estimate_tokens_str(content) + out.append({ + **m, + "content": ( + f"[{tokens} tokens offloaded to save context — preview: {preview}… — " + f'retrieve the full content with {RETRIEVE_TOOL_NAME}(hash="{h}")]' + ), + }) + if h not in hashes: + hashes.append(h) + else: + out.append(m) + return out, captured, hashes + + +def resolve(captured: dict, hash_key: str) -> "str | None": + """Resolve an offload hash to its exact original content.""" + if not captured or not hash_key: + return None + return captured.get(hash_key) + + +def marker_hashes(messages: list[dict]) -> list[str]: + """Return offload hashes referenced by markers in *messages* (order-preserving).""" + seen: list[str] = [] + for h in _MARKER_RE.findall(json.dumps(messages)): + if h not in seen: + seen.append(h) + return seen + + +def extract_retrieve_calls(response: dict) -> list[tuple[str, str]]: + """Parse an OpenAI-format response for ``nadir_retrieve`` tool calls -> [(id, hash)].""" + calls: list[tuple[str, str]] = [] + for choice in (response or {}).get("choices", []): + message = choice.get("message", {}) or {} + for tc in message.get("tool_calls", []) or []: + fn = tc.get("function", {}) or {} + if fn.get("name") != RETRIEVE_TOOL_NAME: + continue + args = fn.get("arguments") + if isinstance(args, str): + try: + args = json.loads(args) + except Exception: + args = {} + calls.append((tc.get("id", ""), (args or {}).get("hash", ""))) + return calls + + +def resolve_loop(messages, first_response, captured, call_llm, *, max_rounds: int = 3): + """Server-side fetch-back loop. + + While the model asks for offloaded content, resolve each hash from *captured*, + append the originals as tool messages, and re-invoke ``call_llm(messages)`` + until the model returns a final (non-retrieval) answer or *max_rounds* is hit. + ``call_llm`` is injected so this is testable without a live provider. + Returns ``(final_response, full_conversation)``. + """ + convo = list(messages) + response = first_response + for _ in range(max_rounds): + calls = extract_retrieve_calls(response) + if not calls: + return response, convo + convo = convo + [response["choices"][0]["message"]] + for tool_call_id, h in calls: + original = resolve(captured, h) + convo.append({ + "role": "tool", + "tool_call_id": tool_call_id, + "name": RETRIEVE_TOOL_NAME, + "content": original if original is not None else f"[retrieve failed: unknown hash {h}]", + }) + response = call_llm(convo) + return response, convo diff --git a/nadirclaw/cli.py b/nadirclaw/cli.py index 9181fa9..25ddced 100644 --- a/nadirclaw/cli.py +++ b/nadirclaw/cli.py @@ -39,8 +39,8 @@ def setup(reconfigure): @click.option("--token", default=None, help="Auth token") @click.option("--verbose", is_flag=True, help="Enable verbose logging") @click.option("--log-raw", is_flag=True, help="Log full raw requests and responses to JSONL") -@click.option("--optimize", default=None, type=click.Choice(["off", "safe", "aggressive"]), - help="Context optimization mode (default: off)") +@click.option("--optimize", default=None, type=click.Choice(["off", "safe", "aggressive", "progressive"]), + help="Context compression: off | safe | aggressive | progressive (default: off)") def serve(port, simple_model, complex_model, models, token, verbose, log_raw, optimize): """Start the NadirClaw router server.""" import logging diff --git a/nadirclaw/optimize.py b/nadirclaw/optimize.py index 1467af0..a54899a 100644 --- a/nadirclaw/optimize.py +++ b/nadirclaw/optimize.py @@ -13,6 +13,8 @@ from __future__ import annotations import json +import logging +import os import re from dataclasses import dataclass, field @@ -29,6 +31,9 @@ class OptimizeResult: tokens_saved: int mode: str optimizations_applied: list[str] = field(default_factory=list) + # When progressive offload runs, maps offload-hash -> original content so the + # caller can inject the retrieve tool and serve the fetch-back loop (see ccr.py). + offload_captured: dict = field(default_factory=dict) # --------------------------------------------------------------------------- @@ -248,8 +253,12 @@ def _normalize_whitespace(content: str) -> tuple[str, bool]: if in_code_block: out_lines.append(line) continue - # Collapse multi-spaces outside code blocks - out_lines.append(_MULTI_SPACES.sub(" ", line)) + # Collapse interior multi-spaces but PRESERVE leading indentation — + # otherwise raw (unfenced) source code has its indentation flattened + # into invalid syntax. Leading whitespace is semantically significant + # (Python, YAML, diffs), so it must survive even in "safe" mode. + n_lead = len(line) - len(line.lstrip(" \t")) + out_lines.append(line[:n_lead] + _MULTI_SPACES.sub(" ", line[n_lead:])) result = "\n".join(out_lines) # Collapse 3+ consecutive blank lines → 2 @@ -444,11 +453,117 @@ def _semantic_dedup( return result, changed +# --------------------------------------------------------------------------- +# Transform — Homogeneous JSON-array packing (aggressive, columnar) +# --------------------------------------------------------------------------- +# +# Large arrays of objects that share the same keys (DB query results, API list +# responses, tool outputs) repeat every key on every row. Packing them into a +# columnar table — a single header of keys plus one JSON value-array per row — +# emits each key once instead of N times. This is *information-lossless* +# (deterministically reversible via _unpack_table) but not byte-identical JSON, +# so it runs in aggressive mode only, never in safe mode. + +_TABLE_OPEN = "⟦cols=" # ⟦cols=[...]⟧ +_TABLE_CLOSE = "⟧" # ⟧ +_TABLE_END = "⟦end⟧" # ⟦end⟧ +_MIN_TABLE_ROWS = 5 + + +def _pack_array(arr: list) -> "str | None": + """Pack a homogeneous list-of-dicts into a columnar table, or None if unfit. + + Only packs when every element is a dict with the *identical* key set (so the + reverse is unambiguous) and there are at least ``_MIN_TABLE_ROWS`` rows. + """ + if len(arr) < _MIN_TABLE_ROWS or not all(isinstance(x, dict) for x in arr): + return None + cols = list(arr[0].keys()) + if len(cols) < 2: + return None + colset = set(cols) + for d in arr: + if set(d.keys()) != colset: + return None # not strictly homogeneous — leave for json_minify + lines = [f"{_TABLE_OPEN}{json.dumps(cols, separators=(',', ':'), ensure_ascii=False)}{_TABLE_CLOSE}"] + for d in arr: + lines.append(json.dumps([d[c] for c in cols], separators=(",", ":"), ensure_ascii=False)) + lines.append(_TABLE_END) + return "\n".join(lines) + + +def _unpack_table(packed: str) -> list: + """Inverse of :func:`_pack_array` — reconstruct the exact list-of-dicts.""" + lines = packed.split("\n") + header = lines[0] + cols = json.loads(header[len(_TABLE_OPEN):-len(_TABLE_CLOSE)]) + rows = [json.loads(ln) for ln in lines[1:] if ln and ln != _TABLE_END] + return [dict(zip(cols, r)) for r in rows] + + +def _pack_homogeneous_arrays(content: str) -> tuple[str, bool]: + """Replace embedded homogeneous JSON arrays with a compact columnar table. + + Skips fenced code blocks and only replaces when the packed form is strictly + smaller (in tokens) than the minified array. + """ + if not content or len(content) < 80 or "[" not in content: + return content, False + + parts = re.split(r"(```[^\n]*\n.*?```)", content, flags=re.DOTALL) + changed = False + out_segments: list[str] = [] + for seg in parts: + if seg.startswith("```"): + out_segments.append(seg) + continue + new_seg, seg_changed = _pack_segment(seg) + out_segments.append(new_seg) + changed = changed or seg_changed + return "".join(out_segments), changed + + +def _pack_segment(text: str) -> tuple[str, bool]: + decoder = json.JSONDecoder() + result: list[str] = [] + pos = 0 + changed = False + while pos < len(text): + idx = text.find("[", pos) + if idx == -1: + result.append(text[pos:]) + break + result.append(text[pos:idx]) + try: + obj, end = decoder.raw_decode(text, idx) + except (json.JSONDecodeError, ValueError): + result.append("[") + pos = idx + 1 + continue + packed = _pack_array(obj) if isinstance(obj, list) else None + if packed is not None: + minified = json.dumps(obj, separators=(",", ":"), ensure_ascii=False) + if _estimate_tokens_str(packed) < _estimate_tokens_str(minified): + result.append(packed) + changed = True + else: + result.append(text[idx:end]) + else: + result.append(text[idx:end]) + pos = end + return "".join(result), changed + + _SAFE_TRANSFORMS = [ ("system_prompt_dedup", lambda msgs, **_: _dedup_system_prompts(msgs)), ("tool_schema_dedup", lambda msgs, **_: _dedup_tool_schemas(msgs)), ] +# Core aggressive content-level transforms (run before caller-supplied hooks). +_AGGRESSIVE_CONTENT_TRANSFORMS = [ + ("json_array_pack", _pack_homogeneous_arrays), +] + # Content-level transforms (operate on individual message content strings) _SAFE_CONTENT_TRANSFORMS = [ ("json_minify", _minify_json_in_content), @@ -456,10 +571,175 @@ def _semantic_dedup( ] +# --------------------------------------------------------------------------- +# Backend selection — native (default) or headroom (opt-in) +# --------------------------------------------------------------------------- + +_headroom_warned = False + + +def _resolve_backend(backend: str | None) -> str: + """Resolve the optimizer backend: explicit arg, else env, else ``native``.""" + val = (backend or os.getenv("NADIRCLAW_OPTIMIZE_BACKEND", "native")).lower() + return val if val in ("native", "headroom") else "native" + + +def _warn_headroom_once(msg: str) -> None: + global _headroom_warned + if not _headroom_warned: + _headroom_warned = True + logging.getLogger(__name__).warning( + "context-optimize: %s — falling back to native backend.", msg + ) + + +def _headroom_optimize( + messages: list[dict], + mode: str, + max_turns: int, + original_tokens: int, +) -> "OptimizeResult | None": + """Compress via the optional ``headroom-ai`` package (Apache-2.0). + + Returns ``None`` so the caller transparently falls back to the native + pipeline when ``headroom-ai`` is not installed or raises. Token metrics + are recomputed with our own estimator so reported savings stay consistent + across backends (Savings/Billing math depends on a single estimator). + """ + try: + from headroom import compress, CompressConfig + except Exception: + _warn_headroom_once("headroom-ai not installed (pip install nadirclaw[headroom])") + return None + + try: + # Kompress (ML token compression) downloads a HuggingFace model on + # first use, so it stays opt-in behind an explicit env flag. + kompress_on = os.getenv("NADIRCLAW_HEADROOM_KOMPRESS", "off").lower() in ( + "on", "1", "true", "yes", + ) + cfg = CompressConfig( + compress_user_messages=(mode == "aggressive"), + kompress_model=None if kompress_on else "disabled", + ) + result = compress([{**m} for m in messages], config=cfg) + msgs = list(result.messages) + # Conversation-turn trimming is ours, not headroom's — apply for parity. + msgs, _ = _trim_chat_history(msgs, max_turns=max_turns) + except Exception as exc: # pragma: no cover — defensive; headroom itself fails open + _warn_headroom_once(f"headroom compress failed: {exc}") + return None + + optimized_tokens = _estimate_tokens_messages(msgs) + applied = [f"headroom:{t}" for t in (getattr(result, "transforms_applied", None) or [])] + return OptimizeResult( + messages=msgs, + original_tokens=original_tokens, + optimized_tokens=optimized_tokens, + tokens_saved=max(0, original_tokens - optimized_tokens), + mode=mode, + optimizations_applied=applied or ["headroom"], + ) + + +# --------------------------------------------------------------------------- +# Reusable pipeline stages (shared by optimize_messages + compress_progressive) +# --------------------------------------------------------------------------- + +def _apply_content_transforms(msgs: list[dict], transforms) -> tuple[list[dict], list[str]]: + """Apply ``[(name, fn)]`` content-level transforms; return (msgs, applied).""" + applied: list[str] = [] + for name, fn in transforms: + content_changed = False + for i, m in enumerate(msgs): + content = m.get("content") + if not isinstance(content, str) or len(content) < 10: + continue + new_content, changed = fn(content) + if changed: + msgs[i] = {**m, "content": new_content} + content_changed = True + if content_changed: + applied.append(name) + return msgs, applied + + +def _apply_safe_transforms(msgs: list[dict], extra_safe_content=None) -> tuple[list[dict], list[str]]: + """Lossless structural transforms (system/tool-schema dedup, json minify, whitespace).""" + applied: list[str] = [] + for name, fn in _SAFE_TRANSFORMS: + msgs, did_change = fn(msgs) + if did_change: + applied.append(name) + msgs, content_applied = _apply_content_transforms( + msgs, list(_SAFE_CONTENT_TRANSFORMS) + list(extra_safe_content or []) + ) + return msgs, applied + content_applied + + +def _apply_aggressive_transforms( + msgs: list[dict], extra_aggressive_message=None, extra_aggressive_content=None +) -> tuple[list[dict], list[str]]: + """Columnar packing + caller hooks + semantic dedup (lossless-to-semantic).""" + applied: list[str] = [] + msgs, a = _apply_content_transforms(msgs, list(_AGGRESSIVE_CONTENT_TRANSFORMS)) + applied += a + for name, fn in (extra_aggressive_message or []): + msgs, did_change = fn(msgs) + if did_change: + applied.append(name) + msgs, a = _apply_content_transforms(msgs, list(extra_aggressive_content or [])) + applied += a + msgs, did_semantic = _semantic_dedup(msgs) + if did_semantic: + applied.append("semantic_dedup") + return msgs, applied + + +def _headroom_stage(msgs: list[dict], *, kompress: bool) -> "tuple[list[dict], list[str]] | None": + """Run Headroom's content compressors over the messages (optional dep). + + Protections are disabled so the structural compressors (SmartCrusher, + LogCompressor, ...) actually engage; the lossy ML prose compressor + (Kompress) is gated behind *kompress*. Returns ``None`` when ``headroom-ai`` + is unavailable or raises, so the caller can skip this stage cleanly. + """ + try: + from headroom import compress, CompressConfig + except Exception: + _warn_headroom_once("headroom-ai not installed (pip install nadirclaw[headroom])") + return None + try: + cfg = CompressConfig( + compress_user_messages=True, + compress_system_messages=True, + protect_recent=0, + protect_analysis_context=False, + kompress_model=None if kompress else "disabled", + ) + result = compress([{**m} for m in msgs], config=cfg) + except Exception as exc: # pragma: no cover — headroom itself fails open + _warn_headroom_once(f"headroom compress failed: {exc}") + return None + seen: set[str] = set() + applied: list[str] = [] + for t in (getattr(result, "transforms_applied", None) or []): + name = "headroom:" + (t.split(":")[0] if ":" in t else t) + if name not in seen: + seen.add(name) + applied.append(name) + return list(result.messages), applied or ["headroom"] + + def optimize_messages( messages: list[dict], mode: str = "off", max_turns: int = 40, + *, + backend: str | None = None, + extra_safe_content: "list | None" = None, + extra_aggressive_message: "list | None" = None, + extra_aggressive_content: "list | None" = None, ) -> OptimizeResult: """Optimize a list of message dicts for token reduction. @@ -472,6 +752,14 @@ def optimize_messages( (safe + semantic deduplication via sentence embeddings). max_turns Maximum conversation turns to keep when trimming history. + backend + ``"native"`` (default — the stdlib transform pipeline) or + ``"headroom"`` (delegates to the optional ``headroom-ai`` package and + falls back to native if it is unavailable). When ``None`` the + ``NADIRCLAW_OPTIMIZE_BACKEND`` env var is consulted (default native). + extra_safe_content, extra_aggressive_message, extra_aggressive_content + Optional ``[(name, fn)]`` hooks letting a superset package (Nadir Pro) + register additional transforms without forking this pipeline. Returns ------- @@ -489,36 +777,28 @@ def optimize_messages( mode="off", ) + # --- Backend selection (headroom is opt-in; native is the default) --- + if _resolve_backend(backend) == "headroom": + hr = _headroom_optimize(messages, mode, max_turns, original_tokens) + if hr is not None: + return hr + # else: fall through to the native pipeline below + applied: list[str] = [] # Deep copy messages to avoid mutating input msgs = [{**m} for m in messages] - # --- Message-level transforms (safe) --- - for name, fn in _SAFE_TRANSFORMS: - msgs, did_change = fn(msgs) - if did_change: - applied.append(name) - - # --- Content-level transforms (safe) --- - for name, fn in _SAFE_CONTENT_TRANSFORMS: - content_changed = False - for i, m in enumerate(msgs): - content = m.get("content") - if not isinstance(content, str) or len(content) < 10: - continue - new_content, changed = fn(content) - if changed: - msgs[i] = {**m, "content": new_content} - content_changed = True - if content_changed: - applied.append(name) + # --- Safe (lossless) transforms --- + msgs, a = _apply_safe_transforms(msgs, extra_safe_content) + applied += a # --- Aggressive-only transforms --- if mode == "aggressive": - msgs, did_semantic = _semantic_dedup(msgs) - if did_semantic: - applied.append("semantic_dedup") + msgs, a = _apply_aggressive_transforms( + msgs, extra_aggressive_message, extra_aggressive_content + ) + applied += a # --- Chat history trimming --- msgs, did_trim = _trim_chat_history(msgs, max_turns=max_turns) @@ -535,3 +815,135 @@ def optimize_messages( mode=mode, optimizations_applied=applied, ) + + +# --------------------------------------------------------------------------- +# Progressive (staged) compression +# --------------------------------------------------------------------------- + +# Escalation ladder, cheapest/safest first. Headroom is the middle tier (its +# structural compressors fill gaps native cannot — ragged JSON, big logs — and +# its lossy ML prose compressor is later). The final tier is native CCR offload: +# move oversized content out of the prompt behind a retrieve handle, fully +# reversible because the caller serves it back on demand (see ccr.py). +_PROGRESSIVE_STAGES = ( + "native_safe", "native_aggressive", "headroom_structural", "headroom_ml", "offload", +) + + +def compress_progressive( + messages: list[dict], + *, + target_tokens: "int | None" = None, + max_turns: int = 40, + allow_lossy: bool = False, + allow_offload: bool = False, + max_stage: str = "native_aggressive", + extra_safe_content: "list | None" = None, + extra_aggressive_message: "list | None" = None, + extra_aggressive_content: "list | None" = None, +) -> OptimizeResult: + """Apply escalating compression stages, stopping once a budget is met. + + Stages run in order and the loop stops as soon as the message set fits + ``target_tokens``: + + 1. ``native_safe`` — lossless structural transforms + 2. ``native_aggressive`` — + columnar packing, semantic dedup, caller hooks + 3. ``headroom_structural``— Headroom content compressors (optional dep) + 4. ``headroom_ml`` — Headroom Kompress (lossy ML prose) + 5. ``offload`` — native CCR: move oversized content out of the + prompt behind a ``nadir_retrieve`` handle + + Escalation rules: + + - When ``target_tokens`` is ``None`` the ladder stops after ``max_stage`` + (default ``native_aggressive``) — Headroom, lossy ML and offload are never + reached unless an explicit budget still isn't met. This keeps the default + behaviour dependency-free and lossless-to-semantic. + - The Headroom stages require the optional ``headroom-ai`` package and are + skipped silently when it is absent or errors. + - ``headroom_ml`` (lossy) only runs when ``allow_lossy=True``. + - ``offload`` only runs when ``allow_offload=True``. It is fully reversible — + the originals are returned in ``OptimizeResult.offload_captured`` so the + caller MUST inject the retrieve tool (:func:`nadirclaw.ccr.retrieve_tool_def`) + and serve the fetch-back loop (:func:`nadirclaw.ccr.resolve_loop`), or the + model cannot recover the offloaded content. + - Chat-history trimming always runs last as a final backstop. + + Returns an :class:`OptimizeResult` whose ``mode`` is ``"progressive"`` and + whose ``optimizations_applied`` is prefixed with the ``stage:`` markers + that actually ran. + """ + original_tokens = _estimate_tokens_messages(messages) + msgs = [{**m} for m in messages] + applied: list[str] = [] + stages_run: list[str] = [] + + ladder = list(_PROGRESSIVE_STAGES) + if max_stage in ladder: + ladder = ladder[: ladder.index(max_stage) + 1] + if not allow_lossy and "headroom_ml" in ladder: + ladder.remove("headroom_ml") + if not allow_offload and "offload" in ladder: + ladder.remove("offload") + + offload_captured: dict = {} + + def _fits() -> bool: + return target_tokens is not None and _estimate_tokens_messages(msgs) <= target_tokens + + for stage in ladder: + if _fits(): + break + # Headroom and offload stages only engage when a budget is set and unmet. + if stage in ("headroom_structural", "headroom_ml", "offload") and target_tokens is None: + break + + if stage == "native_safe": + msgs, a = _apply_safe_transforms(msgs, extra_safe_content) + elif stage == "native_aggressive": + msgs, a = _apply_aggressive_transforms( + msgs, extra_aggressive_message, extra_aggressive_content + ) + elif stage == "headroom_structural": + hr = _headroom_stage(msgs, kompress=False) + if hr is None: + continue + msgs, a = hr + elif stage == "headroom_ml": + hr = _headroom_stage(msgs, kompress=True) + if hr is None: + continue + msgs, a = hr + elif stage == "offload": + # Native CCR offload: move oversized content behind a retrieve handle. + from nadirclaw import ccr + + msgs, captured, hashes = ccr.offload_messages(msgs) + if not hashes: + continue + offload_captured.update(captured) + a = ["offload"] + else: # pragma: no cover + continue + + applied += a + stages_run.append(stage) + + # Final backstop — trim history if still over budget (or unconditionally + # when over max_turns, matching optimize_messages). + msgs, did_trim = _trim_chat_history(msgs, max_turns=max_turns) + if did_trim: + applied.append("chat_history_trim") + + optimized_tokens = _estimate_tokens_messages(msgs) + return OptimizeResult( + messages=msgs, + original_tokens=original_tokens, + optimized_tokens=optimized_tokens, + tokens_saved=max(0, original_tokens - optimized_tokens), + mode="progressive", + optimizations_applied=[f"stage:{s}" for s in stages_run] + applied, + offload_captured=offload_captured, + ) diff --git a/nadirclaw/server.py b/nadirclaw/server.py index e16cb5a..e36e67b 100644 --- a/nadirclaw/server.py +++ b/nadirclaw/server.py @@ -1418,19 +1418,36 @@ async def chat_completions( # Context optimization — compact messages before dispatch # ------------------------------------------------------------------ optimize_mode = (request.model_extra or {}).get("optimize") or settings.OPTIMIZE + optimize_backend = (request.model_extra or {}).get("optimize_backend") or settings.OPTIMIZE_BACKEND optimization_info = None if optimize_mode != "off": - from nadirclaw.optimize import optimize_messages - raw_msgs = [ {"role": m.role, "content": m.text_content()} for m in request.messages ] - opt_result = optimize_messages( - raw_msgs, - mode=optimize_mode, - max_turns=settings.OPTIMIZE_MAX_TURNS, - ) + # `optimize=progressive` (or the legacy NADIRCLAW_OPTIMIZE_PROGRESSIVE + # flag) selects the staged ladder that escalates native → headroom → + # lossy ML only until the token budget is met. Headroom stages are + # skipped if headroom-ai is not installed. + if optimize_mode == "progressive" or settings.OPTIMIZE_PROGRESSIVE: + from nadirclaw.optimize import compress_progressive + + opt_result = compress_progressive( + raw_msgs, + target_tokens=settings.OPTIMIZE_TARGET_TOKENS, + max_turns=settings.OPTIMIZE_MAX_TURNS, + allow_lossy=settings.OPTIMIZE_ALLOW_LOSSY, + max_stage=settings.OPTIMIZE_MAX_STAGE, + ) + else: + from nadirclaw.optimize import optimize_messages + + opt_result = optimize_messages( + raw_msgs, + mode=optimize_mode, + max_turns=settings.OPTIMIZE_MAX_TURNS, + backend=optimize_backend, + ) if opt_result.tokens_saved > 0: optimized_msgs = [ ChatMessage(role=m["role"], content=m["content"]) diff --git a/nadirclaw/settings.py b/nadirclaw/settings.py index 5def506..cccc89b 100644 --- a/nadirclaw/settings.py +++ b/nadirclaw/settings.py @@ -248,11 +248,16 @@ def PROVIDER_HEALTH_FAILURE_THRESHOLD(self) -> int: @property def OPTIMIZE(self) -> str: - """Context optimization mode: off, safe, aggressive. Default: off.""" + """Context optimization mode: off, safe, aggressive, progressive. Default: off. + + ``off`` disables compression entirely. ``safe``/``aggressive`` run the + single-pass pipeline; ``progressive`` runs the staged ladder that + escalates to Headroom only until the token budget is met. + """ val = os.getenv("NADIRCLAW_OPTIMIZE", "off").lower() - if val not in ("off", "safe", "aggressive"): + if val not in ("off", "safe", "aggressive", "progressive"): _settings_logger.warning( - "Invalid NADIRCLAW_OPTIMIZE=%r — expected off|safe|aggressive. " + "Invalid NADIRCLAW_OPTIMIZE=%r — expected off|safe|aggressive|progressive. " "Falling back to 'off'.", val, ) @@ -267,6 +272,76 @@ def OPTIMIZE_MAX_TURNS(self) -> int: except ValueError: return 40 + @property + def OPTIMIZE_BACKEND(self) -> str: + """Optimizer backend: native (default) or headroom (opt-in). + + ``native`` runs the built-in stdlib transform pipeline. ``headroom`` + delegates to the optional ``headroom-ai`` package and transparently + falls back to native when it is not installed. Default: native. + """ + val = os.getenv("NADIRCLAW_OPTIMIZE_BACKEND", "native").lower() + if val not in ("native", "headroom"): + _settings_logger.warning( + "Invalid NADIRCLAW_OPTIMIZE_BACKEND=%r — expected native|headroom. " + "Falling back to 'native'.", + val, + ) + return "native" + return val + + @property + def OPTIMIZE_PROGRESSIVE(self) -> bool: + """Use progressive (staged) compression that escalates to Headroom only + when a token budget is still unmet after native transforms. Default: off.""" + return os.getenv("NADIRCLAW_OPTIMIZE_PROGRESSIVE", "false").lower() in ("true", "1", "yes", "on") + + @property + def OPTIMIZE_TARGET_TOKENS(self): + """Token budget for progressive compression. When set, escalation stops + as soon as the message set fits. Unset (default) → native stages only.""" + raw = os.getenv("NADIRCLAW_OPTIMIZE_TARGET_TOKENS", "").strip() + if not raw: + return None + try: + return max(1, int(raw)) + except ValueError: + return None + + @property + def OPTIMIZE_ALLOW_LOSSY(self) -> bool: + """Allow the lossy ML prose stage (Headroom Kompress) in progressive + compression. Default: off (escalation stops at lossless/structural).""" + return os.getenv("NADIRCLAW_OPTIMIZE_ALLOW_LOSSY", "false").lower() in ("true", "1", "yes", "on") + + @property + def OPTIMIZE_ALLOW_OFFLOAD(self) -> bool: + """Allow the native CCR offload stage (move oversized content behind a + ``nadir_retrieve`` handle) in progressive compression. Default: off. + + NOTE: offload is only safe when the caller injects the retrieve tool + (``nadirclaw.ccr.retrieve_tool_def``) and serves the fetch-back loop + (``nadirclaw.ccr.resolve_loop``). The built-in server does NOT yet run + that loop, so this setting is currently consumed only by direct library + callers of ``compress_progressive(allow_offload=True)`` — not by + ``nadirclaw serve``. Enabling it without a retrieve path means the model + sees a marker it cannot resolve.""" + return os.getenv("NADIRCLAW_OPTIMIZE_ALLOW_OFFLOAD", "false").lower() in ("true", "1", "yes", "on") + + @property + def OPTIMIZE_MAX_STAGE(self) -> str: + """Cap on the progressive escalation ladder. Default: headroom_structural.""" + val = os.getenv("NADIRCLAW_OPTIMIZE_MAX_STAGE", "headroom_structural").lower() + allowed = ("native_safe", "native_aggressive", "headroom_structural", "headroom_ml", "offload") + if val not in allowed: + _settings_logger.warning( + "Invalid NADIRCLAW_OPTIMIZE_MAX_STAGE=%r — expected one of %s. " + "Falling back to 'headroom_structural'.", + val, "|".join(allowed), + ) + return "headroom_structural" + return val + @property def CORS_ORIGINS(self) -> list[str]: """Allowed CORS origins (comma-separated). Empty = local-only regex default.""" diff --git a/nadirclaw/trained_verifier.py b/nadirclaw/trained_verifier.py index 4c83a12..644c205 100644 --- a/nadirclaw/trained_verifier.py +++ b/nadirclaw/trained_verifier.py @@ -32,9 +32,12 @@ >>> result = v.score(prompt, cheap_answer) >>> result.score, result.accepted # float in [0, 1], bool -The ``reference_answer`` and ``expect_json`` arguments are accepted for -parity with ``HeuristicVerifier`` but are currently ignored — the -trained model scores ``(prompt, cheap_answer)`` only. +The ``reference_answer`` argument, when provided, is folded into the +structured ``text_pair`` the cross-encoder was trained on (see +``score()``). When ``None``, an empty ``EXPENSIVE:`` block is +substituted, matching the production backend's behaviour. The +``expect_json`` argument is accepted for parity with +``HeuristicVerifier`` but is currently ignored by the trained model. Dependencies ------------ @@ -219,10 +222,16 @@ def score( ) -> TrainedScore: """Score how acceptable ``cheap_answer`` is for ``prompt``. - ``reference_answer`` and ``expect_json`` are accepted for - interface parity with ``HeuristicVerifier`` and are currently - ignored by the trained model. The cross-encoder was trained - on ``(prompt, cheap_answer)`` pairs only. + The cross-encoder was trained on inputs of the form + ``(prompt, "CHEAP:\\n{cheap}\\n\\nEXPENSIVE:\\n{reference}")``. + ``reference_answer`` is folded into the structured ``text_pair`` + when provided; when ``None`` an empty ``EXPENSIVE:`` block is + substituted, matching the production backend at + ``getnadir.dev/backend/app/services/verifier_model.py``. + + ``expect_json`` is accepted for interface parity with + ``HeuristicVerifier`` and is currently ignored by the trained + model. """ self._ensure_loaded() @@ -240,9 +249,17 @@ def score( import torch + # Match the training format used by the production backend + # (getnadir.dev/backend/app/services/verifier_model.py) and + # documented on the HuggingFace model card. Without the + # ``CHEAP:``/``EXPENSIVE:`` wrapper the scores drift against + # the calibrated tau=0.80 acceptance threshold. + text_pair = ( + f"CHEAP:\n{cheap}\n\nEXPENSIVE:\n{(reference_answer or '').strip()}" + ) enc = self._tokenizer( prompt or "", - cheap, + text_pair, truncation=True, max_length=_MAX_SEQ_LEN, padding=False, diff --git a/pyproject.toml b/pyproject.toml index 847a4a0..4392139 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,13 @@ cascade-rules = [ # (`load_profile`). `load_inline` works without it, so this is opt-in. "pyyaml>=6.0", ] +headroom = [ + # Optional context-compression backend (Apache-2.0, github.com/chopratejas/headroom). + # Activated with NADIRCLAW_OPTIMIZE_BACKEND=headroom. When absent, the optimizer + # transparently falls back to the built-in native pipeline, so this stays opt-in. + # Prebuilt wheels bundle the compiled Rust `_core` extension (SmartCrusher etc.). + "headroom-ai>=0.23.0", +] telemetry = [ "opentelemetry-api>=1.20.0", "opentelemetry-sdk>=1.20.0", diff --git a/tests/test_ccr.py b/tests/test_ccr.py new file mode 100644 index 0000000..cad4cfb --- /dev/null +++ b/tests/test_ccr.py @@ -0,0 +1,138 @@ +"""Tests for the native CCR (Compress-Cache-Retrieve) fetch-back loop. + +Offload moves oversized non-user content out of the prompt behind a retrieve +handle, keeping the exact original in a map. The loop resolves the model's +retrieve calls so nothing is ever lost. All deterministic, no Headroom needed. +""" +import json + +import pytest + +import nadirclaw.ccr as ccr +import nadirclaw.optimize as o + + +def _big_tool_msgs(): + rows = [{"id": 1000 + i, "user": f"user{i}", "status": "active" if i % 4 else "suspended", + "plan": "pro" if i % 5 == 0 else "free"} for i in range(60)] + return [ + {"role": "system", "content": "You are a support assistant."}, + {"role": "user", "content": "How many users are suspended?"}, + {"role": "tool", "content": "get_users() ->\n" + json.dumps(rows, indent=2)}, + ] + + +# --------------------------------------------------------------------------- +# offload + resolve +# --------------------------------------------------------------------------- + +def test_offload_shrinks_and_captures(): + msgs = _big_tool_msgs() + before = o._estimate_tokens_messages(msgs) + out, captured, hashes = ccr.offload_messages(msgs) + after = o._estimate_tokens_messages(out) + assert after < before * 0.5 # big reduction + assert len(hashes) == 1 # the one big tool message + assert hashes[0] in captured + + +def test_offload_is_byte_exact_recoverable(): + msgs = _big_tool_msgs() + original = msgs[2]["content"] + out, captured, hashes = ccr.offload_messages(msgs) + assert ccr.resolve(captured, hashes[0]) == original # exact bytes back + + +def test_user_message_never_offloaded(): + msgs = [{"role": "user", "content": "x" * 5000}] # huge, but it's the user's turn + out, captured, hashes = ccr.offload_messages(msgs) + assert hashes == [] and out == msgs + + +def test_small_messages_not_offloaded(): + msgs = [{"role": "tool", "content": "short result"}] + out, captured, hashes = ccr.offload_messages(msgs, min_tokens=400) + assert hashes == [] + + +def test_marker_carries_the_hash(): + out, captured, hashes = ccr.offload_messages(_big_tool_msgs()) + marker = out[2]["content"] + assert f'hash="{hashes[0]}"' in marker + assert ccr.RETRIEVE_TOOL_NAME in marker + + +# --------------------------------------------------------------------------- +# retrieve tool + response parsing +# --------------------------------------------------------------------------- + +def test_retrieve_tool_def_shape(): + tool = ccr.retrieve_tool_def() + assert tool["function"]["name"] == ccr.RETRIEVE_TOOL_NAME + assert "hash" in tool["function"]["parameters"]["properties"] + + +def test_extract_retrieve_calls(): + resp = {"choices": [{"message": {"tool_calls": [ + {"id": "c1", "type": "function", + "function": {"name": "nadir_retrieve", "arguments": json.dumps({"hash": "deadbeef"})}}, + {"id": "c2", "type": "function", + "function": {"name": "other_tool", "arguments": "{}"}}, + ]}}]} + assert ccr.extract_retrieve_calls(resp) == [("c1", "deadbeef")] + + +# --------------------------------------------------------------------------- +# full fetch-back loop (mock LLM, no provider) +# --------------------------------------------------------------------------- + +def test_resolve_loop_recovers_data_and_answers(): + msgs = _big_tool_msgs() + rows_suspended = sum(1 for i in range(60) if not (i % 4)) # status logic in _big_tool_msgs + out, captured, hashes = ccr.offload_messages(msgs) + + def mock_llm(convo): + tool_msgs = [m for m in convo if m.get("role") == "tool" and m.get("name") == "nadir_retrieve"] + if not tool_msgs: # round 1: ask to retrieve + return {"choices": [{"message": {"role": "assistant", "content": None, "tool_calls": [ + {"id": "c1", "type": "function", + "function": {"name": "nadir_retrieve", "arguments": json.dumps({"hash": hashes[0]})}}]}}]} + data = tool_msgs[-1]["content"] # round 2: answer from the REAL data + return {"choices": [{"message": {"role": "assistant", + "content": f'{data.count(chr(34) + "suspended" + chr(34))} suspended'}}]} + + final, convo = ccr.resolve_loop(out, mock_llm(out), captured, mock_llm) + answer = final["choices"][0]["message"]["content"] + assert str(rows_suspended) in answer # model answered correctly from recovered data + # the resolved tool message in the conversation is the exact original + assert any(m.get("role") == "tool" and m.get("content") == msgs[2]["content"] for m in convo) + + +def test_resolve_loop_handles_unknown_hash(): + resp = {"choices": [{"message": {"tool_calls": [ + {"id": "c1", "type": "function", + "function": {"name": "nadir_retrieve", "arguments": json.dumps({"hash": "nope"})}}]}}]} + final, convo = ccr.resolve_loop([], resp, {}, lambda convo: {"choices": [{"message": {"content": "done"}}]}) + assert any("retrieve failed" in (m.get("content") or "") for m in convo) + + +# --------------------------------------------------------------------------- +# progressive offload stage +# --------------------------------------------------------------------------- + +def test_progressive_offload_gated_off_by_default(): + r = o.compress_progressive(_big_tool_msgs(), target_tokens=200, max_stage="offload") + assert r.offload_captured == {} + assert "stage:offload" not in r.optimizations_applied + + +def test_progressive_offload_engages_and_is_recoverable(): + import re + msgs = _big_tool_msgs() + r = o.compress_progressive(msgs, target_tokens=200, max_stage="offload", allow_offload=True) + assert "stage:offload" in r.optimizations_applied + assert r.offload_captured + # Offload captures the (losslessly) compressed content — not byte-identical to + # the pretty original, but still complete: every user must be recoverable. + recovered = "".join(r.offload_captured.values()) + assert len(set(re.findall(r"user\d+", recovered))) == 60 diff --git a/tests/test_code_safety.py b/tests/test_code_safety.py new file mode 100644 index 0000000..d3886e7 --- /dev/null +++ b/tests/test_code_safety.py @@ -0,0 +1,70 @@ +"""Regression tests: compression must not corrupt source code. + +Raw (unfenced) code arrives in coding-agent traffic as file-read tool outputs. +Whitespace normalization must preserve leading indentation, or it flattens +Python/YAML/diffs into invalid syntax. Fenced code must stay byte-identical. +""" +import ast +import textwrap + +import pytest + +from nadirclaw.optimize import optimize_messages + +PY_SRC = textwrap.dedent('''\ + import json + + + def process(record, config): + result = {} + for key, spec in config.items(): + value = record.get(key) + if value is None: + if spec.get("required"): + raise ValueError(key) + continue + result[key] = value + return result + + + class Validator: + def __init__(self, schema): + self.schema = schema + + def check(self, data): + for field in self.schema: + if field not in data: + return False + return True +''') + + +@pytest.mark.parametrize("mode", ["safe", "aggressive"]) +def test_raw_code_stays_valid_python(mode): + """Unfenced source code in a tool message must remain parseable.""" + msgs = [{"role": "tool", "content": PY_SRC}] + out = optimize_messages(msgs, mode=mode).messages[0]["content"] + ast.parse(out) # raises SyntaxError if indentation was flattened + + +@pytest.mark.parametrize("mode", ["safe", "aggressive"]) +def test_leading_indentation_preserved(mode): + msgs = [{"role": "tool", "content": PY_SRC}] + out = optimize_messages(msgs, mode=mode).messages[0]["content"] + # The deepest line is indented 16 spaces; it must keep its indentation. + line = next(ln for ln in out.split("\n") if "raise ValueError" in ln) + assert line.startswith(" raise ValueError") + + +def test_fenced_code_is_byte_identical(): + snippet = "def f(x):\n if x:\n return x + 1\n return 0" + content = "Here:\n```python\n" + snippet + "\n```" + out = optimize_messages([{"role": "assistant", "content": content}], mode="safe").messages[0]["content"] + assert snippet in out # fenced block untouched, including its interior spacing + + +def test_interior_spaces_still_collapse_in_prose(): + # The fix only protects leading indentation; prose double-spaces still collapse. + content = "this sentence has wide gaps and is long enough to process" + out = optimize_messages([{"role": "user", "content": content}], mode="safe").messages[0]["content"] + assert " " not in out diff --git a/tests/test_json_array_pack.py b/tests/test_json_array_pack.py new file mode 100644 index 0000000..b00ccdb --- /dev/null +++ b/tests/test_json_array_pack.py @@ -0,0 +1,110 @@ +"""Tests for columnar JSON-array packing (aggressive-mode transform). + +Packing rewrites homogeneous arrays-of-objects into a header + one value-array +per row. It must be information-lossless (deterministically reversible), must +never run in safe mode, and must skip arrays it cannot pack unambiguously. +""" +import json + +import pytest + +from nadirclaw.optimize import ( + _pack_array, + _pack_homogeneous_arrays, + _unpack_table, + optimize_messages, +) + + +def _roundtrip(arr): + packed = _pack_array(arr) + assert packed is not None + return _unpack_table(packed) + + +# --------------------------------------------------------------------------- +# Losslessness across value types +# --------------------------------------------------------------------------- + +def test_roundtrip_scalars(): + arr = [{"id": i, "name": f"u{i}", "active": bool(i % 2), "score": i / 3} for i in range(8)] + assert _roundtrip(arr) == arr + + +def test_roundtrip_nested_and_null_and_tricky_strings(): + arr = [ + {"id": 1, "meta": {"a": [1, 2], "b": None}, "note": 'has "quotes", commas, [brackets]'}, + {"id": 2, "meta": {"a": [], "b": 5}, "note": "tab\tand\nnewline"}, + {"id": 3, "meta": {"a": [9], "b": None}, "note": "unicode ✓ é 中"}, + {"id": 4, "meta": {"a": [1], "b": 0}, "note": ""}, + {"id": 5, "meta": {"a": [2, 3], "b": 1}, "note": "⟦cols= looks like a marker"}, + ] + assert _roundtrip(arr) == arr + + +def test_roundtrip_preserves_row_and_key_order(): + arr = [{"z": i, "a": i + 1, "m": i + 2} for i in range(6)] + out = _roundtrip(arr) + assert out == arr + assert list(out[0].keys()) == ["z", "a", "m"] + + +# --------------------------------------------------------------------------- +# Skip conditions (fall back to json_minify) +# --------------------------------------------------------------------------- + +def test_skip_too_few_rows(): + assert _pack_array([{"a": 1, "b": 2}] * 4) is None # < 5 rows + + +def test_skip_non_homogeneous_keys(): + assert _pack_array([{"a": 1, "b": 2}, {"a": 1}, {"c": 3}] * 3) is None + + +def test_skip_single_column(): + assert _pack_array([{"a": i} for i in range(10)]) is None + + +def test_skip_non_dict_elements(): + assert _pack_array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) is None + + +# --------------------------------------------------------------------------- +# Pipeline integration +# --------------------------------------------------------------------------- + +def _msgs(): + rows = [{"id": 1000 + i, "user": f"user{i}", "status": "active" if i % 3 else "inactive", + "plan": "pro" if i % 5 == 0 else "free"} for i in range(40)] + return [{"role": "user", "content": "list users"}, + {"role": "tool", "content": "result:\n" + json.dumps(rows, indent=2)}] + + +def test_aggressive_packs_and_saves(): + r = optimize_messages(_msgs(), mode="aggressive") + assert "json_array_pack" in r.optimizations_applied + assert r.tokens_saved > 0 + + +def test_safe_mode_never_packs(): + r = optimize_messages(_msgs(), mode="safe") + assert "json_array_pack" not in r.optimizations_applied + assert "⟦cols=" not in r.messages[1]["content"] + + +def test_only_packs_when_smaller(): + # A short homogeneous array whose table form isn't worth it stays as-is. + arr = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}, + {"a": 7, "b": 8}, {"a": 9, "b": 10}] + content = "data " + json.dumps(arr) + out, changed = _pack_homogeneous_arrays(content) + # tiny arrays may not beat minified form; if unchanged, output is untouched + if not changed: + assert out == content + + +def test_fenced_code_is_left_untouched(): + arr = [{"id": i, "v": i * 2} for i in range(10)] + content = "```json\n" + json.dumps(arr, indent=2) + "\n```" + out, changed = _pack_homogeneous_arrays(content) + assert not changed and out == content diff --git a/tests/test_optimize_backends.py b/tests/test_optimize_backends.py new file mode 100644 index 0000000..25fb444 --- /dev/null +++ b/tests/test_optimize_backends.py @@ -0,0 +1,113 @@ +"""Tests for the optimizer backend selection (native vs headroom). + +These cover the contract that matters for safety: +- ``off`` mode is a zero-cost no-op regardless of backend. +- The ``headroom`` backend transparently falls back to ``native`` when the + optional ``headroom-ai`` package is absent (the common case), producing + byte-identical output and never reporting headroom transforms. +- Backend selection resolves from arg → env → ``native`` default. +- The extension hooks used by Nadir Pro fire in the expected places. +""" + +import importlib + +import pytest + +from nadirclaw.optimize import ( + OptimizeResult, + _resolve_backend, + optimize_messages, +) + +HEADROOM_INSTALLED = importlib.util.find_spec("headroom") is not None + + +def _sample(): + return [ + {"role": "system", "content": "You are a helpful assistant. " * 3}, + {"role": "user", "content": 'Parse {"a": 1, "b": 2} with extra spaces.'}, + ] + + +# --------------------------------------------------------------------------- +# Backend resolution +# --------------------------------------------------------------------------- + +def test_resolve_backend_default_native(monkeypatch): + monkeypatch.delenv("NADIRCLAW_OPTIMIZE_BACKEND", raising=False) + assert _resolve_backend(None) == "native" + + +def test_resolve_backend_explicit_arg_wins(monkeypatch): + monkeypatch.setenv("NADIRCLAW_OPTIMIZE_BACKEND", "native") + assert _resolve_backend("headroom") == "headroom" + + +def test_resolve_backend_env(monkeypatch): + monkeypatch.setenv("NADIRCLAW_OPTIMIZE_BACKEND", "headroom") + assert _resolve_backend(None) == "headroom" + + +def test_resolve_backend_invalid_falls_back(monkeypatch): + monkeypatch.delenv("NADIRCLAW_OPTIMIZE_BACKEND", raising=False) + assert _resolve_backend("nonsense") == "native" + + +# --------------------------------------------------------------------------- +# off-mode short-circuit +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("backend", ["native", "headroom"]) +def test_off_is_noop_for_any_backend(backend): + msgs = _sample() + result = optimize_messages(msgs, mode="off", backend=backend) + assert isinstance(result, OptimizeResult) + assert result.tokens_saved == 0 + assert result.mode == "off" + # off returns the original list object untouched (zero overhead) + assert result.messages is msgs + + +# --------------------------------------------------------------------------- +# headroom backend fallback (when headroom-ai is not installed) +# --------------------------------------------------------------------------- + +@pytest.mark.skipif(HEADROOM_INSTALLED, reason="headroom-ai installed; fallback path not exercised") +@pytest.mark.parametrize("mode", ["safe", "aggressive"]) +def test_headroom_falls_back_to_native_when_absent(mode): + msgs = _sample() + native = optimize_messages([{**m} for m in msgs], mode=mode, backend="native") + headroom = optimize_messages([{**m} for m in msgs], mode=mode, backend="headroom") + + # Output must be identical to native... + assert headroom.messages == native.messages + # ...and must never claim a headroom transform ran. + assert not any(t.startswith("headroom") for t in headroom.optimizations_applied) + + +# --------------------------------------------------------------------------- +# extension hooks (the mechanism Nadir Pro builds on) +# --------------------------------------------------------------------------- + +def test_extra_safe_content_hook_runs(): + def shout(content): + new = content.replace("hello", "HELLO") + return new, new != content + + msgs = [{"role": "user", "content": "hello there, this is a reasonably long message"}] + result = optimize_messages( + msgs, mode="safe", extra_safe_content=[("shout", shout)] + ) + assert "shout" in result.optimizations_applied + assert "HELLO" in result.messages[0]["content"] + + +def test_extra_aggressive_hooks_skipped_in_safe_mode(): + def boom(_content): + raise AssertionError("aggressive hook must not run in safe mode") + + msgs = [{"role": "user", "content": "a fairly long user message to exceed the length floor"}] + # Should not raise — aggressive hooks only fire in aggressive mode. + optimize_messages( + msgs, mode="safe", extra_aggressive_content=[("boom", boom)] + ) diff --git a/tests/test_progressive.py b/tests/test_progressive.py new file mode 100644 index 0000000..b172042 --- /dev/null +++ b/tests/test_progressive.py @@ -0,0 +1,79 @@ +"""Tests for progressive (staged) compression. + +Headroom stages require the optional ``headroom-ai`` package (Python <= 3.13). +These tests cover the escalation logic, early-stop, stage capping, lossy gating, +and graceful skip when Headroom is absent — all observable on the native stages. +The Headroom stages engaging is exercised manually in a 3.13 venv. +""" +import json + +import pytest + +from nadirclaw.optimize import compress_progressive + + +def _stages(result): + return [x.split(":", 1)[1] for x in result.optimizations_applied if x.startswith("stage:")] + + +def _big_msgs(): + rows = [{"id": 1000 + i, "user": f"user{i}", "status": "active" if i % 3 else "off", + "plan": "pro" if i % 5 == 0 else "free"} for i in range(60)] + return [ + {"role": "system", "content": "You are a helpful assistant. " * 6}, + {"role": "user", "content": "summarize the users"}, + {"role": "tool", "content": "get_users():\n" + json.dumps(rows, indent=2)}, + ] + + +def test_mode_is_progressive(): + r = compress_progressive(_big_msgs()) + assert r.mode == "progressive" + + +def test_no_target_stops_at_max_stage_native(): + # Without a budget, escalation stops after native_aggressive — never reaches Headroom. + r = compress_progressive(_big_msgs()) + assert _stages(r) == ["native_safe", "native_aggressive"] + assert not any(s.startswith("headroom") for s in _stages(r)) + assert r.tokens_saved > 0 + + +def test_generous_target_early_stops_after_safe(): + msgs = _big_msgs() + from nadirclaw.optimize import _estimate_tokens_messages + orig = _estimate_tokens_messages(msgs) + r = compress_progressive(msgs, target_tokens=int(orig * 0.9), max_stage="headroom_ml", allow_lossy=True) + # Safe alone gets under 90% here, so it must stop immediately. + assert _stages(r) == ["native_safe"] + + +def test_max_stage_caps_ladder(): + # Cap at native_safe → aggressive never runs even though target is unmet. + r = compress_progressive(_big_msgs(), target_tokens=1, max_stage="native_safe") + assert _stages(r) == ["native_safe"] + + +def test_headroom_skipped_gracefully_when_absent(): + # Unmeetable target + headroom requested: on a host without headroom-ai the + # headroom stages are skipped (not recorded), output stays valid native. + import importlib.util + r = compress_progressive(_big_msgs(), target_tokens=1, max_stage="headroom_ml", allow_lossy=True) + if importlib.util.find_spec("headroom") is None: + assert _stages(r) == ["native_safe", "native_aggressive"] + # Either way the result is well-formed and never larger than the input. + assert r.optimized_tokens <= r.original_tokens + assert all("content" in m for m in r.messages) + + +def test_lossy_gated_off_by_default(): + # allow_lossy=False must drop headroom_ml from the ladder entirely. + r = compress_progressive(_big_msgs(), target_tokens=1, max_stage="headroom_ml", allow_lossy=False) + assert "headroom_ml" not in _stages(r) + + +def test_already_small_is_noop(): + small = [{"role": "user", "content": "hi"}] + r = compress_progressive(small, target_tokens=10_000) + assert _stages(r) == [] # already under budget, nothing runs + assert r.tokens_saved == 0 diff --git a/tests/test_trained_verifier.py b/tests/test_trained_verifier.py index 606d0db..3c3c7da 100644 --- a/tests/test_trained_verifier.py +++ b/tests/test_trained_verifier.py @@ -148,6 +148,85 @@ def test_trained_verifier_interface_matches_heuristic(): assert {"score", "accepted", "threshold", "reasons", "verifier"} <= d.keys() +def test_trained_verifier_wraps_input_in_production_format(): + """The tokenizer must receive ``text_pair`` wrapped in the + ``CHEAP:\\n...\\n\\nEXPENSIVE:\\n...`` format the cross-encoder was + trained on. Without this wrapper, scores drift against the + calibrated tau=0.80 threshold. + + Production reference: + ``getnadir.dev/backend/app/services/verifier_model.py:195`` + """ + from nadirclaw.trained_verifier import TrainedVerifier + + captured: dict = {} + + class _FakeEncoding(dict): + def __init__(self): + super().__init__() + # Minimal tensor-like values so the .to(device) loop works. + class _T: + def to(self, _device): + return self + + self["input_ids"] = _T() + self["attention_mask"] = _T() + + class _FakeTokenizer: + def __call__(self, prompt, text_pair, **kwargs): + captured["prompt"] = prompt + captured["text_pair"] = text_pair + captured["kwargs"] = kwargs + return _FakeEncoding() + + class _FakeLogits: + # Two-class head; softmax([0, 0]) => probs[..., 1] == 0.5 + shape = (1, 2) + + def __init__(self): + import torch + self._t = torch.tensor([[0.0, 0.0]]) + + def __getattr__(self, name): + return getattr(self._t, name) + + class _FakeModelOut: + def __init__(self): + import torch + self.logits = torch.tensor([[0.0, 0.0]]) + + class _FakeModel: + def __call__(self, **kwargs): + return _FakeModelOut() + + def eval(self): + return self + + def to(self, _device): + return self + + v = TrainedVerifier(threshold=0.8, device="cpu") + v._tokenizer = _FakeTokenizer() + v._model = _FakeModel() + v._resolved_device = "cpu" + + # Case 1: reference_answer provided. + out = v.score("What is 2+2?", "4", reference_answer="four") + assert captured["prompt"] == "What is 2+2?" + assert captured["text_pair"] == "CHEAP:\n4\n\nEXPENSIVE:\nfour" + assert 0.0 <= out.score <= 1.0 + + # Case 2: reference_answer=None -> empty EXPENSIVE: block. + captured.clear() + v.score("What is 2+2?", "4") + assert captured["text_pair"] == "CHEAP:\n4\n\nEXPENSIVE:\n" + + # Case 3: reference_answer is whitespace-only -> stripped to empty. + captured.clear() + v.score("What is 2+2?", "4", reference_answer=" \n ") + assert captured["text_pair"] == "CHEAP:\n4\n\nEXPENSIVE:\n" + + def test_trained_verifier_get_singleton_caches(): """The module-level singleton accessor should cache same-threshold calls and return fresh instances for mismatched thresholds. Construction