diff --git a/.github/workflows/benchmark-scale.yml b/.github/workflows/benchmark-scale.yml new file mode 100644 index 00000000..56d3baaa --- /dev/null +++ b/.github/workflows/benchmark-scale.yml @@ -0,0 +1,50 @@ +name: Routing-scale smoke benchmark + +# Non-gating, scheduled routing-scale benchmark for drift detection (issue #688, +# child of #444). Runs the deterministic routing-scale profiler on a fixed seed +# and stores its JSON as a per-run trend artifact, so scaling regressions are +# visible over time without blocking any PR. This never gates: PR-time quality +# gating lives in the main CI job (benchmarks/gating.yaml + benchmark_gate.py, #491). + +on: + schedule: + # Tuesday 06:30 UTC — off-hours, staggered from the weekly scorecard job. + - cron: "30 6 * * 2" + workflow_dispatch: + +permissions: + contents: read + +jobs: + routing-scale: + name: Routing-scale profile (non-gating) + runs-on: ubuntu-latest + # The profiler is informational; a failure must never page anyone or block work. + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Today + id: date + run: echo "today=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT" + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run routing-scale profile + # Reduced sizes keep the scheduled run well under the runner timeout; + # the local `make benchmark-routing-scale` default sweeps up to 10k. + run: python benchmarks/routing_scale.py --sizes 100,1000,5000 + + - name: Upload trend artifact + uses: actions/upload-artifact@v4 + with: + name: routing-scale-${{ steps.date.outputs.today }} + path: | + benchmarks/results/routing_scale.json + docs/benchmarks/routing-scale.md + retention-days: 90 + if-no-files-found: error diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0302d4f..b6463250 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -277,6 +277,49 @@ jobs: body-path: benchmarks/results/delta.md edit-mode: replace + benchmark-gate: + # Quality-regression gate (issue #491). Where benchmark-comment *describes* + # head-vs-base movement, this job *enforces* it: a PR that regresses a gated + # quality metric (recall@k / MRR / precision@k / token-savings) beyond its + # band in benchmarks/gating.yaml fails CI. Latency is never gated. The + # deterministic quality metrics are environment-independent, so head equals + # the committed base unless a code change moved them. + name: Benchmark quality gate + needs: test + runs-on: ubuntu-latest + timeout-minutes: 20 + if: ${{ github.event_name == 'pull_request' }} + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install dependencies + run: pip install -e ".[dev]" + - name: Generate head + base snapshots + # base = the committed baseline; head = this PR's numbers (matrix run so + # the per-backend cells are populated alongside the routing summary). + run: | + cp benchmarks/results/latest.json benchmarks/results/base.json + python benchmarks/benchmark.py --matrix --output benchmarks/results/head.json + - name: Gate on quality regressions + # The `benchmark-accepted` label downgrades a failure to a warning for + # intentional trade-offs; the rationale must be in the PR description. + # NOTE: this label string is the GitHub-Actions mirror of `override_label` + # in benchmarks/gating.yaml. A `${{ }}` expression cannot read that file, + # so the two must be kept in sync by hand if the label is ever renamed. + run: | + OVERRIDE="" + if ${{ contains(github.event.pull_request.labels.*.name, 'benchmark-accepted') }}; then + OVERRIDE="--override" + fi + python scripts/benchmark_gate.py \ + --base benchmarks/results/base.json \ + --head benchmarks/results/head.json \ + --gating-config benchmarks/gating.yaml $OVERRIDE + docs-build: # Gate the docs build on PRs (issue #474). docs.yml only builds+deploys on # push to main, so a malformed docstring or broken nav could land on main diff --git a/AGENTS.md b/AGENTS.md index b9b72da5..fe6fdb6e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -218,6 +218,10 @@ make docs # mkdocs build --clean (docs site) make docs-serve # mkdocs serve (live preview) make benchmark # run benchmark harness (non-gating; writes benchmarks/results/latest.json) make benchmark-matrix # benchmark + per-backend × per-size matrix (#208) and per-namespace breakdown (#209) +make benchmark-large-catalog # 300+ tool routing benchmark + scorecard (#369); -check gates drift +make benchmark-scenario # naive all-tools vs ChoiceCard routing report (#418); -check gates drift +make trend # render benchmarks/trend.md from per-release history snapshots (#554) +make trend-check # verify benchmarks/trend.md is up to date (exits non-zero on drift) make gateway-scorecard-check # verify gateway scorecard matches its committed JSON (gating CI; #391) make record-demos-check # verify committed demo casts match current output (gating CI; #390) make smoke-eval # non-gating CI smoke-evaluation over fixed fixtures (#331/#392); deterministic, credential-free diff --git a/CHANGELOG.md b/CHANGELOG.md index 62871d08..79b3c899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,44 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Benchmark-suite maturation: scaling, scenarios, CI gating, and trend + (#369, #418, #491, #554, #687, #688).** A coordinated pass on the benchmark + subsystem, all deterministic and offline: + - **Large-catalog benchmark (#369).** `make benchmark-large-catalog` + (`benchmarks/large_catalog.py`) routes over 300+ tools across 8 namespaces + with near-duplicate distractor variants and destructive (side-effecting) + tools, reporting recall@1/3/5, MRR, ChoiceCard-vs-naive prompt-token + reduction, and allow/deny filtering of destructive tools. Writes a + committed scorecard (`benchmarks/large_catalog_scorecard.md`, latency + excluded for determinism) plus `benchmarks/results/large_catalog.json`; + `--check` gates scorecard drift and `--strict` gates regression-guard + thresholds. + - **Scenario benchmark (#418).** `make benchmark-scenario` + (`benchmarks/scenario_routing.py`) contrasts naive all-tools prompting + against bounded `ChoiceCard` routing across tool-heavy scenarios + (`benchmarks/scenarios/routing_choicecard.json`), reporting + correct-in-top-k, rank, cards shown, and token reduction to a committed + report (`benchmarks/scenario_routing.md`). + - **Quality-regression gate (#491).** `scripts/benchmark_gate.py` + + `benchmarks/gating.yaml` turn the informational benchmark delta into a + gating CI check: a PR that regresses recall@k / MRR / precision@k / + token-savings beyond its tolerance band fails the new `benchmark-gate` CI + job. Latency is never gated; the `benchmark-accepted` PR label downgrades a + failure to a warning for intentional trade-offs. + - **Release trend (#554).** `scripts/render_trend.py` captures a + deterministic, latency-free metric snapshot per release under + `benchmarks/results/history/.json` and renders the + release-over-release view to `benchmarks/trend.md` (`make trend` / + `make trend-check`). + - **Scaling matrix docs (#687).** `docs/benchmarks/scaling-matrix.md` + documents the 10k-tool scaling methodology, reproducible commands, and + result interpretation, tying together the routing-scale, large-catalog, + and per-backend matrix benchmarks. + - **Scheduled routing-scale smoke (#688).** A non-gating + `.github/workflows/benchmark-scale.yml` runs the routing-scale profiler on + a weekly schedule and uploads its JSON + report as a per-run trend + artifact. + - **Multi-client MCP config-pack generator (#659).** Added `contextweaver mcp generate-configs` to render client recipe files (`copilot_mcp.json`, `cursor_mcp.json`, `claude_desktop_config.json`, diff --git a/Makefile b/Makefile index 97ec1420..c27da5cb 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: fmt lint type test example demo ci ci-full floor-deps tool-smoke docs docs-serve benchmark benchmark-matrix benchmark-routing-scale benchmark-gateway benchmark-primitives sidecar-smoke token-calibration smoke-eval e2e-quality scorecard scorecard-check sweep-scoring architectures llms llms-check weaver-conformance schemas schemas-check context-rot context-rot-check readme-version-check security-policy-check drift drift-check api api-check module-size-check module-size-update doc-snippets-check +.PHONY: fmt lint type test example demo ci ci-full floor-deps tool-smoke docs docs-serve benchmark benchmark-matrix benchmark-routing-scale benchmark-gateway benchmark-primitives benchmark-large-catalog benchmark-large-catalog-check benchmark-scenario benchmark-scenario-check trend trend-check sidecar-smoke token-calibration smoke-eval e2e-quality scorecard scorecard-check sweep-scoring architectures llms llms-check weaver-conformance schemas schemas-check context-rot context-rot-check readme-version-check security-policy-check drift drift-check api api-check module-size-check module-size-update doc-snippets-check # Interpreter and pip front-end (issue #712). Default to `python3`, which is what # many modern environments ship (some have no bare `python` on PATH at all). @@ -84,6 +84,34 @@ benchmark-matrix: benchmark-routing-scale: $(PYTHON) benchmarks/routing_scale.py +# Large-catalog routing benchmark (issue #369; non-gating). 300+ tools across 8 +# namespaces with near-duplicate distractors and destructive tools; writes +# benchmarks/large_catalog_scorecard.md + benchmarks/results/large_catalog.json. +# `-check` verifies the committed scorecard is in sync (deterministic accuracy). +benchmark-large-catalog: + $(PYTHON) benchmarks/large_catalog.py + +benchmark-large-catalog-check: + $(PYTHON) benchmarks/large_catalog.py --check + +# Scenario benchmark (issue #418; non-gating): naive all-tools prompt vs bounded +# ChoiceCard routing. Writes benchmarks/scenario_routing.md; `-check` gates drift. +benchmark-scenario: + $(PYTHON) benchmarks/scenario_routing.py + +benchmark-scenario-check: + $(PYTHON) benchmarks/scenario_routing.py --check + +# Release-over-release benchmark trend (issue #554). `trend` re-renders +# benchmarks/trend.md from benchmarks/results/history/*.json; `trend-check` gates +# drift. Capture a release snapshot with: +# python scripts/render_trend.py --snapshot --from benchmarks/results/latest.json +trend: + $(PYTHON) scripts/render_trend.py + +trend-check: + $(PYTHON) scripts/render_trend.py --check + benchmark-gateway: $(PYTHON) benchmarks/gateway_benchmark.py diff --git a/benchmarks/gating.yaml b/benchmarks/gating.yaml new file mode 100644 index 00000000..c7cc6eee --- /dev/null +++ b/benchmarks/gating.yaml @@ -0,0 +1,25 @@ +# Benchmark quality-regression gate configuration (issue #491). +# +# Turns the informational benchmark-delta PR comment (scripts/benchmark_delta.py) +# into a gating CI check: a PR that regresses a *quality* metric beyond its band +# (vs the committed benchmarks/results/latest.json baseline) fails CI. Latency +# stays informational — runner variance makes it unreliable as a gate. +# +# Band semantics (see scripts/benchmark_gate.py): +# - fraction metrics (recall_at_k / mrr / precision_at_k, range 0..1) regress +# when head < base - max_regression_pp / 100. +# - percent metrics (token_savings_pct, already 0..100) regress when +# head < base - max_regression_pp. +# +# The ~0.5pp noise floor at 200 gold queries (.github/prompts/add-eval.prompt.md) +# informs the 1.0pp quality band; token savings get a looser 2.0pp band. +quality: + recall_at_k: { max_regression_pp: 1.0 } + mrr: { max_regression_pp: 1.0 } + precision_at_k: { max_regression_pp: 1.0 } + token_savings_pct: { max_regression_pp: 2.0 } +latency: + gating: false +# A maintainer-applied PR label that downgrades a gate failure to a warning for +# intentional trade-offs (the rationale belongs in the PR description). +override_label: benchmark-accepted diff --git a/benchmarks/large_catalog.py b/benchmarks/large_catalog.py new file mode 100644 index 00000000..2bf47236 --- /dev/null +++ b/benchmarks/large_catalog.py @@ -0,0 +1,338 @@ +"""Large-catalog routing benchmark: 300+ tools across many namespaces (issue #369). + +The headline adoption case is a coding-agent setup with many MCP servers and +hundreds of tools. This deterministic, offline benchmark simulates that shape — +300+ tools across 8 namespaces, with near-duplicate *distractor* variants and +*destructive* (write/side-effecting) tools — and measures whether routing keeps +the right tool reachable while collapsing the prompt: + +- recall@1/3/5 and MRR for expected-tool selection (`tool_browse`); +- prompt-token reduction of bounded ``ChoiceCard``s vs the naive all-tools prompt; +- allow/deny filtering of destructive tools (none reach the shortlist when denied). + +It reuses the installed package only (no import from sibling benchmark scripts), +mirroring ``benchmarks/smoke_eval.py``. Accuracy and token figures use +``CharDivFourEstimator`` so they are environment-independent; only latency varies +with hardware and is reported to stdout / JSON, never to the committed scorecard. + +Usage:: + + python benchmarks/large_catalog.py # write JSON + scorecard + python benchmarks/large_catalog.py --check # exit non-zero on scorecard drift + python benchmarks/large_catalog.py --strict # exit non-zero if below thresholds + +Exit codes: 0 on success; 1 on drift (``--check``) or threshold breach (``--strict``). +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_ROOT / "src")) + +from contextweaver.eval.dataset import EvalCase, EvalDataset # noqa: E402 +from contextweaver.eval.routing import evaluate_routing # noqa: E402 +from contextweaver.protocols import CharDivFourEstimator # noqa: E402 +from contextweaver.routing.cards import make_choice_cards, render_cards_text # noqa: E402 +from contextweaver.routing.catalog import ( # noqa: E402 + generate_sample_catalog, + load_catalog_dicts, +) +from contextweaver.routing.router import Router # noqa: E402 +from contextweaver.routing.tree import TreeBuilder # noqa: E402 +from contextweaver.types import SelectableItem # noqa: E402 + +DEFAULT_CATALOG_SIZE = 320 +DEFAULT_SEED = 42 +TOP_K = 5 +BEAM_WIDTH = 3 + +DEFAULT_JSON = _ROOT / "benchmarks" / "results" / "large_catalog.json" +DEFAULT_SCORECARD = _ROOT / "benchmarks" / "large_catalog_scorecard.md" + +# Warn/gate thresholds (issue #369 acceptance: "fails or warns when ... regress"). +# These are *regression guards*, set below the deterministic baseline (recall@5 +# ≈ 0.71 against the distractor-heavy catalog, token reduction ≈ 97%) with margin +# so a real quality drop trips the warning while the synthetic near-duplicate +# variants deliberately competing for rank do not. +RECALL_AT_5_FLOOR = 0.65 +TOKEN_REDUCTION_FLOOR_PCT = 80.0 + +_EST = CharDivFourEstimator() + + +def _count(text: str) -> int: + return _EST.estimate(text) + + +# --------------------------------------------------------------------------- +# Catalog construction +# --------------------------------------------------------------------------- + + +def build_large_catalog(n: int, seed: int) -> list[SelectableItem]: + """Return *n* deterministic tools, extending the 83-item pool with variants. + + Synthetic variants share their original's namespace and tags (preserving + routing signal density) but carry distinct IDs, so they act as near-duplicate + *distractors* without ever matching a gold query. Variants are always + non-destructive (``side_effects`` defaults to ``False`` and is not copied + from the original), so ``destructive_tools`` in the result reflects the base + 83-item pool only — the deny test exercises exactly that base set. + """ + base = load_catalog_dicts(generate_sample_catalog(n=83, seed=seed)) + items: list[SelectableItem] = list(base) + version = 2 + while len(items) < n: + for orig in list(base): + items.append( + SelectableItem( + f"{orig.id}.v{version}", + orig.kind, + f"{orig.name}_v{version}", + f"{orig.description} (variant {version})", + tags=orig.tags, + namespace=orig.namespace, + ) + ) + if len(items) >= n: + break + version += 1 + return sorted(items, key=lambda i: i.id)[:n] + + +def _gold_dataset(base_items: list[SelectableItem]) -> EvalDataset: + """Derive a deterministic gold set: each base tool's description -> its id.""" + cases = [EvalCase(query=it.description, expected=[it.id]) for it in base_items] + return EvalDataset(cases=sorted(cases, key=lambda c: c.query)) + + +# --------------------------------------------------------------------------- +# Measurement +# --------------------------------------------------------------------------- + + +@dataclass +class LargeCatalogResult: + """Deterministic + latency results of one large-catalog run.""" + + catalog_size: int + namespaces: int + distractor_tools: int + destructive_tools: int + queries: int + recall_at_1: float + recall_at_3: float + recall_at_5: float + mrr: float + mean_naive_tokens: int + mean_card_tokens: int + token_reduction_pct: float + destructive_in_shortlist_denied: int + latency_ms_p50: float + latency_ms_p99: float + + +def _percentile(values: list[float], pct: float) -> float: + if not values: + return 0.0 + ordered = sorted(values) + idx = min(len(ordered) - 1, int(round((pct / 100.0) * (len(ordered) - 1)))) + return ordered[idx] + + +def run_benchmark(n: int = DEFAULT_CATALOG_SIZE, seed: int = DEFAULT_SEED) -> LargeCatalogResult: + """Run the large-catalog benchmark and return its results.""" + items = build_large_catalog(n, seed) + base_items = [it for it in items if ".v" not in it.id] + distractors = len(items) - len(base_items) + destructive = [it for it in items if it.side_effects] + namespaces = {it.namespace for it in items if it.namespace} + + router = Router(TreeBuilder().build(items), items=items, top_k=TOP_K, beam_width=BEAM_WIDTH) + catalog_ids = {it.id for it in items} + dataset = _gold_dataset(base_items) + report = evaluate_routing(router, dataset, catalog_ids=catalog_ids) + + # Naive prompt = every tool's name + description. Stable across queries. + naive_text = "\n".join(f"{it.name}: {it.description}" for it in items) + naive_tokens = _count(naive_text) + + card_token_samples: list[int] = [] + latencies: list[float] = [] + for case in dataset: + start = time.perf_counter() + result = router.route(case.query) + latencies.append((time.perf_counter() - start) * 1000.0) + cards = make_choice_cards(result.candidate_items) + card_token_samples.append(_count(render_cards_text(cards))) + mean_card = ( + round(sum(card_token_samples) / len(card_token_samples)) if card_token_samples else 0 + ) + reduction = round((1 - mean_card / naive_tokens) * 100.0, 2) if naive_tokens else 0.0 + + # Allow/deny filtering: deny every destructive tool and confirm none survive. + deny_ids = {it.id for it in destructive} + leaked = 0 + if deny_ids: + for case in dataset: + shortlist = set(router.route(case.query, exclude_ids=deny_ids).candidate_ids) + leaked += len(shortlist & deny_ids) + + return LargeCatalogResult( + catalog_size=len(items), + namespaces=len(namespaces), + distractor_tools=distractors, + destructive_tools=len(destructive), + queries=report.queries_evaluated, + recall_at_1=round(report.top_1_recall, 4), + recall_at_3=round(report.top_3_recall, 4), + recall_at_5=round(report.top_5_recall, 4), + mrr=round(report.mrr, 4), + mean_naive_tokens=naive_tokens, + mean_card_tokens=mean_card, + token_reduction_pct=reduction, + destructive_in_shortlist_denied=leaked, + latency_ms_p50=round(_percentile(latencies, 50), 3), + latency_ms_p99=round(_percentile(latencies, 99), 3), + ) + + +# --------------------------------------------------------------------------- +# Rendering (deterministic — no latency, no environment) +# --------------------------------------------------------------------------- + + +def to_json(result: LargeCatalogResult) -> dict[str, Any]: + """Full result payload, including latency (for the JSON artifact only).""" + return { + "benchmark": "large_catalog", + "seed": DEFAULT_SEED, + "k": TOP_K, + **result.__dict__, + } + + +def render_scorecard(result: LargeCatalogResult) -> str: + """Render the deterministic, latency-free scorecard markdown.""" + breaches = _threshold_breaches(result) + status = "✅ within thresholds" if not breaches else "⚠️ " + "; ".join(breaches) + return "\n".join( + [ + "# contextweaver — Large-Catalog Routing Scorecard", + "", + "> Auto-generated by `make benchmark-large-catalog`. Do not edit by hand.", + "> Source: `benchmarks/large_catalog.py` (issue #369). Offline, deterministic.", + "", + f"- Catalog size: `{result.catalog_size}` tools across " + f"`{result.namespaces}` namespaces", + f"- Near-duplicate distractor tools: `{result.distractor_tools}`", + f"- Destructive (side-effecting) tools: `{result.destructive_tools}`", + f"- Gold queries: `{result.queries}`", + "- Token estimator: `CharDivFourEstimator` (no model dependency)", + "", + "## Routing accuracy", + "", + "| recall@1 | recall@3 | recall@5 | MRR |", + "|---:|---:|---:|---:|", + f"| {result.recall_at_1:.4f} | {result.recall_at_3:.4f} " + f"| {result.recall_at_5:.4f} | {result.mrr:.4f} |", + "", + "## Prompt-token reduction (ChoiceCards vs naive all-tools prompt)", + "", + "| naive tokens | mean card tokens | reduction |", + "|---:|---:|---:|", + f"| {result.mean_naive_tokens} | {result.mean_card_tokens} " + f"| {result.token_reduction_pct:.2f}% |", + "", + "## Destructive-tool filtering", + "", + f"- Destructive tools reaching the shortlist when denied: " + f"`{result.destructive_in_shortlist_denied}` (expected `0`).", + "", + "## Thresholds", + "", + f"- recall@5 floor: `{RECALL_AT_5_FLOOR:.2f}` · " + f"token-reduction floor: `{TOKEN_REDUCTION_FLOOR_PCT:.0f}%`", + f"- Status: {status}", + "", + "Latency is hardware-dependent and intentionally excluded from this", + "committed scorecard; see `benchmarks/results/large_catalog.json` for the", + "p50/p99 measured on the producing host.", + "", + ] + ) + + +def _threshold_breaches(result: LargeCatalogResult) -> list[str]: + breaches: list[str] = [] + if result.recall_at_5 < RECALL_AT_5_FLOOR: + breaches.append(f"recall@5 {result.recall_at_5:.4f} < {RECALL_AT_5_FLOOR:.2f}") + if result.token_reduction_pct < TOKEN_REDUCTION_FLOOR_PCT: + breaches.append( + f"token reduction {result.token_reduction_pct:.2f}% < {TOKEN_REDUCTION_FLOOR_PCT:.0f}%" + ) + if result.destructive_in_shortlist_denied: + breaches.append( + f"{result.destructive_in_shortlist_denied} denied destructive tool(s) leaked" + ) + return breaches + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0] if __doc__ else None) + parser.add_argument("--size", type=int, default=DEFAULT_CATALOG_SIZE) + parser.add_argument("--seed", type=int, default=DEFAULT_SEED) + parser.add_argument("--check", action="store_true", help="Exit non-zero on scorecard drift.") + parser.add_argument("--strict", action="store_true", help="Exit non-zero if below thresholds.") + args = parser.parse_args(argv) + + result = run_benchmark(args.size, args.seed) + scorecard = render_scorecard(result) + + if args.check: + current = ( + DEFAULT_SCORECARD.read_text(encoding="utf-8") if DEFAULT_SCORECARD.exists() else "" + ) + if current != scorecard: + print( + "large-catalog scorecard drift — run `make benchmark-large-catalog` and commit.", + file=sys.stderr, + ) + return 1 + print("large-catalog scorecard: up to date") + return 0 + + DEFAULT_JSON.parent.mkdir(parents=True, exist_ok=True) + DEFAULT_JSON.write_text( + json.dumps(to_json(result), indent=2, sort_keys=True) + "\n", encoding="utf-8", newline="\n" + ) + DEFAULT_SCORECARD.write_text(scorecard, encoding="utf-8", newline="\n") + print(f"Wrote {DEFAULT_SCORECARD} and {DEFAULT_JSON}") + print( + f"recall@5={result.recall_at_5:.4f} reduction={result.token_reduction_pct:.2f}% " + f"p99={result.latency_ms_p99:.3f}ms" + ) + + breaches = _threshold_breaches(result) + if breaches: + print("WARNING: " + "; ".join(breaches), file=sys.stderr) + if args.strict: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/large_catalog_scorecard.md b/benchmarks/large_catalog_scorecard.md new file mode 100644 index 00000000..e10d249f --- /dev/null +++ b/benchmarks/large_catalog_scorecard.md @@ -0,0 +1,35 @@ +# contextweaver — Large-Catalog Routing Scorecard + +> Auto-generated by `make benchmark-large-catalog`. Do not edit by hand. +> Source: `benchmarks/large_catalog.py` (issue #369). Offline, deterministic. + +- Catalog size: `320` tools across `8` namespaces +- Near-duplicate distractor tools: `238` +- Destructive (side-effecting) tools: `31` +- Gold queries: `82` +- Token estimator: `CharDivFourEstimator` (no model dependency) + +## Routing accuracy + +| recall@1 | recall@3 | recall@5 | MRR | +|---:|---:|---:|---:| +| 0.7073 | 0.7073 | 0.7073 | 0.7073 | + +## Prompt-token reduction (ChoiceCards vs naive all-tools prompt) + +| naive tokens | mean card tokens | reduction | +|---:|---:|---:| +| 4368 | 113 | 97.41% | + +## Destructive-tool filtering + +- Destructive tools reaching the shortlist when denied: `0` (expected `0`). + +## Thresholds + +- recall@5 floor: `0.65` · token-reduction floor: `80%` +- Status: ✅ within thresholds + +Latency is hardware-dependent and intentionally excluded from this +committed scorecard; see `benchmarks/results/large_catalog.json` for the +p50/p99 measured on the producing host. diff --git a/benchmarks/results/history/0.16.0.json b/benchmarks/results/history/0.16.0.json new file mode 100644 index 00000000..865a0beb --- /dev/null +++ b/benchmarks/results/history/0.16.0.json @@ -0,0 +1,24 @@ +{ + "metrics": { + "mean_token_reduction_pct": 64.31, + "routing_mrr": { + "1000": 0.1456, + "50": 0.4978, + "83": 0.3242 + }, + "routing_precision_at_k": { + "1000": 0.031, + "50": 0.1191, + "83": 0.08 + }, + "routing_recall_at_k": { + "1000": 0.1475, + "50": 0.5649, + "83": 0.3825 + }, + "total_dedup_removed": 4, + "total_items_dropped": 7 + }, + "release": "0.16.0", + "schema_version": 1 +} diff --git a/benchmarks/scenario_routing.md b/benchmarks/scenario_routing.md new file mode 100644 index 00000000..e2e85681 --- /dev/null +++ b/benchmarks/scenario_routing.md @@ -0,0 +1,26 @@ +# contextweaver — Scenario Routing Benchmark + +> Auto-generated by `make benchmark-scenario`. Do not edit by hand. +> Source: `benchmarks/scenario_routing.py` (issue #418). Offline, deterministic. + +Naive all-tools prompting vs bounded `ChoiceCard` routing across tool-heavy +scenarios. Token counts use `CharDivFourEstimator` (no model dependency). + +- Scenarios: `6` · correct tool in top-5: `4/6` · mean token reduction: `96.54%` + +| scenario | catalog | correct@top-k | rank | cards | naive tokens | card tokens | reduction | +|---|---:|:---:|---:|---:|---:|---:|---:| +| draft_email | 300 | ✅ | 1 | 5 | 4083 | 111 | 97.28% | +| find_contact | 300 | ✅ | 1 | 5 | 4083 | 111 | 97.28% | +| find_unpaid_invoices | 200 | ❌ | — | 5 | 2618 | 110 | 95.80% | +| refund_a_payment | 200 | ❌ | — | 5 | 2618 | 108 | 95.87% | +| revenue_report | 300 | ✅ | 1 | 5 | 4083 | 117 | 97.13% | +| send_slack_update | 200 | ✅ | 1 | 5 | 2618 | 108 | 95.87% | + +Reading the table: + +- `correct@top-k` is whether the expected tool survived into the bounded + shortlist — the property naive prompting trivially satisfies (every tool + is present) but at the token cost in the `naive tokens` column. +- `reduction` is how much smaller the ChoiceCard prompt is than listing + every tool's name + description — the headline routing benefit at scale. diff --git a/benchmarks/scenario_routing.py b/benchmarks/scenario_routing.py new file mode 100644 index 00000000..a8c63c62 --- /dev/null +++ b/benchmarks/scenario_routing.py @@ -0,0 +1,200 @@ +"""Scenario benchmark: naive all-tools prompt vs bounded ChoiceCard routing (#418). + +A scenario-style benchmark that makes contextweaver's routing value concrete: +for each tool-heavy task it contrasts the two prompt-construction strategies a +tool-using agent can pick from — + +1. **naive** — expose *every* tool's name + description to the model; +2. **contextweaver** — route the query and expose only the bounded ``ChoiceCard`` + shortlist. + +For each scenario it reports whether the expected tool stays reachable +(correct-in-top-k + its rank), how many cards are shown, and the prompt-token +cost of each strategy. Deterministic and offline: catalogs are seeded and token +counts use ``CharDivFourEstimator``, so the report is environment-independent. + +It does not depend on LangWatch (the inspiration) or any hosted workspace, and +reuses only the installed package, mirroring ``benchmarks/smoke_eval.py``. + +Usage:: + + python benchmarks/scenario_routing.py # write the markdown report + python benchmarks/scenario_routing.py --check # exit non-zero on drift + +Exit codes: 0 on success; 1 on report drift (``--check``). +""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import dataclass +from pathlib import Path + +_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_ROOT / "src")) + +from contextweaver.protocols import CharDivFourEstimator # noqa: E402 +from contextweaver.routing.cards import make_choice_cards, render_cards_text # noqa: E402 +from contextweaver.routing.catalog import ( # noqa: E402 + generate_sample_catalog, + load_catalog_dicts, +) +from contextweaver.routing.router import Router # noqa: E402 +from contextweaver.routing.tree import TreeBuilder # noqa: E402 +from contextweaver.types import SelectableItem # noqa: E402 + +DEFAULT_DATASET = _ROOT / "benchmarks" / "scenarios" / "routing_choicecard.json" +DEFAULT_OUTPUT = _ROOT / "benchmarks" / "scenario_routing.md" +SEED = 42 +TOP_K = 5 +BEAM_WIDTH = 3 + +_EST = CharDivFourEstimator() + + +def _count(text: str) -> int: + return _EST.estimate(text) + + +def _make_catalog(n: int, seed: int = SEED) -> list[SelectableItem]: + """Deterministic catalog of *n* tools, extending the 83-item pool with variants.""" + base = load_catalog_dicts(generate_sample_catalog(n=83, seed=seed)) + items: list[SelectableItem] = list(base) + version = 2 + while len(items) < n: + for orig in list(base): + items.append( + SelectableItem( + f"{orig.id}.v{version}", + orig.kind, + f"{orig.name}_v{version}", + f"{orig.description} (variant {version})", + tags=orig.tags, + namespace=orig.namespace, + ) + ) + if len(items) >= n: + break + version += 1 + return sorted(items, key=lambda i: i.id)[:n] + + +@dataclass +class ScenarioRow: + """One scenario's naive-vs-ChoiceCard comparison.""" + + name: str + catalog_size: int + correct_in_top_k: bool + correct_rank: int # 1-based; 0 = not in shortlist + cards_shown: int + naive_tokens: int + card_tokens: int + token_reduction_pct: float + + +def run_scenario(scenario: dict[str, object]) -> ScenarioRow: + """Route one scenario and return its comparison row.""" + size = int(scenario["catalog_size"]) # type: ignore[arg-type] + query = str(scenario["query"]) + expected = set(scenario.get("expected", [])) # type: ignore[arg-type] + items = _make_catalog(size) + router = Router(TreeBuilder().build(items), items=items, top_k=TOP_K, beam_width=BEAM_WIDTH) + result = router.route(query) + + candidate_ids = list(result.candidate_ids) + rank = next((i + 1 for i, cid in enumerate(candidate_ids) if cid in expected), 0) + cards = make_choice_cards(result.candidate_items) + naive_tokens = _count("\n".join(f"{it.name}: {it.description}" for it in items)) + card_tokens = _count(render_cards_text(cards)) + reduction = round((1 - card_tokens / naive_tokens) * 100.0, 2) if naive_tokens else 0.0 + return ScenarioRow( + name=str(scenario["name"]), + catalog_size=len(items), + correct_in_top_k=rank > 0, + correct_rank=rank, + cards_shown=len(cards), + naive_tokens=naive_tokens, + card_tokens=card_tokens, + token_reduction_pct=reduction, + ) + + +def run_all(dataset_path: Path = DEFAULT_DATASET) -> list[ScenarioRow]: + """Run every scenario in *dataset_path*, ordered by scenario name.""" + scenarios = json.loads(dataset_path.read_text(encoding="utf-8")) + rows = [run_scenario(s) for s in scenarios] + return sorted(rows, key=lambda r: r.name) + + +def render_report(rows: list[ScenarioRow]) -> str: + """Render the deterministic scenario comparison report.""" + hits = sum(1 for r in rows if r.correct_in_top_k) + mean_reduction = round(sum(r.token_reduction_pct for r in rows) / len(rows), 2) if rows else 0.0 + lines = [ + "# contextweaver — Scenario Routing Benchmark", + "", + "> Auto-generated by `make benchmark-scenario`. Do not edit by hand.", + "> Source: `benchmarks/scenario_routing.py` (issue #418). Offline, deterministic.", + "", + "Naive all-tools prompting vs bounded `ChoiceCard` routing across tool-heavy", + "scenarios. Token counts use `CharDivFourEstimator` (no model dependency).", + "", + f"- Scenarios: `{len(rows)}` · correct tool in top-{TOP_K}: " + f"`{hits}/{len(rows)}` · mean token reduction: `{mean_reduction:.2f}%`", + "", + "| scenario | catalog | correct@top-k | rank | cards | naive tokens " + "| card tokens | reduction |", + "|---|---:|:---:|---:|---:|---:|---:|---:|", + ] + for r in rows: + correct = "✅" if r.correct_in_top_k else "❌" + rank = str(r.correct_rank) if r.correct_rank else "—" + lines.append( + f"| {r.name} | {r.catalog_size} | {correct} | {rank} | {r.cards_shown} " + f"| {r.naive_tokens} | {r.card_tokens} | {r.token_reduction_pct:.2f}% |" + ) + lines.extend( + [ + "", + "Reading the table:", + "", + "- `correct@top-k` is whether the expected tool survived into the bounded", + " shortlist — the property naive prompting trivially satisfies (every tool", + " is present) but at the token cost in the `naive tokens` column.", + "- `reduction` is how much smaller the ChoiceCard prompt is than listing", + " every tool's name + description — the headline routing benefit at scale.", + "", + ] + ) + return "\n".join(lines) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0] if __doc__ else None) + parser.add_argument("--dataset", default=str(DEFAULT_DATASET)) + parser.add_argument("--output", default=str(DEFAULT_OUTPUT)) + parser.add_argument("--check", action="store_true", help="Exit non-zero on report drift.") + args = parser.parse_args(argv) + + report = render_report(run_all(Path(args.dataset))) + output = Path(args.output) + if args.check: + current = output.read_text(encoding="utf-8") if output.exists() else "" + if current != report: + print( + "scenario report drift — run `make benchmark-scenario` and commit.", + file=sys.stderr, + ) + return 1 + print("scenario report: up to date") + return 0 + output.write_text(report, encoding="utf-8", newline="\n") + print(f"Wrote {output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/scenarios/routing_choicecard.json b/benchmarks/scenarios/routing_choicecard.json new file mode 100644 index 00000000..58b1ad3c --- /dev/null +++ b/benchmarks/scenarios/routing_choicecard.json @@ -0,0 +1,38 @@ +[ + { + "name": "find_unpaid_invoices", + "query": "find unpaid invoices and draft a reminder", + "expected": ["billing.invoices.search"], + "catalog_size": 200 + }, + { + "name": "refund_a_payment", + "query": "refund a completed payment to the customer", + "expected": ["billing.payments.refund"], + "catalog_size": 200 + }, + { + "name": "send_slack_update", + "query": "post a status update to a slack channel", + "expected": ["comms.slack.post"], + "catalog_size": 200 + }, + { + "name": "find_contact", + "query": "find a contact by name or email address", + "expected": ["crm.contacts.find"], + "catalog_size": 300 + }, + { + "name": "revenue_report", + "query": "generate a revenue summary report", + "expected": ["billing.reports.revenue"], + "catalog_size": 300 + }, + { + "name": "draft_email", + "query": "create an email draft to a customer", + "expected": ["comms.email.draft"], + "catalog_size": 300 + } +] diff --git a/benchmarks/trend.md b/benchmarks/trend.md new file mode 100644 index 00000000..5557f6de --- /dev/null +++ b/benchmarks/trend.md @@ -0,0 +1,46 @@ +# contextweaver — Benchmark Trend + +> Auto-generated by `make trend`. Do not edit by hand. +> Source: `benchmarks/results/history/*.json` (one snapshot per release). + +Release-over-release view of the deterministic benchmark metrics. Latency +is excluded — it is environment-dependent and not comparable across release +machines. This page is visibility only; PR-time regression gating lives in +`benchmarks/gating.yaml` + `scripts/benchmark_gate.py` (#491). + +Releases recorded: 1 (`0.16.0` … `0.16.0`). + +## Routing recall@k by catalog size + +| release | size=50 | size=83 | size=1000 | +|---|---:|---:|---:| +| `0.16.0` | 0.5649 | 0.3825 | 0.1475 | + +## Routing MRR by catalog size + +| release | size=50 | size=83 | size=1000 | +|---|---:|---:|---:| +| `0.16.0` | 0.4978 | 0.3242 | 0.1456 | + +## Routing precision@k by catalog size + +| release | size=50 | size=83 | size=1000 | +|---|---:|---:|---:| +| `0.16.0` | 0.1191 | 0.0800 | 0.0310 | + +## Context pipeline quality + +| release | mean token reduction | items dropped | dedup removed | +|---|---:|---:|---:| +| `0.16.0` | 64.31% | 7 | 4 | + +--- + +## Capturing a release snapshot + +```bash +make benchmark # refresh benchmarks/results/latest.json +python scripts/render_trend.py --snapshot \ + --from benchmarks/results/latest.json +make trend # re-render benchmarks/trend.md +``` diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 83d8655f..5de7a573 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -24,6 +24,24 @@ git diff --quiet benchmarks/scorecard.md # passes on clean re-run with same se The check `git diff --quiet benchmarks/scorecard.md` is the determinism gate: identical inputs must produce byte-identical scorecard output. +## Scaling and trend + +Beyond the headline scorecard, three companion benchmarks track behaviour at +scale and over time: + +| What | Command | Output | +|---|---|---| +| Latency to 10k tools + cache speedup | `make benchmark-routing-scale` | [`scaling matrix`](benchmarks/scaling-matrix.md) | +| Recall + token reduction at 300+ tools | `make benchmark-large-catalog` | `benchmarks/large_catalog_scorecard.md` | +| Naive all-tools vs ChoiceCard routing | `make benchmark-scenario` | `benchmarks/scenario_routing.md` | +| Release-over-release trend | `make trend` | [`benchmarks/trend.md`](https://github.com/dgenio/contextweaver/blob/main/benchmarks/trend.md) | + +PR-time regressions are caught by the gating check (`benchmarks/gating.yaml` + +`scripts/benchmark_gate.py`): a PR that drops a gated quality metric beyond its +tolerance band fails CI unless the `benchmark-accepted` label is applied with a +rationale. See the [scaling matrix](benchmarks/scaling-matrix.md) page for the +full methodology. + ## What is measured **Routing.** Precision\@k, recall\@k, MRR, and p50/p95/p99 latency at catalog diff --git a/docs/benchmarks/scaling-matrix.md b/docs/benchmarks/scaling-matrix.md new file mode 100644 index 00000000..c619cf16 --- /dev/null +++ b/docs/benchmarks/scaling-matrix.md @@ -0,0 +1,65 @@ +# Scaling benchmark matrix + +How contextweaver's routing behaves as the tool catalog grows — the +methodology, the reproducible commands, and how to read the numbers (issue +#687). This page ties together three deterministic, offline benchmarks that +each measure a different slice of "does this still work at scale?" + +| Benchmark | Question it answers | Command | Output | +|---|---|---|---| +| Routing-scale profile | How does build/route **latency** scale to 10k tools, and how much does the persistent cache save? | `make benchmark-routing-scale` | [`routing-scale.md`](routing-scale.md) · `benchmarks/results/routing_scale.json` | +| Large-catalog quality | At 300+ tools across many namespaces, does routing keep the right tool **reachable** while collapsing the prompt? | `make benchmark-large-catalog` | `benchmarks/large_catalog_scorecard.md` · `benchmarks/results/large_catalog.json` | +| Per-backend matrix | How do `tfidf` / `bm25` / embedding backends compare across catalog sizes? | `make benchmark-matrix` | `benchmarks/scorecard.md` (matrix section) | + +## Methodology + +- **Deterministic and offline.** Catalogs are generated from a seeded pool + (`generate_sample_catalog`) and extended with near-duplicate variants for + larger sizes. No network and no model calls; token counts use the + `CharDivFourEstimator` so accuracy and token figures are + environment-independent. +- **Latency is host-dependent.** Treat latency columns as *ordering*, not + absolutes — the relative cost between catalog sizes is portable, the + absolute millisecond count is not. Quality metrics (recall@k, MRR, token + reduction) are environment-independent and should be byte-identical on a + clean re-run. +- **Scale points.** The routing-scale profile sweeps `100 → 1000 → 5000 → + 10000` tools. The large-catalog quality benchmark runs at 320 tools across + 8 namespaces with ~240 near-duplicate distractor tools and ~30 destructive + (side-effecting) tools. + +## Reproducing the full matrix + +```bash +make benchmark-routing-scale # latency + cache speedup up to 10k tools +make benchmark-large-catalog # recall/MRR + token reduction at 300+ tools +make benchmark-matrix # per-backend × per-size accuracy matrix +``` + +Each command writes a committed Markdown scorecard plus a machine-readable +JSON artifact under `benchmarks/results/`. + +## Interpreting the results + +- **Cold start dominates at scale.** In the routing-scale profile, graph + construction (`TreeBuilder.build`) grows super-linearly and dominates cold + start. Deployments that recreate a router per request over the same catalog + should persist the graph and fitted index (`save_graph`/`load_graph` + + `RoutingIndexCache`); the `cold speedup` column quantifies the win. +- **Recall degrades predictably with catalog size.** As distractors multiply, + near-duplicate tools compete with the true match. The large-catalog + scorecard reports recall@1/3/5 against this pressure; a drop below the + scorecard's threshold floor is flagged as a regression. +- **Token reduction is the headline benefit.** Bounded `ChoiceCard`s shrink + the routing prompt by ~95–97% versus listing every tool's name + description + (the naive baseline these benchmarks measure; full JSON schemas would make + the gap larger still) — and the gap widens as the catalog grows, which is + exactly when naive all-tools prompting becomes untenable. + +## Trend over releases + +Per-release snapshots of the deterministic metrics are captured under +`benchmarks/results/history/` and rendered to +[`benchmarks/trend.md`](https://github.com/dgenio/contextweaver/blob/main/benchmarks/trend.md) +(`make trend`), so scaling regressions that creep in across releases stay +visible. diff --git a/mkdocs.yml b/mkdocs.yml index 48a76faf..b038198f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -115,7 +115,10 @@ nav: - Persistent & Remote Stores: persistent_stores.md - Puppetmaster: integration_puppetmaster.md - Architecture: architecture.md - - Benchmarks: benchmarks.md + - Benchmarks: + - Overview: benchmarks.md + - Scaling matrix: benchmarks/scaling-matrix.md + - Routing-scale profile: benchmarks/routing-scale.md - Adopter Benchmark Report: benchmark_report.md - Token Calibration: token_calibration.md - Context Rot Demo: context_rot.md diff --git a/scripts/benchmark_gate.py b/scripts/benchmark_gate.py new file mode 100644 index 00000000..f9d5c5bb --- /dev/null +++ b/scripts/benchmark_gate.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +"""Gate benchmark quality regressions with tolerance bands (issue #491). + +Companion to ``scripts/benchmark_delta.py`` (the informational sticky PR +comment). Where the delta script *describes* head-vs-base movement, this +script *enforces* it: a PR that regresses a gated quality metric beyond its +band — recall@k, MRR, precision@k, token-savings — exits non-zero so CI can +block the merge. Latency cells are never gated (runner variance). + +The gate compares a head ``latest.json`` against the committed base +``latest.json`` cell-by-cell, keyed by identity (catalog size, backend×size, +or scenario), so a regression in any one cell is surfaced with its location. + +The script is **stdlib-only on the hot path**; the YAML config is parsed with a +lazy ``yaml`` import (a core dependency) only when ``--gating-config`` is read, +so the import is paid in CI after ``pip install -e .`` runs. + +Usage:: + + python scripts/benchmark_gate.py --base base.json --head head.json + python scripts/benchmark_gate.py --base base.json --head head.json \\ + --gating-config benchmarks/gating.yaml + python scripts/benchmark_gate.py --base base.json --head head.json --override + +Exit codes: ``0`` when every gated cell is within band (or ``--override`` is +set), ``1`` when any gated cell regresses beyond its band, ``2`` on bad input. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections.abc import Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_GATING_CONFIG = REPO_ROOT / "benchmarks" / "gating.yaml" + +# Fraction metrics live on a 0..1 scale, so a band expressed in percentage +# points (pp) is applied as ``band / 100``. Percent metrics are already 0..100 +# and the band is applied directly. Anything not listed here is informational. +_FRACTION_METRICS = ("recall_at_k", "mrr", "precision_at_k") +_PERCENT_METRICS = ("token_savings_pct",) + +# Mirrors benchmarks/gating.yaml so the gate has a safe default when no config +# file is present (e.g. a partial checkout). Kept in sync with that file. +DEFAULT_BANDS: dict[str, float] = { + "recall_at_k": 1.0, + "mrr": 1.0, + "precision_at_k": 1.0, + "token_savings_pct": 2.0, +} + + +@dataclass(frozen=True) +class GateViolation: + """One gated cell that regressed beyond its tolerance band.""" + + metric: str + cell: str + base: float + head: float + regression_pp: float + band_pp: float + + def describe(self) -> str: + """Return a single-line, deterministic human-readable summary.""" + return ( + f"{self.metric} [{self.cell}]: {self.base:.4f} -> {self.head:.4f} " + f"(-{self.regression_pp:.2f}pp, band {self.band_pp:.2f}pp)" + ) + + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class GatingConfig: + """Resolved gate configuration: per-metric bands plus the override label.""" + + bands: dict[str, float] + override_label: str = "benchmark-accepted" + + +def load_gating_config(path: Path | None) -> GatingConfig: + """Load bands from *path*; fall back to :data:`DEFAULT_BANDS` when absent. + + The defaults are a safety net for a *missing* or unparseable config (e.g. a + partial checkout), not a floor. When *path* exists, parses to a mapping, and + carries a ``quality`` block, that block is authoritative: only metrics whose + band is a non-negative number are gated (a ``0`` band means "no regression + tolerated"), and a config that sets every metric to ``gating: false`` (so no + band resolves) deliberately gates nothing rather than silently reverting to + the defaults. A present config that omits ``quality`` entirely is treated as + incomplete and keeps the defaults. + """ + if path is None or not path.exists(): + return GatingConfig(bands=dict(DEFAULT_BANDS)) + import yaml # lazy: keeps the import off the no-config path + + raw = yaml.safe_load(path.read_text(encoding="utf-8")) + if not isinstance(raw, dict): + return GatingConfig(bands=dict(DEFAULT_BANDS)) + override = str(raw.get("override_label", "benchmark-accepted")) + if "quality" not in raw: + return GatingConfig(bands=dict(DEFAULT_BANDS), override_label=override) + bands: dict[str, float] = {} + for metric, spec in (raw.get("quality") or {}).items(): + if not isinstance(spec, dict): + continue + band = spec.get("max_regression_pp") + if isinstance(band, (int, float)) and band >= 0: + bands[str(metric)] = float(band) + return GatingConfig(bands=bands, override_label=override) + + +# --------------------------------------------------------------------------- +# Cell extraction — each gated cell is (metric, cell-label, value) +# --------------------------------------------------------------------------- + + +def _routing_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]: + """Gated metrics from the single-backend ``routing`` summary rows.""" + cells: dict[tuple[str, str], float] = {} + for row in payload.get("routing", []): + if not isinstance(row, dict): + continue + size = int(row.get("catalog_size", 0)) + for metric in _FRACTION_METRICS: + if metric in row: + cells[(metric, f"routing/size={size}")] = float(row[metric]) + return cells + + +def _matrix_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]: + """Gated metrics from ``routing_matrix`` cells; skip non-``ok`` cells.""" + cells: dict[tuple[str, str], float] = {} + for row in payload.get("routing_matrix", []): + if not isinstance(row, dict) or str(row.get("status", "ok")) != "ok": + continue + backend = str(row.get("backend", "")) + size = int(row.get("catalog_size", 0)) + for metric in _FRACTION_METRICS: + if metric in row: + cells[(metric, f"matrix/{backend}@{size}")] = float(row[metric]) + return cells + + +def _token_savings_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]: + """Token-savings cells from each context row's ``naive_delta`` block.""" + cells: dict[tuple[str, str], float] = {} + for row in payload.get("context", []): + if not isinstance(row, dict): + continue + nd = row.get("naive_delta") + if isinstance(nd, dict) and "pct_reduction" in nd: + scenario = str(row.get("scenario", "")) + cells[("token_savings_pct", f"context/{scenario}")] = float(nd["pct_reduction"]) + return cells + + +def _all_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]: + cells: dict[tuple[str, str], float] = {} + cells.update(_routing_cells(payload)) + cells.update(_matrix_cells(payload)) + cells.update(_token_savings_cells(payload)) + return cells + + +def _regression_pp(metric: str, base: float, head: float) -> float: + """Return the regression in percentage points (positive = got worse).""" + drop = base - head + if metric in _PERCENT_METRICS: + return drop # already on a 0..100 scale + return drop * 100.0 # fraction (0..1) -> pp + + +# --------------------------------------------------------------------------- +# Gate +# --------------------------------------------------------------------------- + + +def evaluate_gate( + base: dict[str, Any], head: dict[str, Any], config: GatingConfig +) -> list[GateViolation]: + """Return every gated cell in *head* that regressed beyond its band vs *base*. + + Cells absent from *base* (new cells) cannot regress and are skipped. Cells + present in *base* but absent from *head* are also skipped — a removed cell is + a structural change, not a quality regression, and is surfaced by the delta + comment, not the gate. Results are sorted for deterministic output. + """ + base_cells = _all_cells(base) + head_cells = _all_cells(head) + violations: list[GateViolation] = [] + for key, head_value in head_cells.items(): + metric, cell = key + band = config.bands.get(metric) + if band is None or key not in base_cells: + continue + base_value = base_cells[key] + regression = _regression_pp(metric, base_value, head_value) + if regression > band: + violations.append( + GateViolation( + metric=metric, + cell=cell, + base=base_value, + head=head_value, + regression_pp=regression, + band_pp=band, + ) + ) + return sorted(violations, key=lambda v: (v.metric, v.cell)) + + +def render_report( + violations: list[GateViolation], + *, + overridden: bool, + override_label: str = "benchmark-accepted", +) -> str: + """Render a deterministic plain-text gate report. + + *override_label* names the configured downgrade label (from the gating + config) so the report points at the exact label CI checks for. + """ + if not violations: + return "benchmark gate: PASS — all gated quality metrics within band." + lines = [f"benchmark gate: {len(violations)} metric(s) regressed beyond band:"] + lines.extend(f" - {v.describe()}" for v in violations) + lines.append("") + if overridden: + lines.append( + f"Override label '{override_label}' present — downgrading failure to a warning." + ) + else: + lines.append( + f"Fix the regression, or apply the '{override_label}' label with a rationale in " + "the PR description to accept it." + ) + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__.splitlines()[0] if __doc__ else None, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--base", required=True, help="Baseline (committed) latest.json") + parser.add_argument("--head", required=True, help="PR head latest.json") + parser.add_argument( + "--gating-config", + default=str(DEFAULT_GATING_CONFIG), + help="Path to gating.yaml (bands + override label)", + ) + parser.add_argument( + "--override", + action="store_true", + help="Downgrade any gate failure to a warning (exit 0).", + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = _parse_args(argv) + try: + base = json.loads(Path(args.base).read_text(encoding="utf-8")) + head = json.loads(Path(args.head).read_text(encoding="utf-8")) + except FileNotFoundError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + except json.JSONDecodeError as exc: + print(f"error: malformed JSON: {exc}", file=sys.stderr) + return 2 + + config = load_gating_config(Path(args.gating_config) if args.gating_config else None) + violations = evaluate_gate(base, head, config) + print(render_report(violations, overridden=args.override, override_label=config.override_label)) + if violations and not args.override: + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/render_trend.py b/scripts/render_trend.py new file mode 100644 index 00000000..0b7b8509 --- /dev/null +++ b/scripts/render_trend.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +"""Render the release-over-release benchmark trend page (issue #554). + +The deterministic benchmark answers "what are the numbers *now*?" +(``benchmarks/results/latest.json`` + ``scorecard.md``). A regression that +creeps in over several releases is invisible in any single snapshot. This +script keeps a small, deterministic-only metric snapshot per release under +``benchmarks/results/history/.json`` and renders the longitudinal +view to ``benchmarks/trend.md`` so quality trajectories stay visible. + +Latency is deliberately excluded from snapshots — it is environment-dependent +and not comparable across release machines. This page is *visibility*, not a +gate; PR-time gating with tolerance bands is owned by ``benchmark_gate.py`` (#491). + +The script is stdlib-only (no contextweaver import) so it can run before the +package is installed, matching ``scripts/render_scorecard.py``. + +Usage:: + + # Capture a release snapshot from the current latest.json: + python scripts/render_trend.py --snapshot 0.16.0 --from benchmarks/results/latest.json + python scripts/render_trend.py # render benchmarks/trend.md + python scripts/render_trend.py --check # exit non-zero on drift +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections.abc import Sequence +from pathlib import Path +from typing import Any + +from _golden import check_text_artifacts, write_text_artifacts + +REPO_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_HISTORY_DIR = REPO_ROOT / "benchmarks" / "results" / "history" +DEFAULT_OUTPUT = REPO_ROOT / "benchmarks" / "trend.md" +DEFAULT_LATEST = REPO_ROOT / "benchmarks" / "results" / "latest.json" + +SNAPSHOT_SCHEMA_VERSION = 1 + + +# --------------------------------------------------------------------------- +# Snapshot extraction (latest.json -> deterministic metric subset) +# --------------------------------------------------------------------------- + + +def extract_snapshot(release: str, latest: dict[str, Any]) -> dict[str, Any]: + """Return the deterministic-only metric subset for one release. + + Latency fields are intentionally omitted. The shape carries a + ``schema_version`` so future metric changes do not orphan old entries. + """ + recall: dict[str, float] = {} + mrr: dict[str, float] = {} + precision: dict[str, float] = {} + for row in latest.get("routing", []): + if not isinstance(row, dict): + continue + size = str(int(row.get("catalog_size", 0))) + recall[size] = round(float(row.get("recall_at_k", 0.0)), 4) + mrr[size] = round(float(row.get("mrr", 0.0)), 4) + precision[size] = round(float(row.get("precision_at_k", 0.0)), 4) + + reductions: list[float] = [] + dropped = 0 + dedup = 0 + for row in latest.get("context", []): + if not isinstance(row, dict): + continue + dropped += int(row.get("items_dropped", 0)) + dedup += int(row.get("dedup_removed", 0)) + nd = row.get("naive_delta") + if isinstance(nd, dict) and "pct_reduction" in nd: + reductions.append(float(nd["pct_reduction"])) + mean_reduction = round(sum(reductions) / len(reductions), 2) if reductions else 0.0 + + return { + "schema_version": SNAPSHOT_SCHEMA_VERSION, + "release": release, + "metrics": { + "routing_recall_at_k": recall, + "routing_mrr": mrr, + "routing_precision_at_k": precision, + "mean_token_reduction_pct": mean_reduction, + "total_items_dropped": dropped, + "total_dedup_removed": dedup, + }, + } + + +def write_snapshot(snapshot: dict[str, Any], history_dir: Path) -> Path: + """Write *snapshot* deterministically to ``/.json``.""" + history_dir.mkdir(parents=True, exist_ok=True) + out = history_dir / f"{snapshot['release']}.json" + text = json.dumps(snapshot, indent=2, sort_keys=True) + "\n" + out.write_text(text, encoding="utf-8", newline="\n") + return out + + +# --------------------------------------------------------------------------- +# History loading + ordering +# --------------------------------------------------------------------------- + + +def _version_key(release: str) -> tuple[Any, ...]: + """Best-effort semantic ordering: numeric tuple, with a string fallback.""" + parts: list[Any] = [] + for chunk in release.split("."): + parts.append((0, int(chunk)) if chunk.isdigit() else (1, chunk)) + return tuple(parts) + + +def load_history(history_dir: Path) -> list[dict[str, Any]]: + """Load every snapshot in *history_dir*, ordered oldest release first.""" + snapshots: list[dict[str, Any]] = [] + if not history_dir.exists(): + return snapshots + for path in sorted(history_dir.glob("*.json")): + snapshots.append(json.loads(path.read_text(encoding="utf-8"))) + return sorted(snapshots, key=lambda s: _version_key(str(s.get("release", "")))) + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + + +def _per_size_table(snapshots: list[dict[str, Any]], metric: str) -> str: + sizes = sorted({int(size) for s in snapshots for size in s.get("metrics", {}).get(metric, {})}) + if not sizes: + return "_No data._" + header = "| release | " + " | ".join(f"size={n}" for n in sizes) + " |" + sep = "|---|" + "---:|" * len(sizes) + lines = [header, sep] + for s in snapshots: + values = s.get("metrics", {}).get(metric, {}) + cells = [f"{float(values[str(n)]):.4f}" if str(n) in values else "—" for n in sizes] + lines.append(f"| `{s.get('release', '?')}` | " + " | ".join(cells) + " |") + return "\n".join(lines) + + +def _context_table(snapshots: list[dict[str, Any]]) -> str: + header = "| release | mean token reduction | items dropped | dedup removed |" + sep = "|---|---:|---:|---:|" + lines = [header, sep] + for s in snapshots: + m = s.get("metrics", {}) + lines.append( + f"| `{s.get('release', '?')}` | {float(m.get('mean_token_reduction_pct', 0.0)):.2f}% " + f"| {int(m.get('total_items_dropped', 0))} | {int(m.get('total_dedup_removed', 0))} |" + ) + return "\n".join(lines) + + +def render(snapshots: list[dict[str, Any]]) -> str: + """Return the deterministic trend markdown for *snapshots*.""" + parts = [ + "# contextweaver — Benchmark Trend", + "", + "> Auto-generated by `make trend`. Do not edit by hand.", + "> Source: `benchmarks/results/history/*.json` (one snapshot per release).", + "", + "Release-over-release view of the deterministic benchmark metrics. Latency", + "is excluded — it is environment-dependent and not comparable across release", + "machines. This page is visibility only; PR-time regression gating lives in", + "`benchmarks/gating.yaml` + `scripts/benchmark_gate.py` (#491).", + "", + ] + if not snapshots: + parts.extend( + [ + "_No release snapshots recorded yet. Capture one with_", + "`python scripts/render_trend.py --snapshot " + "--from benchmarks/results/latest.json`.", + "", + ] + ) + return "\n".join(parts) + + parts.extend( + [ + f"Releases recorded: {len(snapshots)} " + f"(`{snapshots[0].get('release', '?')}` … `{snapshots[-1].get('release', '?')}`).", + "", + "## Routing recall@k by catalog size", + "", + _per_size_table(snapshots, "routing_recall_at_k"), + "", + "## Routing MRR by catalog size", + "", + _per_size_table(snapshots, "routing_mrr"), + "", + "## Routing precision@k by catalog size", + "", + _per_size_table(snapshots, "routing_precision_at_k"), + "", + "## Context pipeline quality", + "", + _context_table(snapshots), + "", + "---", + "", + "## Capturing a release snapshot", + "", + "```bash", + "make benchmark # refresh benchmarks/results/latest.json", + "python scripts/render_trend.py --snapshot \\", + " --from benchmarks/results/latest.json", + "make trend # re-render benchmarks/trend.md", + "```", + "", + ] + ) + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=__doc__.splitlines()[0] if __doc__ else None, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--history-dir", default=str(DEFAULT_HISTORY_DIR)) + parser.add_argument("--output", default=str(DEFAULT_OUTPUT)) + parser.add_argument( + "--snapshot", help="Capture a release snapshot under this version, then exit" + ) + parser.add_argument("--from", dest="from_path", default=str(DEFAULT_LATEST)) + parser.add_argument( + "--check", + action="store_true", + help="Do not write; exit non-zero if benchmarks/trend.md would change.", + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = _parse_args(argv) + history_dir = Path(args.history_dir) + + if args.snapshot: + latest_path = Path(args.from_path) + if not latest_path.exists(): + print(f"error: {latest_path} not found — run `make benchmark` first.", file=sys.stderr) + return 1 + latest = json.loads(latest_path.read_text(encoding="utf-8")) + out = write_snapshot(extract_snapshot(args.snapshot, latest), history_dir) + print(f"Wrote {out}") + return 0 + + rendered = {Path(args.output): render(load_history(history_dir))} + if args.check: + return check_text_artifacts(rendered, label="trend", regen="make trend") + write_text_artifacts(rendered) + print(f"Wrote {args.output}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_benchmark_gate.py b/tests/test_benchmark_gate.py new file mode 100644 index 00000000..5286d76e --- /dev/null +++ b/tests/test_benchmark_gate.py @@ -0,0 +1,158 @@ +"""Tests for scripts/benchmark_gate.py — the quality-regression gate (#491). + +Covers the gate contract: a quality metric regressing beyond its band fails; +within-band movement and improvements pass; latency never gates; new/removed +cells do not fire; the override downgrades a failure to a warning. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) + +from benchmark_gate import ( # noqa: E402 + GatingConfig, + evaluate_gate, + load_gating_config, + main, +) + +_CONFIG = GatingConfig( + bands={ + "recall_at_k": 1.0, + "mrr": 1.0, + "precision_at_k": 1.0, + "token_savings_pct": 2.0, + } +) + + +def _routing(recall: float, mrr: float = 0.3, size: int = 100) -> dict[str, object]: + return { + "routing": [ + { + "catalog_size": size, + "recall_at_k": recall, + "mrr": mrr, + "precision_at_k": 0.08, + "latency_ms_p99": 1.0, + } + ] + } + + +def test_recall_regression_beyond_band_fails() -> None: + base = _routing(0.5000) + head = _routing(0.4800) # -2.0pp, band is 1.0pp + violations = evaluate_gate(base, head, _CONFIG) + assert len(violations) == 1 + v = violations[0] + assert v.metric == "recall_at_k" + assert v.cell == "routing/size=100" + assert round(v.regression_pp, 2) == 2.0 + + +def test_recall_within_band_passes() -> None: + base = _routing(0.5000) + head = _routing(0.4950) # -0.5pp, inside the 1.0pp band + assert evaluate_gate(base, head, _CONFIG) == [] + + +def test_improvement_passes() -> None: + base = _routing(0.5000) + head = _routing(0.6000) + assert evaluate_gate(base, head, _CONFIG) == [] + + +def test_latency_never_gates() -> None: + base = {"routing": [{"catalog_size": 100, "latency_ms_p99": 1.0}]} + head = {"routing": [{"catalog_size": 100, "latency_ms_p99": 1000.0}]} + assert evaluate_gate(base, head, _CONFIG) == [] + + +def test_new_cell_does_not_fire() -> None: + base = _routing(0.50, size=100) + head = { + "routing": [ + {"catalog_size": 100, "recall_at_k": 0.50, "mrr": 0.3, "precision_at_k": 0.08}, + {"catalog_size": 500, "recall_at_k": 0.01, "mrr": 0.01, "precision_at_k": 0.0}, + ] + } + # The size=500 cell is new (absent in base) — it cannot "regress". + assert evaluate_gate(base, head, _CONFIG) == [] + + +def test_skipped_matrix_cell_ignored() -> None: + base = { + "routing_matrix": [ + {"backend": "fuzzy", "catalog_size": 100, "status": "skipped: missing rapidfuzz"} + ] + } + head = { + "routing_matrix": [ + {"backend": "fuzzy", "catalog_size": 100, "status": "skipped: missing rapidfuzz"} + ] + } + assert evaluate_gate(base, head, _CONFIG) == [] + + +def test_token_savings_band_is_percent_points() -> None: + base = {"context": [{"scenario": "s", "naive_delta": {"pct_reduction": 60.0}}]} + within = {"context": [{"scenario": "s", "naive_delta": {"pct_reduction": 58.5}}]} # -1.5pp + beyond = {"context": [{"scenario": "s", "naive_delta": {"pct_reduction": 57.0}}]} # -3.0pp + assert evaluate_gate(base, within, _CONFIG) == [] + assert len(evaluate_gate(base, beyond, _CONFIG)) == 1 + + +def test_committed_gating_config_loads() -> None: + cfg = load_gating_config(Path(__file__).parent.parent / "benchmarks" / "gating.yaml") + assert cfg.bands["recall_at_k"] == 1.0 + assert cfg.bands["token_savings_pct"] == 2.0 + assert cfg.override_label == "benchmark-accepted" + + +def test_config_disabling_all_gates_gates_nothing(tmp_path: Path) -> None: + # A present config that disables every metric must be honored (gate nothing) + # rather than silently reverting to DEFAULT_BANDS. + cfg_path = tmp_path / "gating.yaml" + cfg_path.write_text( + "quality:\n" + " recall_at_k: { gating: false }\n" + " mrr: { gating: false }\n" + "override_label: custom-accept\n", + encoding="utf-8", + ) + cfg = load_gating_config(cfg_path) + assert cfg.bands == {} + assert cfg.override_label == "custom-accept" + # With no gated metrics, even a large drop cannot produce a violation. + assert evaluate_gate(_routing(0.50), _routing(0.10), cfg) == [] + + +def test_config_without_quality_block_keeps_defaults(tmp_path: Path) -> None: + # An incomplete config (no ``quality`` block at all) keeps the safe defaults. + cfg_path = tmp_path / "gating.yaml" + cfg_path.write_text("latency:\n gating: false\n", encoding="utf-8") + cfg = load_gating_config(cfg_path) + assert cfg.bands["recall_at_k"] == 1.0 + assert cfg.override_label == "benchmark-accepted" + + +def test_cli_exit_codes(tmp_path: Path) -> None: + base = tmp_path / "base.json" + head = tmp_path / "head.json" + base.write_text(json.dumps(_routing(0.50)), encoding="utf-8") + head.write_text(json.dumps(_routing(0.45)), encoding="utf-8") # -5pp + config = str(Path(__file__).parent.parent / "benchmarks" / "gating.yaml") + assert main(["--base", str(base), "--head", str(head), "--gating-config", config]) == 1 + # Override downgrades the failure to a warning. + assert ( + main(["--base", str(base), "--head", str(head), "--gating-config", config, "--override"]) + == 0 + ) + # Clean head passes. + head.write_text(json.dumps(_routing(0.50)), encoding="utf-8") + assert main(["--base", str(base), "--head", str(head), "--gating-config", config]) == 0 diff --git a/tests/test_large_catalog_benchmark.py b/tests/test_large_catalog_benchmark.py new file mode 100644 index 00000000..3f226f5f --- /dev/null +++ b/tests/test_large_catalog_benchmark.py @@ -0,0 +1,63 @@ +"""Tests for benchmarks/large_catalog.py — the 300+ tool benchmark (#369). + +CI-safe: runs a reduced catalog so the suite stays fast, and asserts the +structural contract (size, namespaces, distractors, deny filtering) plus +accuracy/token determinism. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks")) + +from large_catalog import ( # noqa: E402 + build_large_catalog, + render_scorecard, + run_benchmark, +) + + +def test_catalog_has_requested_size_and_many_namespaces() -> None: + items = build_large_catalog(n=320, seed=42) + assert len(items) == 320 + namespaces = {it.namespace for it in items if it.namespace} + assert len(namespaces) >= 5 # acceptance: >= 5 upstream namespaces + # Distractor variants exist and carry distinct ids from the base pool. + assert any(".v" in it.id for it in items) + + +def test_results_are_deterministic() -> None: + a = run_benchmark(n=160, seed=42) + b = run_benchmark(n=160, seed=42) + assert a.recall_at_5 == b.recall_at_5 + assert a.mrr == b.mrr + assert a.mean_card_tokens == b.mean_card_tokens + assert a.token_reduction_pct == b.token_reduction_pct + + +def test_choicecards_collapse_the_prompt() -> None: + result = run_benchmark(n=160, seed=42) + assert result.mean_card_tokens < result.mean_naive_tokens + assert result.token_reduction_pct > 50.0 + + +def test_denied_destructive_tools_never_reach_shortlist() -> None: + result = run_benchmark(n=160, seed=42) + assert result.destructive_tools > 0 + assert result.destructive_in_shortlist_denied == 0 + + +def test_scorecard_render_is_deterministic() -> None: + result = run_benchmark(n=160, seed=42) + assert render_scorecard(result) == render_scorecard(result) + + +def test_committed_scorecard_matches_full_run() -> None: + """The committed 320-tool scorecard must match a fresh full run.""" + result = run_benchmark() # defaults: 320 tools, seed 42 + committed = ( + Path(__file__).parent.parent / "benchmarks" / "large_catalog_scorecard.md" + ).read_text(encoding="utf-8") + assert render_scorecard(result) == committed diff --git a/tests/test_render_trend.py b/tests/test_render_trend.py new file mode 100644 index 00000000..1d05129a --- /dev/null +++ b/tests/test_render_trend.py @@ -0,0 +1,104 @@ +"""Tests for scripts/render_trend.py — the benchmark trend page (#554). + +Covers snapshot extraction (latency excluded), deterministic rendering, the +``--check`` drift gate, and version ordering across releases. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) + +from render_trend import ( # noqa: E402 + extract_snapshot, + load_history, + render, + write_snapshot, +) + +_LATEST = { + "routing": [ + { + "catalog_size": 50, + "recall_at_k": 0.5649, + "mrr": 0.4978, + "precision_at_k": 0.1191, + "latency_ms_p99": 0.759, + }, + { + "catalog_size": 1000, + "recall_at_k": 0.1475, + "mrr": 0.1456, + "precision_at_k": 0.031, + "latency_ms_p99": 41.7, + }, + ], + "context": [ + { + "scenario": "a", + "items_dropped": 7, + "dedup_removed": 4, + "naive_delta": {"pct_reduction": 60.0}, + }, + { + "scenario": "b", + "items_dropped": 0, + "dedup_removed": 0, + "naive_delta": {"pct_reduction": 80.0}, + }, + ], +} + + +def test_snapshot_excludes_latency_and_averages_reduction() -> None: + snap = extract_snapshot("1.0.0", _LATEST) + assert snap["schema_version"] == 1 + assert snap["release"] == "1.0.0" + metrics = snap["metrics"] + # Latency must never leak into the snapshot. + assert "latency" not in json.dumps(metrics) + assert metrics["routing_recall_at_k"]["50"] == 0.5649 + assert metrics["mean_token_reduction_pct"] == 70.0 # mean(60, 80) + assert metrics["total_items_dropped"] == 7 + assert metrics["total_dedup_removed"] == 4 + + +def test_snapshot_roundtrip_is_byte_stable(tmp_path: Path) -> None: + snap = extract_snapshot("1.0.0", _LATEST) + p1 = write_snapshot(snap, tmp_path) + first = p1.read_text(encoding="utf-8") + second_snap = extract_snapshot("1.0.0", _LATEST) + write_snapshot(second_snap, tmp_path) + assert p1.read_text(encoding="utf-8") == first # deterministic, sorted keys + + +def test_render_is_deterministic(tmp_path: Path) -> None: + write_snapshot(extract_snapshot("0.16.0", _LATEST), tmp_path) + snapshots = load_history(tmp_path) + assert render(snapshots) == render(snapshots) + assert "0.16.0" in render(snapshots) + + +def test_releases_ordered_oldest_first(tmp_path: Path) -> None: + write_snapshot(extract_snapshot("0.16.0", _LATEST), tmp_path) + write_snapshot(extract_snapshot("0.9.0", _LATEST), tmp_path) + write_snapshot(extract_snapshot("0.10.0", _LATEST), tmp_path) + releases = [s["release"] for s in load_history(tmp_path)] + # Numeric ordering: 0.9.0 < 0.10.0 < 0.16.0 (not lexicographic). + assert releases == ["0.9.0", "0.10.0", "0.16.0"] + + +def test_empty_history_renders_placeholder() -> None: + out = render([]) + assert "No release snapshots recorded yet" in out + + +def test_committed_trend_is_in_sync() -> None: + """The committed benchmarks/trend.md must match a fresh render of history.""" + root = Path(__file__).parent.parent + snapshots = load_history(root / "benchmarks" / "results" / "history") + committed = (root / "benchmarks" / "trend.md").read_text(encoding="utf-8") + assert render(snapshots) == committed diff --git a/tests/test_scenario_routing.py b/tests/test_scenario_routing.py new file mode 100644 index 00000000..f9bff2c1 --- /dev/null +++ b/tests/test_scenario_routing.py @@ -0,0 +1,59 @@ +"""Tests for benchmarks/scenario_routing.py — naive vs ChoiceCard (#418).""" + +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks")) + +from scenario_routing import ( # noqa: E402 + DEFAULT_DATASET, + render_report, + run_all, + run_scenario, +) + + +def test_scenario_rows_are_deterministic() -> None: + a = run_all() + b = run_all() + assert [r.__dict__ for r in a] == [r.__dict__ for r in b] + + +def test_choicecards_reduce_tokens_vs_naive() -> None: + for row in run_all(): + assert row.card_tokens < row.naive_tokens + assert row.token_reduction_pct > 0.0 + assert row.cards_shown <= 5 # bounded by TOP_K + + +def test_rank_consistent_with_top_k_flag() -> None: + for row in run_all(): + # rank > 0 iff the expected tool is in the shortlist. + assert (row.correct_rank > 0) == row.correct_in_top_k + + +def test_dataset_covers_multiple_namespaces() -> None: + rows = run_all() + assert len(rows) >= 4 + # At least one scenario must keep the expected tool reachable. + assert any(r.correct_in_top_k for r in rows) + + +def test_render_is_deterministic_and_matches_commit() -> None: + rows = run_all() + assert render_report(rows) == render_report(rows) + committed = (Path(__file__).parent.parent / "benchmarks" / "scenario_routing.md").read_text( + encoding="utf-8" + ) + assert render_report(rows) == committed + + +def test_single_scenario_shape() -> None: + import json + + first = json.loads(DEFAULT_DATASET.read_text(encoding="utf-8"))[0] + row = run_scenario(first) + assert row.name == first["name"] + assert row.catalog_size == first["catalog_size"]