diff --git a/.github/workflows/benchmark-scale.yml b/.github/workflows/benchmark-scale.yml
new file mode 100644
index 00000000..56d3baaa
--- /dev/null
+++ b/.github/workflows/benchmark-scale.yml
@@ -0,0 +1,50 @@
+name: Routing-scale smoke benchmark
+
+# Non-gating, scheduled routing-scale benchmark for drift detection (issue #688,
+# child of #444). Runs the deterministic routing-scale profiler on a fixed seed
+# and stores its JSON as a per-run trend artifact, so scaling regressions are
+# visible over time without blocking any PR. This never gates: PR-time quality
+# gating lives in the main CI job (benchmarks/gating.yaml + benchmark_gate.py, #491).
+
+on:
+  schedule:
+    # Tuesday 06:30 UTC — off-hours, staggered from the weekly scorecard job.
+    - cron: "30 6 * * 2"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  routing-scale:
+    name: Routing-scale profile (non-gating)
+    runs-on: ubuntu-latest
+    # The profiler is informational; a failure must never page anyone or block work.
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Today
+        id: date
+        run: echo "today=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+
+      - name: Run routing-scale profile
+        # Reduced sizes keep the scheduled run well under the runner timeout;
+        # the local `make benchmark-routing-scale` default sweeps up to 10k.
+        run: python benchmarks/routing_scale.py --sizes 100,1000,5000
+
+      - name: Upload trend artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: routing-scale-${{ steps.date.outputs.today }}
+          path: |
+            benchmarks/results/routing_scale.json
+            docs/benchmarks/routing-scale.md
+          retention-days: 90
+          if-no-files-found: error
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c0302d4f..b6463250 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -277,6 +277,49 @@ jobs:
           body-path: benchmarks/results/delta.md
           edit-mode: replace
 
+  benchmark-gate:
+    # Quality-regression gate (issue #491). Where benchmark-comment *describes*
+    # head-vs-base movement, this job *enforces* it: a PR that regresses a gated
+    # quality metric (recall@k / MRR / precision@k / token-savings) beyond its
+    # band in benchmarks/gating.yaml fails CI. Latency is never gated. The
+    # deterministic quality metrics are environment-independent, so head equals
+    # the committed base unless a code change moved them.
+    name: Benchmark quality gate
+    needs: test
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    if: ${{ github.event_name == 'pull_request' }}
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+      - name: Generate head + base snapshots
+        # base = the committed baseline; head = this PR's numbers (matrix run so
+        # the per-backend cells are populated alongside the routing summary).
+        run: |
+          cp benchmarks/results/latest.json benchmarks/results/base.json
+          python benchmarks/benchmark.py --matrix --output benchmarks/results/head.json
+      - name: Gate on quality regressions
+        # The `benchmark-accepted` label downgrades a failure to a warning for
+        # intentional trade-offs; the rationale must be in the PR description.
+        # NOTE: this label string is the GitHub-Actions mirror of `override_label`
+        # in benchmarks/gating.yaml. A `${{ }}` expression cannot read that file,
+        # so the two must be kept in sync by hand if the label is ever renamed.
+        run: |
+          OVERRIDE=""
+          if ${{ contains(github.event.pull_request.labels.*.name, 'benchmark-accepted') }}; then
+            OVERRIDE="--override"
+          fi
+          python scripts/benchmark_gate.py \
+            --base benchmarks/results/base.json \
+            --head benchmarks/results/head.json \
+            --gating-config benchmarks/gating.yaml $OVERRIDE
+
   docs-build:
     # Gate the docs build on PRs (issue #474). docs.yml only builds+deploys on
     # push to main, so a malformed docstring or broken nav could land on main
diff --git a/AGENTS.md b/AGENTS.md
index b9b72da5..fe6fdb6e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -218,6 +218,10 @@ make docs     # mkdocs build --clean (docs site)
 make docs-serve  # mkdocs serve (live preview)
 make benchmark        # run benchmark harness (non-gating; writes benchmarks/results/latest.json)
 make benchmark-matrix # benchmark + per-backend × per-size matrix (#208) and per-namespace breakdown (#209)
+make benchmark-large-catalog  # 300+ tool routing benchmark + scorecard (#369); -check gates drift
+make benchmark-scenario       # naive all-tools vs ChoiceCard routing report (#418); -check gates drift
+make trend            # render benchmarks/trend.md from per-release history snapshots (#554)
+make trend-check      # verify benchmarks/trend.md is up to date (exits non-zero on drift)
 make gateway-scorecard-check  # verify gateway scorecard matches its committed JSON (gating CI; #391)
 make record-demos-check       # verify committed demo casts match current output (gating CI; #390)
 make smoke-eval       # non-gating CI smoke-evaluation over fixed fixtures (#331/#392); deterministic, credential-free
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62871d08..79b3c899 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,44 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- **Benchmark-suite maturation: scaling, scenarios, CI gating, and trend
+  (#369, #418, #491, #554, #687, #688).** A coordinated pass on the benchmark
+  subsystem, all deterministic and offline:
+  - **Large-catalog benchmark (#369).** `make benchmark-large-catalog`
+    (`benchmarks/large_catalog.py`) routes over 300+ tools across 8 namespaces
+    with near-duplicate distractor variants and destructive (side-effecting)
+    tools, reporting recall@1/3/5, MRR, ChoiceCard-vs-naive prompt-token
+    reduction, and allow/deny filtering of destructive tools. Writes a
+    committed scorecard (`benchmarks/large_catalog_scorecard.md`, latency
+    excluded for determinism) plus `benchmarks/results/large_catalog.json`;
+    `--check` gates scorecard drift and `--strict` gates regression-guard
+    thresholds.
+  - **Scenario benchmark (#418).** `make benchmark-scenario`
+    (`benchmarks/scenario_routing.py`) contrasts naive all-tools prompting
+    against bounded `ChoiceCard` routing across tool-heavy scenarios
+    (`benchmarks/scenarios/routing_choicecard.json`), reporting
+    correct-in-top-k, rank, cards shown, and token reduction to a committed
+    report (`benchmarks/scenario_routing.md`).
+  - **Quality-regression gate (#491).** `scripts/benchmark_gate.py` +
+    `benchmarks/gating.yaml` turn the informational benchmark delta into a
+    gating CI check: a PR that regresses recall@k / MRR / precision@k /
+    token-savings beyond its tolerance band fails the new `benchmark-gate` CI
+    job. Latency is never gated; the `benchmark-accepted` PR label downgrades a
+    failure to a warning for intentional trade-offs.
+  - **Release trend (#554).** `scripts/render_trend.py` captures a
+    deterministic, latency-free metric snapshot per release under
+    `benchmarks/results/history/<version>.json` and renders the
+    release-over-release view to `benchmarks/trend.md` (`make trend` /
+    `make trend-check`).
+  - **Scaling matrix docs (#687).** `docs/benchmarks/scaling-matrix.md`
+    documents the 10k-tool scaling methodology, reproducible commands, and
+    result interpretation, tying together the routing-scale, large-catalog,
+    and per-backend matrix benchmarks.
+  - **Scheduled routing-scale smoke (#688).** A non-gating
+    `.github/workflows/benchmark-scale.yml` runs the routing-scale profiler on
+    a weekly schedule and uploads its JSON + report as a per-run trend
+    artifact.
+
 - **Multi-client MCP config-pack generator (#659).**
   Added `contextweaver mcp generate-configs` to render client recipe files
   (`copilot_mcp.json`, `cursor_mcp.json`, `claude_desktop_config.json`,
diff --git a/Makefile b/Makefile
index 97ec1420..c27da5cb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: fmt lint type test example demo ci ci-full floor-deps tool-smoke docs docs-serve benchmark benchmark-matrix benchmark-routing-scale benchmark-gateway benchmark-primitives sidecar-smoke token-calibration smoke-eval e2e-quality scorecard scorecard-check sweep-scoring architectures llms llms-check weaver-conformance schemas schemas-check context-rot context-rot-check readme-version-check security-policy-check drift drift-check api api-check module-size-check module-size-update doc-snippets-check
+.PHONY: fmt lint type test example demo ci ci-full floor-deps tool-smoke docs docs-serve benchmark benchmark-matrix benchmark-routing-scale benchmark-gateway benchmark-primitives benchmark-large-catalog benchmark-large-catalog-check benchmark-scenario benchmark-scenario-check trend trend-check sidecar-smoke token-calibration smoke-eval e2e-quality scorecard scorecard-check sweep-scoring architectures llms llms-check weaver-conformance schemas schemas-check context-rot context-rot-check readme-version-check security-policy-check drift drift-check api api-check module-size-check module-size-update doc-snippets-check
 
 # Interpreter and pip front-end (issue #712). Default to `python3`, which is what
 # many modern environments ship (some have no bare `python` on PATH at all).
@@ -84,6 +84,34 @@ benchmark-matrix:
 benchmark-routing-scale:
 	$(PYTHON) benchmarks/routing_scale.py
 
+# Large-catalog routing benchmark (issue #369; non-gating). 300+ tools across 8
+# namespaces with near-duplicate distractors and destructive tools; writes
+# benchmarks/large_catalog_scorecard.md + benchmarks/results/large_catalog.json.
+# `-check` verifies the committed scorecard is in sync (deterministic accuracy).
+benchmark-large-catalog:
+	$(PYTHON) benchmarks/large_catalog.py
+
+benchmark-large-catalog-check:
+	$(PYTHON) benchmarks/large_catalog.py --check
+
+# Scenario benchmark (issue #418; non-gating): naive all-tools prompt vs bounded
+# ChoiceCard routing. Writes benchmarks/scenario_routing.md; `-check` gates drift.
+benchmark-scenario:
+	$(PYTHON) benchmarks/scenario_routing.py
+
+benchmark-scenario-check:
+	$(PYTHON) benchmarks/scenario_routing.py --check
+
+# Release-over-release benchmark trend (issue #554). `trend` re-renders
+# benchmarks/trend.md from benchmarks/results/history/*.json; `trend-check` gates
+# drift. Capture a release snapshot with:
+#   python scripts/render_trend.py --snapshot <version> --from benchmarks/results/latest.json
+trend:
+	$(PYTHON) scripts/render_trend.py
+
+trend-check:
+	$(PYTHON) scripts/render_trend.py --check
+
 benchmark-gateway:
 	$(PYTHON) benchmarks/gateway_benchmark.py
 
diff --git a/benchmarks/gating.yaml b/benchmarks/gating.yaml
new file mode 100644
index 00000000..c7cc6eee
--- /dev/null
+++ b/benchmarks/gating.yaml
@@ -0,0 +1,25 @@
+# Benchmark quality-regression gate configuration (issue #491).
+#
+# Turns the informational benchmark-delta PR comment (scripts/benchmark_delta.py)
+# into a gating CI check: a PR that regresses a *quality* metric beyond its band
+# (vs the committed benchmarks/results/latest.json baseline) fails CI. Latency
+# stays informational — runner variance makes it unreliable as a gate.
+#
+# Band semantics (see scripts/benchmark_gate.py):
+#   - fraction metrics (recall_at_k / mrr / precision_at_k, range 0..1) regress
+#     when  head < base - max_regression_pp / 100.
+#   - percent metrics (token_savings_pct, already 0..100) regress when
+#     head < base - max_regression_pp.
+#
+# The ~0.5pp noise floor at 200 gold queries (.github/prompts/add-eval.prompt.md)
+# informs the 1.0pp quality band; token savings get a looser 2.0pp band.
+quality:
+  recall_at_k: { max_regression_pp: 1.0 }
+  mrr: { max_regression_pp: 1.0 }
+  precision_at_k: { max_regression_pp: 1.0 }
+  token_savings_pct: { max_regression_pp: 2.0 }
+latency:
+  gating: false
+# A maintainer-applied PR label that downgrades a gate failure to a warning for
+# intentional trade-offs (the rationale belongs in the PR description).
+override_label: benchmark-accepted
diff --git a/benchmarks/large_catalog.py b/benchmarks/large_catalog.py
new file mode 100644
index 00000000..2bf47236
--- /dev/null
+++ b/benchmarks/large_catalog.py
@@ -0,0 +1,338 @@
+"""Large-catalog routing benchmark: 300+ tools across many namespaces (issue #369).
+
+The headline adoption case is a coding-agent setup with many MCP servers and
+hundreds of tools. This deterministic, offline benchmark simulates that shape —
+300+ tools across 8 namespaces, with near-duplicate *distractor* variants and
+*destructive* (write/side-effecting) tools — and measures whether routing keeps
+the right tool reachable while collapsing the prompt:
+
+- recall@1/3/5 and MRR for expected-tool selection (`tool_browse`);
+- prompt-token reduction of bounded ``ChoiceCard``s vs the naive all-tools prompt;
+- allow/deny filtering of destructive tools (none reach the shortlist when denied).
+
+It reuses the installed package only (no import from sibling benchmark scripts),
+mirroring ``benchmarks/smoke_eval.py``. Accuracy and token figures use
+``CharDivFourEstimator`` so they are environment-independent; only latency varies
+with hardware and is reported to stdout / JSON, never to the committed scorecard.
+
+Usage::
+
+    python benchmarks/large_catalog.py            # write JSON + scorecard
+    python benchmarks/large_catalog.py --check     # exit non-zero on scorecard drift
+    python benchmarks/large_catalog.py --strict    # exit non-zero if below thresholds
+
+Exit codes: 0 on success; 1 on drift (``--check``) or threshold breach (``--strict``).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(_ROOT / "src"))
+
+from contextweaver.eval.dataset import EvalCase, EvalDataset  # noqa: E402
+from contextweaver.eval.routing import evaluate_routing  # noqa: E402
+from contextweaver.protocols import CharDivFourEstimator  # noqa: E402
+from contextweaver.routing.cards import make_choice_cards, render_cards_text  # noqa: E402
+from contextweaver.routing.catalog import (  # noqa: E402
+    generate_sample_catalog,
+    load_catalog_dicts,
+)
+from contextweaver.routing.router import Router  # noqa: E402
+from contextweaver.routing.tree import TreeBuilder  # noqa: E402
+from contextweaver.types import SelectableItem  # noqa: E402
+
+DEFAULT_CATALOG_SIZE = 320
+DEFAULT_SEED = 42
+TOP_K = 5
+BEAM_WIDTH = 3
+
+DEFAULT_JSON = _ROOT / "benchmarks" / "results" / "large_catalog.json"
+DEFAULT_SCORECARD = _ROOT / "benchmarks" / "large_catalog_scorecard.md"
+
+# Warn/gate thresholds (issue #369 acceptance: "fails or warns when ... regress").
+# These are *regression guards*, set below the deterministic baseline (recall@5
+# ≈ 0.71 against the distractor-heavy catalog, token reduction ≈ 97%) with margin
+# so a real quality drop trips the warning while the synthetic near-duplicate
+# variants deliberately competing for rank do not.
+RECALL_AT_5_FLOOR = 0.65
+TOKEN_REDUCTION_FLOOR_PCT = 80.0
+
+_EST = CharDivFourEstimator()
+
+
+def _count(text: str) -> int:
+    return _EST.estimate(text)
+
+
+# ---------------------------------------------------------------------------
+# Catalog construction
+# ---------------------------------------------------------------------------
+
+
+def build_large_catalog(n: int, seed: int) -> list[SelectableItem]:
+    """Return *n* deterministic tools, extending the 83-item pool with variants.
+
+    Synthetic variants share their original's namespace and tags (preserving
+    routing signal density) but carry distinct IDs, so they act as near-duplicate
+    *distractors* without ever matching a gold query. Variants are always
+    non-destructive (``side_effects`` defaults to ``False`` and is not copied
+    from the original), so ``destructive_tools`` in the result reflects the base
+    83-item pool only — the deny test exercises exactly that base set.
+    """
+    base = load_catalog_dicts(generate_sample_catalog(n=83, seed=seed))
+    items: list[SelectableItem] = list(base)
+    version = 2
+    while len(items) < n:
+        for orig in list(base):
+            items.append(
+                SelectableItem(
+                    f"{orig.id}.v{version}",
+                    orig.kind,
+                    f"{orig.name}_v{version}",
+                    f"{orig.description} (variant {version})",
+                    tags=orig.tags,
+                    namespace=orig.namespace,
+                )
+            )
+            if len(items) >= n:
+                break
+        version += 1
+    return sorted(items, key=lambda i: i.id)[:n]
+
+
+def _gold_dataset(base_items: list[SelectableItem]) -> EvalDataset:
+    """Derive a deterministic gold set: each base tool's description -> its id."""
+    cases = [EvalCase(query=it.description, expected=[it.id]) for it in base_items]
+    return EvalDataset(cases=sorted(cases, key=lambda c: c.query))
+
+
+# ---------------------------------------------------------------------------
+# Measurement
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LargeCatalogResult:
+    """Deterministic + latency results of one large-catalog run."""
+
+    catalog_size: int
+    namespaces: int
+    distractor_tools: int
+    destructive_tools: int
+    queries: int
+    recall_at_1: float
+    recall_at_3: float
+    recall_at_5: float
+    mrr: float
+    mean_naive_tokens: int
+    mean_card_tokens: int
+    token_reduction_pct: float
+    destructive_in_shortlist_denied: int
+    latency_ms_p50: float
+    latency_ms_p99: float
+
+
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return 0.0
+    ordered = sorted(values)
+    idx = min(len(ordered) - 1, int(round((pct / 100.0) * (len(ordered) - 1))))
+    return ordered[idx]
+
+
+def run_benchmark(n: int = DEFAULT_CATALOG_SIZE, seed: int = DEFAULT_SEED) -> LargeCatalogResult:
+    """Run the large-catalog benchmark and return its results."""
+    items = build_large_catalog(n, seed)
+    base_items = [it for it in items if ".v" not in it.id]
+    distractors = len(items) - len(base_items)
+    destructive = [it for it in items if it.side_effects]
+    namespaces = {it.namespace for it in items if it.namespace}
+
+    router = Router(TreeBuilder().build(items), items=items, top_k=TOP_K, beam_width=BEAM_WIDTH)
+    catalog_ids = {it.id for it in items}
+    dataset = _gold_dataset(base_items)
+    report = evaluate_routing(router, dataset, catalog_ids=catalog_ids)
+
+    # Naive prompt = every tool's name + description. Stable across queries.
+    naive_text = "\n".join(f"{it.name}: {it.description}" for it in items)
+    naive_tokens = _count(naive_text)
+
+    card_token_samples: list[int] = []
+    latencies: list[float] = []
+    for case in dataset:
+        start = time.perf_counter()
+        result = router.route(case.query)
+        latencies.append((time.perf_counter() - start) * 1000.0)
+        cards = make_choice_cards(result.candidate_items)
+        card_token_samples.append(_count(render_cards_text(cards)))
+    mean_card = (
+        round(sum(card_token_samples) / len(card_token_samples)) if card_token_samples else 0
+    )
+    reduction = round((1 - mean_card / naive_tokens) * 100.0, 2) if naive_tokens else 0.0
+
+    # Allow/deny filtering: deny every destructive tool and confirm none survive.
+    deny_ids = {it.id for it in destructive}
+    leaked = 0
+    if deny_ids:
+        for case in dataset:
+            shortlist = set(router.route(case.query, exclude_ids=deny_ids).candidate_ids)
+            leaked += len(shortlist & deny_ids)
+
+    return LargeCatalogResult(
+        catalog_size=len(items),
+        namespaces=len(namespaces),
+        distractor_tools=distractors,
+        destructive_tools=len(destructive),
+        queries=report.queries_evaluated,
+        recall_at_1=round(report.top_1_recall, 4),
+        recall_at_3=round(report.top_3_recall, 4),
+        recall_at_5=round(report.top_5_recall, 4),
+        mrr=round(report.mrr, 4),
+        mean_naive_tokens=naive_tokens,
+        mean_card_tokens=mean_card,
+        token_reduction_pct=reduction,
+        destructive_in_shortlist_denied=leaked,
+        latency_ms_p50=round(_percentile(latencies, 50), 3),
+        latency_ms_p99=round(_percentile(latencies, 99), 3),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Rendering (deterministic — no latency, no environment)
+# ---------------------------------------------------------------------------
+
+
+def to_json(result: LargeCatalogResult) -> dict[str, Any]:
+    """Full result payload, including latency (for the JSON artifact only)."""
+    return {
+        "benchmark": "large_catalog",
+        "seed": DEFAULT_SEED,
+        "k": TOP_K,
+        **result.__dict__,
+    }
+
+
+def render_scorecard(result: LargeCatalogResult) -> str:
+    """Render the deterministic, latency-free scorecard markdown."""
+    breaches = _threshold_breaches(result)
+    status = "✅ within thresholds" if not breaches else "⚠️ " + "; ".join(breaches)
+    return "\n".join(
+        [
+            "# contextweaver — Large-Catalog Routing Scorecard",
+            "",
+            "> Auto-generated by `make benchmark-large-catalog`. Do not edit by hand.",
+            "> Source: `benchmarks/large_catalog.py` (issue #369). Offline, deterministic.",
+            "",
+            f"- Catalog size: `{result.catalog_size}` tools across "
+            f"`{result.namespaces}` namespaces",
+            f"- Near-duplicate distractor tools: `{result.distractor_tools}`",
+            f"- Destructive (side-effecting) tools: `{result.destructive_tools}`",
+            f"- Gold queries: `{result.queries}`",
+            "- Token estimator: `CharDivFourEstimator` (no model dependency)",
+            "",
+            "## Routing accuracy",
+            "",
+            "| recall@1 | recall@3 | recall@5 | MRR |",
+            "|---:|---:|---:|---:|",
+            f"| {result.recall_at_1:.4f} | {result.recall_at_3:.4f} "
+            f"| {result.recall_at_5:.4f} | {result.mrr:.4f} |",
+            "",
+            "## Prompt-token reduction (ChoiceCards vs naive all-tools prompt)",
+            "",
+            "| naive tokens | mean card tokens | reduction |",
+            "|---:|---:|---:|",
+            f"| {result.mean_naive_tokens} | {result.mean_card_tokens} "
+            f"| {result.token_reduction_pct:.2f}% |",
+            "",
+            "## Destructive-tool filtering",
+            "",
+            f"- Destructive tools reaching the shortlist when denied: "
+            f"`{result.destructive_in_shortlist_denied}` (expected `0`).",
+            "",
+            "## Thresholds",
+            "",
+            f"- recall@5 floor: `{RECALL_AT_5_FLOOR:.2f}` · "
+            f"token-reduction floor: `{TOKEN_REDUCTION_FLOOR_PCT:.0f}%`",
+            f"- Status: {status}",
+            "",
+            "Latency is hardware-dependent and intentionally excluded from this",
+            "committed scorecard; see `benchmarks/results/large_catalog.json` for the",
+            "p50/p99 measured on the producing host.",
+            "",
+        ]
+    )
+
+
+def _threshold_breaches(result: LargeCatalogResult) -> list[str]:
+    breaches: list[str] = []
+    if result.recall_at_5 < RECALL_AT_5_FLOOR:
+        breaches.append(f"recall@5 {result.recall_at_5:.4f} < {RECALL_AT_5_FLOOR:.2f}")
+    if result.token_reduction_pct < TOKEN_REDUCTION_FLOOR_PCT:
+        breaches.append(
+            f"token reduction {result.token_reduction_pct:.2f}% < {TOKEN_REDUCTION_FLOOR_PCT:.0f}%"
+        )
+    if result.destructive_in_shortlist_denied:
+        breaches.append(
+            f"{result.destructive_in_shortlist_denied} denied destructive tool(s) leaked"
+        )
+    return breaches
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0] if __doc__ else None)
+    parser.add_argument("--size", type=int, default=DEFAULT_CATALOG_SIZE)
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
+    parser.add_argument("--check", action="store_true", help="Exit non-zero on scorecard drift.")
+    parser.add_argument("--strict", action="store_true", help="Exit non-zero if below thresholds.")
+    args = parser.parse_args(argv)
+
+    result = run_benchmark(args.size, args.seed)
+    scorecard = render_scorecard(result)
+
+    if args.check:
+        current = (
+            DEFAULT_SCORECARD.read_text(encoding="utf-8") if DEFAULT_SCORECARD.exists() else ""
+        )
+        if current != scorecard:
+            print(
+                "large-catalog scorecard drift — run `make benchmark-large-catalog` and commit.",
+                file=sys.stderr,
+            )
+            return 1
+        print("large-catalog scorecard: up to date")
+        return 0
+
+    DEFAULT_JSON.parent.mkdir(parents=True, exist_ok=True)
+    DEFAULT_JSON.write_text(
+        json.dumps(to_json(result), indent=2, sort_keys=True) + "\n", encoding="utf-8", newline="\n"
+    )
+    DEFAULT_SCORECARD.write_text(scorecard, encoding="utf-8", newline="\n")
+    print(f"Wrote {DEFAULT_SCORECARD} and {DEFAULT_JSON}")
+    print(
+        f"recall@5={result.recall_at_5:.4f} reduction={result.token_reduction_pct:.2f}% "
+        f"p99={result.latency_ms_p99:.3f}ms"
+    )
+
+    breaches = _threshold_breaches(result)
+    if breaches:
+        print("WARNING: " + "; ".join(breaches), file=sys.stderr)
+        if args.strict:
+            return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/large_catalog_scorecard.md b/benchmarks/large_catalog_scorecard.md
new file mode 100644
index 00000000..e10d249f
--- /dev/null
+++ b/benchmarks/large_catalog_scorecard.md
@@ -0,0 +1,35 @@
+# contextweaver — Large-Catalog Routing Scorecard
+
+> Auto-generated by `make benchmark-large-catalog`. Do not edit by hand.
+> Source: `benchmarks/large_catalog.py` (issue #369). Offline, deterministic.
+
+- Catalog size: `320` tools across `8` namespaces
+- Near-duplicate distractor tools: `238`
+- Destructive (side-effecting) tools: `31`
+- Gold queries: `82`
+- Token estimator: `CharDivFourEstimator` (no model dependency)
+
+## Routing accuracy
+
+| recall@1 | recall@3 | recall@5 | MRR |
+|---:|---:|---:|---:|
+| 0.7073 | 0.7073 | 0.7073 | 0.7073 |
+
+## Prompt-token reduction (ChoiceCards vs naive all-tools prompt)
+
+| naive tokens | mean card tokens | reduction |
+|---:|---:|---:|
+| 4368 | 113 | 97.41% |
+
+## Destructive-tool filtering
+
+- Destructive tools reaching the shortlist when denied: `0` (expected `0`).
+
+## Thresholds
+
+- recall@5 floor: `0.65` · token-reduction floor: `80%`
+- Status: ✅ within thresholds
+
+Latency is hardware-dependent and intentionally excluded from this
+committed scorecard; see `benchmarks/results/large_catalog.json` for the
+p50/p99 measured on the producing host.
diff --git a/benchmarks/results/history/0.16.0.json b/benchmarks/results/history/0.16.0.json
new file mode 100644
index 00000000..865a0beb
--- /dev/null
+++ b/benchmarks/results/history/0.16.0.json
@@ -0,0 +1,24 @@
+{
+  "metrics": {
+    "mean_token_reduction_pct": 64.31,
+    "routing_mrr": {
+      "1000": 0.1456,
+      "50": 0.4978,
+      "83": 0.3242
+    },
+    "routing_precision_at_k": {
+      "1000": 0.031,
+      "50": 0.1191,
+      "83": 0.08
+    },
+    "routing_recall_at_k": {
+      "1000": 0.1475,
+      "50": 0.5649,
+      "83": 0.3825
+    },
+    "total_dedup_removed": 4,
+    "total_items_dropped": 7
+  },
+  "release": "0.16.0",
+  "schema_version": 1
+}
diff --git a/benchmarks/scenario_routing.md b/benchmarks/scenario_routing.md
new file mode 100644
index 00000000..e2e85681
--- /dev/null
+++ b/benchmarks/scenario_routing.md
@@ -0,0 +1,26 @@
+# contextweaver — Scenario Routing Benchmark
+
+> Auto-generated by `make benchmark-scenario`. Do not edit by hand.
+> Source: `benchmarks/scenario_routing.py` (issue #418). Offline, deterministic.
+
+Naive all-tools prompting vs bounded `ChoiceCard` routing across tool-heavy
+scenarios. Token counts use `CharDivFourEstimator` (no model dependency).
+
+- Scenarios: `6` · correct tool in top-5: `4/6` · mean token reduction: `96.54%`
+
+| scenario | catalog | correct@top-k | rank | cards | naive tokens | card tokens | reduction |
+|---|---:|:---:|---:|---:|---:|---:|---:|
+| draft_email | 300 | ✅ | 1 | 5 | 4083 | 111 | 97.28% |
+| find_contact | 300 | ✅ | 1 | 5 | 4083 | 111 | 97.28% |
+| find_unpaid_invoices | 200 | ❌ | — | 5 | 2618 | 110 | 95.80% |
+| refund_a_payment | 200 | ❌ | — | 5 | 2618 | 108 | 95.87% |
+| revenue_report | 300 | ✅ | 1 | 5 | 4083 | 117 | 97.13% |
+| send_slack_update | 200 | ✅ | 1 | 5 | 2618 | 108 | 95.87% |
+
+Reading the table:
+
+- `correct@top-k` is whether the expected tool survived into the bounded
+  shortlist — the property naive prompting trivially satisfies (every tool
+  is present) but at the token cost in the `naive tokens` column.
+- `reduction` is how much smaller the ChoiceCard prompt is than listing
+  every tool's name + description — the headline routing benefit at scale.
diff --git a/benchmarks/scenario_routing.py b/benchmarks/scenario_routing.py
new file mode 100644
index 00000000..a8c63c62
--- /dev/null
+++ b/benchmarks/scenario_routing.py
@@ -0,0 +1,200 @@
+"""Scenario benchmark: naive all-tools prompt vs bounded ChoiceCard routing (#418).
+
+A scenario-style benchmark that makes contextweaver's routing value concrete:
+for each tool-heavy task it contrasts the two prompt-construction strategies a
+tool-using agent can pick from —
+
+1. **naive** — expose *every* tool's name + description to the model;
+2. **contextweaver** — route the query and expose only the bounded ``ChoiceCard``
+   shortlist.
+
+For each scenario it reports whether the expected tool stays reachable
+(correct-in-top-k + its rank), how many cards are shown, and the prompt-token
+cost of each strategy. Deterministic and offline: catalogs are seeded and token
+counts use ``CharDivFourEstimator``, so the report is environment-independent.
+
+It does not depend on LangWatch (the inspiration) or any hosted workspace, and
+reuses only the installed package, mirroring ``benchmarks/smoke_eval.py``.
+
+Usage::
+
+    python benchmarks/scenario_routing.py            # write the markdown report
+    python benchmarks/scenario_routing.py --check     # exit non-zero on drift
+
+Exit codes: 0 on success; 1 on report drift (``--check``).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(_ROOT / "src"))
+
+from contextweaver.protocols import CharDivFourEstimator  # noqa: E402
+from contextweaver.routing.cards import make_choice_cards, render_cards_text  # noqa: E402
+from contextweaver.routing.catalog import (  # noqa: E402
+    generate_sample_catalog,
+    load_catalog_dicts,
+)
+from contextweaver.routing.router import Router  # noqa: E402
+from contextweaver.routing.tree import TreeBuilder  # noqa: E402
+from contextweaver.types import SelectableItem  # noqa: E402
+
+DEFAULT_DATASET = _ROOT / "benchmarks" / "scenarios" / "routing_choicecard.json"
+DEFAULT_OUTPUT = _ROOT / "benchmarks" / "scenario_routing.md"
+SEED = 42
+TOP_K = 5
+BEAM_WIDTH = 3
+
+_EST = CharDivFourEstimator()
+
+
+def _count(text: str) -> int:
+    return _EST.estimate(text)
+
+
+def _make_catalog(n: int, seed: int = SEED) -> list[SelectableItem]:
+    """Deterministic catalog of *n* tools, extending the 83-item pool with variants."""
+    base = load_catalog_dicts(generate_sample_catalog(n=83, seed=seed))
+    items: list[SelectableItem] = list(base)
+    version = 2
+    while len(items) < n:
+        for orig in list(base):
+            items.append(
+                SelectableItem(
+                    f"{orig.id}.v{version}",
+                    orig.kind,
+                    f"{orig.name}_v{version}",
+                    f"{orig.description} (variant {version})",
+                    tags=orig.tags,
+                    namespace=orig.namespace,
+                )
+            )
+            if len(items) >= n:
+                break
+        version += 1
+    return sorted(items, key=lambda i: i.id)[:n]
+
+
+@dataclass
+class ScenarioRow:
+    """One scenario's naive-vs-ChoiceCard comparison."""
+
+    name: str
+    catalog_size: int
+    correct_in_top_k: bool
+    correct_rank: int  # 1-based; 0 = not in shortlist
+    cards_shown: int
+    naive_tokens: int
+    card_tokens: int
+    token_reduction_pct: float
+
+
+def run_scenario(scenario: dict[str, object]) -> ScenarioRow:
+    """Route one scenario and return its comparison row."""
+    size = int(scenario["catalog_size"])  # type: ignore[arg-type]
+    query = str(scenario["query"])
+    expected = set(scenario.get("expected", []))  # type: ignore[arg-type]
+    items = _make_catalog(size)
+    router = Router(TreeBuilder().build(items), items=items, top_k=TOP_K, beam_width=BEAM_WIDTH)
+    result = router.route(query)
+
+    candidate_ids = list(result.candidate_ids)
+    rank = next((i + 1 for i, cid in enumerate(candidate_ids) if cid in expected), 0)
+    cards = make_choice_cards(result.candidate_items)
+    naive_tokens = _count("\n".join(f"{it.name}: {it.description}" for it in items))
+    card_tokens = _count(render_cards_text(cards))
+    reduction = round((1 - card_tokens / naive_tokens) * 100.0, 2) if naive_tokens else 0.0
+    return ScenarioRow(
+        name=str(scenario["name"]),
+        catalog_size=len(items),
+        correct_in_top_k=rank > 0,
+        correct_rank=rank,
+        cards_shown=len(cards),
+        naive_tokens=naive_tokens,
+        card_tokens=card_tokens,
+        token_reduction_pct=reduction,
+    )
+
+
+def run_all(dataset_path: Path = DEFAULT_DATASET) -> list[ScenarioRow]:
+    """Run every scenario in *dataset_path*, ordered by scenario name."""
+    scenarios = json.loads(dataset_path.read_text(encoding="utf-8"))
+    rows = [run_scenario(s) for s in scenarios]
+    return sorted(rows, key=lambda r: r.name)
+
+
+def render_report(rows: list[ScenarioRow]) -> str:
+    """Render the deterministic scenario comparison report."""
+    hits = sum(1 for r in rows if r.correct_in_top_k)
+    mean_reduction = round(sum(r.token_reduction_pct for r in rows) / len(rows), 2) if rows else 0.0
+    lines = [
+        "# contextweaver — Scenario Routing Benchmark",
+        "",
+        "> Auto-generated by `make benchmark-scenario`. Do not edit by hand.",
+        "> Source: `benchmarks/scenario_routing.py` (issue #418). Offline, deterministic.",
+        "",
+        "Naive all-tools prompting vs bounded `ChoiceCard` routing across tool-heavy",
+        "scenarios. Token counts use `CharDivFourEstimator` (no model dependency).",
+        "",
+        f"- Scenarios: `{len(rows)}` · correct tool in top-{TOP_K}: "
+        f"`{hits}/{len(rows)}` · mean token reduction: `{mean_reduction:.2f}%`",
+        "",
+        "| scenario | catalog | correct@top-k | rank | cards | naive tokens "
+        "| card tokens | reduction |",
+        "|---|---:|:---:|---:|---:|---:|---:|---:|",
+    ]
+    for r in rows:
+        correct = "✅" if r.correct_in_top_k else "❌"
+        rank = str(r.correct_rank) if r.correct_rank else "—"
+        lines.append(
+            f"| {r.name} | {r.catalog_size} | {correct} | {rank} | {r.cards_shown} "
+            f"| {r.naive_tokens} | {r.card_tokens} | {r.token_reduction_pct:.2f}% |"
+        )
+    lines.extend(
+        [
+            "",
+            "Reading the table:",
+            "",
+            "- `correct@top-k` is whether the expected tool survived into the bounded",
+            "  shortlist — the property naive prompting trivially satisfies (every tool",
+            "  is present) but at the token cost in the `naive tokens` column.",
+            "- `reduction` is how much smaller the ChoiceCard prompt is than listing",
+            "  every tool's name + description — the headline routing benefit at scale.",
+            "",
+        ]
+    )
+    return "\n".join(lines)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0] if __doc__ else None)
+    parser.add_argument("--dataset", default=str(DEFAULT_DATASET))
+    parser.add_argument("--output", default=str(DEFAULT_OUTPUT))
+    parser.add_argument("--check", action="store_true", help="Exit non-zero on report drift.")
+    args = parser.parse_args(argv)
+
+    report = render_report(run_all(Path(args.dataset)))
+    output = Path(args.output)
+    if args.check:
+        current = output.read_text(encoding="utf-8") if output.exists() else ""
+        if current != report:
+            print(
+                "scenario report drift — run `make benchmark-scenario` and commit.",
+                file=sys.stderr,
+            )
+            return 1
+        print("scenario report: up to date")
+        return 0
+    output.write_text(report, encoding="utf-8", newline="\n")
+    print(f"Wrote {output}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/scenarios/routing_choicecard.json b/benchmarks/scenarios/routing_choicecard.json
new file mode 100644
index 00000000..58b1ad3c
--- /dev/null
+++ b/benchmarks/scenarios/routing_choicecard.json
@@ -0,0 +1,38 @@
+[
+  {
+    "name": "find_unpaid_invoices",
+    "query": "find unpaid invoices and draft a reminder",
+    "expected": ["billing.invoices.search"],
+    "catalog_size": 200
+  },
+  {
+    "name": "refund_a_payment",
+    "query": "refund a completed payment to the customer",
+    "expected": ["billing.payments.refund"],
+    "catalog_size": 200
+  },
+  {
+    "name": "send_slack_update",
+    "query": "post a status update to a slack channel",
+    "expected": ["comms.slack.post"],
+    "catalog_size": 200
+  },
+  {
+    "name": "find_contact",
+    "query": "find a contact by name or email address",
+    "expected": ["crm.contacts.find"],
+    "catalog_size": 300
+  },
+  {
+    "name": "revenue_report",
+    "query": "generate a revenue summary report",
+    "expected": ["billing.reports.revenue"],
+    "catalog_size": 300
+  },
+  {
+    "name": "draft_email",
+    "query": "create an email draft to a customer",
+    "expected": ["comms.email.draft"],
+    "catalog_size": 300
+  }
+]
diff --git a/benchmarks/trend.md b/benchmarks/trend.md
new file mode 100644
index 00000000..5557f6de
--- /dev/null
+++ b/benchmarks/trend.md
@@ -0,0 +1,46 @@
+# contextweaver — Benchmark Trend
+
+> Auto-generated by `make trend`. Do not edit by hand.
+> Source: `benchmarks/results/history/*.json` (one snapshot per release).
+
+Release-over-release view of the deterministic benchmark metrics. Latency
+is excluded — it is environment-dependent and not comparable across release
+machines. This page is visibility only; PR-time regression gating lives in
+`benchmarks/gating.yaml` + `scripts/benchmark_gate.py` (#491).
+
+Releases recorded: 1 (`0.16.0` … `0.16.0`).
+
+## Routing recall@k by catalog size
+
+| release | size=50 | size=83 | size=1000 |
+|---|---:|---:|---:|
+| `0.16.0` | 0.5649 | 0.3825 | 0.1475 |
+
+## Routing MRR by catalog size
+
+| release | size=50 | size=83 | size=1000 |
+|---|---:|---:|---:|
+| `0.16.0` | 0.4978 | 0.3242 | 0.1456 |
+
+## Routing precision@k by catalog size
+
+| release | size=50 | size=83 | size=1000 |
+|---|---:|---:|---:|
+| `0.16.0` | 0.1191 | 0.0800 | 0.0310 |
+
+## Context pipeline quality
+
+| release | mean token reduction | items dropped | dedup removed |
+|---|---:|---:|---:|
+| `0.16.0` | 64.31% | 7 | 4 |
+
+---
+
+## Capturing a release snapshot
+
+```bash
+make benchmark   # refresh benchmarks/results/latest.json
+python scripts/render_trend.py --snapshot <version> \
+    --from benchmarks/results/latest.json
+make trend       # re-render benchmarks/trend.md
+```
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 83d8655f..5de7a573 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -24,6 +24,24 @@ git diff --quiet benchmarks/scorecard.md   # passes on clean re-run with same se
 The check `git diff --quiet benchmarks/scorecard.md` is the determinism gate:
 identical inputs must produce byte-identical scorecard output.
 
+## Scaling and trend
+
+Beyond the headline scorecard, three companion benchmarks track behaviour at
+scale and over time:
+
+| What | Command | Output |
+|---|---|---|
+| Latency to 10k tools + cache speedup | `make benchmark-routing-scale` | [`scaling matrix`](benchmarks/scaling-matrix.md) |
+| Recall + token reduction at 300+ tools | `make benchmark-large-catalog` | `benchmarks/large_catalog_scorecard.md` |
+| Naive all-tools vs ChoiceCard routing | `make benchmark-scenario` | `benchmarks/scenario_routing.md` |
+| Release-over-release trend | `make trend` | [`benchmarks/trend.md`](https://github.com/dgenio/contextweaver/blob/main/benchmarks/trend.md) |
+
+PR-time regressions are caught by the gating check (`benchmarks/gating.yaml` +
+`scripts/benchmark_gate.py`): a PR that drops a gated quality metric beyond its
+tolerance band fails CI unless the `benchmark-accepted` label is applied with a
+rationale. See the [scaling matrix](benchmarks/scaling-matrix.md) page for the
+full methodology.
+
 ## What is measured
 
 **Routing.** Precision\@k, recall\@k, MRR, and p50/p95/p99 latency at catalog
diff --git a/docs/benchmarks/scaling-matrix.md b/docs/benchmarks/scaling-matrix.md
new file mode 100644
index 00000000..c619cf16
--- /dev/null
+++ b/docs/benchmarks/scaling-matrix.md
@@ -0,0 +1,65 @@
+# Scaling benchmark matrix
+
+How contextweaver's routing behaves as the tool catalog grows — the
+methodology, the reproducible commands, and how to read the numbers (issue
+#687). This page ties together three deterministic, offline benchmarks that
+each measure a different slice of "does this still work at scale?"
+
+| Benchmark | Question it answers | Command | Output |
+|---|---|---|---|
+| Routing-scale profile | How does build/route **latency** scale to 10k tools, and how much does the persistent cache save? | `make benchmark-routing-scale` | [`routing-scale.md`](routing-scale.md) · `benchmarks/results/routing_scale.json` |
+| Large-catalog quality | At 300+ tools across many namespaces, does routing keep the right tool **reachable** while collapsing the prompt? | `make benchmark-large-catalog` | `benchmarks/large_catalog_scorecard.md` · `benchmarks/results/large_catalog.json` |
+| Per-backend matrix | How do `tfidf` / `bm25` / embedding backends compare across catalog sizes? | `make benchmark-matrix` | `benchmarks/scorecard.md` (matrix section) |
+
+## Methodology
+
+- **Deterministic and offline.** Catalogs are generated from a seeded pool
+  (`generate_sample_catalog`) and extended with near-duplicate variants for
+  larger sizes. No network and no model calls; token counts use the
+  `CharDivFourEstimator` so accuracy and token figures are
+  environment-independent.
+- **Latency is host-dependent.** Treat latency columns as *ordering*, not
+  absolutes — the relative cost between catalog sizes is portable, the
+  absolute millisecond count is not. Quality metrics (recall@k, MRR, token
+  reduction) are environment-independent and should be byte-identical on a
+  clean re-run.
+- **Scale points.** The routing-scale profile sweeps `100 → 1000 → 5000 →
+  10000` tools. The large-catalog quality benchmark runs at 320 tools across
+  8 namespaces with ~240 near-duplicate distractor tools and ~30 destructive
+  (side-effecting) tools.
+
+## Reproducing the full matrix
+
+```bash
+make benchmark-routing-scale   # latency + cache speedup up to 10k tools
+make benchmark-large-catalog   # recall/MRR + token reduction at 300+ tools
+make benchmark-matrix          # per-backend × per-size accuracy matrix
+```
+
+Each command writes a committed Markdown scorecard plus a machine-readable
+JSON artifact under `benchmarks/results/`.
+
+## Interpreting the results
+
+- **Cold start dominates at scale.** In the routing-scale profile, graph
+  construction (`TreeBuilder.build`) grows super-linearly and dominates cold
+  start. Deployments that recreate a router per request over the same catalog
+  should persist the graph and fitted index (`save_graph`/`load_graph` +
+  `RoutingIndexCache`); the `cold speedup` column quantifies the win.
+- **Recall degrades predictably with catalog size.** As distractors multiply,
+  near-duplicate tools compete with the true match. The large-catalog
+  scorecard reports recall@1/3/5 against this pressure; a drop below the
+  scorecard's threshold floor is flagged as a regression.
+- **Token reduction is the headline benefit.** Bounded `ChoiceCard`s shrink
+  the routing prompt by ~95–97% versus listing every tool's name + description
+  (the naive baseline these benchmarks measure; full JSON schemas would make
+  the gap larger still) — and the gap widens as the catalog grows, which is
+  exactly when naive all-tools prompting becomes untenable.
+
+## Trend over releases
+
+Per-release snapshots of the deterministic metrics are captured under
+`benchmarks/results/history/` and rendered to
+[`benchmarks/trend.md`](https://github.com/dgenio/contextweaver/blob/main/benchmarks/trend.md)
+(`make trend`), so scaling regressions that creep in across releases stay
+visible.
diff --git a/mkdocs.yml b/mkdocs.yml
index 48a76faf..b038198f 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -115,7 +115,10 @@ nav:
       - Persistent & Remote Stores: persistent_stores.md
       - Puppetmaster: integration_puppetmaster.md
   - Architecture: architecture.md
-  - Benchmarks: benchmarks.md
+  - Benchmarks:
+      - Overview: benchmarks.md
+      - Scaling matrix: benchmarks/scaling-matrix.md
+      - Routing-scale profile: benchmarks/routing-scale.md
   - Adopter Benchmark Report: benchmark_report.md
   - Token Calibration: token_calibration.md
   - Context Rot Demo: context_rot.md
diff --git a/scripts/benchmark_gate.py b/scripts/benchmark_gate.py
new file mode 100644
index 00000000..f9d5c5bb
--- /dev/null
+++ b/scripts/benchmark_gate.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""Gate benchmark quality regressions with tolerance bands (issue #491).
+
+Companion to ``scripts/benchmark_delta.py`` (the informational sticky PR
+comment). Where the delta script *describes* head-vs-base movement, this
+script *enforces* it: a PR that regresses a gated quality metric beyond its
+band — recall@k, MRR, precision@k, token-savings — exits non-zero so CI can
+block the merge. Latency cells are never gated (runner variance).
+
+The gate compares a head ``latest.json`` against the committed base
+``latest.json`` cell-by-cell, keyed by identity (catalog size, backend×size,
+or scenario), so a regression in any one cell is surfaced with its location.
+
+The script is **stdlib-only on the hot path**; the YAML config is parsed with a
+lazy ``yaml`` import (a core dependency) only when ``--gating-config`` is read,
+so the import is paid in CI after ``pip install -e .`` runs.
+
+Usage::
+
+    python scripts/benchmark_gate.py --base base.json --head head.json
+    python scripts/benchmark_gate.py --base base.json --head head.json \\
+        --gating-config benchmarks/gating.yaml
+    python scripts/benchmark_gate.py --base base.json --head head.json --override
+
+Exit codes: ``0`` when every gated cell is within band (or ``--override`` is
+set), ``1`` when any gated cell regresses beyond its band, ``2`` on bad input.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections.abc import Sequence
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_GATING_CONFIG = REPO_ROOT / "benchmarks" / "gating.yaml"
+
+# Fraction metrics live on a 0..1 scale, so a band expressed in percentage
+# points (pp) is applied as ``band / 100``. Percent metrics are already 0..100
+# and the band is applied directly. Anything not listed here is informational.
+_FRACTION_METRICS = ("recall_at_k", "mrr", "precision_at_k")
+_PERCENT_METRICS = ("token_savings_pct",)
+
+# Mirrors benchmarks/gating.yaml so the gate has a safe default when no config
+# file is present (e.g. a partial checkout). Kept in sync with that file.
+DEFAULT_BANDS: dict[str, float] = {
+    "recall_at_k": 1.0,
+    "mrr": 1.0,
+    "precision_at_k": 1.0,
+    "token_savings_pct": 2.0,
+}
+
+
+@dataclass(frozen=True)
+class GateViolation:
+    """One gated cell that regressed beyond its tolerance band."""
+
+    metric: str
+    cell: str
+    base: float
+    head: float
+    regression_pp: float
+    band_pp: float
+
+    def describe(self) -> str:
+        """Return a single-line, deterministic human-readable summary."""
+        return (
+            f"{self.metric} [{self.cell}]: {self.base:.4f} -> {self.head:.4f} "
+            f"(-{self.regression_pp:.2f}pp, band {self.band_pp:.2f}pp)"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class GatingConfig:
+    """Resolved gate configuration: per-metric bands plus the override label."""
+
+    bands: dict[str, float]
+    override_label: str = "benchmark-accepted"
+
+
+def load_gating_config(path: Path | None) -> GatingConfig:
+    """Load bands from *path*; fall back to :data:`DEFAULT_BANDS` when absent.
+
+    The defaults are a safety net for a *missing* or unparseable config (e.g. a
+    partial checkout), not a floor. When *path* exists, parses to a mapping, and
+    carries a ``quality`` block, that block is authoritative: only metrics whose
+    band is a non-negative number are gated (a ``0`` band means "no regression
+    tolerated"), and a config that sets every metric to ``gating: false`` (so no
+    band resolves) deliberately gates nothing rather than silently reverting to
+    the defaults. A present config that omits ``quality`` entirely is treated as
+    incomplete and keeps the defaults.
+    """
+    if path is None or not path.exists():
+        return GatingConfig(bands=dict(DEFAULT_BANDS))
+    import yaml  # lazy: keeps the import off the no-config path
+
+    raw = yaml.safe_load(path.read_text(encoding="utf-8"))
+    if not isinstance(raw, dict):
+        return GatingConfig(bands=dict(DEFAULT_BANDS))
+    override = str(raw.get("override_label", "benchmark-accepted"))
+    if "quality" not in raw:
+        return GatingConfig(bands=dict(DEFAULT_BANDS), override_label=override)
+    bands: dict[str, float] = {}
+    for metric, spec in (raw.get("quality") or {}).items():
+        if not isinstance(spec, dict):
+            continue
+        band = spec.get("max_regression_pp")
+        if isinstance(band, (int, float)) and band >= 0:
+            bands[str(metric)] = float(band)
+    return GatingConfig(bands=bands, override_label=override)
+
+
+# ---------------------------------------------------------------------------
+# Cell extraction — each gated cell is (metric, cell-label, value)
+# ---------------------------------------------------------------------------
+
+
+def _routing_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]:
+    """Gated metrics from the single-backend ``routing`` summary rows."""
+    cells: dict[tuple[str, str], float] = {}
+    for row in payload.get("routing", []):
+        if not isinstance(row, dict):
+            continue
+        size = int(row.get("catalog_size", 0))
+        for metric in _FRACTION_METRICS:
+            if metric in row:
+                cells[(metric, f"routing/size={size}")] = float(row[metric])
+    return cells
+
+
+def _matrix_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]:
+    """Gated metrics from ``routing_matrix`` cells; skip non-``ok`` cells."""
+    cells: dict[tuple[str, str], float] = {}
+    for row in payload.get("routing_matrix", []):
+        if not isinstance(row, dict) or str(row.get("status", "ok")) != "ok":
+            continue
+        backend = str(row.get("backend", ""))
+        size = int(row.get("catalog_size", 0))
+        for metric in _FRACTION_METRICS:
+            if metric in row:
+                cells[(metric, f"matrix/{backend}@{size}")] = float(row[metric])
+    return cells
+
+
+def _token_savings_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]:
+    """Token-savings cells from each context row's ``naive_delta`` block."""
+    cells: dict[tuple[str, str], float] = {}
+    for row in payload.get("context", []):
+        if not isinstance(row, dict):
+            continue
+        nd = row.get("naive_delta")
+        if isinstance(nd, dict) and "pct_reduction" in nd:
+            scenario = str(row.get("scenario", ""))
+            cells[("token_savings_pct", f"context/{scenario}")] = float(nd["pct_reduction"])
+    return cells
+
+
+def _all_cells(payload: dict[str, Any]) -> dict[tuple[str, str], float]:
+    cells: dict[tuple[str, str], float] = {}
+    cells.update(_routing_cells(payload))
+    cells.update(_matrix_cells(payload))
+    cells.update(_token_savings_cells(payload))
+    return cells
+
+
+def _regression_pp(metric: str, base: float, head: float) -> float:
+    """Return the regression in percentage points (positive = got worse)."""
+    drop = base - head
+    if metric in _PERCENT_METRICS:
+        return drop  # already on a 0..100 scale
+    return drop * 100.0  # fraction (0..1) -> pp
+
+
+# ---------------------------------------------------------------------------
+# Gate
+# ---------------------------------------------------------------------------
+
+
+def evaluate_gate(
+    base: dict[str, Any], head: dict[str, Any], config: GatingConfig
+) -> list[GateViolation]:
+    """Return every gated cell in *head* that regressed beyond its band vs *base*.
+
+    Cells absent from *base* (new cells) cannot regress and are skipped. Cells
+    present in *base* but absent from *head* are also skipped — a removed cell is
+    a structural change, not a quality regression, and is surfaced by the delta
+    comment, not the gate. Results are sorted for deterministic output.
+    """
+    base_cells = _all_cells(base)
+    head_cells = _all_cells(head)
+    violations: list[GateViolation] = []
+    for key, head_value in head_cells.items():
+        metric, cell = key
+        band = config.bands.get(metric)
+        if band is None or key not in base_cells:
+            continue
+        base_value = base_cells[key]
+        regression = _regression_pp(metric, base_value, head_value)
+        if regression > band:
+            violations.append(
+                GateViolation(
+                    metric=metric,
+                    cell=cell,
+                    base=base_value,
+                    head=head_value,
+                    regression_pp=regression,
+                    band_pp=band,
+                )
+            )
+    return sorted(violations, key=lambda v: (v.metric, v.cell))
+
+
+def render_report(
+    violations: list[GateViolation],
+    *,
+    overridden: bool,
+    override_label: str = "benchmark-accepted",
+) -> str:
+    """Render a deterministic plain-text gate report.
+
+    *override_label* names the configured downgrade label (from the gating
+    config) so the report points at the exact label CI checks for.
+    """
+    if not violations:
+        return "benchmark gate: PASS — all gated quality metrics within band."
+    lines = [f"benchmark gate: {len(violations)} metric(s) regressed beyond band:"]
+    lines.extend(f"  - {v.describe()}" for v in violations)
+    lines.append("")
+    if overridden:
+        lines.append(
+            f"Override label '{override_label}' present — downgrading failure to a warning."
+        )
+    else:
+        lines.append(
+            f"Fix the regression, or apply the '{override_label}' label with a rationale in "
+            "the PR description to accept it."
+        )
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=__doc__.splitlines()[0] if __doc__ else None,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--base", required=True, help="Baseline (committed) latest.json")
+    parser.add_argument("--head", required=True, help="PR head latest.json")
+    parser.add_argument(
+        "--gating-config",
+        default=str(DEFAULT_GATING_CONFIG),
+        help="Path to gating.yaml (bands + override label)",
+    )
+    parser.add_argument(
+        "--override",
+        action="store_true",
+        help="Downgrade any gate failure to a warning (exit 0).",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    args = _parse_args(argv)
+    try:
+        base = json.loads(Path(args.base).read_text(encoding="utf-8"))
+        head = json.loads(Path(args.head).read_text(encoding="utf-8"))
+    except FileNotFoundError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+    except json.JSONDecodeError as exc:
+        print(f"error: malformed JSON: {exc}", file=sys.stderr)
+        return 2
+
+    config = load_gating_config(Path(args.gating_config) if args.gating_config else None)
+    violations = evaluate_gate(base, head, config)
+    print(render_report(violations, overridden=args.override, override_label=config.override_label))
+    if violations and not args.override:
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/render_trend.py b/scripts/render_trend.py
new file mode 100644
index 00000000..0b7b8509
--- /dev/null
+++ b/scripts/render_trend.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+"""Render the release-over-release benchmark trend page (issue #554).
+
+The deterministic benchmark answers "what are the numbers *now*?"
+(``benchmarks/results/latest.json`` + ``scorecard.md``). A regression that
+creeps in over several releases is invisible in any single snapshot. This
+script keeps a small, deterministic-only metric snapshot per release under
+``benchmarks/results/history/<version>.json`` and renders the longitudinal
+view to ``benchmarks/trend.md`` so quality trajectories stay visible.
+
+Latency is deliberately excluded from snapshots — it is environment-dependent
+and not comparable across release machines. This page is *visibility*, not a
+gate; PR-time gating with tolerance bands is owned by ``benchmark_gate.py`` (#491).
+
+The script is stdlib-only (no contextweaver import) so it can run before the
+package is installed, matching ``scripts/render_scorecard.py``.
+
+Usage::
+
+    # Capture a release snapshot from the current latest.json:
+    python scripts/render_trend.py --snapshot 0.16.0 --from benchmarks/results/latest.json
+    python scripts/render_trend.py            # render benchmarks/trend.md
+    python scripts/render_trend.py --check     # exit non-zero on drift
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any
+
+from _golden import check_text_artifacts, write_text_artifacts
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_HISTORY_DIR = REPO_ROOT / "benchmarks" / "results" / "history"
+DEFAULT_OUTPUT = REPO_ROOT / "benchmarks" / "trend.md"
+DEFAULT_LATEST = REPO_ROOT / "benchmarks" / "results" / "latest.json"
+
+SNAPSHOT_SCHEMA_VERSION = 1
+
+
+# ---------------------------------------------------------------------------
+# Snapshot extraction (latest.json -> deterministic metric subset)
+# ---------------------------------------------------------------------------
+
+
+def extract_snapshot(release: str, latest: dict[str, Any]) -> dict[str, Any]:
+    """Return the deterministic-only metric subset for one release.
+
+    Latency fields are intentionally omitted. The shape carries a
+    ``schema_version`` so future metric changes do not orphan old entries.
+    """
+    recall: dict[str, float] = {}
+    mrr: dict[str, float] = {}
+    precision: dict[str, float] = {}
+    for row in latest.get("routing", []):
+        if not isinstance(row, dict):
+            continue
+        size = str(int(row.get("catalog_size", 0)))
+        recall[size] = round(float(row.get("recall_at_k", 0.0)), 4)
+        mrr[size] = round(float(row.get("mrr", 0.0)), 4)
+        precision[size] = round(float(row.get("precision_at_k", 0.0)), 4)
+
+    reductions: list[float] = []
+    dropped = 0
+    dedup = 0
+    for row in latest.get("context", []):
+        if not isinstance(row, dict):
+            continue
+        dropped += int(row.get("items_dropped", 0))
+        dedup += int(row.get("dedup_removed", 0))
+        nd = row.get("naive_delta")
+        if isinstance(nd, dict) and "pct_reduction" in nd:
+            reductions.append(float(nd["pct_reduction"]))
+    mean_reduction = round(sum(reductions) / len(reductions), 2) if reductions else 0.0
+
+    return {
+        "schema_version": SNAPSHOT_SCHEMA_VERSION,
+        "release": release,
+        "metrics": {
+            "routing_recall_at_k": recall,
+            "routing_mrr": mrr,
+            "routing_precision_at_k": precision,
+            "mean_token_reduction_pct": mean_reduction,
+            "total_items_dropped": dropped,
+            "total_dedup_removed": dedup,
+        },
+    }
+
+
+def write_snapshot(snapshot: dict[str, Any], history_dir: Path) -> Path:
+    """Write *snapshot* deterministically to ``<history_dir>/<release>.json``."""
+    history_dir.mkdir(parents=True, exist_ok=True)
+    out = history_dir / f"{snapshot['release']}.json"
+    text = json.dumps(snapshot, indent=2, sort_keys=True) + "\n"
+    out.write_text(text, encoding="utf-8", newline="\n")
+    return out
+
+
+# ---------------------------------------------------------------------------
+# History loading + ordering
+# ---------------------------------------------------------------------------
+
+
+def _version_key(release: str) -> tuple[Any, ...]:
+    """Best-effort semantic ordering: numeric tuple, with a string fallback."""
+    parts: list[Any] = []
+    for chunk in release.split("."):
+        parts.append((0, int(chunk)) if chunk.isdigit() else (1, chunk))
+    return tuple(parts)
+
+
+def load_history(history_dir: Path) -> list[dict[str, Any]]:
+    """Load every snapshot in *history_dir*, ordered oldest release first."""
+    snapshots: list[dict[str, Any]] = []
+    if not history_dir.exists():
+        return snapshots
+    for path in sorted(history_dir.glob("*.json")):
+        snapshots.append(json.loads(path.read_text(encoding="utf-8")))
+    return sorted(snapshots, key=lambda s: _version_key(str(s.get("release", ""))))
+
+
+# ---------------------------------------------------------------------------
+# Rendering
+# ---------------------------------------------------------------------------
+
+
+def _per_size_table(snapshots: list[dict[str, Any]], metric: str) -> str:
+    sizes = sorted({int(size) for s in snapshots for size in s.get("metrics", {}).get(metric, {})})
+    if not sizes:
+        return "_No data._"
+    header = "| release | " + " | ".join(f"size={n}" for n in sizes) + " |"
+    sep = "|---|" + "---:|" * len(sizes)
+    lines = [header, sep]
+    for s in snapshots:
+        values = s.get("metrics", {}).get(metric, {})
+        cells = [f"{float(values[str(n)]):.4f}" if str(n) in values else "—" for n in sizes]
+        lines.append(f"| `{s.get('release', '?')}` | " + " | ".join(cells) + " |")
+    return "\n".join(lines)
+
+
+def _context_table(snapshots: list[dict[str, Any]]) -> str:
+    header = "| release | mean token reduction | items dropped | dedup removed |"
+    sep = "|---|---:|---:|---:|"
+    lines = [header, sep]
+    for s in snapshots:
+        m = s.get("metrics", {})
+        lines.append(
+            f"| `{s.get('release', '?')}` | {float(m.get('mean_token_reduction_pct', 0.0)):.2f}% "
+            f"| {int(m.get('total_items_dropped', 0))} | {int(m.get('total_dedup_removed', 0))} |"
+        )
+    return "\n".join(lines)
+
+
+def render(snapshots: list[dict[str, Any]]) -> str:
+    """Return the deterministic trend markdown for *snapshots*."""
+    parts = [
+        "# contextweaver — Benchmark Trend",
+        "",
+        "> Auto-generated by `make trend`. Do not edit by hand.",
+        "> Source: `benchmarks/results/history/*.json` (one snapshot per release).",
+        "",
+        "Release-over-release view of the deterministic benchmark metrics. Latency",
+        "is excluded — it is environment-dependent and not comparable across release",
+        "machines. This page is visibility only; PR-time regression gating lives in",
+        "`benchmarks/gating.yaml` + `scripts/benchmark_gate.py` (#491).",
+        "",
+    ]
+    if not snapshots:
+        parts.extend(
+            [
+                "_No release snapshots recorded yet. Capture one with_",
+                "`python scripts/render_trend.py --snapshot <version> "
+                "--from benchmarks/results/latest.json`.",
+                "",
+            ]
+        )
+        return "\n".join(parts)
+
+    parts.extend(
+        [
+            f"Releases recorded: {len(snapshots)} "
+            f"(`{snapshots[0].get('release', '?')}` … `{snapshots[-1].get('release', '?')}`).",
+            "",
+            "## Routing recall@k by catalog size",
+            "",
+            _per_size_table(snapshots, "routing_recall_at_k"),
+            "",
+            "## Routing MRR by catalog size",
+            "",
+            _per_size_table(snapshots, "routing_mrr"),
+            "",
+            "## Routing precision@k by catalog size",
+            "",
+            _per_size_table(snapshots, "routing_precision_at_k"),
+            "",
+            "## Context pipeline quality",
+            "",
+            _context_table(snapshots),
+            "",
+            "---",
+            "",
+            "## Capturing a release snapshot",
+            "",
+            "```bash",
+            "make benchmark   # refresh benchmarks/results/latest.json",
+            "python scripts/render_trend.py --snapshot <version> \\",
+            "    --from benchmarks/results/latest.json",
+            "make trend       # re-render benchmarks/trend.md",
+            "```",
+            "",
+        ]
+    )
+    return "\n".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=__doc__.splitlines()[0] if __doc__ else None,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--history-dir", default=str(DEFAULT_HISTORY_DIR))
+    parser.add_argument("--output", default=str(DEFAULT_OUTPUT))
+    parser.add_argument(
+        "--snapshot", help="Capture a release snapshot under this version, then exit"
+    )
+    parser.add_argument("--from", dest="from_path", default=str(DEFAULT_LATEST))
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Do not write; exit non-zero if benchmarks/trend.md would change.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    args = _parse_args(argv)
+    history_dir = Path(args.history_dir)
+
+    if args.snapshot:
+        latest_path = Path(args.from_path)
+        if not latest_path.exists():
+            print(f"error: {latest_path} not found — run `make benchmark` first.", file=sys.stderr)
+            return 1
+        latest = json.loads(latest_path.read_text(encoding="utf-8"))
+        out = write_snapshot(extract_snapshot(args.snapshot, latest), history_dir)
+        print(f"Wrote {out}")
+        return 0
+
+    rendered = {Path(args.output): render(load_history(history_dir))}
+    if args.check:
+        return check_text_artifacts(rendered, label="trend", regen="make trend")
+    write_text_artifacts(rendered)
+    print(f"Wrote {args.output}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_benchmark_gate.py b/tests/test_benchmark_gate.py
new file mode 100644
index 00000000..5286d76e
--- /dev/null
+++ b/tests/test_benchmark_gate.py
@@ -0,0 +1,158 @@
+"""Tests for scripts/benchmark_gate.py — the quality-regression gate (#491).
+
+Covers the gate contract: a quality metric regressing beyond its band fails;
+within-band movement and improvements pass; latency never gates; new/removed
+cells do not fire; the override downgrades a failure to a warning.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+from benchmark_gate import (  # noqa: E402
+    GatingConfig,
+    evaluate_gate,
+    load_gating_config,
+    main,
+)
+
+_CONFIG = GatingConfig(
+    bands={
+        "recall_at_k": 1.0,
+        "mrr": 1.0,
+        "precision_at_k": 1.0,
+        "token_savings_pct": 2.0,
+    }
+)
+
+
+def _routing(recall: float, mrr: float = 0.3, size: int = 100) -> dict[str, object]:
+    return {
+        "routing": [
+            {
+                "catalog_size": size,
+                "recall_at_k": recall,
+                "mrr": mrr,
+                "precision_at_k": 0.08,
+                "latency_ms_p99": 1.0,
+            }
+        ]
+    }
+
+
+def test_recall_regression_beyond_band_fails() -> None:
+    base = _routing(0.5000)
+    head = _routing(0.4800)  # -2.0pp, band is 1.0pp
+    violations = evaluate_gate(base, head, _CONFIG)
+    assert len(violations) == 1
+    v = violations[0]
+    assert v.metric == "recall_at_k"
+    assert v.cell == "routing/size=100"
+    assert round(v.regression_pp, 2) == 2.0
+
+
+def test_recall_within_band_passes() -> None:
+    base = _routing(0.5000)
+    head = _routing(0.4950)  # -0.5pp, inside the 1.0pp band
+    assert evaluate_gate(base, head, _CONFIG) == []
+
+
+def test_improvement_passes() -> None:
+    base = _routing(0.5000)
+    head = _routing(0.6000)
+    assert evaluate_gate(base, head, _CONFIG) == []
+
+
+def test_latency_never_gates() -> None:
+    base = {"routing": [{"catalog_size": 100, "latency_ms_p99": 1.0}]}
+    head = {"routing": [{"catalog_size": 100, "latency_ms_p99": 1000.0}]}
+    assert evaluate_gate(base, head, _CONFIG) == []
+
+
+def test_new_cell_does_not_fire() -> None:
+    base = _routing(0.50, size=100)
+    head = {
+        "routing": [
+            {"catalog_size": 100, "recall_at_k": 0.50, "mrr": 0.3, "precision_at_k": 0.08},
+            {"catalog_size": 500, "recall_at_k": 0.01, "mrr": 0.01, "precision_at_k": 0.0},
+        ]
+    }
+    # The size=500 cell is new (absent in base) — it cannot "regress".
+    assert evaluate_gate(base, head, _CONFIG) == []
+
+
+def test_skipped_matrix_cell_ignored() -> None:
+    base = {
+        "routing_matrix": [
+            {"backend": "fuzzy", "catalog_size": 100, "status": "skipped: missing rapidfuzz"}
+        ]
+    }
+    head = {
+        "routing_matrix": [
+            {"backend": "fuzzy", "catalog_size": 100, "status": "skipped: missing rapidfuzz"}
+        ]
+    }
+    assert evaluate_gate(base, head, _CONFIG) == []
+
+
+def test_token_savings_band_is_percent_points() -> None:
+    base = {"context": [{"scenario": "s", "naive_delta": {"pct_reduction": 60.0}}]}
+    within = {"context": [{"scenario": "s", "naive_delta": {"pct_reduction": 58.5}}]}  # -1.5pp
+    beyond = {"context": [{"scenario": "s", "naive_delta": {"pct_reduction": 57.0}}]}  # -3.0pp
+    assert evaluate_gate(base, within, _CONFIG) == []
+    assert len(evaluate_gate(base, beyond, _CONFIG)) == 1
+
+
+def test_committed_gating_config_loads() -> None:
+    cfg = load_gating_config(Path(__file__).parent.parent / "benchmarks" / "gating.yaml")
+    assert cfg.bands["recall_at_k"] == 1.0
+    assert cfg.bands["token_savings_pct"] == 2.0
+    assert cfg.override_label == "benchmark-accepted"
+
+
+def test_config_disabling_all_gates_gates_nothing(tmp_path: Path) -> None:
+    # A present config that disables every metric must be honored (gate nothing)
+    # rather than silently reverting to DEFAULT_BANDS.
+    cfg_path = tmp_path / "gating.yaml"
+    cfg_path.write_text(
+        "quality:\n"
+        "  recall_at_k: { gating: false }\n"
+        "  mrr: { gating: false }\n"
+        "override_label: custom-accept\n",
+        encoding="utf-8",
+    )
+    cfg = load_gating_config(cfg_path)
+    assert cfg.bands == {}
+    assert cfg.override_label == "custom-accept"
+    # With no gated metrics, even a large drop cannot produce a violation.
+    assert evaluate_gate(_routing(0.50), _routing(0.10), cfg) == []
+
+
+def test_config_without_quality_block_keeps_defaults(tmp_path: Path) -> None:
+    # An incomplete config (no ``quality`` block at all) keeps the safe defaults.
+    cfg_path = tmp_path / "gating.yaml"
+    cfg_path.write_text("latency:\n  gating: false\n", encoding="utf-8")
+    cfg = load_gating_config(cfg_path)
+    assert cfg.bands["recall_at_k"] == 1.0
+    assert cfg.override_label == "benchmark-accepted"
+
+
+def test_cli_exit_codes(tmp_path: Path) -> None:
+    base = tmp_path / "base.json"
+    head = tmp_path / "head.json"
+    base.write_text(json.dumps(_routing(0.50)), encoding="utf-8")
+    head.write_text(json.dumps(_routing(0.45)), encoding="utf-8")  # -5pp
+    config = str(Path(__file__).parent.parent / "benchmarks" / "gating.yaml")
+    assert main(["--base", str(base), "--head", str(head), "--gating-config", config]) == 1
+    # Override downgrades the failure to a warning.
+    assert (
+        main(["--base", str(base), "--head", str(head), "--gating-config", config, "--override"])
+        == 0
+    )
+    # Clean head passes.
+    head.write_text(json.dumps(_routing(0.50)), encoding="utf-8")
+    assert main(["--base", str(base), "--head", str(head), "--gating-config", config]) == 0
diff --git a/tests/test_large_catalog_benchmark.py b/tests/test_large_catalog_benchmark.py
new file mode 100644
index 00000000..3f226f5f
--- /dev/null
+++ b/tests/test_large_catalog_benchmark.py
@@ -0,0 +1,63 @@
+"""Tests for benchmarks/large_catalog.py — the 300+ tool benchmark (#369).
+
+CI-safe: runs a reduced catalog so the suite stays fast, and asserts the
+structural contract (size, namespaces, distractors, deny filtering) plus
+accuracy/token determinism.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
+
+from large_catalog import (  # noqa: E402
+    build_large_catalog,
+    render_scorecard,
+    run_benchmark,
+)
+
+
+def test_catalog_has_requested_size_and_many_namespaces() -> None:
+    items = build_large_catalog(n=320, seed=42)
+    assert len(items) == 320
+    namespaces = {it.namespace for it in items if it.namespace}
+    assert len(namespaces) >= 5  # acceptance: >= 5 upstream namespaces
+    # Distractor variants exist and carry distinct ids from the base pool.
+    assert any(".v" in it.id for it in items)
+
+
+def test_results_are_deterministic() -> None:
+    a = run_benchmark(n=160, seed=42)
+    b = run_benchmark(n=160, seed=42)
+    assert a.recall_at_5 == b.recall_at_5
+    assert a.mrr == b.mrr
+    assert a.mean_card_tokens == b.mean_card_tokens
+    assert a.token_reduction_pct == b.token_reduction_pct
+
+
+def test_choicecards_collapse_the_prompt() -> None:
+    result = run_benchmark(n=160, seed=42)
+    assert result.mean_card_tokens < result.mean_naive_tokens
+    assert result.token_reduction_pct > 50.0
+
+
+def test_denied_destructive_tools_never_reach_shortlist() -> None:
+    result = run_benchmark(n=160, seed=42)
+    assert result.destructive_tools > 0
+    assert result.destructive_in_shortlist_denied == 0
+
+
+def test_scorecard_render_is_deterministic() -> None:
+    result = run_benchmark(n=160, seed=42)
+    assert render_scorecard(result) == render_scorecard(result)
+
+
+def test_committed_scorecard_matches_full_run() -> None:
+    """The committed 320-tool scorecard must match a fresh full run."""
+    result = run_benchmark()  # defaults: 320 tools, seed 42
+    committed = (
+        Path(__file__).parent.parent / "benchmarks" / "large_catalog_scorecard.md"
+    ).read_text(encoding="utf-8")
+    assert render_scorecard(result) == committed
diff --git a/tests/test_render_trend.py b/tests/test_render_trend.py
new file mode 100644
index 00000000..1d05129a
--- /dev/null
+++ b/tests/test_render_trend.py
@@ -0,0 +1,104 @@
+"""Tests for scripts/render_trend.py — the benchmark trend page (#554).
+
+Covers snapshot extraction (latency excluded), deterministic rendering, the
+``--check`` drift gate, and version ordering across releases.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+from render_trend import (  # noqa: E402
+    extract_snapshot,
+    load_history,
+    render,
+    write_snapshot,
+)
+
+_LATEST = {
+    "routing": [
+        {
+            "catalog_size": 50,
+            "recall_at_k": 0.5649,
+            "mrr": 0.4978,
+            "precision_at_k": 0.1191,
+            "latency_ms_p99": 0.759,
+        },
+        {
+            "catalog_size": 1000,
+            "recall_at_k": 0.1475,
+            "mrr": 0.1456,
+            "precision_at_k": 0.031,
+            "latency_ms_p99": 41.7,
+        },
+    ],
+    "context": [
+        {
+            "scenario": "a",
+            "items_dropped": 7,
+            "dedup_removed": 4,
+            "naive_delta": {"pct_reduction": 60.0},
+        },
+        {
+            "scenario": "b",
+            "items_dropped": 0,
+            "dedup_removed": 0,
+            "naive_delta": {"pct_reduction": 80.0},
+        },
+    ],
+}
+
+
+def test_snapshot_excludes_latency_and_averages_reduction() -> None:
+    snap = extract_snapshot("1.0.0", _LATEST)
+    assert snap["schema_version"] == 1
+    assert snap["release"] == "1.0.0"
+    metrics = snap["metrics"]
+    # Latency must never leak into the snapshot.
+    assert "latency" not in json.dumps(metrics)
+    assert metrics["routing_recall_at_k"]["50"] == 0.5649
+    assert metrics["mean_token_reduction_pct"] == 70.0  # mean(60, 80)
+    assert metrics["total_items_dropped"] == 7
+    assert metrics["total_dedup_removed"] == 4
+
+
+def test_snapshot_roundtrip_is_byte_stable(tmp_path: Path) -> None:
+    snap = extract_snapshot("1.0.0", _LATEST)
+    p1 = write_snapshot(snap, tmp_path)
+    first = p1.read_text(encoding="utf-8")
+    second_snap = extract_snapshot("1.0.0", _LATEST)
+    write_snapshot(second_snap, tmp_path)
+    assert p1.read_text(encoding="utf-8") == first  # deterministic, sorted keys
+
+
+def test_render_is_deterministic(tmp_path: Path) -> None:
+    write_snapshot(extract_snapshot("0.16.0", _LATEST), tmp_path)
+    snapshots = load_history(tmp_path)
+    assert render(snapshots) == render(snapshots)
+    assert "0.16.0" in render(snapshots)
+
+
+def test_releases_ordered_oldest_first(tmp_path: Path) -> None:
+    write_snapshot(extract_snapshot("0.16.0", _LATEST), tmp_path)
+    write_snapshot(extract_snapshot("0.9.0", _LATEST), tmp_path)
+    write_snapshot(extract_snapshot("0.10.0", _LATEST), tmp_path)
+    releases = [s["release"] for s in load_history(tmp_path)]
+    # Numeric ordering: 0.9.0 < 0.10.0 < 0.16.0 (not lexicographic).
+    assert releases == ["0.9.0", "0.10.0", "0.16.0"]
+
+
+def test_empty_history_renders_placeholder() -> None:
+    out = render([])
+    assert "No release snapshots recorded yet" in out
+
+
+def test_committed_trend_is_in_sync() -> None:
+    """The committed benchmarks/trend.md must match a fresh render of history."""
+    root = Path(__file__).parent.parent
+    snapshots = load_history(root / "benchmarks" / "results" / "history")
+    committed = (root / "benchmarks" / "trend.md").read_text(encoding="utf-8")
+    assert render(snapshots) == committed
diff --git a/tests/test_scenario_routing.py b/tests/test_scenario_routing.py
new file mode 100644
index 00000000..f9bff2c1
--- /dev/null
+++ b/tests/test_scenario_routing.py
@@ -0,0 +1,59 @@
+"""Tests for benchmarks/scenario_routing.py — naive vs ChoiceCard (#418)."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
+
+from scenario_routing import (  # noqa: E402
+    DEFAULT_DATASET,
+    render_report,
+    run_all,
+    run_scenario,
+)
+
+
+def test_scenario_rows_are_deterministic() -> None:
+    a = run_all()
+    b = run_all()
+    assert [r.__dict__ for r in a] == [r.__dict__ for r in b]
+
+
+def test_choicecards_reduce_tokens_vs_naive() -> None:
+    for row in run_all():
+        assert row.card_tokens < row.naive_tokens
+        assert row.token_reduction_pct > 0.0
+        assert row.cards_shown <= 5  # bounded by TOP_K
+
+
+def test_rank_consistent_with_top_k_flag() -> None:
+    for row in run_all():
+        # rank > 0 iff the expected tool is in the shortlist.
+        assert (row.correct_rank > 0) == row.correct_in_top_k
+
+
+def test_dataset_covers_multiple_namespaces() -> None:
+    rows = run_all()
+    assert len(rows) >= 4
+    # At least one scenario must keep the expected tool reachable.
+    assert any(r.correct_in_top_k for r in rows)
+
+
+def test_render_is_deterministic_and_matches_commit() -> None:
+    rows = run_all()
+    assert render_report(rows) == render_report(rows)
+    committed = (Path(__file__).parent.parent / "benchmarks" / "scenario_routing.md").read_text(
+        encoding="utf-8"
+    )
+    assert render_report(rows) == committed
+
+
+def test_single_scenario_shape() -> None:
+    import json
+
+    first = json.loads(DEFAULT_DATASET.read_text(encoding="utf-8"))[0]
+    row = run_scenario(first)
+    assert row.name == first["name"]
+    assert row.catalog_size == first["catalog_size"]