dgenio · dgenio · Jun 24, 2026 · Jun 24, 2026 · Jun 25, 2026
diff --git a/.github/workflows/benchmark-scale.yml b/.github/workflows/benchmark-scale.yml
@@ -0,0 +1,50 @@
+name: Routing-scale smoke benchmark
+
+# Non-gating, scheduled routing-scale benchmark for drift detection (issue #688,
+# child of #444). Runs the deterministic routing-scale profiler on a fixed seed
+# and stores its JSON as a per-run trend artifact, so scaling regressions are
+# visible over time without blocking any PR. This never gates: PR-time quality
+# gating lives in the main CI job (benchmarks/gating.yaml + benchmark_gate.py, #491).
+
+on:
+  schedule:
+    # Tuesday 06:30 UTC — off-hours, staggered from the weekly scorecard job.
+    - cron: "30 6 * * 2"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  routing-scale:
+    name: Routing-scale profile (non-gating)
+    runs-on: ubuntu-latest
+    # The profiler is informational; a failure must never page anyone or block work.
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Today
+        id: date
+        run: echo "today=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+
+      - name: Run routing-scale profile
+        # Reduced sizes keep the scheduled run well under the runner timeout;
+        # the local `make benchmark-routing-scale` default sweeps up to 10k.
+        run: python benchmarks/routing_scale.py --sizes 100,1000,5000
+
+      - name: Upload trend artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: routing-scale-${{ steps.date.outputs.today }}
+          path: |
+            benchmarks/results/routing_scale.json
+            docs/benchmarks/routing-scale.md
+          retention-days: 90
+          if-no-files-found: error
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -277,6 +277,49 @@ jobs:
           body-path: benchmarks/results/delta.md
           edit-mode: replace
 
+  benchmark-gate:
+    # Quality-regression gate (issue #491). Where benchmark-comment *describes*
+    # head-vs-base movement, this job *enforces* it: a PR that regresses a gated
+    # quality metric (recall@k / MRR / precision@k / token-savings) beyond its
+    # band in benchmarks/gating.yaml fails CI. Latency is never gated. The
+    # deterministic quality metrics are environment-independent, so head equals
+    # the committed base unless a code change moved them.
+    name: Benchmark quality gate
+    needs: test
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    if: ${{ github.event_name == 'pull_request' }}
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+      - name: Generate head + base snapshots
+        # base = the committed baseline; head = this PR's numbers (matrix run so
+        # the per-backend cells are populated alongside the routing summary).
+        run: |
+          cp benchmarks/results/latest.json benchmarks/results/base.json
+          python benchmarks/benchmark.py --matrix --output benchmarks/results/head.json
+      - name: Gate on quality regressions
+        # The `benchmark-accepted` label downgrades a failure to a warning for
+        # intentional trade-offs; the rationale must be in the PR description.
+        # NOTE: this label string is the GitHub-Actions mirror of `override_label`
+        # in benchmarks/gating.yaml. A `${{ }}` expression cannot read that file,
+        # so the two must be kept in sync by hand if the label is ever renamed.
+        run: |
+          OVERRIDE=""
+          if ${{ contains(github.event.pull_request.labels.*.name, 'benchmark-accepted') }}; then
+            OVERRIDE="--override"
+          fi
+          python scripts/benchmark_gate.py \
+            --base benchmarks/results/base.json \
+            --head benchmarks/results/head.json \
+            --gating-config benchmarks/gating.yaml $OVERRIDE
+
   docs-build:
     # Gate the docs build on PRs (issue #474). docs.yml only builds+deploys on
     # push to main, so a malformed docstring or broken nav could land on main

diff --git a/AGENTS.md b/AGENTS.md
@@ -218,6 +218,10 @@ make docs     # mkdocs build --clean (docs site)
 make docs-serve  # mkdocs serve (live preview)
 make benchmark        # run benchmark harness (non-gating; writes benchmarks/results/latest.json)
 make benchmark-matrix # benchmark + per-backend × per-size matrix (#208) and per-namespace breakdown (#209)
+make benchmark-large-catalog  # 300+ tool routing benchmark + scorecard (#369); -check gates drift
+make benchmark-scenario       # naive all-tools vs ChoiceCard routing report (#418); -check gates drift
+make trend            # render benchmarks/trend.md from per-release history snapshots (#554)
+make trend-check      # verify benchmarks/trend.md is up to date (exits non-zero on drift)
 make gateway-scorecard-check  # verify gateway scorecard matches its committed JSON (gating CI; #391)
 make record-demos-check       # verify committed demo casts match current output (gating CI; #390)
 make smoke-eval       # non-gating CI smoke-evaluation over fixed fixtures (#331/#392); deterministic, credential-free

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,44 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- **Benchmark-suite maturation: scaling, scenarios, CI gating, and trend
+  (#369, #418, #491, #554, #687, #688).** A coordinated pass on the benchmark
+  subsystem, all deterministic and offline:
+  - **Large-catalog benchmark (#369).** `make benchmark-large-catalog`
+    (`benchmarks/large_catalog.py`) routes over 300+ tools across 8 namespaces
+    with near-duplicate distractor variants and destructive (side-effecting)
+    tools, reporting recall@1/3/5, MRR, ChoiceCard-vs-naive prompt-token
+    reduction, and allow/deny filtering of destructive tools. Writes a
+    committed scorecard (`benchmarks/large_catalog_scorecard.md`, latency
+    excluded for determinism) plus `benchmarks/results/large_catalog.json`;
+    `--check` gates scorecard drift and `--strict` gates regression-guard
+    thresholds.
+  - **Scenario benchmark (#418).** `make benchmark-scenario`
+    (`benchmarks/scenario_routing.py`) contrasts naive all-tools prompting
+    against bounded `ChoiceCard` routing across tool-heavy scenarios
+    (`benchmarks/scenarios/routing_choicecard.json`), reporting
+    correct-in-top-k, rank, cards shown, and token reduction to a committed
+    report (`benchmarks/scenario_routing.md`).
+  - **Quality-regression gate (#491).** `scripts/benchmark_gate.py` +
+    `benchmarks/gating.yaml` turn the informational benchmark delta into a
+    gating CI check: a PR that regresses recall@k / MRR / precision@k /
+    token-savings beyond its tolerance band fails the new `benchmark-gate` CI
+    job. Latency is never gated; the `benchmark-accepted` PR label downgrades a
+    failure to a warning for intentional trade-offs.
+  - **Release trend (#554).** `scripts/render_trend.py` captures a
+    deterministic, latency-free metric snapshot per release under
+    `benchmarks/results/history/<version>.json` and renders the
+    release-over-release view to `benchmarks/trend.md` (`make trend` /
+    `make trend-check`).
+  - **Scaling matrix docs (#687).** `docs/benchmarks/scaling-matrix.md`
+    documents the 10k-tool scaling methodology, reproducible commands, and
+    result interpretation, tying together the routing-scale, large-catalog,
+    and per-backend matrix benchmarks.
+  - **Scheduled routing-scale smoke (#688).** A non-gating
+    `.github/workflows/benchmark-scale.yml` runs the routing-scale profiler on
+    a weekly schedule and uploads its JSON + report as a per-run trend
+    artifact.
+
 - **Multi-client MCP config-pack generator (#659).**
   Added `contextweaver mcp generate-configs` to render client recipe files
   (`copilot_mcp.json`, `cursor_mcp.json`, `claude_desktop_config.json`,

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: fmt lint type test example demo ci ci-full floor-deps tool-smoke docs docs-serve benchmark benchmark-matrix benchmark-routing-scale benchmark-gateway benchmark-primitives sidecar-smoke token-calibration smoke-eval e2e-quality scorecard scorecard-check sweep-scoring architectures llms llms-check weaver-conformance schemas schemas-check context-rot context-rot-check readme-version-check security-policy-check drift drift-check api api-check module-size-check module-size-update doc-snippets-check
+.PHONY: fmt lint type test example demo ci ci-full floor-deps tool-smoke docs docs-serve benchmark benchmark-matrix benchmark-routing-scale benchmark-gateway benchmark-primitives benchmark-large-catalog benchmark-large-catalog-check benchmark-scenario benchmark-scenario-check trend trend-check sidecar-smoke token-calibration smoke-eval e2e-quality scorecard scorecard-check sweep-scoring architectures llms llms-check weaver-conformance schemas schemas-check context-rot context-rot-check readme-version-check security-policy-check drift drift-check api api-check module-size-check module-size-update doc-snippets-check
 
 # Interpreter and pip front-end (issue #712). Default to `python3`, which is what
 # many modern environments ship (some have no bare `python` on PATH at all).
@@ -84,6 +84,34 @@ benchmark-matrix:
 benchmark-routing-scale:
 	$(PYTHON) benchmarks/routing_scale.py
 
+# Large-catalog routing benchmark (issue #369; non-gating). 300+ tools across 8
+# namespaces with near-duplicate distractors and destructive tools; writes
+# benchmarks/large_catalog_scorecard.md + benchmarks/results/large_catalog.json.
+# `-check` verifies the committed scorecard is in sync (deterministic accuracy).
+benchmark-large-catalog:
+	$(PYTHON) benchmarks/large_catalog.py
+
+benchmark-large-catalog-check:
+	$(PYTHON) benchmarks/large_catalog.py --check
+
+# Scenario benchmark (issue #418; non-gating): naive all-tools prompt vs bounded
+# ChoiceCard routing. Writes benchmarks/scenario_routing.md; `-check` gates drift.
+benchmark-scenario:
+	$(PYTHON) benchmarks/scenario_routing.py
+
+benchmark-scenario-check:
+	$(PYTHON) benchmarks/scenario_routing.py --check
+
+# Release-over-release benchmark trend (issue #554). `trend` re-renders
+# benchmarks/trend.md from benchmarks/results/history/*.json; `trend-check` gates
+# drift. Capture a release snapshot with:
+#   python scripts/render_trend.py --snapshot <version> --from benchmarks/results/latest.json
+trend:
+	$(PYTHON) scripts/render_trend.py
+
+trend-check:
+	$(PYTHON) scripts/render_trend.py --check
+
 benchmark-gateway:
 	$(PYTHON) benchmarks/gateway_benchmark.py
 

diff --git a/benchmarks/gating.yaml b/benchmarks/gating.yaml
@@ -0,0 +1,25 @@
+# Benchmark quality-regression gate configuration (issue #491).
+#
+# Turns the informational benchmark-delta PR comment (scripts/benchmark_delta.py)
+# into a gating CI check: a PR that regresses a *quality* metric beyond its band
+# (vs the committed benchmarks/results/latest.json baseline) fails CI. Latency
+# stays informational — runner variance makes it unreliable as a gate.
+#
+# Band semantics (see scripts/benchmark_gate.py):
+#   - fraction metrics (recall_at_k / mrr / precision_at_k, range 0..1) regress
+#     when  head < base - max_regression_pp / 100.
+#   - percent metrics (token_savings_pct, already 0..100) regress when
+#     head < base - max_regression_pp.
+#
+# The ~0.5pp noise floor at 200 gold queries (.github/prompts/add-eval.prompt.md)
+# informs the 1.0pp quality band; token savings get a looser 2.0pp band.
+quality:
+  recall_at_k: { max_regression_pp: 1.0 }
+  mrr: { max_regression_pp: 1.0 }
+  precision_at_k: { max_regression_pp: 1.0 }
+  token_savings_pct: { max_regression_pp: 2.0 }
+latency:
+  gating: false
+# A maintainer-applied PR label that downgrades a gate failure to a warning for
+# intentional trade-offs (the rationale belongs in the PR description).
+override_label: benchmark-accepted