diff --git a/.gitignore b/.gitignore index df81afc..f3a73f5 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,7 @@ logs/ *.tmp *.bak *.orig +redteam/ # uv .python-version diff --git a/liveweb_arena/core/validators/base.py b/liveweb_arena/core/validators/base.py index 19e138b..ae07be8 100644 --- a/liveweb_arena/core/validators/base.py +++ b/liveweb_arena/core/validators/base.py @@ -375,6 +375,15 @@ async def fetch_cache_api_data(cls) -> Optional[Dict[str, Any]]: """ return None + def get_probe_urls(self, validation_info: Dict[str, Any]) -> List[str]: + """ + Optional red-team probe URL declaration. + + Redteam tooling can call this to get the exact URLs required to populate + collected API data for GT checks. Default returns an empty list. + """ + return [] + # === Step-wise Reward Interface === # Templates can override these methods to provide reward-relevant information. diff --git a/liveweb_arena/redteam/__init__.py b/liveweb_arena/redteam/__init__.py new file mode 100644 index 0000000..a004762 --- /dev/null +++ b/liveweb_arena/redteam/__init__.py @@ -0,0 +1,8 @@ +"""Red team utilities for template quality checks.""" + +from __future__ import annotations + +__all__ = ["__version__"] + +__version__ = "0.1.0" + diff --git a/liveweb_arena/redteam/__main__.py b/liveweb_arena/redteam/__main__.py new file mode 100644 index 0000000..4edc2f7 --- /dev/null +++ b/liveweb_arena/redteam/__main__.py @@ -0,0 +1,353 @@ +from __future__ import annotations + +import argparse +import asyncio +import os +from dataclasses import asdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple + +from liveweb_arena.core.task_manager import TaskManager +from liveweb_arena.plugins import get_all_plugins +from liveweb_arena.core.validators.base import get_registered_templates +from liveweb_arena.redteam.metrics import compare_repeated_runs, compute_template_metrics +from liveweb_arena.redteam.probe import ProbeResult, probe_task_ground_truth +from liveweb_arena.redteam.report import build_report, write_report_files + + +def _parse_templates(template_strs: Optional[List[str]]) -> Optional[List[Tuple[str, str, Optional[int]]]]: + if not template_strs: + return None + result: List[Tuple[str, str, Optional[int]]] = [] + for t in template_strs: + parts = t.split("/") + if len(parts) == 2: + result.append((parts[0], parts[1], None)) + elif len(parts) == 3: + result.append((parts[0], parts[1], int(parts[2]))) + else: + raise ValueError(f"Invalid template format: {t}. Use plugin/template[/variant]") + return result + + +def _parse_seeds(s: str) -> List[int]: + """ + Parse seeds: + - "0:10" -> 0..9 + - "0:10:2" -> 0,2,4,6,8 + - "1,5,9" -> explicit list + """ + s = (s or "").strip() + if not s: + raise ValueError("Empty seeds") + if "," in s: + return [int(x.strip()) for x in s.split(",") if x.strip()] + if ":" in s: + parts = [p.strip() for p in s.split(":")] + if len(parts) == 2: + start, end = int(parts[0]), int(parts[1]) + step = 1 + elif len(parts) == 3: + start, end, step = int(parts[0]), int(parts[1]), int(parts[2]) + else: + raise ValueError("Invalid seed range format") + if step <= 0: + raise ValueError("Seed step must be > 0") + return list(range(start, end, step)) + return [int(s)] + +def _resolve_all_templates( + *, + plugins_filter: Optional[List[str]] = None, + include_variants: bool = False, +) -> List[Tuple[str, str, Optional[int]]]: + """ + Return (plugin, template_name, variant) for all registered templates that + have an identifiable cache source/plugin. + """ + resolved: List[Tuple[str, str, Optional[int]]] = [] + registry = get_registered_templates() + for registered_name, cls in registry.items(): + try: + plugin = cls.get_cache_source() + except Exception: + plugin = None + if not plugin: + continue + if plugins_filter and plugin not in plugins_filter: + continue + # Use the registry name for template lookup (BasePlugin.generate_task supports it). + resolved.append((plugin, registered_name, None)) + # stable order for CI + resolved.sort(key=lambda t: (t[0], t[1], t[2] if t[2] is not None else -1)) + return resolved + + +async def _run_probe_once( + *, + templates: List[Tuple[str, str, Optional[int]]], + seeds: List[int], +) -> List[ProbeResult]: + task_manager = TaskManager(get_all_plugins()) + all_results: List[ProbeResult] = [] + + for seed in seeds: + # For deterministic coverage: 1 subtask per template spec. + for plugin, template_name, variant in templates: + task = await task_manager.generate_composite_task( + seed=seed, + num_subtasks=1, + templates=[(plugin, template_name, variant)], + ) + st = task.subtasks[0] + if st.question is None: + raise RuntimeError("Subtask missing GeneratedQuestion") + q = st.question + results = await probe_task_ground_truth( + task_manager=task_manager, + subtasks=[st], + questions=[q], + plugin_names=[plugin], + seed=seed, + variant=variant, + ) + all_results.extend(results) + + return all_results + + +async def main() -> int: + parser = argparse.ArgumentParser( + description=( + "LiveWeb Arena - Template Red Team Dashboard " + "(supplementary API probe, not a replacement for CLAUDE.md red team review or eval.py)." + ) + ) + parser.add_argument( + "--templates", + type=str, + nargs="+", + required=False, + help='Templates to probe: "plugin/template[/variant]"', + ) + parser.add_argument( + "--all-templates", + action="store_true", + help="Probe all registered templates with known plugin source.", + ) + parser.add_argument( + "--plugins", + type=str, + nargs="+", + default=None, + help='Optional filter for --all-templates (e.g., "coingecko stooq")', + ) + parser.add_argument( + "--list-templates", + action="store_true", + help="List resolved templates (no probing) and exit.", + ) + parser.add_argument( + "--seeds", + type=str, + default="0:25", + help='Seed schedule: "0:25", "0:100:2", or "1,2,3"', + ) + parser.add_argument( + "--repeat", + type=int, + default=1, + help="Number of probe repeats (>=1). If 2, stability is reported.", + ) + parser.add_argument( + "--repeat-delay-s", + type=float, + default=0.0, + help="Delay between repeats (seconds).", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory to write report.{json,md} (default: ./redteam//)", + ) + parser.add_argument( + "--fail-on-violation", + action="store_true", + help="Exit non-zero if any threshold is violated (CI mode).", + ) + parser.add_argument( + "--min-gt-success", + type=float, + default=None, + help="Minimum GT success rate (0..1).", + ) + parser.add_argument( + "--max-collapse", + type=float, + default=None, + help="Maximum allowed collapse rate (0..1).", + ) + parser.add_argument( + "--max-baseline", + type=float, + default=None, + help="Maximum allowed baseline guess rate (0..1).", + ) + parser.add_argument( + "--min-stability", + type=float, + default=None, + help="Minimum allowed stability rate (0..1). Requires --repeat >= 2.", + ) + + args = parser.parse_args() + + # Ensure plugins/templates are loaded before resolving templates. + get_all_plugins() + + if args.all_templates: + templates = _resolve_all_templates(plugins_filter=args.plugins) + if not templates: + raise ValueError("No templates resolved via --all-templates") + else: + templates = _parse_templates(args.templates) + if not templates: + raise ValueError("No templates provided (use --templates or --all-templates)") + + if args.list_templates: + # Print in a stable order for scripting/CI. + for plugin, template_name, variant in templates: + if variant is None: + print(f"{plugin}/{template_name}") + else: + print(f"{plugin}/{template_name}/{variant}") + return 0 + + seeds = _parse_seeds(args.seeds) + if args.repeat < 1: + raise ValueError("--repeat must be >= 1") + + repeats: List[List[ProbeResult]] = [] + for i in range(args.repeat): + runs = await _run_probe_once(templates=templates, seeds=seeds) + repeats.append(runs) + if i + 1 < args.repeat and args.repeat_delay_s > 0: + await asyncio.sleep(args.repeat_delay_s) + + samples = repeats[0] + + # Group by template for metrics. + by_key: Dict[str, List[ProbeResult]] = {} + for r in samples: + key = f"{r.plugin_name}/{r.template_name}" + by_key.setdefault(key, []).append(r) + + template_metrics = [compute_template_metrics(v) for v in by_key.values()] + + stability = None + if len(repeats) >= 2: + # Compare first two repeats, same ordering by construction. + stability = compare_repeated_runs(repeats[0], repeats[1]) + + violations: List[Dict[str, Any]] = [] + def _add_violation(*, scope: str, template: str, metric: str, actual: Any, limit: Any): + violations.append( + { + "scope": scope, + "template": template, + "metric": metric, + "actual": actual, + "limit": limit, + } + ) + + # Per-template thresholds. + for tm in template_metrics: + name = f"{tm.plugin_name}/{tm.template_name}" + if args.min_gt_success is not None and tm.gt_success_rate < args.min_gt_success: + _add_violation( + scope="template", + template=name, + metric="gt_success_rate", + actual=tm.gt_success_rate, + limit=args.min_gt_success, + ) + if args.max_collapse is not None and tm.collapse_rate > args.max_collapse: + _add_violation( + scope="template", + template=name, + metric="collapse_rate", + actual=tm.collapse_rate, + limit=args.max_collapse, + ) + if args.max_baseline is not None and tm.baseline_guess_rate > args.max_baseline: + _add_violation( + scope="template", + template=name, + metric="baseline_guess_rate", + actual=tm.baseline_guess_rate, + limit=args.max_baseline, + ) + + # Global stability threshold (repeat probe). + if args.min_stability is not None: + if args.repeat < 2: + _add_violation( + scope="global", + template="", + metric="stability_rate", + actual=None, + limit=args.min_stability, + ) + else: + sr = (stability or {}).get("stability_rate") + if sr is None or sr < args.min_stability: + _add_violation( + scope="global", + template="", + metric="stability_rate", + actual=sr, + limit=args.min_stability, + ) + + # Output directory + if args.output_dir: + out_dir = Path(args.output_dir) + else: + import datetime as _dt + + ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d_%H%M%S") + out_dir = Path.cwd() / "redteam" / ts + + report = build_report( + templates=template_metrics, + samples=samples, + stability=stability, + violations=violations, + args={ + "templates": args.templates, + "all_templates": args.all_templates, + "plugins": args.plugins, + "seeds": args.seeds, + "repeat": args.repeat, + "repeat_delay_s": args.repeat_delay_s, + "fail_on_violation": args.fail_on_violation, + "min_gt_success": args.min_gt_success, + "max_collapse": args.max_collapse, + "max_baseline": args.max_baseline, + "min_stability": args.min_stability, + }, + ) + paths = write_report_files(out_dir, report) + + print(paths["md"]) + print(paths["json"]) + if args.fail_on_violation and violations: + return 2 + return 0 + + +if __name__ == "__main__": + raise SystemExit(asyncio.run(main())) + diff --git a/liveweb_arena/redteam/metrics.py b/liveweb_arena/redteam/metrics.py new file mode 100644 index 0000000..259daa6 --- /dev/null +++ b/liveweb_arena/redteam/metrics.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +import math +import re +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple + +from liveweb_arena.redteam.probe import ProbeResult, validation_info_signature + + +def _normalize_text(s: str) -> str: + return re.sub(r"\s+", " ", (s or "").strip()).casefold() + + +def _looks_boolean(v: str) -> bool: + t = _normalize_text(v) + return t in {"yes", "no", "true", "false"} + + +def _extract_quoted_options(question_text: str) -> Optional[List[str]]: + # Common pattern in OL comparisons: "A" or "B" + opts = re.findall(r"\"([^\"]{1,200})\"", question_text or "") + opts = [o.strip() for o in opts if o.strip()] + if len(opts) >= 2: + return opts[:2] + return None + + +def estimate_random_baseline(sample: Sequence[ProbeResult]) -> float: + """ + Estimate random baseline guessability for a template's answer format. + + Heuristic (documented in report): + - If binary/boolean: 0.5 + - If question appears to be choosing between 2 quoted options: 0.5 + - Else: 1 / (# unique GT values in sample) with floor to avoid division by zero + """ + if not sample: + return 1.0 + + # Boolean / 2-option detection + for r in sample: + if r.gt_ok and r.gt_value is not None and _looks_boolean(r.gt_value): + return 0.5 + opts = _extract_quoted_options(r.question_text) + if opts is not None and " or " in (r.question_text or "").lower(): + return 0.5 + + gts = sorted({_normalize_text(r.gt_value or "") for r in sample if r.gt_ok and r.gt_value is not None}) + if not gts: + return 1.0 + return 1.0 / max(1, len(gts)) + + +@dataclass(frozen=True) +class TemplateMetrics: + template_name: str + plugin_name: str + samples: int + gt_success_rate: float + unique_questions: int + unique_gt_values: int + collapse_rate: float + baseline_guess_rate: float + + +def compute_template_metrics(sample: Sequence[ProbeResult]) -> TemplateMetrics: + if not sample: + raise ValueError("No samples") + + template_name = sample[0].template_name + plugin_name = sample[0].plugin_name + + samples_n = len(sample) + ok = [r for r in sample if r.gt_ok and r.gt_value is not None] + gt_success_rate = len(ok) / samples_n if samples_n else 0.0 + + uniq_q = len({_normalize_text(r.question_text) for r in sample}) + uniq_gt = len({_normalize_text(r.gt_value or "") for r in ok}) + + # Cross-parameter collapse: distinct validation_info signatures mapping to distinct GT values. + sig_to_gt: Dict[str, str] = {} + sigs = set() + for r in ok: + sig = validation_info_signature(r.validation_info) + sigs.add(sig) + sig_to_gt.setdefault(sig, _normalize_text(r.gt_value or "")) + + distinct_param_sets = len(sigs) + distinct_outputs = len(set(sig_to_gt.values())) + if distinct_param_sets <= 1: + collapse_rate = 0.0 + else: + # collapse = 1 - (distinct outputs / distinct param sets) + collapse_rate = 1.0 - (distinct_outputs / distinct_param_sets) + + baseline = estimate_random_baseline(sample) + + return TemplateMetrics( + template_name=template_name, + plugin_name=plugin_name, + samples=samples_n, + gt_success_rate=gt_success_rate, + unique_questions=uniq_q, + unique_gt_values=uniq_gt, + collapse_rate=collapse_rate, + baseline_guess_rate=baseline, + ) + + +def compare_repeated_runs( + run_a: Sequence[ProbeResult], + run_b: Sequence[ProbeResult], +) -> Dict[str, Any]: + """ + Compare GT stability across two probes for the same (seed, variant) schedule. + """ + if len(run_a) != len(run_b): + raise ValueError("Run sizes differ; cannot compare") + + total = len(run_a) + comparable = 0 + same = 0 + changes: List[Dict[str, Any]] = [] + + for a, b in zip(run_a, run_b): + if not (a.gt_ok and b.gt_ok and a.gt_value is not None and b.gt_value is not None): + continue + comparable += 1 + if _normalize_text(a.gt_value) == _normalize_text(b.gt_value): + same += 1 + else: + changes.append( + { + "template_name": a.template_name, + "plugin_name": a.plugin_name, + "seed": a.seed, + "variant": a.variant, + "question": a.question_text, + "gt_a": a.gt_value, + "gt_b": b.gt_value, + } + ) + + return { + "total_pairs": total, + "comparable_pairs": comparable, + "stable_pairs": same, + "stability_rate": (same / comparable) if comparable else None, + "changes": changes[:50], # cap + } + diff --git a/liveweb_arena/redteam/probe.py b/liveweb_arena/redteam/probe.py new file mode 100644 index 0000000..9eb64b1 --- /dev/null +++ b/liveweb_arena/redteam/probe.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from urllib.parse import quote_plus + +from liveweb_arena.core.gt_collector import GTCollector, set_current_gt_collector +from liveweb_arena.core.task_manager import TaskManager +from liveweb_arena.core.validators.base import GeneratedQuestion, get_template +from liveweb_arena.plugins.base import SubTask + + +@dataclass(frozen=True) +class ProbeResult: + template_name: str + plugin_name: str + seed: int + variant: Optional[int] + question_text: str + validation_info: Dict[str, Any] + probe_urls: List[str] + gt_ok: bool + gt_value: Optional[str] + gt_error: Optional[str] + + +def _safe_json_dumps(x: Any) -> str: + try: + return json.dumps(x, ensure_ascii=False, sort_keys=True, default=str) + except Exception: + return json.dumps(str(x), ensure_ascii=False) + + +def _infer_probe_urls(plugin_name: str, question: GeneratedQuestion) -> List[str]: + """ + Produce a minimal set of URLs likely sufficient for GT. + + This is intentionally conservative and template-agnostic. + It first uses template-declared probe URLs (if provided), then falls back + to compatibility heuristics based on common validation_info fields. + """ + urls: List[str] = [] + if question.start_url: + urls.append(question.start_url) + + template_cls = get_template(question.template_name) + if template_cls is not None: + try: + template = template_cls() + declared_urls = template.get_probe_urls(question.validation_info or {}) + if declared_urls: + urls.extend(declared_urls) + except Exception: + # Keep probe resilient; fallback heuristics still apply. + pass + + vi = question.validation_info or {} + + # Open Library templates often require collecting works for both queries. + if plugin_name == "openlibrary": + a = vi.get("book_a_query") + b = vi.get("book_b_query") + subject = vi.get("subject") + if isinstance(a, str) and a.strip(): + urls.append(f"https://openlibrary.org/search?q={quote_plus(a)}") + if isinstance(b, str) and b.strip(): + urls.append(f"https://openlibrary.org/search?q={quote_plus(b)}") + if isinstance(subject, str) and subject.strip(): + urls.append(f"https://openlibrary.org/subjects/{quote_plus(subject)}") + + # ArXiv listing tasks commonly depend on category. + if plugin_name == "arxiv": + category = vi.get("category") + if isinstance(category, str) and category.strip(): + urls.append(f"https://arxiv.org/list/{category}/recent") + + # Hacker News tasks generally depend on homepage. + if plugin_name == "hackernews": + if "news.ycombinator.com" not in " ".join(urls): + urls.append("https://news.ycombinator.com/") + + # Stooq tasks generally depend on quote pages. + if plugin_name == "stooq": + symbol = vi.get("symbol") or vi.get("symbol_a") or vi.get("symbol_b") + if isinstance(symbol, str) and symbol.strip(): + urls.append(f"https://stooq.com/q/?s={quote_plus(symbol)}") + + # De-duplicate while preserving order. + seen = set() + out = [] + for u in urls: + if not isinstance(u, str) or not u.strip(): + continue + if u in seen: + continue + seen.add(u) + out.append(u) + return out + + +async def probe_task_ground_truth( + *, + task_manager: TaskManager, + subtasks: Sequence[SubTask], + questions: Sequence[GeneratedQuestion], + plugin_names: Sequence[str], + seed: int, + variant: Optional[int], +) -> List[ProbeResult]: + """ + Probe GT for subtasks without running the LLM agent/browser. + + Strategy: + - Create a GTCollector for the subtasks + - For each subtask, call plugin.fetch_api_data() for probe URLs + - Feed that api_data into GTCollector.on_page_visit() + - Call GTCollector.fetch_remaining_api_gt() for template.get_ground_truth() + + NOTE: This is a quick API-level supplement. It does NOT replace full + eval.py runs or the mandatory CLAUDE.md red team review process. + """ + gt_collector = GTCollector(subtasks=list(subtasks), task_manager=task_manager) + set_current_gt_collector(gt_collector) + try: + # Collect page-bound api_data by calling plugins directly (API semantic probe). + for st, q, plugin_name in zip(subtasks, questions, plugin_names): + plugin = task_manager.get_plugin(plugin_name) + probe_urls = _infer_probe_urls(plugin_name, q) + for url in probe_urls: + # Some navigation pages intentionally don't have api_data; respect plugin. + if not plugin.needs_api_data(url): + continue + api_data = await plugin.fetch_api_data(url) + if not api_data: + raise RuntimeError( + f"Probe API returned empty data for {plugin_name} url={url}" + ) + # content isn't used for GT in this flow; keep empty. + await gt_collector.on_page_visit(url, content="", api_data=api_data) + + await gt_collector.fetch_remaining_api_gt() + + results: List[ProbeResult] = [] + for st, q, plugin_name in zip(subtasks, questions, plugin_names): + tag = st.answer_tag + gt_value = gt_collector.get_gt_for_subtask(st) + if gt_value is not None: + results.append( + ProbeResult( + template_name=q.template_name, + plugin_name=plugin_name, + seed=seed, + variant=variant, + question_text=q.question_text, + validation_info=dict(q.validation_info), + probe_urls=_infer_probe_urls(plugin_name, q), + gt_ok=True, + gt_value=str(gt_value), + gt_error=None, + ) + ) + else: + reason = gt_collector.get_failure_reason(st) + results.append( + ProbeResult( + template_name=q.template_name, + plugin_name=plugin_name, + seed=seed, + variant=variant, + question_text=q.question_text, + validation_info=dict(q.validation_info), + probe_urls=_infer_probe_urls(plugin_name, q), + gt_ok=False, + gt_value=None, + gt_error=reason, + ) + ) + return results + finally: + set_current_gt_collector(None) + gt_collector.cleanup() + + +def validation_info_signature(validation_info: Dict[str, Any]) -> str: + """ + Canonical signature for collapse detection. + + Excludes volatile/non-semantic keys by convention. + """ + if not isinstance(validation_info, dict): + return _safe_json_dumps(validation_info) + vi = dict(validation_info) + for k in ("template_name", "_template_name"): + vi.pop(k, None) + return _safe_json_dumps(vi) + diff --git a/liveweb_arena/redteam/report.py b/liveweb_arena/redteam/report.py new file mode 100644 index 0000000..e7fba24 --- /dev/null +++ b/liveweb_arena/redteam/report.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import json +from dataclasses import asdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence + +from liveweb_arena.redteam.metrics import TemplateMetrics +from liveweb_arena.redteam.probe import ProbeResult + + +def build_report( + *, + templates: Sequence[TemplateMetrics], + samples: Sequence[ProbeResult], + stability: Optional[Dict[str, Any]], + violations: Optional[List[Dict[str, Any]]] = None, + args: Dict[str, Any], +) -> Dict[str, Any]: + by_template: Dict[str, List[ProbeResult]] = {} + for r in samples: + key = f"{r.plugin_name}/{r.template_name}" + by_template.setdefault(key, []).append(r) + + return { + "meta": { + "generated_at": datetime.now(timezone.utc).isoformat(), + "tool": "liveweb_arena.redteam", + "args": args, + }, + "templates": [asdict(t) for t in templates], + "stability": stability, + "violations": violations or [], + "failures": [ + { + "plugin_name": r.plugin_name, + "template_name": r.template_name, + "seed": r.seed, + "variant": r.variant, + "question": r.question_text, + "validation_info": r.validation_info, + "probe_urls": r.probe_urls, + "error": r.gt_error, + } + for r in samples + if not r.gt_ok + ][:100], + } + + +def render_markdown(report: Dict[str, Any]) -> str: + tmpl = report.get("templates", []) + tmpl_sorted = sorted( + tmpl, + key=lambda t: ( + -(t.get("collapse_rate") or 0.0), + (t.get("gt_success_rate") or 0.0), + t.get("template_name") or "", + ), + ) + + lines: List[str] = [] + lines.append("## Template Red Team Report") + lines.append("") + lines.append(f"- **Generated**: {report['meta']['generated_at']}") + lines.append(f"- **Mode**: api-probe (plugin.fetch_api_data → GTCollector)") + lines.append("") + lines.append("## Summary") + lines.append("") + lines.append("| Template | Samples | GT ok | Unique GT | Collapse | Baseline |") + lines.append("|---|---:|---:|---:|---:|---:|") + for t in tmpl_sorted: + name = f"{t.get('plugin_name')}/{t.get('template_name')}" + lines.append( + "| " + + " | ".join( + [ + name, + str(t.get("samples", 0)), + f"{(t.get('gt_success_rate', 0.0) * 100):.1f}%", + str(t.get("unique_gt_values", 0)), + f"{(t.get('collapse_rate', 0.0) * 100):.1f}%", + f"{(t.get('baseline_guess_rate', 0.0) * 100):.1f}%", + ] + ) + + " |" + ) + + violations = report.get("violations", []) or [] + if violations: + lines.append("") + lines.append("## Threshold violations") + lines.append("") + for v in violations[:50]: + scope = v.get("scope", "global") + name = v.get("template", "") + metric = v.get("metric", "") + actual = v.get("actual") + limit = v.get("limit") + lines.append(f"- **{scope}** {name} `{metric}` actual={actual} limit={limit}") + + stability = report.get("stability") + if stability and stability.get("stability_rate") is not None: + lines.append("") + lines.append("## Stability (repeat probe)") + lines.append("") + lines.append( + f"- **Comparable pairs**: {stability.get('comparable_pairs')}/{stability.get('total_pairs')}" + ) + lines.append( + f"- **Stable GT**: {stability.get('stable_pairs')} " + f"(stability={(stability.get('stability_rate') * 100):.1f}%)" + ) + if stability.get("changes"): + lines.append("") + lines.append("### Examples of changed GT (capped)") + for ch in stability["changes"][:10]: + lines.append(f"- **{ch['plugin_name']}/{ch['template_name']}** seed={ch['seed']} variant={ch['variant']}") + lines.append(f" - Q: {ch['question']}") + lines.append(f" - GT-A: {ch['gt_a']}") + lines.append(f" - GT-B: {ch['gt_b']}") + + failures = report.get("failures", []) + if failures: + lines.append("") + lines.append("## GT probe failures (capped)") + for f in failures[:20]: + lines.append(f"- **{f['plugin_name']}/{f['template_name']}** seed={f['seed']} variant={f['variant']}") + lines.append(f" - Q: {f['question']}") + lines.append(f" - Error: {f['error']}") + + lines.append("") + lines.append("## Notes") + lines.append("") + lines.append( + "- **Scope**: this dashboard is a quick automated supplement; it does NOT replace CLAUDE.md mandatory red team review or eval.py template testing." + ) + lines.append( + "- **Collapse**: computed as \\(1 - \\frac{\\#distinct\\_GT}{\\#distinct\\_validation\\_signatures}\\) over successful samples." + ) + lines.append( + "- **Baseline**: heuristic estimate (binary/2-option → 50%, else \\(1/\\#unique\\_GT\\))." + ) + lines.append( + "- **GT ok** depends on whether the probe URLs were sufficient to populate collected API data for the template." + ) + lines.append("") + return "\n".join(lines) + + +def write_report_files(output_dir: Path, report: Dict[str, Any]) -> Dict[str, str]: + output_dir.mkdir(parents=True, exist_ok=True) + json_path = output_dir / "report.json" + md_path = output_dir / "report.md" + + tmp_json = json_path.with_suffix(".json.tmp") + tmp_md = md_path.with_suffix(".md.tmp") + + tmp_json.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + tmp_md.write_text(render_markdown(report) + "\n", encoding="utf-8") + + tmp_json.replace(json_path) + tmp_md.replace(md_path) + + return {"json": str(json_path), "md": str(md_path)} +