diff --git a/.gitignore b/.gitignore
index df81afc..f3a73f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,7 @@ logs/
 *.tmp
 *.bak
 *.orig
+redteam/
 
 # uv
 .python-version
diff --git a/liveweb_arena/core/validators/base.py b/liveweb_arena/core/validators/base.py
index 19e138b..ae07be8 100644
--- a/liveweb_arena/core/validators/base.py
+++ b/liveweb_arena/core/validators/base.py
@@ -375,6 +375,15 @@ async def fetch_cache_api_data(cls) -> Optional[Dict[str, Any]]:
         """
         return None
 
+    def get_probe_urls(self, validation_info: Dict[str, Any]) -> List[str]:
+        """
+        Optional red-team probe URL declaration.
+
+        Redteam tooling can call this to get the exact URLs required to populate
+        collected API data for GT checks. Default returns an empty list.
+        """
+        return []
+
     # === Step-wise Reward Interface ===
     # Templates can override these methods to provide reward-relevant information.
 
diff --git a/liveweb_arena/redteam/__init__.py b/liveweb_arena/redteam/__init__.py
new file mode 100644
index 0000000..a004762
--- /dev/null
+++ b/liveweb_arena/redteam/__init__.py
@@ -0,0 +1,8 @@
+"""Red team utilities for template quality checks."""
+
+from __future__ import annotations
+
+__all__ = ["__version__"]
+
+__version__ = "0.1.0"
+
diff --git a/liveweb_arena/redteam/__main__.py b/liveweb_arena/redteam/__main__.py
new file mode 100644
index 0000000..4edc2f7
--- /dev/null
+++ b/liveweb_arena/redteam/__main__.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+from liveweb_arena.core.task_manager import TaskManager
+from liveweb_arena.plugins import get_all_plugins
+from liveweb_arena.core.validators.base import get_registered_templates
+from liveweb_arena.redteam.metrics import compare_repeated_runs, compute_template_metrics
+from liveweb_arena.redteam.probe import ProbeResult, probe_task_ground_truth
+from liveweb_arena.redteam.report import build_report, write_report_files
+
+
+def _parse_templates(template_strs: Optional[List[str]]) -> Optional[List[Tuple[str, str, Optional[int]]]]:
+    if not template_strs:
+        return None
+    result: List[Tuple[str, str, Optional[int]]] = []
+    for t in template_strs:
+        parts = t.split("/")
+        if len(parts) == 2:
+            result.append((parts[0], parts[1], None))
+        elif len(parts) == 3:
+            result.append((parts[0], parts[1], int(parts[2])))
+        else:
+            raise ValueError(f"Invalid template format: {t}. Use plugin/template[/variant]")
+    return result
+
+
+def _parse_seeds(s: str) -> List[int]:
+    """
+    Parse seeds:
+    - "0:10" -> 0..9
+    - "0:10:2" -> 0,2,4,6,8
+    - "1,5,9" -> explicit list
+    """
+    s = (s or "").strip()
+    if not s:
+        raise ValueError("Empty seeds")
+    if "," in s:
+        return [int(x.strip()) for x in s.split(",") if x.strip()]
+    if ":" in s:
+        parts = [p.strip() for p in s.split(":")]
+        if len(parts) == 2:
+            start, end = int(parts[0]), int(parts[1])
+            step = 1
+        elif len(parts) == 3:
+            start, end, step = int(parts[0]), int(parts[1]), int(parts[2])
+        else:
+            raise ValueError("Invalid seed range format")
+        if step <= 0:
+            raise ValueError("Seed step must be > 0")
+        return list(range(start, end, step))
+    return [int(s)]
+
+def _resolve_all_templates(
+    *,
+    plugins_filter: Optional[List[str]] = None,
+    include_variants: bool = False,
+) -> List[Tuple[str, str, Optional[int]]]:
+    """
+    Return (plugin, template_name, variant) for all registered templates that
+    have an identifiable cache source/plugin.
+    """
+    resolved: List[Tuple[str, str, Optional[int]]] = []
+    registry = get_registered_templates()
+    for registered_name, cls in registry.items():
+        try:
+            plugin = cls.get_cache_source()
+        except Exception:
+            plugin = None
+        if not plugin:
+            continue
+        if plugins_filter and plugin not in plugins_filter:
+            continue
+        # Use the registry name for template lookup (BasePlugin.generate_task supports it).
+        resolved.append((plugin, registered_name, None))
+    # stable order for CI
+    resolved.sort(key=lambda t: (t[0], t[1], t[2] if t[2] is not None else -1))
+    return resolved
+
+
+async def _run_probe_once(
+    *,
+    templates: List[Tuple[str, str, Optional[int]]],
+    seeds: List[int],
+) -> List[ProbeResult]:
+    task_manager = TaskManager(get_all_plugins())
+    all_results: List[ProbeResult] = []
+
+    for seed in seeds:
+        # For deterministic coverage: 1 subtask per template spec.
+        for plugin, template_name, variant in templates:
+            task = await task_manager.generate_composite_task(
+                seed=seed,
+                num_subtasks=1,
+                templates=[(plugin, template_name, variant)],
+            )
+            st = task.subtasks[0]
+            if st.question is None:
+                raise RuntimeError("Subtask missing GeneratedQuestion")
+            q = st.question
+            results = await probe_task_ground_truth(
+                task_manager=task_manager,
+                subtasks=[st],
+                questions=[q],
+                plugin_names=[plugin],
+                seed=seed,
+                variant=variant,
+            )
+            all_results.extend(results)
+
+    return all_results
+
+
+async def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "LiveWeb Arena - Template Red Team Dashboard "
+            "(supplementary API probe, not a replacement for CLAUDE.md red team review or eval.py)."
+        )
+    )
+    parser.add_argument(
+        "--templates",
+        type=str,
+        nargs="+",
+        required=False,
+        help='Templates to probe: "plugin/template[/variant]"',
+    )
+    parser.add_argument(
+        "--all-templates",
+        action="store_true",
+        help="Probe all registered templates with known plugin source.",
+    )
+    parser.add_argument(
+        "--plugins",
+        type=str,
+        nargs="+",
+        default=None,
+        help='Optional filter for --all-templates (e.g., "coingecko stooq")',
+    )
+    parser.add_argument(
+        "--list-templates",
+        action="store_true",
+        help="List resolved templates (no probing) and exit.",
+    )
+    parser.add_argument(
+        "--seeds",
+        type=str,
+        default="0:25",
+        help='Seed schedule: "0:25", "0:100:2", or "1,2,3"',
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+        help="Number of probe repeats (>=1). If 2, stability is reported.",
+    )
+    parser.add_argument(
+        "--repeat-delay-s",
+        type=float,
+        default=0.0,
+        help="Delay between repeats (seconds).",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory to write report.{json,md} (default: ./redteam/<timestamp>/)",
+    )
+    parser.add_argument(
+        "--fail-on-violation",
+        action="store_true",
+        help="Exit non-zero if any threshold is violated (CI mode).",
+    )
+    parser.add_argument(
+        "--min-gt-success",
+        type=float,
+        default=None,
+        help="Minimum GT success rate (0..1).",
+    )
+    parser.add_argument(
+        "--max-collapse",
+        type=float,
+        default=None,
+        help="Maximum allowed collapse rate (0..1).",
+    )
+    parser.add_argument(
+        "--max-baseline",
+        type=float,
+        default=None,
+        help="Maximum allowed baseline guess rate (0..1).",
+    )
+    parser.add_argument(
+        "--min-stability",
+        type=float,
+        default=None,
+        help="Minimum allowed stability rate (0..1). Requires --repeat >= 2.",
+    )
+
+    args = parser.parse_args()
+
+    # Ensure plugins/templates are loaded before resolving templates.
+    get_all_plugins()
+
+    if args.all_templates:
+        templates = _resolve_all_templates(plugins_filter=args.plugins)
+        if not templates:
+            raise ValueError("No templates resolved via --all-templates")
+    else:
+        templates = _parse_templates(args.templates)
+        if not templates:
+            raise ValueError("No templates provided (use --templates or --all-templates)")
+
+    if args.list_templates:
+        # Print in a stable order for scripting/CI.
+        for plugin, template_name, variant in templates:
+            if variant is None:
+                print(f"{plugin}/{template_name}")
+            else:
+                print(f"{plugin}/{template_name}/{variant}")
+        return 0
+
+    seeds = _parse_seeds(args.seeds)
+    if args.repeat < 1:
+        raise ValueError("--repeat must be >= 1")
+
+    repeats: List[List[ProbeResult]] = []
+    for i in range(args.repeat):
+        runs = await _run_probe_once(templates=templates, seeds=seeds)
+        repeats.append(runs)
+        if i + 1 < args.repeat and args.repeat_delay_s > 0:
+            await asyncio.sleep(args.repeat_delay_s)
+
+    samples = repeats[0]
+
+    # Group by template for metrics.
+    by_key: Dict[str, List[ProbeResult]] = {}
+    for r in samples:
+        key = f"{r.plugin_name}/{r.template_name}"
+        by_key.setdefault(key, []).append(r)
+
+    template_metrics = [compute_template_metrics(v) for v in by_key.values()]
+
+    stability = None
+    if len(repeats) >= 2:
+        # Compare first two repeats, same ordering by construction.
+        stability = compare_repeated_runs(repeats[0], repeats[1])
+
+    violations: List[Dict[str, Any]] = []
+    def _add_violation(*, scope: str, template: str, metric: str, actual: Any, limit: Any):
+        violations.append(
+            {
+                "scope": scope,
+                "template": template,
+                "metric": metric,
+                "actual": actual,
+                "limit": limit,
+            }
+        )
+
+    # Per-template thresholds.
+    for tm in template_metrics:
+        name = f"{tm.plugin_name}/{tm.template_name}"
+        if args.min_gt_success is not None and tm.gt_success_rate < args.min_gt_success:
+            _add_violation(
+                scope="template",
+                template=name,
+                metric="gt_success_rate",
+                actual=tm.gt_success_rate,
+                limit=args.min_gt_success,
+            )
+        if args.max_collapse is not None and tm.collapse_rate > args.max_collapse:
+            _add_violation(
+                scope="template",
+                template=name,
+                metric="collapse_rate",
+                actual=tm.collapse_rate,
+                limit=args.max_collapse,
+            )
+        if args.max_baseline is not None and tm.baseline_guess_rate > args.max_baseline:
+            _add_violation(
+                scope="template",
+                template=name,
+                metric="baseline_guess_rate",
+                actual=tm.baseline_guess_rate,
+                limit=args.max_baseline,
+            )
+
+    # Global stability threshold (repeat probe).
+    if args.min_stability is not None:
+        if args.repeat < 2:
+            _add_violation(
+                scope="global",
+                template="",
+                metric="stability_rate",
+                actual=None,
+                limit=args.min_stability,
+            )
+        else:
+            sr = (stability or {}).get("stability_rate")
+            if sr is None or sr < args.min_stability:
+                _add_violation(
+                    scope="global",
+                    template="",
+                    metric="stability_rate",
+                    actual=sr,
+                    limit=args.min_stability,
+                )
+
+    # Output directory
+    if args.output_dir:
+        out_dir = Path(args.output_dir)
+    else:
+        import datetime as _dt
+
+        ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d_%H%M%S")
+        out_dir = Path.cwd() / "redteam" / ts
+
+    report = build_report(
+        templates=template_metrics,
+        samples=samples,
+        stability=stability,
+        violations=violations,
+        args={
+            "templates": args.templates,
+            "all_templates": args.all_templates,
+            "plugins": args.plugins,
+            "seeds": args.seeds,
+            "repeat": args.repeat,
+            "repeat_delay_s": args.repeat_delay_s,
+            "fail_on_violation": args.fail_on_violation,
+            "min_gt_success": args.min_gt_success,
+            "max_collapse": args.max_collapse,
+            "max_baseline": args.max_baseline,
+            "min_stability": args.min_stability,
+        },
+    )
+    paths = write_report_files(out_dir, report)
+
+    print(paths["md"])
+    print(paths["json"])
+    if args.fail_on_violation and violations:
+        return 2
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(asyncio.run(main()))
+
diff --git a/liveweb_arena/redteam/metrics.py b/liveweb_arena/redteam/metrics.py
new file mode 100644
index 0000000..259daa6
--- /dev/null
+++ b/liveweb_arena/redteam/metrics.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+import math
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+
+from liveweb_arena.redteam.probe import ProbeResult, validation_info_signature
+
+
+def _normalize_text(s: str) -> str:
+    return re.sub(r"\s+", " ", (s or "").strip()).casefold()
+
+
+def _looks_boolean(v: str) -> bool:
+    t = _normalize_text(v)
+    return t in {"yes", "no", "true", "false"}
+
+
+def _extract_quoted_options(question_text: str) -> Optional[List[str]]:
+    # Common pattern in OL comparisons: "A" or "B"
+    opts = re.findall(r"\"([^\"]{1,200})\"", question_text or "")
+    opts = [o.strip() for o in opts if o.strip()]
+    if len(opts) >= 2:
+        return opts[:2]
+    return None
+
+
+def estimate_random_baseline(sample: Sequence[ProbeResult]) -> float:
+    """
+    Estimate random baseline guessability for a template's answer format.
+
+    Heuristic (documented in report):
+    - If binary/boolean: 0.5
+    - If question appears to be choosing between 2 quoted options: 0.5
+    - Else: 1 / (# unique GT values in sample) with floor to avoid division by zero
+    """
+    if not sample:
+        return 1.0
+
+    # Boolean / 2-option detection
+    for r in sample:
+        if r.gt_ok and r.gt_value is not None and _looks_boolean(r.gt_value):
+            return 0.5
+        opts = _extract_quoted_options(r.question_text)
+        if opts is not None and " or " in (r.question_text or "").lower():
+            return 0.5
+
+    gts = sorted({_normalize_text(r.gt_value or "") for r in sample if r.gt_ok and r.gt_value is not None})
+    if not gts:
+        return 1.0
+    return 1.0 / max(1, len(gts))
+
+
+@dataclass(frozen=True)
+class TemplateMetrics:
+    template_name: str
+    plugin_name: str
+    samples: int
+    gt_success_rate: float
+    unique_questions: int
+    unique_gt_values: int
+    collapse_rate: float
+    baseline_guess_rate: float
+
+
+def compute_template_metrics(sample: Sequence[ProbeResult]) -> TemplateMetrics:
+    if not sample:
+        raise ValueError("No samples")
+
+    template_name = sample[0].template_name
+    plugin_name = sample[0].plugin_name
+
+    samples_n = len(sample)
+    ok = [r for r in sample if r.gt_ok and r.gt_value is not None]
+    gt_success_rate = len(ok) / samples_n if samples_n else 0.0
+
+    uniq_q = len({_normalize_text(r.question_text) for r in sample})
+    uniq_gt = len({_normalize_text(r.gt_value or "") for r in ok})
+
+    # Cross-parameter collapse: distinct validation_info signatures mapping to distinct GT values.
+    sig_to_gt: Dict[str, str] = {}
+    sigs = set()
+    for r in ok:
+        sig = validation_info_signature(r.validation_info)
+        sigs.add(sig)
+        sig_to_gt.setdefault(sig, _normalize_text(r.gt_value or ""))
+
+    distinct_param_sets = len(sigs)
+    distinct_outputs = len(set(sig_to_gt.values()))
+    if distinct_param_sets <= 1:
+        collapse_rate = 0.0
+    else:
+        # collapse = 1 - (distinct outputs / distinct param sets)
+        collapse_rate = 1.0 - (distinct_outputs / distinct_param_sets)
+
+    baseline = estimate_random_baseline(sample)
+
+    return TemplateMetrics(
+        template_name=template_name,
+        plugin_name=plugin_name,
+        samples=samples_n,
+        gt_success_rate=gt_success_rate,
+        unique_questions=uniq_q,
+        unique_gt_values=uniq_gt,
+        collapse_rate=collapse_rate,
+        baseline_guess_rate=baseline,
+    )
+
+
+def compare_repeated_runs(
+    run_a: Sequence[ProbeResult],
+    run_b: Sequence[ProbeResult],
+) -> Dict[str, Any]:
+    """
+    Compare GT stability across two probes for the same (seed, variant) schedule.
+    """
+    if len(run_a) != len(run_b):
+        raise ValueError("Run sizes differ; cannot compare")
+
+    total = len(run_a)
+    comparable = 0
+    same = 0
+    changes: List[Dict[str, Any]] = []
+
+    for a, b in zip(run_a, run_b):
+        if not (a.gt_ok and b.gt_ok and a.gt_value is not None and b.gt_value is not None):
+            continue
+        comparable += 1
+        if _normalize_text(a.gt_value) == _normalize_text(b.gt_value):
+            same += 1
+        else:
+            changes.append(
+                {
+                    "template_name": a.template_name,
+                    "plugin_name": a.plugin_name,
+                    "seed": a.seed,
+                    "variant": a.variant,
+                    "question": a.question_text,
+                    "gt_a": a.gt_value,
+                    "gt_b": b.gt_value,
+                }
+            )
+
+    return {
+        "total_pairs": total,
+        "comparable_pairs": comparable,
+        "stable_pairs": same,
+        "stability_rate": (same / comparable) if comparable else None,
+        "changes": changes[:50],  # cap
+    }
+
diff --git a/liveweb_arena/redteam/probe.py b/liveweb_arena/redteam/probe.py
new file mode 100644
index 0000000..9eb64b1
--- /dev/null
+++ b/liveweb_arena/redteam/probe.py
@@ -0,0 +1,197 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+from urllib.parse import quote_plus
+
+from liveweb_arena.core.gt_collector import GTCollector, set_current_gt_collector
+from liveweb_arena.core.task_manager import TaskManager
+from liveweb_arena.core.validators.base import GeneratedQuestion, get_template
+from liveweb_arena.plugins.base import SubTask
+
+
+@dataclass(frozen=True)
+class ProbeResult:
+    template_name: str
+    plugin_name: str
+    seed: int
+    variant: Optional[int]
+    question_text: str
+    validation_info: Dict[str, Any]
+    probe_urls: List[str]
+    gt_ok: bool
+    gt_value: Optional[str]
+    gt_error: Optional[str]
+
+
+def _safe_json_dumps(x: Any) -> str:
+    try:
+        return json.dumps(x, ensure_ascii=False, sort_keys=True, default=str)
+    except Exception:
+        return json.dumps(str(x), ensure_ascii=False)
+
+
+def _infer_probe_urls(plugin_name: str, question: GeneratedQuestion) -> List[str]:
+    """
+    Produce a minimal set of URLs likely sufficient for GT.
+
+    This is intentionally conservative and template-agnostic.
+    It first uses template-declared probe URLs (if provided), then falls back
+    to compatibility heuristics based on common validation_info fields.
+    """
+    urls: List[str] = []
+    if question.start_url:
+        urls.append(question.start_url)
+
+    template_cls = get_template(question.template_name)
+    if template_cls is not None:
+        try:
+            template = template_cls()
+            declared_urls = template.get_probe_urls(question.validation_info or {})
+            if declared_urls:
+                urls.extend(declared_urls)
+        except Exception:
+            # Keep probe resilient; fallback heuristics still apply.
+            pass
+
+    vi = question.validation_info or {}
+
+    # Open Library templates often require collecting works for both queries.
+    if plugin_name == "openlibrary":
+        a = vi.get("book_a_query")
+        b = vi.get("book_b_query")
+        subject = vi.get("subject")
+        if isinstance(a, str) and a.strip():
+            urls.append(f"https://openlibrary.org/search?q={quote_plus(a)}")
+        if isinstance(b, str) and b.strip():
+            urls.append(f"https://openlibrary.org/search?q={quote_plus(b)}")
+        if isinstance(subject, str) and subject.strip():
+            urls.append(f"https://openlibrary.org/subjects/{quote_plus(subject)}")
+
+    # ArXiv listing tasks commonly depend on category.
+    if plugin_name == "arxiv":
+        category = vi.get("category")
+        if isinstance(category, str) and category.strip():
+            urls.append(f"https://arxiv.org/list/{category}/recent")
+
+    # Hacker News tasks generally depend on homepage.
+    if plugin_name == "hackernews":
+        if "news.ycombinator.com" not in " ".join(urls):
+            urls.append("https://news.ycombinator.com/")
+
+    # Stooq tasks generally depend on quote pages.
+    if plugin_name == "stooq":
+        symbol = vi.get("symbol") or vi.get("symbol_a") or vi.get("symbol_b")
+        if isinstance(symbol, str) and symbol.strip():
+            urls.append(f"https://stooq.com/q/?s={quote_plus(symbol)}")
+
+    # De-duplicate while preserving order.
+    seen = set()
+    out = []
+    for u in urls:
+        if not isinstance(u, str) or not u.strip():
+            continue
+        if u in seen:
+            continue
+        seen.add(u)
+        out.append(u)
+    return out
+
+
+async def probe_task_ground_truth(
+    *,
+    task_manager: TaskManager,
+    subtasks: Sequence[SubTask],
+    questions: Sequence[GeneratedQuestion],
+    plugin_names: Sequence[str],
+    seed: int,
+    variant: Optional[int],
+) -> List[ProbeResult]:
+    """
+    Probe GT for subtasks without running the LLM agent/browser.
+
+    Strategy:
+    - Create a GTCollector for the subtasks
+    - For each subtask, call plugin.fetch_api_data() for probe URLs
+    - Feed that api_data into GTCollector.on_page_visit()
+    - Call GTCollector.fetch_remaining_api_gt() for template.get_ground_truth()
+
+    NOTE: This is a quick API-level supplement. It does NOT replace full
+    eval.py runs or the mandatory CLAUDE.md red team review process.
+    """
+    gt_collector = GTCollector(subtasks=list(subtasks), task_manager=task_manager)
+    set_current_gt_collector(gt_collector)
+    try:
+        # Collect page-bound api_data by calling plugins directly (API semantic probe).
+        for st, q, plugin_name in zip(subtasks, questions, plugin_names):
+            plugin = task_manager.get_plugin(plugin_name)
+            probe_urls = _infer_probe_urls(plugin_name, q)
+            for url in probe_urls:
+                # Some navigation pages intentionally don't have api_data; respect plugin.
+                if not plugin.needs_api_data(url):
+                    continue
+                api_data = await plugin.fetch_api_data(url)
+                if not api_data:
+                    raise RuntimeError(
+                        f"Probe API returned empty data for {plugin_name} url={url}"
+                    )
+                # content isn't used for GT in this flow; keep empty.
+                await gt_collector.on_page_visit(url, content="", api_data=api_data)
+
+        await gt_collector.fetch_remaining_api_gt()
+
+        results: List[ProbeResult] = []
+        for st, q, plugin_name in zip(subtasks, questions, plugin_names):
+            tag = st.answer_tag
+            gt_value = gt_collector.get_gt_for_subtask(st)
+            if gt_value is not None:
+                results.append(
+                    ProbeResult(
+                        template_name=q.template_name,
+                        plugin_name=plugin_name,
+                        seed=seed,
+                        variant=variant,
+                        question_text=q.question_text,
+                        validation_info=dict(q.validation_info),
+                        probe_urls=_infer_probe_urls(plugin_name, q),
+                        gt_ok=True,
+                        gt_value=str(gt_value),
+                        gt_error=None,
+                    )
+                )
+            else:
+                reason = gt_collector.get_failure_reason(st)
+                results.append(
+                    ProbeResult(
+                        template_name=q.template_name,
+                        plugin_name=plugin_name,
+                        seed=seed,
+                        variant=variant,
+                        question_text=q.question_text,
+                        validation_info=dict(q.validation_info),
+                        probe_urls=_infer_probe_urls(plugin_name, q),
+                        gt_ok=False,
+                        gt_value=None,
+                        gt_error=reason,
+                    )
+                )
+        return results
+    finally:
+        set_current_gt_collector(None)
+        gt_collector.cleanup()
+
+
+def validation_info_signature(validation_info: Dict[str, Any]) -> str:
+    """
+    Canonical signature for collapse detection.
+
+    Excludes volatile/non-semantic keys by convention.
+    """
+    if not isinstance(validation_info, dict):
+        return _safe_json_dumps(validation_info)
+    vi = dict(validation_info)
+    for k in ("template_name", "_template_name"):
+        vi.pop(k, None)
+    return _safe_json_dumps(vi)
+
diff --git a/liveweb_arena/redteam/report.py b/liveweb_arena/redteam/report.py
new file mode 100644
index 0000000..e7fba24
--- /dev/null
+++ b/liveweb_arena/redteam/report.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import json
+from dataclasses import asdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence
+
+from liveweb_arena.redteam.metrics import TemplateMetrics
+from liveweb_arena.redteam.probe import ProbeResult
+
+
+def build_report(
+    *,
+    templates: Sequence[TemplateMetrics],
+    samples: Sequence[ProbeResult],
+    stability: Optional[Dict[str, Any]],
+    violations: Optional[List[Dict[str, Any]]] = None,
+    args: Dict[str, Any],
+) -> Dict[str, Any]:
+    by_template: Dict[str, List[ProbeResult]] = {}
+    for r in samples:
+        key = f"{r.plugin_name}/{r.template_name}"
+        by_template.setdefault(key, []).append(r)
+
+    return {
+        "meta": {
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+            "tool": "liveweb_arena.redteam",
+            "args": args,
+        },
+        "templates": [asdict(t) for t in templates],
+        "stability": stability,
+        "violations": violations or [],
+        "failures": [
+            {
+                "plugin_name": r.plugin_name,
+                "template_name": r.template_name,
+                "seed": r.seed,
+                "variant": r.variant,
+                "question": r.question_text,
+                "validation_info": r.validation_info,
+                "probe_urls": r.probe_urls,
+                "error": r.gt_error,
+            }
+            for r in samples
+            if not r.gt_ok
+        ][:100],
+    }
+
+
+def render_markdown(report: Dict[str, Any]) -> str:
+    tmpl = report.get("templates", [])
+    tmpl_sorted = sorted(
+        tmpl,
+        key=lambda t: (
+            -(t.get("collapse_rate") or 0.0),
+            (t.get("gt_success_rate") or 0.0),
+            t.get("template_name") or "",
+        ),
+    )
+
+    lines: List[str] = []
+    lines.append("## Template Red Team Report")
+    lines.append("")
+    lines.append(f"- **Generated**: {report['meta']['generated_at']}")
+    lines.append(f"- **Mode**: api-probe (plugin.fetch_api_data → GTCollector)")
+    lines.append("")
+    lines.append("## Summary")
+    lines.append("")
+    lines.append("| Template | Samples | GT ok | Unique GT | Collapse | Baseline |")
+    lines.append("|---|---:|---:|---:|---:|---:|")
+    for t in tmpl_sorted:
+        name = f"{t.get('plugin_name')}/{t.get('template_name')}"
+        lines.append(
+            "| "
+            + " | ".join(
+                [
+                    name,
+                    str(t.get("samples", 0)),
+                    f"{(t.get('gt_success_rate', 0.0) * 100):.1f}%",
+                    str(t.get("unique_gt_values", 0)),
+                    f"{(t.get('collapse_rate', 0.0) * 100):.1f}%",
+                    f"{(t.get('baseline_guess_rate', 0.0) * 100):.1f}%",
+                ]
+            )
+            + " |"
+        )
+
+    violations = report.get("violations", []) or []
+    if violations:
+        lines.append("")
+        lines.append("## Threshold violations")
+        lines.append("")
+        for v in violations[:50]:
+            scope = v.get("scope", "global")
+            name = v.get("template", "")
+            metric = v.get("metric", "")
+            actual = v.get("actual")
+            limit = v.get("limit")
+            lines.append(f"- **{scope}** {name} `{metric}` actual={actual} limit={limit}")
+
+    stability = report.get("stability")
+    if stability and stability.get("stability_rate") is not None:
+        lines.append("")
+        lines.append("## Stability (repeat probe)")
+        lines.append("")
+        lines.append(
+            f"- **Comparable pairs**: {stability.get('comparable_pairs')}/{stability.get('total_pairs')}"
+        )
+        lines.append(
+            f"- **Stable GT**: {stability.get('stable_pairs')} "
+            f"(stability={(stability.get('stability_rate') * 100):.1f}%)"
+        )
+        if stability.get("changes"):
+            lines.append("")
+            lines.append("### Examples of changed GT (capped)")
+            for ch in stability["changes"][:10]:
+                lines.append(f"- **{ch['plugin_name']}/{ch['template_name']}** seed={ch['seed']} variant={ch['variant']}")
+                lines.append(f"  - Q: {ch['question']}")
+                lines.append(f"  - GT-A: {ch['gt_a']}")
+                lines.append(f"  - GT-B: {ch['gt_b']}")
+
+    failures = report.get("failures", [])
+    if failures:
+        lines.append("")
+        lines.append("## GT probe failures (capped)")
+        for f in failures[:20]:
+            lines.append(f"- **{f['plugin_name']}/{f['template_name']}** seed={f['seed']} variant={f['variant']}")
+            lines.append(f"  - Q: {f['question']}")
+            lines.append(f"  - Error: {f['error']}")
+
+    lines.append("")
+    lines.append("## Notes")
+    lines.append("")
+    lines.append(
+        "- **Scope**: this dashboard is a quick automated supplement; it does NOT replace CLAUDE.md mandatory red team review or eval.py template testing."
+    )
+    lines.append(
+        "- **Collapse**: computed as \\(1 - \\frac{\\#distinct\\_GT}{\\#distinct\\_validation\\_signatures}\\) over successful samples."
+    )
+    lines.append(
+        "- **Baseline**: heuristic estimate (binary/2-option → 50%, else \\(1/\\#unique\\_GT\\))."
+    )
+    lines.append(
+        "- **GT ok** depends on whether the probe URLs were sufficient to populate collected API data for the template."
+    )
+    lines.append("")
+    return "\n".join(lines)
+
+
+def write_report_files(output_dir: Path, report: Dict[str, Any]) -> Dict[str, str]:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    json_path = output_dir / "report.json"
+    md_path = output_dir / "report.md"
+
+    tmp_json = json_path.with_suffix(".json.tmp")
+    tmp_md = md_path.with_suffix(".md.tmp")
+
+    tmp_json.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    tmp_md.write_text(render_markdown(report) + "\n", encoding="utf-8")
+
+    tmp_json.replace(json_path)
+    tmp_md.replace(md_path)
+
+    return {"json": str(json_path), "md": str(md_path)}
+