diff --git a/autoany-core/src/comparative_evaluator.rs b/autoany-core/src/comparative_evaluator.rs
new file mode 100644
index 0000000..338ecde
--- /dev/null
+++ b/autoany-core/src/comparative_evaluator.rs
@@ -0,0 +1,43 @@
+//! Comparative evaluation for subjective domains.
+//!
+//! Unlike [`Evaluator`](crate::evaluator::Evaluator) which scores a single
+//! artifact in isolation, `ComparativeEvaluator` compares two artifacts to
+//! determine which is better. This is the core abstraction that enables
+//! autoreason-style adversarial debate evaluation.
+
+use crate::error::Result;
+use crate::types::ComparisonOutcome;
+
+/// Compares two artifacts to determine which is better.
+///
+/// Used when no objective scalar metric exists. The comparison IS the
+/// evaluation — there is no separate scoring step.
+///
+/// Each call to `compare` should be deterministic in procedure (same debate
+/// config, same rubric) even though the LLM outputs may vary. The evaluator
+/// itself is immutable — EGRI Law 3 still holds.
+///
+/// # Context Isolation
+///
+/// Implementations must ensure that each evaluation phase (critic, reviser,
+/// synthesizer, judge) uses a fresh LLM context with no shared conversation
+/// history. This is what eliminates sycophancy and anchoring biases.
+pub trait ComparativeEvaluator {
+    /// The artifact type to compare.
+    type Artifact;
+
+    /// Compare the incumbent and candidate artifacts, returning which is better.
+    ///
+    /// - `task`: the original task description (provides context for judges)
+    /// - `incumbent`: the current best artifact
+    /// - `candidate`: the proposed replacement
+    ///
+    /// The returned [`ComparisonOutcome`] contains the winner, confidence
+    /// (judge agreement ratio), and the full debate transcript for the ledger.
+    fn compare(
+        &self,
+        task: &str,
+        incumbent: &Self::Artifact,
+        candidate: &Self::Artifact,
+    ) -> Result<ComparisonOutcome>;
+}
diff --git a/autoany-core/src/lib.rs b/autoany-core/src/lib.rs
index ef92fd2..2d0a4e8 100644
--- a/autoany-core/src/lib.rs
+++ b/autoany-core/src/lib.rs
@@ -15,6 +15,7 @@
 //! ```
 
 pub mod budget;
+pub mod comparative_evaluator;
 pub mod constraint;
 pub mod dead_ends;
 pub mod error;
@@ -22,6 +23,7 @@ pub mod evaluator;
 pub mod executor;
 pub mod inheritance;
 pub mod ledger;
+pub mod llm_backend;
 pub mod loop_engine;
 pub mod promotion;
 pub mod proposer;
diff --git a/autoany-core/src/llm_backend.rs b/autoany-core/src/llm_backend.rs
new file mode 100644
index 0000000..b8e9380
--- /dev/null
+++ b/autoany-core/src/llm_backend.rs
@@ -0,0 +1,39 @@
+//! Stateless LLM call abstraction for autoreason debate.
+//!
+//! Each call to [`LlmBackend::generate`] is independent — no conversation
+//! history is carried between calls. This enforces the context isolation
+//! that makes autoreason's adversarial debate effective against sycophancy.
+
+use crate::error::Result;
+
+/// Abstraction over LLM API calls.
+///
+/// Each call is stateless — no conversation history. This enforces the
+/// context isolation that makes autoreason work. Every debate phase
+/// (critic, reviser, synthesizer, judge) gets a fresh context.
+///
+/// # Implementation Notes
+///
+/// Implementors should:
+/// - Never maintain conversation state between calls
+/// - Support configurable model selection for judge diversity
+/// - Handle rate limiting and retries internally
+/// - Return errors for API failures (do not silently fall back)
+pub trait LlmBackend: Send + Sync {
+    /// Generate a completion from a system prompt and user message.
+    ///
+    /// Each call is independent — no shared conversation state.
+    fn generate(&self, system: &str, user: &str) -> Result<String>;
+
+    /// Generate with a specific model (for judge diversity).
+    ///
+    /// When `model_diversity` is enabled in [`DebateConfig`](crate::types::DebateConfig),
+    /// different judges can use different models to reduce correlated biases.
+    ///
+    /// The default implementation ignores the model parameter and delegates
+    /// to [`generate`](Self::generate).
+    fn generate_with_model(&self, model: &str, system: &str, user: &str) -> Result<String> {
+        let _ = model;
+        self.generate(system, user)
+    }
+}
diff --git a/autoany-core/src/selector.rs b/autoany-core/src/selector.rs
index 2342fe5..cfeeac3 100644
--- a/autoany-core/src/selector.rs
+++ b/autoany-core/src/selector.rs
@@ -125,6 +125,15 @@ impl Selector for DefaultSelector {
                     new_state_id: None,
                 })
             }
+            PromotionPolicy::Comparative => {
+                // Comparative evaluation is handled by DebateLoop, not DefaultSelector.
+                // If this is reached, the problem is misconfigured.
+                Ok(Decision {
+                    action: Action::Escalated,
+                    reason: "comparative policy requires DebateLoop, not EgriLoop".into(),
+                    new_state_id: None,
+                })
+            }
         }
     }
 }
diff --git a/autoany-core/src/spec.rs b/autoany-core/src/spec.rs
index c817f6b..1176be0 100644
--- a/autoany-core/src/spec.rs
+++ b/autoany-core/src/spec.rs
@@ -1,6 +1,6 @@
 use serde::{Deserialize, Serialize};
 
-use crate::types::{AutonomyMode, Direction};
+use crate::types::{AutonomyMode, DebateConfig, Direction};
 
 /// Problem specification — the compiled EGRI instance definition.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -16,6 +16,12 @@ pub struct ProblemSpec {
     pub autonomy: Autonomy,
     #[serde(default)]
     pub search: Option<Search>,
+    /// Debate configuration (required when `promotion.policy == "comparative"`).
+    #[serde(default)]
+    pub debate: Option<DebateSpec>,
+    /// Task description providing context for debate evaluation.
+    #[serde(default)]
+    pub task: Option<TaskSpec>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -94,6 +100,9 @@ pub enum PromotionPolicy {
     Pareto,
     Threshold,
     HumanGate,
+    /// Autoreason debate evaluation — uses adversarial multi-agent debate
+    /// with blind judge panel instead of scalar comparison.
+    Comparative,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -114,3 +123,24 @@ pub struct Search {
 fn default_proposer() -> String {
     "llm".to_string()
 }
+
+/// Debate configuration for comparative evaluation.
+///
+/// Required when `promotion.policy` is `comparative`. Wraps [`DebateConfig`]
+/// with additional problem-spec-level fields.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DebateSpec {
+    /// Core debate protocol configuration.
+    #[serde(flatten)]
+    pub config: DebateConfig,
+}
+
+/// Task description providing context for debate evaluation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TaskSpec {
+    /// The original task brief — what the artifact is supposed to accomplish.
+    pub description: String,
+    /// Evaluation criteria for judges (optional, supplements rubric).
+    #[serde(default)]
+    pub criteria: Vec<String>,
+}
diff --git a/autoany-core/src/types.rs b/autoany-core/src/types.rs
index 558521f..41254cf 100644
--- a/autoany-core/src/types.rs
+++ b/autoany-core/src/types.rs
@@ -172,6 +172,162 @@ pub enum AutonomyMode {
     Portfolio,
 }
 
+// ---------------------------------------------------------------------------
+// Autoreason / debate types
+// ---------------------------------------------------------------------------
+
+/// Configuration for the autoreason debate protocol.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DebateConfig {
+    /// Number of judges in the panel (should be odd for clean majorities).
+    #[serde(default = "default_judge_count")]
+    pub judge_count: u32,
+    /// Consecutive incumbent wins required to declare convergence.
+    #[serde(default = "default_convergence_threshold")]
+    pub convergence_threshold: u32,
+    /// Whether to randomize version labels per judge (strongly recommended).
+    #[serde(default = "default_true")]
+    pub label_randomization: bool,
+    /// Whether to use different LLM providers for different judges.
+    #[serde(default)]
+    pub model_diversity: bool,
+    /// Optional rubric for critique and judging (domain-specific).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rubric: Option<String>,
+    /// Maximum tokens per phase (controls cost).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tokens_per_phase: Option<u32>,
+}
+
+fn default_judge_count() -> u32 {
+    3
+}
+
+fn default_convergence_threshold() -> u32 {
+    3
+}
+
+fn default_true() -> bool {
+    true
+}
+
+impl Default for DebateConfig {
+    fn default() -> Self {
+        Self {
+            judge_count: default_judge_count(),
+            convergence_threshold: default_convergence_threshold(),
+            label_randomization: true,
+            model_diversity: false,
+            rubric: None,
+            max_tokens_per_phase: None,
+        }
+    }
+}
+
+/// Which version won a debate round.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Winner {
+    /// The incumbent (Version A) — no improvement found.
+    Incumbent,
+    /// The revision (Version B) — critique-driven improvement.
+    Revision,
+    /// The synthesis (Version AB) — combined strengths.
+    Synthesis,
+}
+
+impl std::fmt::Display for Winner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Winner::Incumbent => write!(f, "incumbent"),
+            Winner::Revision => write!(f, "revision"),
+            Winner::Synthesis => write!(f, "synthesis"),
+        }
+    }
+}
+
+/// Severity of a critique issue.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum IssueSeverity {
+    /// Undermines the core argument.
+    Critical,
+    /// Significant weakness.
+    Major,
+    /// Improvement opportunity.
+    Minor,
+}
+
+/// A single issue identified during critique.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CritiqueIssue {
+    /// Category: "logical_gap", "unsupported_claim", "missing_perspective", etc.
+    pub category: String,
+    /// How severe the issue is.
+    pub severity: IssueSeverity,
+    /// Description of the issue.
+    pub description: String,
+    /// Where in the artifact the issue occurs.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub location: Option<String>,
+}
+
+/// Result of the critique phase.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CritiqueResult {
+    /// Structured list of identified issues.
+    pub issues: Vec<CritiqueIssue>,
+    /// Raw critique text (for passing to reviser).
+    pub raw_text: String,
+}
+
+/// A single judge's vote.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JudgeVote {
+    /// Judge identifier (for audit trail).
+    pub judge_id: String,
+    /// Label-to-version mapping this judge saw (for audit trail).
+    pub label_map: std::collections::HashMap<String, Winner>,
+    /// Ranked preference: first element is the judge's top pick.
+    pub ranking: Vec<Winner>,
+    /// Free-text justification.
+    pub justification: String,
+}
+
+/// Result of one complete autoreason round.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DebateRound {
+    /// Which round this is (1-indexed).
+    pub round_number: u32,
+    /// The incumbent content entering this round.
+    pub incumbent_content: String,
+    /// Critique produced by the adversarial critic.
+    pub critique: CritiqueResult,
+    /// Revised version addressing the critique.
+    pub revision_content: String,
+    /// Synthesis of incumbent and revision.
+    pub synthesis_content: String,
+    /// Individual judge votes.
+    pub votes: Vec<JudgeVote>,
+    /// Which version won this round.
+    pub winner: Winner,
+    /// Judge agreement ratio: 1.0 = unanimous, 0.33 = split.
+    pub confidence: f64,
+    /// The actual content of the winning version.
+    pub winning_content: String,
+}
+
+/// Outcome of a comparative evaluation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ComparisonOutcome {
+    /// Which version won.
+    pub winner: Winner,
+    /// Judge agreement ratio: 1.0 = unanimous, 0.33 = split.
+    pub confidence: f64,
+    /// Full debate record (for ledger).
+    pub round: DebateRound,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -232,4 +388,146 @@ mod tests {
         let back: Action = serde_json::from_str(&json).unwrap();
         assert_eq!(back, Action::Promoted);
     }
+
+    // --- Autoreason / debate type tests ---
+
+    #[test]
+    fn debate_config_defaults() {
+        let cfg = DebateConfig::default();
+        assert_eq!(cfg.judge_count, 3);
+        assert_eq!(cfg.convergence_threshold, 3);
+        assert!(cfg.label_randomization);
+        assert!(!cfg.model_diversity);
+        assert!(cfg.rubric.is_none());
+        assert!(cfg.max_tokens_per_phase.is_none());
+    }
+
+    #[test]
+    fn debate_config_serde_roundtrip() {
+        let cfg = DebateConfig {
+            judge_count: 5,
+            convergence_threshold: 4,
+            label_randomization: false,
+            model_diversity: true,
+            rubric: Some("Evaluate clarity and coherence.".into()),
+            max_tokens_per_phase: Some(2000),
+        };
+        let json = serde_json::to_string(&cfg).unwrap();
+        let back: DebateConfig = serde_json::from_str(&json).unwrap();
+        assert_eq!(back.judge_count, 5);
+        assert_eq!(back.convergence_threshold, 4);
+        assert!(!back.label_randomization);
+        assert!(back.model_diversity);
+        assert_eq!(
+            back.rubric.as_deref(),
+            Some("Evaluate clarity and coherence.")
+        );
+        assert_eq!(back.max_tokens_per_phase, Some(2000));
+    }
+
+    #[test]
+    fn winner_serde_roundtrip() {
+        for winner in [Winner::Incumbent, Winner::Revision, Winner::Synthesis] {
+            let json = serde_json::to_string(&winner).unwrap();
+            let back: Winner = serde_json::from_str(&json).unwrap();
+            assert_eq!(back, winner);
+        }
+    }
+
+    #[test]
+    fn winner_display() {
+        assert_eq!(Winner::Incumbent.to_string(), "incumbent");
+        assert_eq!(Winner::Revision.to_string(), "revision");
+        assert_eq!(Winner::Synthesis.to_string(), "synthesis");
+    }
+
+    #[test]
+    fn issue_severity_serde_roundtrip() {
+        for sev in [
+            IssueSeverity::Critical,
+            IssueSeverity::Major,
+            IssueSeverity::Minor,
+        ] {
+            let json = serde_json::to_string(&sev).unwrap();
+            let back: IssueSeverity = serde_json::from_str(&json).unwrap();
+            assert_eq!(back, sev);
+        }
+    }
+
+    #[test]
+    fn critique_result_serde_roundtrip() {
+        let critique = CritiqueResult {
+            issues: vec![
+                CritiqueIssue {
+                    category: "logical_gap".into(),
+                    severity: IssueSeverity::Critical,
+                    description: "Missing causal link between X and Y.".into(),
+                    location: Some("paragraph 3".into()),
+                },
+                CritiqueIssue {
+                    category: "unsupported_claim".into(),
+                    severity: IssueSeverity::Minor,
+                    description: "No citation for the 30% figure.".into(),
+                    location: None,
+                },
+            ],
+            raw_text: "The argument has two main issues...".into(),
+        };
+        let json = serde_json::to_string(&critique).unwrap();
+        let back: CritiqueResult = serde_json::from_str(&json).unwrap();
+        assert_eq!(back.issues.len(), 2);
+        assert_eq!(back.issues[0].category, "logical_gap");
+        assert_eq!(back.issues[0].severity, IssueSeverity::Critical);
+        assert_eq!(back.issues[1].location, None);
+        assert_eq!(back.raw_text, critique.raw_text);
+    }
+
+    #[test]
+    fn judge_vote_serde_roundtrip() {
+        let mut label_map = std::collections::HashMap::new();
+        label_map.insert("Alpha".into(), Winner::Incumbent);
+        label_map.insert("Beta".into(), Winner::Revision);
+        label_map.insert("Gamma".into(), Winner::Synthesis);
+
+        let vote = JudgeVote {
+            judge_id: "judge-0".into(),
+            label_map,
+            ranking: vec![Winner::Revision, Winner::Synthesis, Winner::Incumbent],
+            justification: "Beta is the strongest because...".into(),
+        };
+        let json = serde_json::to_string(&vote).unwrap();
+        let back: JudgeVote = serde_json::from_str(&json).unwrap();
+        assert_eq!(back.judge_id, "judge-0");
+        assert_eq!(back.ranking[0], Winner::Revision);
+        assert_eq!(back.label_map.len(), 3);
+    }
+
+    #[test]
+    fn comparison_outcome_serde_roundtrip() {
+        let round = DebateRound {
+            round_number: 1,
+            incumbent_content: "Version A text.".into(),
+            critique: CritiqueResult {
+                issues: vec![],
+                raw_text: "No significant issues.".into(),
+            },
+            revision_content: "Version B text.".into(),
+            synthesis_content: "Version AB text.".into(),
+            votes: vec![],
+            winner: Winner::Incumbent,
+            confidence: 1.0,
+            winning_content: "Version A text.".into(),
+        };
+        let outcome = ComparisonOutcome {
+            winner: Winner::Incumbent,
+            confidence: 1.0,
+            round,
+        };
+        let json = serde_json::to_string(&outcome).unwrap();
+        let back: ComparisonOutcome = serde_json::from_str(&json).unwrap();
+        assert_eq!(back.winner, Winner::Incumbent);
+        assert!((back.confidence - 1.0).abs() < f64::EPSILON);
+        assert_eq!(back.round.round_number, 1);
+        assert_eq!(back.round.winning_content, "Version A text.");
+    }
 }
diff --git a/autoany/references/AUTOREASON.md b/autoany/references/AUTOREASON.md
new file mode 100644
index 0000000..cca653e
--- /dev/null
+++ b/autoany/references/AUTOREASON.md
@@ -0,0 +1,1173 @@
+# Autoreason: Adversarial Debate Evaluation for Subjective Domains
+
+## Motivation
+
+EGRI's formal model `Π = (X, M, H, E, J, C, B, P, L)` assumes that **J (the evaluator)
+produces a reliable score**. For objective domains — `val_bpb`, test pass rates, latency
+benchmarks — this holds trivially. For subjective domains — writing quality, argument
+strength, design coherence, strategic clarity — no scalar metric exists.
+
+EGRI currently handles this by escalating to `HumanGate`, which breaks the autonomy loop.
+The system degenerates to "propose and wait for human approval," losing the tight
+propose-evaluate-promote cycle that makes EGRI powerful.
+
+### The Sycophancy Problem
+
+Naive LLM-as-judge evaluation fails systematically:
+
+| Prompt framing | Bias | Effect on loop |
+|----------------|------|----------------|
+| "Improve this" | Sycophantic — always finds something to change | Infinite non-converging edits |
+| "Find flaws in this" | Hypercritical — always finds problems | Never promotes anything |
+| "Merge the best of both" | Compromising — averages rather than selects | Bland regression to mean |
+| "Which is better, A or B?" | Positional — prefers the first/last shown | Score oscillates with presentation order |
+| "Rate this 1-10" | Anchoring — scores cluster around 7 | No discrimination between candidates |
+
+These biases mean the loop's output is shaped more by **how you prompt** than by **what's
+actually better**. The evaluator becomes a mirror of the framing, not a judge of quality.
+
+### The Solution: Constructed Fitness via Adversarial Debate
+
+Autoreason constructs a synthetic evaluator for subjective domains using the same principle
+that makes science work for questions where mathematics cannot provide proofs: **independent
+blind peer review**.
+
+Every role is executed by a fresh, isolated LLM agent with no shared context. No agent sees
+its own prior output. Labels are randomized. The evaluator is the emergent consensus of
+adversarial debate, not a single prompted judgment.
+
+---
+
+## The Protocol
+
+### One Autoreason Round
+
+```
+                    ┌─────────────┐
+                    │ Original    │
+                    │ Task Brief  │
+                    └──────┬──────┘
+                           │
+              ┌────────────┴────────────┐
+              │                         │
+              ▼                         ▼
+     ┌────────────────┐       ┌────────────────┐
+     │   Version A    │       │   Version A    │
+     │  (incumbent)   │       │  (incumbent)   │
+     └───────┬────────┘       └───────┬────────┘
+             │                        │
+             ▼                        │
+     ┌────────────────┐               │
+     │ Phase 1: ATTACK│               │
+     │ (fresh agent)  │               │
+     │                │               │
+     │ Sees: A only   │               │
+     │ Produces:      │               │
+     │   critique     │               │
+     └───────┬────────┘               │
+             │                        │
+             ▼                        │
+     ┌────────────────┐               │
+     │ Phase 2: REVISE│               │
+     │ (fresh agent)  │               │
+     │                │               │
+     │ Sees: task +   │               │
+     │   A + critique │               │
+     │ Produces:      │               │
+     │   Version B    │               │
+     └───────┬────────┘               │
+             │                        │
+             ├────────────────────────┤
+             │                        │
+             ▼                        ▼
+     ┌──────────────────────────────────┐
+     │ Phase 3: SYNTHESIZE              │
+     │ (fresh agent)                    │
+     │                                  │
+     │ Sees: "Version 1" + "Version 2"  │
+     │   (A and B in randomized order)  │
+     │ Produces: Version AB             │
+     └───────────────┬──────────────────┘
+                     │
+                     ▼
+     ┌──────────────────────────────────┐
+     │ Phase 4: JUDGE                   │
+     │ (N fresh agents, independent)    │
+     │                                  │
+     │ Each judge sees: three versions  │
+     │   with randomized labels         │
+     │   ("Alpha", "Beta", "Gamma")     │
+     │ Each produces: ranked preference │
+     │   + justification                │
+     └───────────────┬──────────────────┘
+                     │
+                     ▼
+     ┌──────────────────────────────────┐
+     │ Phase 5: DECIDE                  │
+     │ (deterministic, no LLM)          │
+     │                                  │
+     │ Majority vote → winner           │
+     │ Agreement ratio → confidence     │
+     │ Winner becomes new incumbent     │
+     │   (or incumbent stays if it won) │
+     └─────────────────────────────────┘
+```
+
+### Phase Details
+
+#### Phase 1: Attack (Adversarial Critic)
+
+**Context isolation**: Fresh agent. Sees ONLY the current artifact.
+
+**Input**:
+- The artifact (Version A)
+- Domain-specific critique rubric (optional)
+
+**System prompt** (paraphrased):
+> You are a rigorous critic. Your job is to find genuine weaknesses in this work:
+> logical gaps, unsupported claims, missing perspectives, structural problems,
+> unclear reasoning. Be specific and substantive. Do not nitpick style.
+
+**Output**: Structured critique with categorized issues.
+
+**Why fresh context matters**: If the critic had seen the drafting process, it would
+anchor to the author's reasoning and be less likely to identify genuine blind spots.
+Freshness ensures independence.
+
+#### Phase 2: Revise (Adversarial Author)
+
+**Context isolation**: Fresh agent. Does NOT see the critic's reasoning process.
+
+**Input**:
+- Original task description
+- Version A
+- The critique from Phase 1
+
+**System prompt** (paraphrased):
+> You see an artifact and substantive critique of it. Address the valid criticisms
+> to produce an improved version. You may reject critiques that are wrong, but you
+> must address each one explicitly. Do not water down the original — make it stronger.
+
+**Output**: Version B — a revision that addresses the critique.
+
+**Why this is better than "improve this"**: The reviser works from specific,
+adversarially-generated feedback rather than searching for generic improvements.
+The critique provides signal; the reviser provides craft.
+
+#### Phase 3: Synthesize (Blind Merger)
+
+**Context isolation**: Fresh agent. No knowledge of who wrote what, or the critique.
+
+**Input**:
+- Two versions labeled neutrally: "Version 1" and "Version 2"
+- Labels are randomized (A might be "Version 1" or "Version 2")
+- Original task description
+
+**System prompt** (paraphrased):
+> You see two versions of the same work. Combine the strongest elements of both
+> into a unified version. You are not obligated to include material from both —
+> if one version is clearly stronger in a section, use that.
+
+**Output**: Version AB — a synthesis.
+
+**Why randomized labels**: Eliminates "Version 1 must be the original" anchoring.
+The synthesizer evaluates content, not position.
+
+#### Phase 4: Judge (Blind Panel)
+
+**Context isolation**: N fresh agents, each independent. No shared deliberation.
+
+**Input per judge**:
+- Three versions with randomized labels (e.g., "Alpha", "Beta", "Gamma")
+- Label assignment is different per judge to eliminate correlated positional bias
+- Original task description
+- Evaluation criteria (optional rubric)
+
+**System prompt** (paraphrased):
+> Rank these three versions from strongest to weakest on [criteria].
+> For each, explain what makes it stronger or weaker than the others.
+> Your ranking must be definitive — no ties.
+
+**Output per judge**: Ranked preference (1st, 2nd, 3rd) + justification.
+
+**Panel composition options**:
+- Same model, different label orderings (minimum viable)
+- Different models from different providers (reduces correlated blind spots)
+- Different rubric emphasis per judge (diversity of evaluation criteria)
+
+#### Phase 5: Decide (Deterministic)
+
+No LLM involved. Pure counting.
+
+**Aggregation**: Borda count or simple plurality on first-place votes.
+
+**Output**:
+- `winner`: which version (A, B, or AB) won
+- `confidence`: judge agreement ratio (e.g., 3/3 = 1.0, 2/3 = 0.67)
+- `transcript`: full record of all phases
+
+**Convergence signal**: If `winner == A` (the incumbent), this round produced
+no improvement. The `StagnationDetector` increments its counter. If the incumbent
+wins N consecutive rounds, the loop has converged — further debate is unlikely
+to improve the artifact.
+
+---
+
+## Mapping onto EGRI Architecture
+
+### The Key Insight
+
+Autoreason is **not** a replacement for EGRI. It is a **specific evaluator
+construction** that enables EGRI to operate in subjective domains. Everything
+else — budget control, promotion, rollback, ledger, safety laws — remains
+unchanged.
+
+### Architectural Options
+
+Three ways to integrate autoreason into autoany-core, ordered by invasiveness:
+
+#### Option A: Adapter Pattern (Recommended)
+
+Implement `ComparativeEvaluator` as a new trait. Build a `DebateLoop` variant
+that uses it instead of `Evaluator + Selector`.
+
+```
+autoany-core/src/
+  ├── evaluator.rs              # existing, unchanged
+  ├── comparative_evaluator.rs  # NEW — ComparativeEvaluator trait
+  ├── debate.rs                 # NEW — autoreason protocol
+  ├── debate_loop.rs            # NEW — loop variant using ComparativeEvaluator
+  ├── loop_engine.rs            # existing, unchanged
+  └── ...
+```
+
+**Pros**: Zero changes to existing code. Clean separation. Can test debate independently.
+**Cons**: Some loop logic duplicated between `EgriLoop` and `DebateLoop`.
+
+#### Option B: Evaluator Wrapper
+
+Make `AutoreasonEvaluator` implement the existing `Evaluator` trait by storing
+the incumbent internally and converting comparison results to scalar scores.
+
+**Pros**: No new loop variant needed. Works with existing `EgriLoop`.
+**Cons**: Evaluator becomes stateful (fragile on rollback). Violates the spirit
+of evaluator immutability. The incumbent must be kept in sync with
+`PromotionController` state — a synchronization bug waiting to happen.
+
+#### Option C: Generic Loop
+
+Make `EgriLoop` generic over evaluation mode (absolute vs comparative).
+
+**Pros**: Single loop implementation. Maximum code reuse.
+**Cons**: Complex generic bounds. Every existing user of `EgriLoop` sees more
+type parameters. Optimization for code sharing at the cost of readability.
+
+**Recommendation: Option A.** The duplication is small (the loop is ~80 lines of
+orchestration) and the conceptual clarity is worth it. `EgriLoop` remains the
+workhorse for objective evaluation. `DebateLoop` handles subjective evaluation.
+Both share `BudgetController`, `PromotionController`, `Ledger`, and all meta-
+optimization modules.
+
+### Dependency Chain
+
+```
+Layer 0: Types (no dependencies)
+    types.rs additions:
+      DebateConfig, DebateRole, CritiqueResult,
+      DebateRound, JudgeVote, ComparisonOutcome,
+      DebateTranscript, Winner
+
+Layer 1: Traits (depends on Layer 0)
+    comparative_evaluator.rs:
+      trait ComparativeEvaluator {
+          type Artifact;
+          fn compare(&self, task: &str, incumbent: &A, candidate: &A)
+              -> Result<ComparisonOutcome>;
+      }
+
+    llm_backend.rs:
+      trait LlmBackend {
+          fn generate(&self, system: &str, user: &str) -> Result<String>;
+      }
+      (abstraction over LLM API — sync trait, async via executor)
+
+Layer 2: Protocol (depends on Layers 0 + 1)
+    debate.rs:
+      fn attack(backend: &dyn LlmBackend, artifact: &str, rubric: Option<&str>)
+          -> Result<CritiqueResult>;
+      fn revise(backend: &dyn LlmBackend, task: &str, artifact: &str, critique: &CritiqueResult)
+          -> Result<String>;
+      fn synthesize(backend: &dyn LlmBackend, task: &str, a: &str, b: &str)
+          -> Result<String>;
+      fn judge(backend: &dyn LlmBackend, task: &str, versions: &[&str], config: &DebateConfig)
+          -> Result<Vec<JudgeVote>>;
+      fn decide(votes: &[JudgeVote], version_map: &HashMap<Label, Winner>)
+          -> ComparisonOutcome;
+      fn autoreason_round(backend: &dyn LlmBackend, task: &str, incumbent: &str, config: &DebateConfig)
+          -> Result<DebateRound>;
+
+Layer 3: Evaluator (depends on Layer 2)
+    autoreason_evaluator.rs:
+      struct AutoreasonEvaluator<B: LlmBackend> {
+          backend: B,
+          config: DebateConfig,
+      }
+      impl<B: LlmBackend> ComparativeEvaluator for AutoreasonEvaluator<B> { ... }
+
+Layer 4: Loop (depends on Layers 1 + 3 + existing EGRI primitives)
+    debate_loop.rs:
+      struct DebateLoop<A, P, X, CE> {
+          proposer: P,
+          executor: X,
+          evaluator: CE,          // ComparativeEvaluator, not Evaluator
+          budget: BudgetController,
+          promotion: PromotionController<A>,
+          ledger: Ledger,
+          task: String,           // original task description
+          convergence: ConvergenceDetector,
+      }
+
+Layer 5: Integration (depends on all above)
+    spec.rs: PromotionPolicy::Comparative, DebateSpec in ProblemSpec
+    strategy.rs: debate_distill() for Level 1 meta-loop
+    Domain mappings: writing, argument, design, strategy
+```
+
+### Shared vs New Components
+
+| Component | Status | Notes |
+|-----------|--------|-------|
+| `BudgetController` | **Shared** | Budget accounts for LLM calls per round (5+ per debate round) |
+| `PromotionController` | **Shared** | Unchanged. Stores winning artifact after each round |
+| `Ledger` | **Shared** | Debate transcripts stored in `evaluator_metadata` |
+| `DeadEndTracker` | **Shared** | Tracks critique patterns that lead nowhere |
+| `StagnationDetector` | **Reinterpreted** | Stagnation = convergence. Incumbent wins N times = done |
+| `InheritedKnowledge` | **Extended** | Carries forward: which critique styles were productive |
+| `strategy::distill()` | **Extended** | New: `debate_distill()` ranks critique effectiveness |
+| `Evaluator` trait | **Unchanged** | Still used for objective domains |
+| `Selector` trait | **Unchanged** | Not used by `DebateLoop` (debate is both eval + selection) |
+| `ComparativeEvaluator` trait | **New** | Compares two artifacts |
+| `DebateLoop` | **New** | Loop variant for comparative evaluation |
+| `debate.rs` | **New** | Autoreason protocol implementation |
+| `LlmBackend` trait | **New** | Abstraction over LLM API calls |
+
+---
+
+## Type Definitions
+
+### Core Types
+
+```rust
+/// Configuration for the autoreason debate protocol.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DebateConfig {
+    /// Number of judges in the panel (should be odd for clean majorities).
+    pub judge_count: u32,
+    /// Consecutive incumbent wins required to declare convergence.
+    pub convergence_threshold: u32,
+    /// Whether to randomize version labels per judge (strongly recommended).
+    pub label_randomization: bool,
+    /// Whether to use different LLM providers for different judges.
+    pub model_diversity: bool,
+    /// Optional rubric for critique and judging (domain-specific).
+    pub rubric: Option<String>,
+    /// Maximum tokens per phase (controls cost).
+    pub max_tokens_per_phase: Option<u32>,
+}
+
+/// Which version won a debate round.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum Winner {
+    /// The incumbent (Version A) — no improvement found.
+    Incumbent,
+    /// The revision (Version B) — critique-driven improvement.
+    Revision,
+    /// The synthesis (Version AB) — combined strengths.
+    Synthesis,
+}
+
+/// A single judge's vote.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JudgeVote {
+    /// Judge identifier (for audit trail).
+    pub judge_id: String,
+    /// Label-to-version mapping this judge saw (for audit trail).
+    pub label_map: HashMap<String, Winner>,
+    /// Ranked preference: [1st, 2nd, 3rd].
+    pub ranking: Vec<Winner>,
+    /// Free-text justification.
+    pub justification: String,
+}
+
+/// Result of the critique phase.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CritiqueResult {
+    /// Structured list of identified issues.
+    pub issues: Vec<CritiqueIssue>,
+    /// Raw critique text (for passing to reviser).
+    pub raw_text: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CritiqueIssue {
+    pub category: String,       // "logical_gap", "unsupported_claim", "missing_perspective", etc.
+    pub severity: IssueSeverity,
+    pub description: String,
+    pub location: Option<String>, // where in the artifact
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum IssueSeverity {
+    Critical,   // Undermines the core argument
+    Major,      // Significant weakness
+    Minor,      // Improvement opportunity
+}
+
+/// Result of one complete autoreason round.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DebateRound {
+    pub round_number: u32,
+    pub incumbent_content: String,
+    pub critique: CritiqueResult,
+    pub revision_content: String,
+    pub synthesis_content: String,
+    pub votes: Vec<JudgeVote>,
+    pub winner: Winner,
+    pub confidence: f64,
+    pub winning_content: String,
+}
+
+/// Outcome of a comparative evaluation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ComparisonOutcome {
+    /// Which version won.
+    pub winner: Winner,
+    /// Judge agreement ratio: 1.0 = unanimous, 0.33 = split.
+    pub confidence: f64,
+    /// Full debate record (for ledger).
+    pub round: DebateRound,
+}
+```
+
+### Trait Definitions
+
+```rust
+/// Compares two artifacts to determine which is better.
+///
+/// Used when no objective scalar metric exists. The comparison IS the
+/// evaluation — there is no separate scoring step.
+pub trait ComparativeEvaluator {
+    type Artifact;
+
+    /// Compare incumbent and candidate, returning which is better.
+    fn compare(
+        &self,
+        task: &str,
+        incumbent: &Self::Artifact,
+        candidate: &Self::Artifact,
+    ) -> Result<ComparisonOutcome>;
+}
+
+/// Abstraction over LLM API calls.
+///
+/// Each call is stateless — no conversation history. This enforces the
+/// context isolation that makes autoreason work.
+pub trait LlmBackend: Send + Sync {
+    /// Generate a completion from a system prompt and user message.
+    /// Each call is independent — no shared conversation state.
+    fn generate(&self, system: &str, user: &str) -> Result<String>;
+
+    /// Generate with a specific model (for judge diversity).
+    /// Falls back to default model if not supported.
+    fn generate_with_model(
+        &self,
+        model: &str,
+        system: &str,
+        user: &str,
+    ) -> Result<String> {
+        // Default: ignore model, use primary
+        let _ = model;
+        self.generate(system, user)
+    }
+}
+```
+
+---
+
+## The DebateLoop
+
+### Structure
+
+```rust
+pub struct DebateLoop<A, P, X, CE>
+where
+    A: Clone + AsRef<str>,   // Artifact must be representable as text for debate
+    P: Proposer<Artifact = A>,
+    X: Executor<Artifact = A>,
+    CE: ComparativeEvaluator<Artifact = A>,
+{
+    proposer: P,
+    executor: X,
+    evaluator: CE,
+    budget: BudgetController,
+    promotion: PromotionController<A>,
+    ledger: Ledger,
+    task: String,
+    convergence_threshold: u32,
+    consecutive_incumbent_wins: u32,
+}
+```
+
+### Loop Execution
+
+```rust
+impl<A, P, X, CE> DebateLoop<A, P, X, CE> {
+    pub fn run(&mut self) -> Result<LoopSummary> {
+        self.budget.start();
+
+        loop {
+            // 1. Budget check
+            if let Err(EgriError::BudgetExhausted(msg)) = self.budget.check() {
+                info!(reason = %msg, "budget exhausted");
+                break;
+            }
+
+            // 2. Get incumbent from promotion controller
+            let incumbent = self.promotion.current()
+                .ok_or(EgriError::NoBaseline)?.clone();
+
+            let parent_state = self.promotion.current_state_id()
+                .cloned().unwrap_or_else(StateId::baseline);
+
+            // 3. Propose mutation (optional — can also use debate-generated revisions)
+            let (mutation, candidate) = self.proposer.propose(&incumbent, &self.ledger)?;
+
+            // 4. Execute candidate (render/validate — may be identity for text)
+            let exec_result = self.executor.execute(&candidate)?;
+
+            // 5. Comparative evaluation via debate
+            let comparison = self.evaluator.compare(
+                &self.task,
+                &incumbent,
+                &candidate,
+            )?;
+
+            // 6. Convert to EGRI decision
+            let (action, winning_artifact) = match comparison.winner {
+                Winner::Incumbent => {
+                    self.consecutive_incumbent_wins += 1;
+                    (Action::Discarded, None)
+                }
+                Winner::Revision | Winner::Synthesis => {
+                    self.consecutive_incumbent_wins = 0;
+                    // The winning content may differ from the candidate
+                    // (synthesis or revision produced during debate)
+                    let winner_artifact: A = /* reconstruct from comparison.round.winning_content */;
+                    (Action::Promoted, Some(winner_artifact))
+                }
+            };
+
+            // 7. Build decision record
+            let decision = Decision {
+                action,
+                reason: format!(
+                    "debate: {:?} won (confidence: {:.2}, round: {})",
+                    comparison.winner, comparison.confidence, comparison.round.round_number
+                ),
+                new_state_id: if action == Action::Promoted {
+                    Some(StateId::new())
+                } else {
+                    None
+                },
+            };
+
+            // 8. Apply promotion
+            if let Some(ref winner) = winning_artifact {
+                self.promotion.apply_decision(&decision, winner.clone());
+            } else {
+                self.promotion.apply_decision(&decision, candidate);
+            }
+
+            // 9. Record to ledger
+            let record = TrialRecord {
+                trial_id: TrialId::new(self.budget.used()),
+                timestamp: Utc::now(),
+                parent_state,
+                mutation,
+                execution: Some(exec_result),
+                outcome: Outcome {
+                    score: Score::Scalar(comparison.confidence),
+                    constraints_passed: true,
+                    constraint_violations: vec![],
+                    evaluator_metadata: Some(serde_json::to_value(&comparison.round)?),
+                },
+                decision,
+                strategy_notes: None,
+            };
+            self.ledger.append(record)?;
+
+            self.budget.consume();
+
+            // 10. Convergence check
+            if self.consecutive_incumbent_wins >= self.convergence_threshold {
+                info!(
+                    rounds = self.consecutive_incumbent_wins,
+                    "converged — incumbent won {} consecutive rounds",
+                    self.consecutive_incumbent_wins
+                );
+                break;
+            }
+        }
+
+        Ok(self.summary())
+    }
+}
+```
+
+### Key Differences from EgriLoop
+
+| Aspect | EgriLoop | DebateLoop |
+|--------|----------|------------|
+| Evaluation | Score one candidate in isolation | Compare candidate vs incumbent |
+| Selection | Separate `Selector` trait | Built into comparative evaluation |
+| Termination | Budget exhaustion or escalation | Budget OR convergence (incumbent wins N times) |
+| Score meaning | Objective metric (lower/higher = better) | Confidence (judge agreement on winner) |
+| Artifact output | May differ from input | May produce novel artifact (synthesis) |
+| Cost per trial | 1 executor + 1 evaluator call | 1 executor + 5+ LLM calls |
+
+### Important: The Debate Can Produce New Artifacts
+
+In standard EGRI, the evaluator only scores — it never modifies the artifact.
+In autoreason, the debate protocol generates two new versions (B and AB) during
+evaluation. If one of those wins, the promoted artifact is something the original
+Proposer never generated.
+
+This is a feature, not a bug. The debate protocol is simultaneously:
+- An **evaluator** (judges quality)
+- A **refiner** (produces improved versions)
+
+But it means `DebateLoop` must handle the case where the winning artifact differs
+from the proposer's candidate. The `PromotionController` receives the actual winner,
+not necessarily the original candidate.
+
+**Law 3 compliance**: The debate protocol (evaluator) is still immutable — the same
+debate config produces the same procedure every time. What changes is the artifact
+content, which is the artifact, not the evaluator. The evaluator's structure (critic →
+reviser → synthesizer → judge panel) is fixed.
+
+---
+
+## Convergence Theory
+
+### When Does Autoreason Converge?
+
+The loop converges when the judge panel consistently prefers the incumbent over
+debate-generated alternatives. This means:
+
+1. The critic cannot find substantive weaknesses
+2. The reviser cannot produce meaningful improvements from the critique
+3. The synthesizer cannot improve on the incumbent by merging with the revision
+4. The judges agree the incumbent is the strongest version
+
+This is analogous to `val_bpb` plateauing in autoresearch — further mutations
+don't improve the metric. The difference is that "improvement" is measured by
+consensus rather than a scalar comparison.
+
+### Convergence vs Oscillation
+
+**Risk**: On polarizing topics, the loop might oscillate:
+- Round 1: Version B wins (takes position X)
+- Round 2: Critique attacks X, Version B' wins (takes position Y)
+- Round 3: Critique attacks Y, Version B'' wins (takes position X again)
+
+**Mitigations**:
+1. **Synthesis phase**: Version AB should capture both perspectives, breaking the cycle
+2. **Judge panel diversity**: Multiple judges are less likely to flip-flop than one
+3. **Convergence threshold > 1**: Requiring 3 consecutive incumbent wins filters out noise
+4. **Ledger analysis**: If the same critique pattern appears repeatedly, `DeadEndTracker`
+   flags it. Strategy distillation can detect oscillation patterns.
+
+### Convergence Speed
+
+Empirically (from the multi-agent debate literature):
+
+- **Well-defined quality** (clear writing, sound arguments): 3-5 rounds
+- **Ambiguous quality** (creative work, taste-dependent): 5-10 rounds
+- **Polarizing topics** (strong opposing views): may not converge cleanly;
+  detect via oscillation and escalate to `HumanGate`
+
+Budget planning: assume 5-7 rounds average, 5+ LLM calls per round = 25-35 calls
+per artifact optimization. At ~$0.01-0.05 per call, that's $0.25-1.75 per artifact.
+
+---
+
+## Integration with Existing Meta-Optimization
+
+### Stagnation as Convergence
+
+The existing `StagnationDetector` counts consecutive non-improvements and triggers
+escalation at a threshold. In debate context, "non-improvement" means the incumbent
+won. This is exactly convergence detection:
+
+```rust
+// Existing code, zero changes needed:
+let status = stagnation_detector.check(&ledger);
+match status {
+    StagnationStatus::Ok => continue,
+    StagnationStatus::Warning(n) => info!("approaching convergence ({n} incumbent wins)"),
+    StagnationStatus::Stagnated(n) => {
+        info!("converged after {n} consecutive incumbent wins");
+        break;
+    }
+}
+```
+
+The only change is **interpretation**: in objective loops, stagnation means "we're
+stuck, escalate." In debate loops, stagnation means "we've converged, we're done."
+
+### Strategy Distillation for Debate
+
+The existing `strategy::distill()` ranks mutation operators by success rate. For
+debate loops, we need an additional `debate_distill()` that analyzes:
+
+1. **Critique effectiveness**: Which critique categories (logical gaps, missing
+   perspectives, unsupported claims) most often led to successful revisions?
+2. **Revision patterns**: When the revision won, what structural changes did it make?
+3. **Synthesis value**: How often does the synthesis win vs the revision? If synthesis
+   rarely wins, consider dropping Phase 3 to save budget.
+4. **Judge agreement patterns**: Do judges diverge on specific quality dimensions?
+   This reveals where the rubric needs refinement.
+
+```rust
+pub struct DebateStrategyReport {
+    /// Critique categories ranked by revision success rate.
+    pub effective_critiques: Vec<(String, f64)>,
+    /// How often each phase's output wins.
+    pub phase_win_rates: HashMap<Winner, f64>,
+    /// Average judge agreement per round.
+    pub mean_confidence: f64,
+    /// Rounds where judges diverged significantly.
+    pub low_agreement_rounds: Vec<u32>,
+    /// Recommended: skip synthesis if its win rate is below threshold?
+    pub skip_synthesis: bool,
+}
+```
+
+### Dead-End Tracking for Debate
+
+The existing `DeadEndTracker` tracks mutation signatures that fail repeatedly.
+For debate, it tracks **critique patterns** that lead nowhere:
+
+- Signature: `critique_category:round_context`
+- Example: `"missing_perspective:after_synthesis"` — if the critic keeps finding
+  "missing perspectives" after synthesis rounds but it never leads to a winning
+  revision, that critique direction is exhausted
+
+### Inherited Knowledge for Debate
+
+Cross-run learning (`InheritedKnowledge`) extends naturally:
+
+- Carry forward which critique styles were productive
+- Carry forward the final converged artifact as the new baseline
+- Carry forward judge agreement patterns (which dimensions are hard to evaluate)
+
+---
+
+## Hybrid Evaluation: Objective + Subjective
+
+Many real problems have BOTH objective and subjective components:
+
+| Domain | Objective component | Subjective component |
+|--------|--------------------|--------------------|
+| Blog post | Word count, formatting, link validity | Argument quality, clarity, engagement |
+| API design | Type safety, endpoint coverage | Ergonomics, naming, conceptual coherence |
+| UI/UX | Accessibility score, load time | Visual appeal, flow intuitiveness |
+| Prompt engineering | Format compliance, token count | Response quality, helpfulness |
+
+For these, use a **HybridEvaluator** that:
+
+1. Checks hard constraints objectively (existing `Evaluator` + constraints)
+2. Runs autoreason debate for the subjective component
+3. Combines both into a final `Outcome`
+
+```rust
+pub struct HybridEvaluator<OE, CE>
+where
+    OE: Evaluator,
+    CE: ComparativeEvaluator,
+{
+    objective: OE,      // Checks constraints, computes objective metrics
+    subjective: CE,     // Runs debate for quality assessment
+}
+```
+
+The objective evaluator acts as a **gate**: if hard constraints fail, the candidate
+is discarded without running the expensive debate. Only candidates that pass
+objective checks proceed to subjective evaluation.
+
+This saves significant budget — if 30% of candidates fail constraints, you save
+30% of debate costs.
+
+---
+
+## Problem Spec Extensions
+
+### New Fields in `problem-spec.yaml`
+
+```yaml
+# Existing fields unchanged...
+
+# NEW: Debate configuration (required when promotion.policy == "comparative")
+debate:
+  judge_count: 3
+  convergence_threshold: 3
+  label_randomization: true
+  model_diversity: false
+  rubric: |
+    Evaluate on: logical coherence, evidence quality, clarity of expression,
+    completeness of argument, acknowledgment of counterarguments.
+  max_tokens_per_phase: 2000
+
+# NEW: Task description (required for debate — provides the original brief)
+task:
+  description: |
+    Write a compelling analysis of X that addresses Y and considers Z.
+  criteria:
+    - logical_coherence
+    - evidence_quality
+    - clarity
+    - completeness
+
+# Updated: New promotion policy option
+promotion:
+  policy: comparative   # NEW — uses autoreason debate instead of scalar comparison
+```
+
+### New PromotionPolicy Variant
+
+```rust
+pub enum PromotionPolicy {
+    KeepIfImproves,
+    Pareto,
+    Threshold,
+    HumanGate,
+    Comparative,      // NEW — autoreason debate evaluation
+}
+```
+
+---
+
+## New Domain Mappings
+
+### Writing / Argumentation
+
+| Component | Concrete form |
+|-----------|--------------|
+| Mutable artifact | Text content (essay, blog post, report, argument) |
+| Immutable harness | Original task brief, evaluation rubric, debate config |
+| Evaluator | Autoreason debate protocol (critic → reviser → synthesizer → judge panel) |
+| Constraints | Word count, format requirements, factual accuracy checks (objective) |
+| Budget | N debate rounds, token budget, cost cap |
+| Promotion | Comparative — debate winner replaces incumbent |
+| Ledger | JSONL with full debate transcripts per round |
+| Execution | Identity (text is the artifact — no "running" needed) |
+
+### Strategy / Decision Analysis
+
+| Component | Concrete form |
+|-----------|--------------|
+| Mutable artifact | Strategic analysis, decision recommendation, risk assessment |
+| Immutable harness | Problem context, stakeholder constraints, data sources |
+| Evaluator | Autoreason with domain-specific rubric (feasibility, risk coverage, coherence) |
+| Constraints | Must address all stakeholder concerns, must cite sources |
+| Budget | N debate rounds |
+| Promotion | Comparative |
+| Ledger | JSONL with debate transcripts + decision lineage |
+| Execution | Optional validation step (check cited sources exist, numbers add up) |
+
+### Design / UX (Hybrid)
+
+| Component | Concrete form |
+|-----------|--------------|
+| Mutable artifact | Component specs, layout descriptions, interaction flows |
+| Immutable harness | Design system constraints, accessibility requirements |
+| Evaluator | HybridEvaluator: objective (a11y score, performance) + subjective (debate on aesthetics, usability) |
+| Constraints | WCAG compliance, performance budget, design system consistency |
+| Budget | N trials for objective + M debate rounds for subjective |
+| Promotion | Comparative (after objective gate) |
+| Ledger | JSONL with both objective scores and debate transcripts |
+| Execution | Screenshot rendering, a11y audit, Lighthouse |
+
+### Prompt Engineering (Hybrid)
+
+| Component | Concrete form |
+|-----------|--------------|
+| Mutable artifact | System prompt, few-shot examples, output format |
+| Immutable harness | Eval dataset, judge rubric, golden answers |
+| Evaluator | HybridEvaluator: objective (format compliance, token cost) + subjective (debate on response quality) |
+| Constraints | Token budget per call, format compliance |
+| Budget | N trials with eval set + M debate rounds on borderline cases |
+| Promotion | Comparative (after objective gate) |
+| Ledger | JSONL with both objective scores and debate transcripts |
+| Execution | API calls to target model |
+
+---
+
+## Safety Analysis
+
+### EGRI Laws Under Autoreason
+
+| Law | Status | Analysis |
+|-----|--------|----------|
+| **Law 1: Evaluator Supremacy** | **Strengthened** | N independent judges > 1 prompted judge. Adversarial structure resists gaming. |
+| **Law 2: Mutation-Evaluation Proportionality** | **Upheld** | Debate evaluator is expensive but thorough. Cost scales with mutation complexity. |
+| **Law 3: Immutability of the Evaluator** | **Upheld with nuance** | The debate protocol (config, rubric, judge count) is immutable. The debate *generates* artifacts, but the evaluation *procedure* doesn't change. |
+| **Law 4: Budget Closure** | **Upheld** | BudgetController counts debate rounds. Each round = 1 budget unit. |
+| **Law 5: Rollback Guarantee** | **Upheld** | PromotionController stores the winning artifact. Rollback is always available. |
+
+### New Failure Modes
+
+| Failure | Symptom | Remedy |
+|---------|---------|--------|
+| Judge sycophancy to verbose version | Longer version always wins | Add explicit length-normalization instruction to judge prompt |
+| Critique exhaustion | Critic repeats same issues | DeadEndTracker flags repeated critique categories |
+| Synthesis blandness | Synthesis always loses (too safe) | Track synthesis win rate; if < 10%, add "prefer boldness" to synthesis prompt |
+| Oscillation | A→B→A→B cycle, never converges | Detect via ledger pattern analysis; escalate to HumanGate |
+| Budget explosion | 10+ rounds without convergence | BudgetController enforces hard cap; escalate at warning threshold |
+| Correlated judge bias | All judges from same model agree on wrong answer | Enable `model_diversity: true` — use different providers |
+| Rubric ambiguity | Judges disagree on criteria interpretation | Low confidence signals rubric needs refinement; escalate |
+
+### When NOT to Use Autoreason
+
+- **Objective metrics exist**: Use scalar evaluator. Faster, cheaper, more reliable.
+- **Trivial improvements**: If the task is "fix the typo," debate is overkill.
+- **Time-critical loops**: Each debate round takes 10-30 seconds (5+ LLM calls).
+  For real-time optimization, use scalar evaluation.
+- **Highly deterministic domains**: Compiler optimization, numerical methods —
+  the benchmark suite IS the evaluator.
+
+Autoreason is for the space where:
+1. Quality matters but can't be reduced to a number
+2. The cost of getting it wrong justifies the cost of debate (5-35 LLM calls)
+3. Human evaluation is too slow for the iteration frequency you need
+
+---
+
+## Cost Model
+
+### Per-Round Cost
+
+| Phase | LLM calls | Typical tokens (in + out) |
+|-------|-----------|--------------------------|
+| Attack | 1 | ~2000 in, ~1000 out |
+| Revise | 1 | ~4000 in, ~2000 out |
+| Synthesize | 1 | ~5000 in, ~2000 out |
+| Judge (x3) | 3 | ~4000 in, ~500 out each |
+| **Total** | **6** | **~20K in, ~5.5K out** |
+
+At Claude Sonnet rates (~$3/M input, ~$15/M output):
+- Per round: ~$0.06 input + ~$0.08 output = **~$0.14/round**
+- 5-round convergence: **~$0.70 per artifact optimization**
+- 10-round convergence: **~$1.40 per artifact optimization**
+
+At GPT-4.1 rates (~$2/M input, ~$8/M output):
+- Per round: ~$0.04 input + ~$0.04 output = **~$0.08/round**
+- 5-round convergence: **~$0.40 per artifact optimization**
+
+### Budget Planning
+
+```yaml
+budget:
+  max_trials: 10       # Maximum debate rounds
+  cost_budget: 2.00    # Hard cost cap in USD
+  token_budget: 300000 # Total token cap across all phases
+```
+
+### Optimization: Skip Synthesis When Cheap
+
+If strategy distillation shows synthesis win rate < 10% after the first 3 rounds,
+drop Phase 3 for remaining rounds. Saves ~25% of per-round cost.
+
+```rust
+if debate_strategy.phase_win_rates[&Winner::Synthesis] < 0.10
+    && round_number > 3
+{
+    // Skip synthesis, judge only A vs B
+}
+```
+
+---
+
+## Implementation Sequence
+
+### Phase 1: Types and Traits (no runtime dependencies)
+
+```
+Add to autoany-core/src/:
+  types.rs   → DebateConfig, Winner, JudgeVote, CritiqueResult,
+               DebateRound, ComparisonOutcome (+ serde derives)
+  comparative_evaluator.rs → ComparativeEvaluator trait
+  llm_backend.rs → LlmBackend trait
+
+Update lib.rs → pub mod comparative_evaluator, llm_backend
+
+Depends on: nothing new. Pure type definitions.
+Tests: unit tests for serde roundtrips on new types.
+```
+
+### Phase 2: Protocol Functions (depends on Phase 1)
+
+```
+Add autoany-core/src/debate.rs:
+  attack(), revise(), synthesize(), judge(), decide()
+  autoreason_round() — orchestrates one full round
+
+Depends on: LlmBackend trait, new types.
+Tests: mock LlmBackend that returns canned responses.
+  - Test label randomization produces different orderings.
+  - Test decide() correctly aggregates votes.
+  - Test full round with mock backend.
+```
+
+### Phase 3: Evaluator Implementation (depends on Phase 2)
+
+```
+Add autoany-core/src/autoreason_evaluator.rs:
+  AutoreasonEvaluator<B: LlmBackend> implements ComparativeEvaluator
+
+Depends on: debate.rs, ComparativeEvaluator trait.
+Tests: mock backend → verify compare() runs full protocol.
+```
+
+### Phase 4: Loop Variant (depends on Phase 3 + existing modules)
+
+```
+Add autoany-core/src/debate_loop.rs:
+  DebateLoop — uses ComparativeEvaluator, BudgetController,
+  PromotionController, Ledger.
+
+Depends on: everything above + existing EGRI modules.
+Tests: full integration test with mock backend:
+  - Convergence detection (mock judges always pick incumbent after N rounds).
+  - Promotion of revision/synthesis winners.
+  - Budget exhaustion halts loop.
+  - Ledger records full debate transcripts.
+```
+
+### Phase 5: Spec and Strategy Extensions (depends on Phase 4)
+
+```
+Update autoany-core/src/spec.rs:
+  PromotionPolicy::Comparative
+  DebateSpec in ProblemSpec
+
+Add debate strategy distillation to strategy.rs:
+  debate_distill() → DebateStrategyReport
+
+Update inheritance.rs:
+  Carry forward debate strategy reports.
+
+Update DOMAIN-MAPPINGS.md:
+  Add Writing, Strategy, Design, Prompt Engineering domains.
+```
+
+### Phase 6: Adapter Crates (depends on Phase 4)
+
+```
+Update autoany-aios/:
+  Route debate LLM calls through Arcan agent sessions.
+
+Update autoany-lago/:
+  Persist debate transcripts as EventKind::Custom("egri.debate.*").
+  Enable querying: "show critique patterns across runs."
+```
+
+### Phase 7: Example Instance (depends on all above)
+
+```
+Add examples/blog-post-optimizer/:
+  problem-spec.yaml — blog post optimization with debate evaluation
+  artifacts/post.md — initial blog post draft
+  eval/ — debate config + rubric
+  harness/ — format validation (objective) + debate (subjective)
+```
+
+---
+
+## Open Questions
+
+### 1. Should the Debate Generate New Artifacts?
+
+The current protocol generates Version B (revision) and Version AB (synthesis)
+during evaluation. If one of these wins, the promoted artifact is something the
+Proposer never generated. This is powerful but means the evaluator is also a
+refiner — blurring the EGRI role boundaries.
+
+**Alternative**: Have the debate only compare Proposer-generated candidates.
+The Proposer generates N candidates, the debate ranks them, the best is promoted.
+Cleaner role separation but loses the adversarial revision mechanism.
+
+**Recommendation**: Keep artifact generation in the debate. The value of
+autoreason comes precisely from the adversarial revision cycle. Document the
+role-blurring explicitly and ensure rollback works for debate-generated artifacts.
+
+### 2. Judge Model Diversity
+
+Should all judges use the same model, or different models?
+
+- **Same model, different label orderings**: Cheapest. Tests positional robustness.
+- **Different models, same provider**: Moderate cost. Tests model-specific biases.
+- **Different providers**: Most expensive. Maximally decorrelated judgments.
+
+**Recommendation**: Default to same model with label randomization. Add
+`model_diversity: true` as an option for high-stakes domains. The
+`LlmBackend::generate_with_model()` method supports this.
+
+### 3. How to Handle Judge Disagreement
+
+When judges split (e.g., 2:1 or worse), what does it mean?
+
+- **Low confidence on an easy round**: Possible rubric ambiguity. Log for
+  strategy distillation.
+- **Low confidence throughout**: The quality dimension may be genuinely
+  subjective. Consider escalating to `HumanGate` for calibration.
+- **Consistent 2:1 splits on specific criteria**: One judge may have a
+  systematically different interpretation. Strategy distillation can detect this.
+
+### 4. Feedback to the Proposer
+
+Should the Proposer see debate transcripts from previous rounds?
+
+- **Yes**: The Proposer can learn what the judges value and propose targeted
+  mutations. Risk: the Proposer "overfits" to judge preferences.
+- **No**: The Proposer generates independently. Slower convergence but more
+  diverse exploration.
+- **Partial**: The Proposer sees the winning critique categories but not the
+  full transcript. Balanced signal without overfitting.
+
+**Recommendation**: Partial. Pass `DebateStrategyReport` to the Proposer's
+ledger context, not raw transcripts.
+
+---
+
+## References
+
+### Origin
+
+- [shl0ms (@SHL0MS)](https://x.com/SHL0MS/status/2037939506733523025) — autoreason concept
+- [Andrej Karpathy (@karpathy)](https://x.com/karpathy/status/2037921699824607591) — LLM sycophancy observation
+- [autoresearch (GitHub)](https://github.com/karpathy/autoresearch) — objective EGRI loop that autoreason extends
+
+### Multi-Agent Debate Literature
+
+- [Peacemaker or Troublemaker: Sycophancy in Multi-Agent Debate](https://arxiv.org/abs/2509.23055) — mixing cooperative/adversarial personas halves sycophancy rates
+- [D3: Debate, Deliberate, Decide](https://arxiv.org/abs/2410.04663) — structured debate with budgeted stopping and convergence checks
+- [PROClaim: Courtroom-Style Multi-Agent Debate](https://arxiv.org/abs/2603.28488) — specialized roles + evidence protocols, +10pp over standard debate
+- [Talk Isn't Always Cheap: Failure Modes in Multi-Agent Debate](https://arxiv.org/abs/2509.05396) — debate can degrade performance; sycophancy drives premature convergence
+- [Multi-Agent Debate with Adaptive Stability Detection](https://arxiv.org/abs/2510.12697) — automatic convergence detection in debate
+- [Debate-Reflection Cycles in Multi-Agent Systems](https://emergentmind.com/topics/debate-reflection-cycles) — DTE+RCR halves sycophancy rates
+
+### Self-Improvement Literature
+
+- [Self-Refine: Iterative Refinement with Self-Feedback](https://arxiv.org/abs/2303.17651) — same-model feedback loop (autoreason improves on this via role separation)
+- [The Karpathy Loop (Fortune)](https://fortune.com/2026/03/17/andrej-karpathy-loop-autonomous-ai-agents-future/) — 700 experiments, 2 days