broomva · broomva · Apr 1, 2026 · Apr 1, 2026
diff --git a/autoany-core/src/comparative_evaluator.rs b/autoany-core/src/comparative_evaluator.rs
@@ -0,0 +1,43 @@
+//! Comparative evaluation for subjective domains.
+//!
+//! Unlike [`Evaluator`](crate::evaluator::Evaluator) which scores a single
+//! artifact in isolation, `ComparativeEvaluator` compares two artifacts to
+//! determine which is better. This is the core abstraction that enables
+//! autoreason-style adversarial debate evaluation.
+
+use crate::error::Result;
+use crate::types::ComparisonOutcome;
+
+/// Compares two artifacts to determine which is better.
+///
+/// Used when no objective scalar metric exists. The comparison IS the
+/// evaluation — there is no separate scoring step.
+///
+/// Each call to `compare` should be deterministic in procedure (same debate
+/// config, same rubric) even though the LLM outputs may vary. The evaluator
+/// itself is immutable — EGRI Law 3 still holds.
+///
+/// # Context Isolation
+///
+/// Implementations must ensure that each evaluation phase (critic, reviser,
+/// synthesizer, judge) uses a fresh LLM context with no shared conversation
+/// history. This is what eliminates sycophancy and anchoring biases.
+pub trait ComparativeEvaluator {
+    /// The artifact type to compare.
+    type Artifact;
+
+    /// Compare the incumbent and candidate artifacts, returning which is better.
+    ///
+    /// - `task`: the original task description (provides context for judges)
+    /// - `incumbent`: the current best artifact
+    /// - `candidate`: the proposed replacement
+    ///
+    /// The returned [`ComparisonOutcome`] contains the winner, confidence
+    /// (judge agreement ratio), and the full debate transcript for the ledger.
+    fn compare(
+        &self,
+        task: &str,
+        incumbent: &Self::Artifact,
+        candidate: &Self::Artifact,
+    ) -> Result<ComparisonOutcome>;
+}
diff --git a/autoany-core/src/lib.rs b/autoany-core/src/lib.rs
@@ -15,13 +15,15 @@
 //! ```
 
 pub mod budget;
+pub mod comparative_evaluator;
 pub mod constraint;
 pub mod dead_ends;
 pub mod error;
 pub mod evaluator;
 pub mod executor;
 pub mod inheritance;
 pub mod ledger;
+pub mod llm_backend;
 pub mod loop_engine;
 pub mod promotion;
 pub mod proposer;

diff --git a/autoany-core/src/llm_backend.rs b/autoany-core/src/llm_backend.rs
@@ -0,0 +1,39 @@
+//! Stateless LLM call abstraction for autoreason debate.
+//!
+//! Each call to [`LlmBackend::generate`] is independent — no conversation
+//! history is carried between calls. This enforces the context isolation
+//! that makes autoreason's adversarial debate effective against sycophancy.
+
+use crate::error::Result;
+
+/// Abstraction over LLM API calls.
+///
+/// Each call is stateless — no conversation history. This enforces the
+/// context isolation that makes autoreason work. Every debate phase
+/// (critic, reviser, synthesizer, judge) gets a fresh context.
+///
+/// # Implementation Notes
+///
+/// Implementors should:
+/// - Never maintain conversation state between calls
+/// - Support configurable model selection for judge diversity
+/// - Handle rate limiting and retries internally
+/// - Return errors for API failures (do not silently fall back)
+pub trait LlmBackend: Send + Sync {
+    /// Generate a completion from a system prompt and user message.
+    ///
+    /// Each call is independent — no shared conversation state.
+    fn generate(&self, system: &str, user: &str) -> Result<String>;
+
+    /// Generate with a specific model (for judge diversity).
+    ///
+    /// When `model_diversity` is enabled in [`DebateConfig`](crate::types::DebateConfig),
+    /// different judges can use different models to reduce correlated biases.
+    ///
+    /// The default implementation ignores the model parameter and delegates
+    /// to [`generate`](Self::generate).
+    fn generate_with_model(&self, model: &str, system: &str, user: &str) -> Result<String> {
+        let _ = model;
+        self.generate(system, user)
+    }
+}
diff --git a/autoany-core/src/selector.rs b/autoany-core/src/selector.rs
@@ -125,6 +125,15 @@ impl Selector for DefaultSelector {
                     new_state_id: None,
                 })
             }
+            PromotionPolicy::Comparative => {
+                // Comparative evaluation is handled by DebateLoop, not DefaultSelector.
+                // If this is reached, the problem is misconfigured.
+                Ok(Decision {
+                    action: Action::Escalated,
+                    reason: "comparative policy requires DebateLoop, not EgriLoop".into(),
+                    new_state_id: None,
+                })
+            }
         }
     }
 }

diff --git a/autoany-core/src/spec.rs b/autoany-core/src/spec.rs
@@ -1,6 +1,6 @@
 use serde::{Deserialize, Serialize};
 
-use crate::types::{AutonomyMode, Direction};
+use crate::types::{AutonomyMode, DebateConfig, Direction};
 
 /// Problem specification — the compiled EGRI instance definition.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -16,6 +16,12 @@ pub struct ProblemSpec {
     pub autonomy: Autonomy,
     #[serde(default)]
     pub search: Option<Search>,
+    /// Debate configuration (required when `promotion.policy == "comparative"`).
+    #[serde(default)]
+    pub debate: Option<DebateSpec>,
+    /// Task description providing context for debate evaluation.
+    #[serde(default)]
+    pub task: Option<TaskSpec>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -94,6 +100,9 @@ pub enum PromotionPolicy {
     Pareto,
     Threshold,
     HumanGate,
+    /// Autoreason debate evaluation — uses adversarial multi-agent debate
+    /// with blind judge panel instead of scalar comparison.
+    Comparative,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -114,3 +123,24 @@ pub struct Search {
 fn default_proposer() -> String {
     "llm".to_string()
 }
+
+/// Debate configuration for comparative evaluation.
+///
+/// Required when `promotion.policy` is `comparative`. Wraps [`DebateConfig`]
+/// with additional problem-spec-level fields.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DebateSpec {
+    /// Core debate protocol configuration.
+    #[serde(flatten)]
+    pub config: DebateConfig,
+}
+
+/// Task description providing context for debate evaluation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TaskSpec {
+    /// The original task brief — what the artifact is supposed to accomplish.
+    pub description: String,
+    /// Evaluation criteria for judges (optional, supplements rubric).
+    #[serde(default)]
+    pub criteria: Vec<String>,
+}