diff --git a/Cargo.toml b/Cargo.toml
index f71801118..aca372a14 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,6 +86,9 @@ lazy_static = "1.4"
 # Actor framework for ReAct agents
 ractor = "0"
 
+# TOML deserialization (also used transitively by config)
+toml = "0.8"
+
 # Configuration file support (multi-format)
 config = { version = "0.14", features = [
     "toml",
diff --git a/crates/mofa-cli/Cargo.toml b/crates/mofa-cli/Cargo.toml
index 875ae62a6..fa3120073 100644
--- a/crates/mofa-cli/Cargo.toml
+++ b/crates/mofa-cli/Cargo.toml
@@ -25,6 +25,7 @@ mofa-kernel = { path = "../mofa-kernel", version = "0.1", features = [
 ] }
 mofa-runtime = { path = "../mofa-runtime", version = "0.1" }
 mofa-foundation = { path = "../mofa-foundation", version = "0.1" }
+mofa-testing = { path = "../../tests", version = "0.1" }
 config.workspace = true
 tokio = { workspace = true }
 thiserror = { workspace = true }
@@ -90,6 +91,7 @@ tokio-stream = "0.1"
 assert_cmd = "2"
 predicates = "3"
 tempfile = "3"
+axum = { workspace = true }
 
 [features]
 default = []
diff --git a/crates/mofa-cli/src/cli.rs b/crates/mofa-cli/src/cli.rs
index 7e10ad479..d55a67ec9 100644
--- a/crates/mofa-cli/src/cli.rs
+++ b/crates/mofa-cli/src/cli.rs
@@ -81,6 +81,40 @@ pub enum Commands {
         dora: bool,
     },
 
+    /// Run a testing DSL case file
+    TestDsl {
+        /// TOML DSL file to execute
+        file: PathBuf,
+
+        /// Optional canonical artifact file path
+        #[arg(long)]
+        artifact_out: Option<PathBuf>,
+
+        /// Optional report file path
+        #[arg(long)]
+        report_out: Option<PathBuf>,
+
+        /// Compare the current artifact against a saved baseline artifact
+        #[arg(long)]
+        baseline_in: Option<PathBuf>,
+
+        /// Write the current artifact to a baseline file
+        #[arg(long)]
+        baseline_out: Option<PathBuf>,
+
+        /// Write machine-readable comparison output (requires --baseline-in)
+        #[arg(long)]
+        comparison_out: Option<PathBuf>,
+
+        /// Exit non-zero when baseline comparison mismatches
+        #[arg(long)]
+        fail_on_diff: bool,
+
+        /// Report file format
+        #[arg(long, value_enum, default_value_t = TestDslReportFormat::Json)]
+        report_format: TestDslReportFormat,
+    },
+
     /// Run a dora dataflow
     #[cfg(feature = "dora")]
     Dataflow {
@@ -219,6 +253,12 @@ pub enum DatabaseType {
     Sqlite,
 }
 
+#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)]
+pub enum TestDslReportFormat {
+    Json,
+    Text,
+}
+
 impl std::fmt::Display for DatabaseType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
@@ -725,6 +765,77 @@ mod tests {
         assert!(parsed.is_ok(), "doctor ci strict json should parse");
     }
 
+    #[test]
+    fn test_test_dsl_parses() {
+        let parsed = Cli::try_parse_from(["mofa", "test-dsl", "tests/examples/simple_agent.toml"]);
+        assert!(parsed.is_ok(), "test-dsl command should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_report_flags_parse() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--report-out",
+            "/tmp/report.json",
+            "--report-format",
+            "json",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl report flags should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_artifact_flag_parses() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--artifact-out",
+            "/tmp/artifact.json",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl artifact flag should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_baseline_flags_parse() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--baseline-in",
+            "/tmp/baseline.json",
+            "--baseline-out",
+            "/tmp/new-baseline.json",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl baseline flags should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_comparison_flag_parse() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--baseline-in",
+            "/tmp/baseline.json",
+            "--comparison-out",
+            "/tmp/comparison.json",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl comparison flag should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_fail_on_diff_flag_parse() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--fail-on-diff",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl fail-on-diff flag should parse");
+    }
+
     #[test]
     fn test_rag_index_parses() {
         let parsed = Cli::try_parse_from([
diff --git a/crates/mofa-cli/src/commands/mod.rs b/crates/mofa-cli/src/commands/mod.rs
index 0fb02a9d3..1b798f63f 100644
--- a/crates/mofa-cli/src/commands/mod.rs
+++ b/crates/mofa-cli/src/commands/mod.rs
@@ -11,5 +11,6 @@ pub mod new;
 pub mod plugin;
 pub mod rag;
 pub mod run;
+pub mod test_dsl;
 pub mod session;
 pub mod tool;
diff --git a/crates/mofa-cli/src/commands/test_dsl.rs b/crates/mofa-cli/src/commands/test_dsl.rs
new file mode 100644
index 000000000..a3222595f
--- /dev/null
+++ b/crates/mofa-cli/src/commands/test_dsl.rs
@@ -0,0 +1,213 @@
+//! `mofa test-dsl` command implementation
+
+use crate::CliError;
+use crate::cli::TestDslReportFormat;
+use crate::output::OutputFormat;
+use mofa_testing::{
+    AgentRunArtifact, AgentRunArtifactComparison, DslError, JsonFormatter,
+    ReportFormatter,
+    TestCaseResult, TestReport, TestStatus, TextFormatter, TestCaseDsl,
+    assertion_error_from_outcomes, collect_assertion_outcomes, execute_test_case,
+};
+use serde::Serialize;
+use serde_json::json;
+use std::path::Path;
+
+#[derive(Debug, Serialize)]
+struct TestDslSummary {
+    name: String,
+    success: bool,
+    output_text: Option<String>,
+    duration_ms: u128,
+    tool_calls: Vec<String>,
+    workspace_root: String,
+    baseline_matches: Option<bool>,
+}
+
+/// Execute one TOML DSL test case through the testing runner.
+pub async fn run(
+    path: &Path,
+    format: OutputFormat,
+    artifact_out: Option<&Path>,
+    report_out: Option<&Path>,
+    baseline_in: Option<&Path>,
+    baseline_out: Option<&Path>,
+    comparison_out: Option<&Path>,
+    fail_on_diff: bool,
+    report_format: TestDslReportFormat,
+) -> Result<(), CliError> {
+    let case = TestCaseDsl::from_toml_file(path).map_err(map_dsl_error)?;
+    let result = execute_test_case(&case).await.map_err(map_dsl_error)?;
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone());
+    let report = build_report(&artifact);
+    let baseline = if let Some(baseline_in) = baseline_in {
+        Some(read_artifact(baseline_in)?)
+    } else {
+        None
+    };
+    let baseline_diff = baseline.as_ref().map(|baseline| artifact.compare_to(baseline));
+
+    if let Some(artifact_out) = artifact_out {
+        write_artifact(artifact_out, &artifact)?;
+    }
+
+    if let Some(baseline_out) = baseline_out {
+        write_artifact(baseline_out, &artifact)?;
+    }
+
+    if let Some(comparison_out) = comparison_out {
+        let baseline = baseline.as_ref().ok_or_else(|| {
+            CliError::Other("comparison output requires --baseline-in".to_string())
+        })?;
+        let diff = baseline_diff.as_ref().ok_or_else(|| {
+            CliError::Other("comparison output requires --baseline-in".to_string())
+        })?;
+        let comparison = AgentRunArtifactComparison::from_artifacts(
+            &artifact,
+            baseline,
+            diff.clone(),
+        );
+        write_comparison(comparison_out, &comparison)?;
+    }
+
+    if let Some(report_out) = report_out {
+        write_report(report_out, report_format, &report)?;
+    }
+
+    let summary = TestDslSummary {
+        name: case.name,
+        success: result.is_success(),
+        output_text: result.output_text(),
+        duration_ms: result.duration.as_millis(),
+        tool_calls: result
+            .metadata
+            .tool_calls
+            .iter()
+            .map(|record| record.tool_name.clone())
+            .collect(),
+        workspace_root: result.metadata.workspace_root.display().to_string(),
+        baseline_matches: baseline_diff.as_ref().map(|diff| diff.matches),
+    };
+
+    match format {
+        OutputFormat::Json => {
+            let output = json!({
+                "success": true,
+                "case": summary,
+                "baseline": baseline_diff,
+            });
+            println!("{}", serde_json::to_string_pretty(&output)?);
+        }
+        _ => {
+            println!("case: {}", summary.name);
+            println!("status: {}", if summary.success { "passed" } else { "failed" });
+            if let Some(output_text) = &summary.output_text {
+                println!("output: {}", output_text);
+            }
+            if !summary.tool_calls.is_empty() {
+                println!("tool_calls: {}", summary.tool_calls.join(", "));
+            }
+            println!("duration_ms: {}", summary.duration_ms);
+            if let Some(diff) = &baseline_diff {
+                println!("baseline: {}", if diff.matches { "matched" } else { "mismatch" });
+                for difference in &diff.differences {
+                    println!("difference: {}", difference.field);
+                }
+            }
+        }
+    }
+
+    if fail_on_diff {
+        if let Some(diff) = &baseline_diff {
+            if !diff.matches {
+                return Err(CliError::Other("baseline comparison mismatch".to_string()));
+            }
+        }
+    }
+
+    if let Some(error) = assertion_error_from_outcomes(&assertions) {
+        return Err(map_dsl_error(error));
+    }
+
+    Ok(())
+}
+
+fn build_report(artifact: &AgentRunArtifact) -> TestReport {
+    let status = if artifact.status == "passed" {
+        TestStatus::Passed
+    } else {
+        TestStatus::Failed
+    };
+    let error = artifact
+        .runner_error
+        .clone()
+        .or_else(|| {
+            artifact
+                .assertions
+                .iter()
+                .find(|item| !item.passed)
+                .map(|item| format!("assertion failed: {}", item.kind))
+        });
+    let metadata = vec![
+        (
+            "execution_id".to_string(),
+            artifact.execution_id.clone(),
+        ),
+        (
+            "workspace_root".to_string(),
+            artifact.workspace_root.clone(),
+        ),
+        (
+            "tool_calls".to_string(),
+            artifact.tool_calls.len().to_string(),
+        ),
+    ];
+
+    TestReport {
+        suite_name: "dsl".to_string(),
+        results: vec![TestCaseResult {
+            name: artifact.case_name.clone(),
+            status,
+            duration: std::time::Duration::from_millis(artifact.duration_ms),
+            error,
+            metadata,
+        }],
+        total_duration: std::time::Duration::from_millis(artifact.duration_ms),
+        timestamp: artifact.started_at_ms,
+    }
+}
+
+fn write_artifact(path: &Path, artifact: &AgentRunArtifact) -> Result<(), CliError> {
+    let body = serde_json::to_string_pretty(artifact)?;
+    std::fs::write(path, body)?;
+    Ok(())
+}
+
+fn write_comparison(
+    path: &Path,
+    comparison: &AgentRunArtifactComparison,
+) -> Result<(), CliError> {
+    // Emit machine readable baseline comparison output.
+    let body = serde_json::to_string_pretty(comparison)?;
+    std::fs::write(path, body)?;
+    Ok(())
+}
+
+fn read_artifact(path: &Path) -> Result<AgentRunArtifact, CliError> {
+    let body = std::fs::read_to_string(path)?;
+    Ok(serde_json::from_str(&body)?)
+}
+
+fn write_report(path: &Path, format: TestDslReportFormat, report: &TestReport) -> Result<(), CliError> {
+    let body = match format {
+        TestDslReportFormat::Json => JsonFormatter.format(report),
+        TestDslReportFormat::Text => TextFormatter.format(report),
+    };
+    std::fs::write(path, body)?;
+    Ok(())
+}
+
+fn map_dsl_error(error: DslError) -> CliError {
+    CliError::Other(format!("DSL test failed: {error}"))
+}
diff --git a/crates/mofa-cli/src/main.rs b/crates/mofa-cli/src/main.rs
index 749fe56f7..1ef811cb2 100644
--- a/crates/mofa-cli/src/main.rs
+++ b/crates/mofa-cli/src/main.rs
@@ -75,6 +75,7 @@ fn main() {
 
 async fn run_command(cli: Cli) -> CliResult<()> {
     use cli::Commands;
+    let output_format = cli.output_format.unwrap_or_default();
 
     // Initialize context for commands that need backend services
     let needs_context = matches!(
@@ -121,6 +122,32 @@ async fn run_command(cli: Cli) -> CliResult<()> {
             commands::run::run(&config, dora)?;
         }
 
+        Some(Commands::TestDsl {
+            file,
+            artifact_out,
+            report_out,
+            baseline_in,
+            baseline_out,
+            comparison_out,
+            fail_on_diff,
+            report_format,
+        }) => {
+            commands::test_dsl::run(
+                &file,
+                output_format,
+                artifact_out.as_deref(),
+                report_out.as_deref(),
+                baseline_in.as_deref(),
+                baseline_out.as_deref(),
+                comparison_out.as_deref(),
+                fail_on_diff,
+                report_format,
+            )
+                .await
+                .into_report()
+                .attach_with(|| format!("running DSL test case from {}", file.display()))?;
+        }
+
         #[cfg(feature = "dora")]
         Some(Commands::Dataflow { file, uv }) => {
             commands::run::run_dataflow(&file, uv)?;
diff --git a/crates/mofa-cli/tests/test_dsl_integration_tests.rs b/crates/mofa-cli/tests/test_dsl_integration_tests.rs
new file mode 100644
index 000000000..7a7f02483
--- /dev/null
+++ b/crates/mofa-cli/tests/test_dsl_integration_tests.rs
@@ -0,0 +1,357 @@
+//! Integration tests for `mofa test-dsl`.
+
+use axum::{Json, Router, routing::post};
+use assert_cmd::Command;
+use predicates::prelude::*;
+use serde_json::json;
+use tempfile::tempdir;
+
+#[test]
+fn test_dsl_command_runs_example_case() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent.toml"
+    );
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args(["test-dsl", case_path])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("status: passed"))
+        .stdout(predicate::str::contains("output: hello from DSL"));
+}
+
+#[test]
+fn test_dsl_command_emits_json() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args(["--output-format", "json", "test-dsl", case_path])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("\"success\": true"))
+        .stdout(predicate::str::contains("\"tool_calls\""))
+        .stdout(predicate::str::contains("\"echo_tool\""));
+}
+
+#[test]
+fn test_dsl_command_runs_tape_backed_case() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent_tape.toml"
+    );
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args(["test-dsl", case_path])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("status: passed"))
+        .stdout(predicate::str::contains("output: hello from tape"));
+}
+
+#[test]
+fn test_dsl_command_writes_json_report_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let report_path = temp.path().join("dsl-report.json");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--report-out",
+            report_path.to_str().expect("utf8 report path"),
+            "--report-format",
+            "json",
+        ])
+        .assert()
+        .success();
+
+    let report = std::fs::read_to_string(&report_path).expect("report file exists");
+    assert!(report.contains("\"suite\": \"dsl\""));
+    assert!(report.contains("\"name\": \"simple_agent_run\""));
+    assert!(report.contains("\"status\": \"passed\""));
+}
+
+#[test]
+fn test_dsl_command_writes_text_report_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let report_path = temp.path().join("dsl-report.txt");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--report-out",
+            report_path.to_str().expect("utf8 report path"),
+            "--report-format",
+            "text",
+        ])
+        .assert()
+        .success();
+
+    let report = std::fs::read_to_string(&report_path).expect("report file exists");
+    assert!(report.contains("=== dsl ==="));
+    assert!(report.contains("tool_agent_run"));
+    assert!(report.contains("[+]"));
+}
+
+#[test]
+fn test_dsl_command_writes_canonical_artifact_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let artifact_path = temp.path().join("dsl-artifact.json");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--artifact-out",
+            artifact_path.to_str().expect("utf8 artifact path"),
+        ])
+        .assert()
+        .success();
+
+    let artifact = std::fs::read_to_string(&artifact_path).expect("artifact file exists");
+    assert!(artifact.contains("\"case_name\": \"tool_agent_run\""));
+    assert!(artifact.contains("\"status\": \"passed\""));
+    assert!(artifact.contains("\"assertions\""));
+    assert!(artifact.contains("\"tool_calls\""));
+}
+
+#[test]
+fn test_dsl_command_writes_baseline_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let baseline_path = temp.path().join("dsl-baseline.json");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--baseline-out",
+            baseline_path.to_str().expect("utf8 baseline path"),
+        ])
+        .assert()
+        .success();
+
+    let baseline = std::fs::read_to_string(&baseline_path).expect("baseline file exists");
+    assert!(baseline.contains("\"case_name\": \"simple_agent_run\""));
+    assert!(baseline.contains("\"status\": \"passed\""));
+}
+
+#[test]
+fn test_dsl_command_reports_baseline_mismatch() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let baseline_path = temp.path().join("dsl-baseline.json");
+
+    std::fs::write(
+        &baseline_path,
+        r#"{
+  "case_name": "tool_agent_run",
+  "status": "passed",
+  "output_text": "Baseline output",
+  "runner_error": null,
+  "duration_ms": 0,
+  "started_at_ms": 0,
+  "execution_id": "baseline-exec",
+  "session_id": "baseline-session",
+  "workspace_root": "/tmp/baseline",
+  "agent": { "id": "baseline-agent", "name": "baseline" },
+  "assertions": [{ "kind": "contains", "expected": "Baseline output", "actual": "Baseline output", "passed": true }],
+  "tool_calls": [],
+  "llm_request": null,
+  "llm_response": null,
+  "session_snapshot": null,
+  "workspace_before": { "files": [] },
+  "workspace_after": { "files": [] }
+}"#,
+    )
+    .expect("baseline fixture written");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--baseline-in",
+            baseline_path.to_str().expect("utf8 baseline path"),
+        ])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("baseline: mismatch"))
+        .stdout(predicate::str::contains("difference: output_text"));
+}
+
+#[test]
+fn test_dsl_command_writes_comparison_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let baseline_path = temp.path().join("dsl-baseline.json");
+    let comparison_path = temp.path().join("dsl-comparison.json");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--baseline-out",
+            baseline_path.to_str().expect("utf8 baseline path"),
+        ])
+        .assert()
+        .success();
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--baseline-in",
+            baseline_path.to_str().expect("utf8 baseline path"),
+            "--comparison-out",
+            comparison_path.to_str().expect("utf8 comparison path"),
+        ])
+        .assert()
+        .success();
+
+    let comparison = std::fs::read_to_string(&comparison_path).expect("comparison file exists");
+    assert!(comparison.contains("\"case_name\": \"simple_agent_run\""));
+    assert!(comparison.contains("\"matches\": true"));
+}
+
+#[test]
+fn test_dsl_command_fails_on_baseline_mismatch_when_flag_set() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let baseline_path = temp.path().join("dsl-baseline.json");
+
+    std::fs::write(
+        &baseline_path,
+        r#"{
+  "case_name": "tool_agent_run",
+  "status": "passed",
+  "output_text": "Baseline output",
+  "runner_error": null,
+  "duration_ms": 0,
+  "started_at_ms": 0,
+  "execution_id": "baseline-exec",
+  "session_id": "baseline-session",
+  "workspace_root": "/tmp/baseline",
+  "agent": { "id": "baseline-agent", "name": "baseline" },
+  "assertions": [{ "kind": "contains", "expected": "Baseline output", "actual": "Baseline output", "passed": true }],
+  "tool_calls": [],
+  "llm_request": null,
+  "llm_response": null,
+  "session_snapshot": null,
+  "workspace_before": { "files": [] },
+  "workspace_after": { "files": [] }
+}"#,
+    )
+    .expect("baseline fixture written");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--baseline-in",
+            baseline_path.to_str().expect("utf8 baseline path"),
+            "--fail-on-diff",
+        ])
+        .assert()
+        .failure();
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn test_dsl_command_records_tape_from_live_provider() {
+    fn toml_string(path: &std::path::Path) -> String {
+        path.display().to_string().replace('\\', "\\\\")
+    }
+
+    async fn completions(Json(_request): Json<serde_json::Value>) -> Json<serde_json::Value> {
+        Json(json!({
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": "hello from live provider"
+                    }
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 3,
+                "completion_tokens": 4,
+                "total_tokens": 7
+            }
+        }))
+    }
+
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0")
+        .await
+        .expect("bind mock server");
+    let address = listener.local_addr().expect("local addr");
+    let server = tokio::spawn(async move {
+        axum::serve(listener, Router::new().route("/v1/chat/completions", post(completions)))
+            .await
+            .expect("mock server should run");
+    });
+
+    let temp = tempdir().expect("temp dir");
+    let tape_path = temp.path().join("recorded.tape.json");
+    let case_path = temp.path().join("record_case.toml");
+    std::fs::write(
+        &case_path,
+        format!(
+            "name = \"record_case\"\nprompt = \"hello\"\n\n[llm]\nrecord_tape = \"{}\"\n\n[llm.provider]\nkind = \"open_ai_compatible\"\nbase_url = \"http://{}/v1\"\nmodel = \"mock-model\"\n",
+            toml_string(&tape_path),
+            address
+        ),
+    )
+    .expect("record case written");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args(["test-dsl", case_path.to_str().expect("utf8 case path")])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("output: hello from live provider"));
+
+    let tape = std::fs::read_to_string(&tape_path).expect("tape file exists");
+    assert!(tape.contains("\"case_name\": \"record_case\""));
+    assert!(tape.contains("\"response\": \"hello from live provider\""));
+
+    server.abort();
+}
diff --git a/crates/mofa-foundation/src/agent/context/prompt.rs b/crates/mofa-foundation/src/agent/context/prompt.rs
index 04ec30bd6..87cc8a127 100644
--- a/crates/mofa-foundation/src/agent/context/prompt.rs
+++ b/crates/mofa-foundation/src/agent/context/prompt.rs
@@ -142,6 +142,17 @@ impl PromptContext {
         self
     }
 
+    /// Replace the agent identity.
+    pub fn set_identity(&mut self, identity: AgentIdentity) {
+        self.agent_name = identity.name.clone();
+        self.identity = identity;
+    }
+
+    /// Replace the bootstrap file list.
+    pub fn set_bootstrap_files(&mut self, files: Vec<String>) {
+        self.bootstrap_files = files;
+    }
+
     /// Set skills that should always be loaded
     pub fn with_always_load(mut self, skills: Vec<String>) -> Self {
         self.always_load = skills;
diff --git a/crates/mofa-foundation/src/agent/executor.rs b/crates/mofa-foundation/src/agent/executor.rs
index 605ab8fe0..15dca5065 100644
--- a/crates/mofa-foundation/src/agent/executor.rs
+++ b/crates/mofa-foundation/src/agent/executor.rs
@@ -548,6 +548,15 @@ impl AgentExecutor {
         &self.config
     }
 
+    /// Update the prompt context (system prompt builder).
+    pub async fn update_prompt_context<F>(&self, updater: F)
+    where
+        F: FnOnce(&mut PromptContext),
+    {
+        let mut ctx = self.context.write().await;
+        updater(&mut ctx);
+    }
+
     /// Get mutable reference to base agent
     pub fn base_mut(&mut self) -> &mut BaseAgent {
         &mut self.base
@@ -586,7 +595,9 @@ impl MoFAAgent for AgentExecutor {
         self.base.initialize(ctx).await?;
 
         // Additional executor-specific initialization
-        self.base.transition_to(AgentState::Ready)?;
+        if self.base.state() != AgentState::Ready {
+            self.base.transition_to(AgentState::Ready)?;
+        }
 
         Ok(())
     }
@@ -643,7 +654,10 @@ mod tests {
             "mock"
         }
 
-        async fn chat(&self, _request: ChatCompletionRequest) -> AgentResult<ChatCompletionResponse> {
+        async fn chat(
+            &self,
+            _request: ChatCompletionRequest,
+        ) -> AgentResult<ChatCompletionResponse> {
             Ok(ChatCompletionResponse {
                 content: Some("ok".to_string()),
                 tool_calls: Some(Vec::<ToolCall>::new()),
diff --git a/crates/mofa-runtime/src/runner.rs b/crates/mofa-runtime/src/runner.rs
index 91ca5aaff..09a856636 100644
--- a/crates/mofa-runtime/src/runner.rs
+++ b/crates/mofa-runtime/src/runner.rs
@@ -349,6 +349,11 @@ impl<T: MoFAAgent> AgentRunner<T> {
         &self.context
     }
 
+    /// Update session ID in the execution context.
+    pub fn set_session_id(&mut self, session_id: Option<String>) {
+        self.context.session_id = session_id;
+    }
+
     /// 获取运行器状态
     /// Get runner state
     pub async fn state(&self) -> RunnerState {
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
index f952ce045..3da476607 100644
--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -1,6 +1,9 @@
 [workspace]
 resolver = "3"
 members = [
+    "agent_runner_basic",
+    "agent_runner_custom_session",
+    "agent_runner_tools",
     "cli_production_smoke",
     "cli_agent_logs_demo",
     "cli_plugin_lifecycle",
diff --git a/examples/agent_runner_basic/Cargo.toml b/examples/agent_runner_basic/Cargo.toml
new file mode 100644
index 000000000..e49ef8dea
--- /dev/null
+++ b/examples/agent_runner_basic/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "agent_runner_basic"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+mofa-testing = { path = "../../tests" }
+tokio.workspace = true
diff --git a/examples/agent_runner_basic/src/main.rs b/examples/agent_runner_basic/src/main.rs
new file mode 100644
index 000000000..f49a93ee7
--- /dev/null
+++ b/examples/agent_runner_basic/src/main.rs
@@ -0,0 +1,29 @@
+use anyhow::Result;
+use mofa_testing::AgentTestRunner;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut runner = AgentTestRunner::new().await?;
+    runner.mock_llm().add_response("Hello from the runner").await;
+
+    let result = runner.run_text("hi").await?;
+    println!("Output: {}", result.output_text().unwrap_or_default());
+    println!(
+        "Session: {}",
+        result
+            .metadata
+            .session_id
+            .as_deref()
+            .unwrap_or("<none>")
+    );
+    println!("Workspace: {}", result.metadata.workspace_root.display());
+    println!(
+        "Runner stats: total={} success={} failed={}",
+        result.metadata.runner_stats_after.total_executions,
+        result.metadata.runner_stats_after.successful_executions,
+        result.metadata.runner_stats_after.failed_executions
+    );
+
+    runner.shutdown().await?;
+    Ok(())
+}
diff --git a/examples/agent_runner_custom_session/Cargo.toml b/examples/agent_runner_custom_session/Cargo.toml
new file mode 100644
index 000000000..52620ec07
--- /dev/null
+++ b/examples/agent_runner_custom_session/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "agent_runner_custom_session"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+mofa-testing = { path = "../../tests" }
+mofa-foundation = { path = "../../crates/mofa-foundation" }
+tokio.workspace = true
diff --git a/examples/agent_runner_custom_session/src/main.rs b/examples/agent_runner_custom_session/src/main.rs
new file mode 100644
index 000000000..fc03881b8
--- /dev/null
+++ b/examples/agent_runner_custom_session/src/main.rs
@@ -0,0 +1,42 @@
+use anyhow::Result;
+use mofa_foundation::agent::context::prompt::AgentIdentity;
+use mofa_testing::AgentTestRunner;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut runner = AgentTestRunner::new().await?;
+
+    runner.write_bootstrap_file("CUSTOM.md", "Custom bootstrap content.")?;
+    runner
+        .configure_prompt(
+            Some(AgentIdentity {
+                name: "RunnerDemo".to_string(),
+                description: "Custom identity for example runs".to_string(),
+                icon: None,
+            }),
+            Some(vec!["CUSTOM.md".to_string()]),
+        )
+        .await;
+
+    runner
+        .mock_llm()
+        .add_response("Custom session response")
+        .await;
+
+    let result = runner
+        .run_text_with_session("demo-session", "hello session")
+        .await?;
+
+    println!(
+        "Session id: {}",
+        result
+            .metadata
+            .session_id
+            .as_deref()
+            .unwrap_or("<none>")
+    );
+    println!("Output: {}", result.output_text().unwrap_or_default());
+
+    runner.shutdown().await?;
+    Ok(())
+}
diff --git a/examples/agent_runner_tools/Cargo.toml b/examples/agent_runner_tools/Cargo.toml
new file mode 100644
index 000000000..511578074
--- /dev/null
+++ b/examples/agent_runner_tools/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "agent_runner_tools"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+mofa-testing = { path = "../../tests" }
+serde_json.workspace = true
+tokio.workspace = true
diff --git a/examples/agent_runner_tools/src/main.rs b/examples/agent_runner_tools/src/main.rs
new file mode 100644
index 000000000..7399abf8b
--- /dev/null
+++ b/examples/agent_runner_tools/src/main.rs
@@ -0,0 +1,47 @@
+use anyhow::Result;
+use mofa_testing::{AgentTestRunner, MockTool};
+use serde_json::json;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut runner = AgentTestRunner::new().await?;
+
+    let tool = MockTool::new(
+        "echo_tool",
+        "Echo the provided input",
+        json!({
+            "type": "object",
+            "properties": {
+                "input": { "type": "string" }
+            },
+            "required": ["input"]
+        }),
+    );
+
+    runner.register_mock_tool(tool).await?;
+
+    runner
+        .mock_llm()
+        .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None)
+        .await;
+    runner
+        .mock_llm()
+        .add_response("Tool response completed")
+        .await;
+
+    let result = runner.run_text("use the tool").await?;
+    println!("Output: {}", result.output_text().unwrap_or_default());
+
+    for record in &result.metadata.tool_calls {
+        println!(
+            "Tool call: name={} input={} output={} duration_ms={:?}",
+            record.tool_name,
+            record.input,
+            record.output.as_ref().unwrap_or(&serde_json::Value::Null),
+            record.duration_ms
+        );
+    }
+
+    runner.shutdown().await?;
+    Ok(())
+}
diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index a0597654f..72f369eb3 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -9,10 +9,19 @@ description = "Testing utilities for the MoFA agent framework"
 [dependencies]
 mofa-kernel = { path = "../crates/mofa-kernel" }
 mofa-foundation = { path = "../crates/mofa-foundation" }
+mofa-runtime = { path = "../crates/mofa-runtime" }
 tokio = { workspace = true }
 async-trait = { workspace = true }
 anyhow = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 chrono = { workspace = true }
+thiserror = { workspace = true }
+uuid = { workspace = true }
 regex = { workspace = true }
+toml = { workspace = true }
+tracing = { workspace = true }
+reqwest = { workspace = true, features = ["json", "rustls-tls"] }
+
+[dev-dependencies]
+axum = { workspace = true }
diff --git a/tests/examples/bootstrap_agent.toml b/tests/examples/bootstrap_agent.toml
new file mode 100644
index 000000000..9dd75291c
--- /dev/null
+++ b/tests/examples/bootstrap_agent.toml
@@ -0,0 +1,13 @@
+name = "bootstrap_agent_run"
+prompt = "What file was loaded?"
+expected_text = "Bootstrapped"
+
+[[bootstrap_files]]
+path = "AGENTS.md"
+content = "Bootstrapped instructions for the DSL test."
+
+[llm]
+responses = ["Bootstrapped response"]
+
+[assert]
+contains = "Bootstrapped"
\ No newline at end of file
diff --git a/tests/examples/record_case_gpt_oss_120b.toml b/tests/examples/record_case_gpt_oss_120b.toml
new file mode 100644
index 000000000..cb313bc97
--- /dev/null
+++ b/tests/examples/record_case_gpt_oss_120b.toml
@@ -0,0 +1,11 @@
+name = "simple_agent_record"
+prompt = "hello"
+
+[llm]
+record_tape = "../fixtures/simple_agent_recorded.tape.json"
+
+[llm.provider]
+kind = "open_ai_compatible"
+base_url = "https://api.groq.com/openai/v1"
+model = "openai/gpt-oss-120b"
+api_key_env = "GROQ_API_KEY"
diff --git a/tests/examples/simple_agent.toml b/tests/examples/simple_agent.toml
new file mode 100644
index 000000000..dc673d88a
--- /dev/null
+++ b/tests/examples/simple_agent.toml
@@ -0,0 +1,9 @@
+name = "simple_agent_run"
+prompt = "Say hello"
+expected_text = "hello"
+
+[llm]
+responses = ["hello from DSL"]
+
+[assert]
+contains = "hello"
diff --git a/tests/examples/simple_agent_llama_record.toml b/tests/examples/simple_agent_llama_record.toml
new file mode 100644
index 000000000..3fedfbe8b
--- /dev/null
+++ b/tests/examples/simple_agent_llama_record.toml
@@ -0,0 +1,11 @@
+name = "simple_agent_llama_record"
+prompt = "hello"
+
+[llm]
+record_tape = "../fixtures/simple_agent_recorded.tape.json"
+
+[llm.provider]
+kind = "open_ai_compatible"
+base_url = "https://api.groq.com/openai/v1"
+model = "llama-3.3-70b-versatile"
+api_key_env = "GROQ_API_KEY"
diff --git a/tests/examples/simple_agent_tape.toml b/tests/examples/simple_agent_tape.toml
new file mode 100644
index 000000000..02e65a793
--- /dev/null
+++ b/tests/examples/simple_agent_tape.toml
@@ -0,0 +1,8 @@
+name = "simple_agent_run"
+prompt = "hello"
+
+[llm]
+tape = "../fixtures/simple_agent.tape.json"
+
+[assert]
+contains = "hello from tape"
diff --git a/tests/examples/tool_agent.toml b/tests/examples/tool_agent.toml
new file mode 100644
index 000000000..81be62550
--- /dev/null
+++ b/tests/examples/tool_agent.toml
@@ -0,0 +1,27 @@
+name = "tool_agent_run"
+input = "Use the echo tool and summarize the result."
+
+[agent]
+name = "ToolAgent"
+description = "Agent used to validate tool-aware DSL execution."
+
+[[tools]]
+name = "echo_tool"
+description = "Echo the provided input."
+schema = { type = "object", properties = { input = { type = "string" } }, required = ["input"] }
+result = "echoed from tool"
+
+[assert]
+contains = "Tool execution complete"
+tool_called = "echo_tool"
+
+[llm]
+
+[[llm.steps]]
+type = "tool_call"
+tool = "echo_tool"
+arguments = { input = "ping" }
+
+[[llm.steps]]
+type = "text"
+content = "Tool execution complete"
diff --git a/tests/fixtures/simple_agent.tape.json b/tests/fixtures/simple_agent.tape.json
new file mode 100644
index 000000000..a6a0e204a
--- /dev/null
+++ b/tests/fixtures/simple_agent.tape.json
@@ -0,0 +1,7 @@
+{
+  "version": 1,
+  "case_name": "simple_agent_run",
+  "interactions": [
+    { "response": "hello from tape" }
+  ]
+}
diff --git a/tests/fixtures/simple_agent_recorded.tape.json b/tests/fixtures/simple_agent_recorded.tape.json
new file mode 100644
index 000000000..cd203ee4e
--- /dev/null
+++ b/tests/fixtures/simple_agent_recorded.tape.json
@@ -0,0 +1,35 @@
+{
+  "version": 1,
+  "case_name": "simple_agent_record",
+  "interactions": [
+    {
+      "request": {
+        "messages": [
+          {
+            "role": "system",
+            "content": "#  Agent A helpful AI assistant\n\nYou are Agent, A helpful AI assistant.\n\n## Current Time\n2026-04-04 14:22 (Saturday)\n\n## Workspace\nYour workspace is at: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb\n- Memory files: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/memory/MEMORY.md\n- Daily notes: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/memory/YYYY-MM-DD.md\n- Custom skills: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/skills/{{skill-name}}/SKILL.md\n\nAlways be helpful, accurate, and concise. When using tools, explain what you're doing.\nWhen remembering something, write to /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/memory/MEMORY.md",
+            "tool_call_id": null,
+            "tool_calls": []
+          },
+          {
+            "role": "user",
+            "content": "hello",
+            "tool_call_id": null,
+            "tool_calls": []
+          }
+        ],
+        "model": null,
+        "temperature": null,
+        "max_tokens": null,
+        "tool_names": []
+      },
+      "response": "Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?",
+      "tool_calls": [],
+      "usage": {
+        "prompt_tokens": 284,
+        "completion_tokens": 25,
+        "total_tokens": 309
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/src/agent_runner.rs b/tests/src/agent_runner.rs
new file mode 100644
index 000000000..a59975264
--- /dev/null
+++ b/tests/src/agent_runner.rs
@@ -0,0 +1,685 @@
+//! Real agent runner harness for integration-style tests.
+//!
+//! Provides a lightweight wrapper around the MoFA runtime `AgentRunner`
+//! with an isolated workspace and deterministic mock LLM.
+
+use async_trait::async_trait;
+use chrono::{DateTime, Utc};
+use mofa_foundation::agent::context::prompt::AgentIdentity;
+use mofa_foundation::agent::executor::{AgentExecutor, AgentExecutorConfig};
+use mofa_kernel::agent::context::AgentContext;
+use mofa_kernel::agent::core::MoFAAgent;
+use mofa_kernel::agent::error::{AgentError, AgentResult};
+use mofa_foundation::agent::components::tool::as_tool;
+use mofa_foundation::agent::components::tool::SimpleTool;
+use mofa_foundation::agent::session::{JsonlSessionStorage, Session, SessionStorage};
+use crate::tools::MockTool;
+use mofa_kernel::agent::types::{AgentInput, AgentOutput, ChatCompletionRequest};
+use mofa_kernel::agent::types::{ChatCompletionResponse, ToolCall};
+use mofa_kernel::agent::AgentCapabilities;
+use mofa_kernel::agent::AgentState;
+use mofa_runtime::runner::{AgentRunner, RunnerState, RunnerStats};
+use crate::replay::{RecordingLLMProvider, ReplayLLMProvider, Tape};
+use std::collections::VecDeque;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use thiserror::Error;
+use tokio::sync::RwLock;
+use uuid::Uuid;
+
+/// Errors returned by the agent runner harness itself.
+#[derive(Debug, Error)]
+#[non_exhaustive]
+pub enum AgentRunnerError {
+    #[error("failed to create test workspace: {0}")]
+    WorkspaceIo(#[from] std::io::Error),
+
+    #[error("agent runner failure: {0}")]
+    Agent(#[from] AgentError),
+}
+
+/// Metadata captured for each run.
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub struct AgentRunMetadata {
+    pub agent_id: String,
+    pub agent_name: String,
+    pub execution_id: String,
+    pub session_id: Option<String>,
+    pub workspace_root: PathBuf,
+    pub runner_state_before: RunnerState,
+    pub runner_state_after: RunnerState,
+    pub runner_stats_before: RunnerStats,
+    pub runner_stats_after: RunnerStats,
+    pub agent_state_before: AgentState,
+    pub agent_state_after: AgentState,
+    pub started_at: DateTime<Utc>,
+    pub session_snapshot: Option<Session>,
+    pub tool_calls: Vec<ToolCallRecord>,
+    pub llm_last_request: Option<ChatCompletionRequest>,
+    pub llm_last_response: Option<ChatCompletionResponse>,
+    pub workspace_snapshot_before: WorkspaceSnapshot,
+    pub workspace_snapshot_after: WorkspaceSnapshot,
+}
+
+/// Result of a single agent run.
+#[derive(Debug)]
+#[non_exhaustive]
+pub struct AgentRunResult {
+    pub output: Option<AgentOutput>,
+    pub error: Option<AgentError>,
+    pub duration: Duration,
+    pub metadata: AgentRunMetadata,
+}
+
+/// Captures a tool call with its input and output.
+#[derive(Debug, Clone)]
+pub struct ToolCallRecord {
+    pub tool_name: String,
+    pub input: serde_json::Value,
+    pub output: Option<serde_json::Value>,
+    pub success: bool,
+    pub duration_ms: Option<u64>,
+    pub timed_out: bool,
+}
+
+/// Snapshot of files in the test workspace.
+#[derive(Debug, Clone)]
+pub struct WorkspaceSnapshot {
+    pub files: Vec<WorkspaceFileSnapshot>,
+}
+
+#[derive(Debug, Clone)]
+pub struct WorkspaceFileSnapshot {
+    pub relative_path: String,
+    pub size_bytes: u64,
+    pub modified_ms: Option<u64>,
+    pub checksum: u64,
+}
+
+impl AgentRunResult {
+    pub fn is_success(&self) -> bool {
+        self.error.is_none()
+    }
+
+    pub fn output_text(&self) -> Option<String> {
+        self.output.as_ref().map(AgentOutput::to_text)
+    }
+}
+
+/// Simple deterministic LLM provider for tests.
+#[derive(Debug)]
+pub struct MockAgentLLMProvider {
+    name: String,
+    responses: RwLock<VecDeque<MockLlmResponse>>,
+    default_response: RwLock<String>,
+    last_request: RwLock<Option<ChatCompletionRequest>>,
+    last_response: RwLock<Option<ChatCompletionResponse>>,
+}
+
+#[derive(Debug, Clone)]
+enum MockLlmResponse {
+    Text(String),
+    ToolCall {
+        content: Option<String>,
+        tool_calls: Vec<ToolCall>,
+    },
+    Error(String),
+}
+
+impl MockAgentLLMProvider {
+    pub fn new(name: impl Into<String>) -> Self {
+        Self {
+            name: name.into(),
+            responses: RwLock::new(VecDeque::new()),
+            default_response: RwLock::new("This is a mock response.".to_string()),
+            last_request: RwLock::new(None),
+            last_response: RwLock::new(None),
+        }
+    }
+
+    pub async fn add_response(&self, response: impl Into<String>) {
+        self.responses
+            .write()
+            .await
+            .push_back(MockLlmResponse::Text(response.into()));
+    }
+
+    pub async fn add_tool_call_response(
+        &self,
+        tool_name: &str,
+        arguments: serde_json::Value,
+        content: Option<String>,
+    ) {
+        let tool_call = ToolCall {
+            id: Uuid::now_v7().to_string(),
+            name: tool_name.to_string(),
+            arguments,
+        };
+        self.responses.write().await.push_back(MockLlmResponse::ToolCall {
+            content,
+            tool_calls: vec![tool_call],
+        });
+    }
+
+    pub async fn add_error_response(&self, message: impl Into<String>) {
+        self.responses
+            .write()
+            .await
+            .push_back(MockLlmResponse::Error(message.into()));
+    }
+
+    pub async fn set_default_response(&self, response: impl Into<String>) {
+        *self.default_response.write().await = response.into();
+    }
+
+    pub async fn pending_responses(&self) -> usize {
+        self.responses.read().await.len()
+    }
+
+    pub async fn last_request(&self) -> Option<ChatCompletionRequest> {
+        self.last_request.read().await.clone()
+    }
+
+    pub async fn last_response(&self) -> Option<ChatCompletionResponse> {
+        self.last_response.read().await.clone()
+    }
+}
+
+#[async_trait]
+impl mofa_kernel::agent::types::LLMProvider for MockAgentLLMProvider {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn chat(
+        &self,
+        request: ChatCompletionRequest,
+    ) -> AgentResult<ChatCompletionResponse> {
+        *self.last_request.write().await = Some(request);
+        let response = {
+            let mut responses = self.responses.write().await;
+            if let Some(next) = responses.pop_front() {
+                next
+            } else {
+                MockLlmResponse::Text(self.default_response.read().await.clone())
+            }
+        };
+
+        let response = match response {
+            MockLlmResponse::Text(content) => Ok(ChatCompletionResponse {
+                content: Some(content),
+                tool_calls: Some(Vec::<ToolCall>::new()),
+                usage: None,
+            }),
+            MockLlmResponse::ToolCall { content, tool_calls } => Ok(ChatCompletionResponse {
+                content,
+                tool_calls: Some(tool_calls),
+                usage: None,
+            }),
+            MockLlmResponse::Error(message) => Err(AgentError::ExecutionFailed(message)),
+        }?;
+
+        *self.last_response.write().await = Some(response.clone());
+        Ok(response)
+    }
+}
+
+struct SessionAwareExecutor {
+    executor: AgentExecutor,
+}
+
+impl SessionAwareExecutor {
+    fn new(executor: AgentExecutor) -> Self {
+        Self { executor }
+    }
+
+    async fn register_tool(
+        &self,
+        tool: Arc<dyn mofa_kernel::agent::components::tool::DynTool>,
+    ) -> AgentResult<()> {
+        self.executor.register_tool(tool).await
+    }
+
+    async fn update_prompt_context<F>(&self, updater: F)
+    where
+        F: FnOnce(&mut mofa_foundation::agent::context::prompt::PromptContext),
+    {
+        self.executor.update_prompt_context(updater).await;
+    }
+}
+
+#[async_trait]
+impl MoFAAgent for SessionAwareExecutor {
+    fn id(&self) -> &str {
+        self.executor.id()
+    }
+
+    fn name(&self) -> &str {
+        self.executor.name()
+    }
+
+    fn capabilities(&self) -> &AgentCapabilities {
+        self.executor.capabilities()
+    }
+
+    fn state(&self) -> mofa_kernel::agent::AgentState {
+        self.executor.state()
+    }
+
+    async fn initialize(&mut self, ctx: &AgentContext) -> AgentResult<()> {
+        self.executor.initialize(ctx).await
+    }
+
+    async fn execute(
+        &mut self,
+        input: AgentInput,
+        ctx: &AgentContext,
+    ) -> AgentResult<AgentOutput> {
+        let message = input.as_text().unwrap_or("");
+        let session_key = ctx.session_id.as_deref().unwrap_or("default");
+        let response = self.executor.process_message(session_key, message).await?;
+        Ok(AgentOutput::text(response))
+    }
+
+    async fn shutdown(&mut self) -> AgentResult<()> {
+        self.executor.shutdown().await
+    }
+}
+
+struct TempWorkspace {
+    root: PathBuf,
+}
+
+impl TempWorkspace {
+    fn new(prefix: &str) -> Result<Self, AgentRunnerError> {
+        let root = std::env::temp_dir().join(format!("{}-{}", prefix, Uuid::now_v7()));
+        std::fs::create_dir_all(&root)?;
+        Ok(Self { root })
+    }
+
+    fn path(&self) -> &Path {
+        &self.root
+    }
+
+    fn write_file(&self, relative_path: &Path, content: &str) -> Result<PathBuf, AgentRunnerError> {
+        let path = self.root.join(relative_path);
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        std::fs::write(&path, content)?;
+        Ok(path)
+    }
+
+    fn snapshot(&self) -> WorkspaceSnapshot {
+        let mut files = Vec::new();
+        collect_workspace_files(&self.root, &self.root, &mut files);
+        files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
+        WorkspaceSnapshot { files }
+    }
+}
+
+impl Drop for TempWorkspace {
+    fn drop(&mut self) {
+        let _ = std::fs::remove_dir_all(&self.root);
+    }
+}
+
+fn collect_workspace_files(root: &Path, current: &Path, files: &mut Vec<WorkspaceFileSnapshot>) {
+    let entries = match std::fs::read_dir(current) {
+        Ok(entries) => entries,
+        Err(_) => return,
+    };
+
+    for entry in entries.flatten() {
+        let path = entry.path();
+        if path.is_dir() {
+            collect_workspace_files(root, &path, files);
+            continue;
+        }
+
+        let metadata = match entry.metadata() {
+            Ok(metadata) => metadata,
+            Err(_) => continue,
+        };
+
+        let size_bytes = metadata.len();
+        let modified_ms = metadata
+            .modified()
+            .ok()
+            .and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok())
+            .map(|duration| duration.as_millis() as u64);
+
+        let bytes = match std::fs::read(&path) {
+            Ok(bytes) => bytes,
+            Err(_) => Vec::new(),
+        };
+        let checksum = hash_bytes(&bytes);
+        let relative_path = path
+            .strip_prefix(root)
+            .unwrap_or(&path)
+            .to_string_lossy()
+            .replace('\\', "/");
+
+        files.push(WorkspaceFileSnapshot {
+            relative_path,
+            size_bytes,
+            modified_ms,
+            checksum,
+        });
+    }
+}
+
+fn hash_bytes(bytes: &[u8]) -> u64 {
+    let mut hasher = DefaultHasher::new();
+    bytes.hash(&mut hasher);
+    hasher.finish()
+}
+
+/// Test harness for running real agent execution paths.
+pub struct AgentTestRunner {
+    workspace: TempWorkspace,
+    session_id: String,
+    execution_id: String,
+    llm: RunnerLlmHandle,
+    runner: AgentRunner<SessionAwareExecutor>,
+    mock_tools: Vec<MockTool>,
+}
+
+enum RunnerLlmHandle {
+    Mock(Arc<MockAgentLLMProvider>),
+    Recording(Arc<RecordingLLMProvider>),
+    Replay(Arc<ReplayLLMProvider>),
+}
+
+impl AgentTestRunner {
+    pub async fn new() -> Result<Self, AgentRunnerError> {
+        Self::with_config(AgentExecutorConfig::default()).await
+    }
+
+    pub async fn with_config(config: AgentExecutorConfig) -> Result<Self, AgentRunnerError> {
+        let workspace = TempWorkspace::new("mofa-agent-test")?;
+        let llm = Arc::new(MockAgentLLMProvider::new("mock-llm"));
+        let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?;
+        Self::with_workspace_and_handle(workspace, RunnerLlmHandle::Mock(llm), executor).await
+    }
+
+    // Build a runner backed by a live provider that records every interaction.
+    pub async fn with_recording_provider(
+        inner: Arc<dyn mofa_kernel::agent::types::LLMProvider>,
+        case_name: &str,
+        tape_path: impl Into<PathBuf>,
+        config: AgentExecutorConfig,
+    ) -> Result<Self, AgentRunnerError> {
+        let workspace = TempWorkspace::new("mofa-agent-test")?;
+        let llm = Arc::new(RecordingLLMProvider::new(
+            "recording-llm",
+            inner,
+            case_name,
+            tape_path,
+        ));
+        let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?;
+        Self::with_workspace_and_handle(workspace, RunnerLlmHandle::Recording(llm), executor).await
+    }
+
+    // Build a runner that serves responses from a pre-recorded tape.
+    pub async fn with_replay_tape(
+        tape: Tape,
+        config: AgentExecutorConfig,
+    ) -> Result<Self, AgentRunnerError> {
+        let workspace = TempWorkspace::new("mofa-agent-test")?;
+        let llm = Arc::new(ReplayLLMProvider::new("replay-llm", tape));
+        let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?;
+        Self::with_workspace_and_handle(workspace, RunnerLlmHandle::Replay(llm), executor).await
+    }
+
+    // Final assembly step that wires up the workspace, LLM handle, and executor.
+    async fn with_workspace_and_handle(
+        workspace: TempWorkspace,
+        llm: RunnerLlmHandle,
+        executor: AgentExecutor,
+    ) -> Result<Self, AgentRunnerError> {
+        let agent = SessionAwareExecutor::new(executor);
+
+        let execution_id = Uuid::now_v7().to_string();
+        let session_id = Uuid::now_v7().to_string();
+        let context = AgentContext::with_session(&execution_id, &session_id);
+
+        let runner = AgentRunner::with_context(agent, context).await?;
+
+        Ok(Self {
+            workspace,
+            session_id,
+            execution_id,
+            llm,
+            runner,
+            mock_tools: Vec::new(),
+        })
+    }
+
+    pub fn workspace(&self) -> &Path {
+        self.workspace.path()
+    }
+
+    pub fn session_id(&self) -> &str {
+        &self.session_id
+    }
+
+    pub fn execution_id(&self) -> &str {
+        &self.execution_id
+    }
+
+    pub fn mock_llm(&self) -> Arc<MockAgentLLMProvider> {
+        match &self.llm {
+            RunnerLlmHandle::Mock(provider) => Arc::clone(provider),
+            RunnerLlmHandle::Recording(_) | RunnerLlmHandle::Replay(_) => {
+                panic!("mock_llm is only available for mock-backed runners")
+            }
+        }
+    }
+
+    pub fn write_bootstrap_file(
+        &self,
+        filename: &str,
+        content: &str,
+    ) -> Result<PathBuf, AgentRunnerError> {
+        self.workspace.write_file(Path::new(filename), content)
+    }
+
+    pub fn write_workspace_file(
+        &self,
+        relative_path: impl AsRef<Path>,
+        content: &str,
+    ) -> Result<PathBuf, AgentRunnerError> {
+        self.workspace.write_file(relative_path.as_ref(), content)
+    }
+
+    pub async fn register_simple_tool<T>(&self, tool: T) -> Result<(), AgentRunnerError>
+    where
+        T: mofa_foundation::agent::components::tool::SimpleTool + Send + Sync + 'static,
+    {
+        let tool_ref = as_tool(tool);
+        self.runner
+            .agent()
+            .register_tool(tool_ref)
+            .await
+            .map_err(AgentRunnerError::from)
+    }
+
+    pub async fn register_mock_tool(&mut self, tool: MockTool) -> Result<(), AgentRunnerError> {
+        self.register_simple_tool(tool.clone()).await?;
+        self.mock_tools.push(tool);
+        Ok(())
+    }
+
+    pub async fn configure_prompt(
+        &self,
+        identity: Option<AgentIdentity>,
+        bootstrap_files: Option<Vec<String>>,
+    ) {
+        self.runner
+            .agent()
+            .update_prompt_context(|ctx| {
+                if let Some(identity) = identity {
+                    ctx.set_identity(identity);
+                }
+                if let Some(files) = bootstrap_files {
+                    ctx.set_bootstrap_files(files);
+                }
+            })
+            .await;
+    }
+
+    pub async fn run_text(&mut self, input: &str) -> Result<AgentRunResult, AgentRunnerError> {
+        self.run_input(AgentInput::text(input)).await
+    }
+
+    pub async fn run_text_with_session(
+        &mut self,
+        session_id: &str,
+        input: &str,
+    ) -> Result<AgentRunResult, AgentRunnerError> {
+        let original_session = self.runner.context().session_id.clone();
+        self.runner
+            .set_session_id(Some(session_id.to_string()));
+        let result = self.run_text(input).await;
+        self.runner.set_session_id(original_session);
+        result
+    }
+
+    pub async fn run_texts(
+        &mut self,
+        inputs: &[&str],
+    ) -> Result<Vec<AgentRunResult>, AgentRunnerError> {
+        let mut results = Vec::with_capacity(inputs.len());
+        for input in inputs {
+            results.push(self.run_text(input).await?);
+        }
+        Ok(results)
+    }
+
+    pub async fn run_input(
+        &mut self,
+        input: AgentInput,
+    ) -> Result<AgentRunResult, AgentRunnerError> {
+        let started_at = Utc::now();
+        let runner_state_before = self.runner.state().await;
+        let runner_stats_before = self.runner.stats().await;
+        let agent_state_before = self.runner.agent_state();
+        let workspace_snapshot_before = self.workspace.snapshot();
+        let timer = Instant::now();
+        let result = self.runner.execute(input).await;
+        let duration = timer.elapsed();
+        let runner_state_after = self.runner.state().await;
+        let runner_stats_after = self.runner.stats().await;
+        let agent_state_after = self.runner.agent_state();
+        let session_snapshot = self.load_session_snapshot().await;
+        let workspace_snapshot_after = self.workspace.snapshot();
+        let tool_calls = self.collect_tool_calls().await;
+        let llm_last_request = self.last_request().await;
+        let llm_last_response = self.last_response().await;
+
+        let (output, error) = match result {
+            Ok(output) => (Some(output), None),
+            Err(err) => (None, Some(err)),
+        };
+
+        let metadata = AgentRunMetadata {
+            agent_id: self.runner.agent().id().to_string(),
+            agent_name: self.runner.agent().name().to_string(),
+            execution_id: self.runner.context().execution_id.clone(),
+            session_id: self.runner.context().session_id.clone(),
+            workspace_root: self.workspace.path().to_path_buf(),
+            runner_state_before,
+            runner_state_after,
+            runner_stats_before,
+            runner_stats_after,
+            agent_state_before,
+            agent_state_after,
+            started_at,
+            session_snapshot,
+            tool_calls,
+            llm_last_request,
+            llm_last_response,
+            workspace_snapshot_before,
+            workspace_snapshot_after,
+        };
+
+        Ok(AgentRunResult {
+            output,
+            error,
+            duration,
+            metadata,
+        })
+    }
+
+    pub async fn shutdown(self) -> Result<(), AgentRunnerError> {
+        self.runner.shutdown().await?;
+        Ok(())
+    }
+
+    async fn load_session_snapshot(&self) -> Option<Session> {
+        let session_id = self.runner.context().session_id.as_deref()?;
+        let storage = JsonlSessionStorage::new(self.workspace.path()).await.ok()?;
+        storage.load(session_id).await.ok()?
+    }
+
+    async fn collect_tool_calls(&self) -> Vec<ToolCallRecord> {
+        let mut records = Vec::new();
+        for tool in &self.mock_tools {
+            let calls = tool.history().await;
+            let results = tool.results().await;
+            for (idx, call) in calls.into_iter().enumerate() {
+                let result = results.get(idx).cloned();
+                let (output, success, duration_ms, timed_out) = match result {
+                    Some(result) => {
+                        let duration_ms = result
+                            .metadata
+                            .get("duration_ms")
+                            .and_then(|value| value.parse::<u64>().ok());
+                        let timed_out = result
+                            .error
+                            .as_ref()
+                            .map(|err| err.contains("timed out"))
+                            .unwrap_or(false);
+                        (
+                            Some(result.output.clone()),
+                            result.success,
+                            duration_ms,
+                            timed_out,
+                        )
+                    }
+                    None => (None, false, None, false),
+                };
+                records.push(ToolCallRecord {
+                    tool_name: tool.name().to_string(),
+                    input: call.arguments,
+                    output,
+                    success,
+                    duration_ms,
+                    timed_out,
+                });
+            }
+        }
+        records
+    }
+
+    async fn last_request(&self) -> Option<ChatCompletionRequest> {
+        match &self.llm {
+            RunnerLlmHandle::Mock(provider) => provider.last_request().await,
+            RunnerLlmHandle::Recording(provider) => provider.last_request().await,
+            RunnerLlmHandle::Replay(provider) => provider.last_request().await,
+        }
+    }
+
+    async fn last_response(&self) -> Option<ChatCompletionResponse> {
+        match &self.llm {
+            RunnerLlmHandle::Mock(provider) => provider.last_response().await,
+            RunnerLlmHandle::Recording(provider) => provider.last_response().await,
+            RunnerLlmHandle::Replay(provider) => provider.last_response().await,
+        }
+    }
+}
diff --git a/tests/src/artifact.rs b/tests/src/artifact.rs
new file mode 100644
index 000000000..54416259c
--- /dev/null
+++ b/tests/src/artifact.rs
@@ -0,0 +1,364 @@
+//! Canonical run artifacts for DSL-backed agent test execution.
+//!
+//! These types provide the stable, serializable output model for DSL runs,
+//! built from the existing runner result.
+
+use crate::agent_runner::{AgentRunResult, ToolCallRecord, WorkspaceFileSnapshot, WorkspaceSnapshot};
+use crate::dsl::{AssertionOutcome, TestCaseDsl};
+use mofa_foundation::agent::session::Session;
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+
+// Top-level artifact emitted for a single DSL-backed case execution.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentRunArtifact {
+    pub case_name: String,
+    pub status: String,
+    pub output_text: Option<String>,
+    pub runner_error: Option<String>,
+    pub duration_ms: u64,
+    pub started_at_ms: u64,
+    pub execution_id: String,
+    pub session_id: Option<String>,
+    pub workspace_root: String,
+    pub agent: AgentArtifact,
+    pub assertions: Vec<AssertionOutcome>,
+    pub tool_calls: Vec<ToolCallArtifact>,
+    pub llm_request: Option<LlmRequestArtifact>,
+    pub llm_response: Option<LlmResponseArtifact>,
+    pub session_snapshot: Option<SessionArtifact>,
+    pub workspace_before: WorkspaceSnapshotArtifact,
+    pub workspace_after: WorkspaceSnapshotArtifact,
+}
+
+// Compact identity data for the agent used by the run.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentArtifact {
+    pub id: String,
+    pub name: String,
+}
+
+// Tool execution records are flattened into the artifact for downstream checks.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolCallArtifact {
+    pub tool_name: String,
+    pub input: serde_json::Value,
+    pub output: Option<serde_json::Value>,
+    pub success: bool,
+    pub duration_ms: Option<u64>,
+    pub timed_out: bool,
+}
+
+// LLM request/response types keep only the fields needed for stable inspection.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmRequestArtifact {
+    pub model: Option<String>,
+    pub temperature: Option<f32>,
+    pub max_tokens: Option<u32>,
+    pub messages: Vec<LlmMessageArtifact>,
+    pub tool_names: Vec<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmResponseArtifact {
+    pub content: Option<String>,
+    pub tool_calls: Vec<LlmToolCallArtifact>,
+    pub usage: Option<TokenUsageArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmMessageArtifact {
+    pub role: String,
+    pub content: Option<String>,
+    pub tool_call_id: Option<String>,
+    pub tool_calls: Vec<LlmToolCallArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmToolCallArtifact {
+    pub id: String,
+    pub name: String,
+    pub arguments: serde_json::Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TokenUsageArtifact {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SessionArtifact {
+    pub messages: Vec<SessionMessageArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SessionMessageArtifact {
+    pub role: String,
+    pub content: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WorkspaceSnapshotArtifact {
+    pub files: Vec<WorkspaceFileArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WorkspaceFileArtifact {
+    pub relative_path: String,
+    pub size_bytes: u64,
+    pub modified_ms: Option<u64>,
+    pub checksum: u64,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentRunArtifactDiff {
+    pub matches: bool,
+    pub differences: Vec<ArtifactDifference>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentRunArtifactComparison {
+    // Stable comparison payload for CI and tooling.
+    pub case_name: String,
+    pub matches: bool,
+    pub differences: Vec<ArtifactDifference>,
+    pub baseline_execution_id: String,
+    pub candidate_execution_id: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ArtifactDifference {
+    pub field: String,
+    pub expected: serde_json::Value,
+    pub actual: serde_json::Value,
+}
+
+impl AgentRunArtifact {
+    // Build the canonical artifact from the current runner result plus DSL assertion outcomes.
+    pub fn from_run_result(
+        case: &TestCaseDsl,
+        result: &AgentRunResult,
+        assertions: Vec<AssertionOutcome>,
+    ) -> Self {
+        Self {
+            case_name: case.name.clone(),
+            status: if result.is_success() && assertions.iter().all(|item| item.passed) {
+                "passed".to_string()
+            } else {
+                "failed".to_string()
+            },
+            output_text: result.output_text(),
+            runner_error: result.error.as_ref().map(ToString::to_string),
+            duration_ms: result.duration.as_millis() as u64,
+            started_at_ms: result.metadata.started_at.timestamp_millis() as u64,
+            execution_id: result.metadata.execution_id.clone(),
+            session_id: result.metadata.session_id.clone(),
+            workspace_root: result.metadata.workspace_root.display().to_string(),
+            agent: AgentArtifact {
+                id: result.metadata.agent_id.clone(),
+                name: result.metadata.agent_name.clone(),
+            },
+            assertions,
+            tool_calls: result
+                .metadata
+                .tool_calls
+                .iter()
+                .map(tool_call_artifact)
+                .collect(),
+            llm_request: result
+                .metadata
+                .llm_last_request
+                .as_ref()
+                .map(|request| LlmRequestArtifact {
+                    model: request.model.clone(),
+                    temperature: request.temperature,
+                    max_tokens: request.max_tokens,
+                    messages: request
+                        .messages
+                        .iter()
+                        .map(|message| LlmMessageArtifact {
+                            role: message.role.clone(),
+                            content: message.content.clone(),
+                            tool_call_id: message.tool_call_id.clone(),
+                            tool_calls: message
+                                .tool_calls
+                                .clone()
+                                .unwrap_or_default()
+                                .into_iter()
+                                .map(llm_tool_call_artifact)
+                                .collect(),
+                        })
+                        .collect(),
+                    tool_names: request
+                        .tools
+                        .clone()
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(|tool| tool.name)
+                        .collect(),
+                }),
+            llm_response: result
+                .metadata
+                .llm_last_response
+                .as_ref()
+                .map(|response| LlmResponseArtifact {
+                    content: response.content.clone(),
+                    tool_calls: response
+                        .tool_calls
+                        .clone()
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(llm_tool_call_artifact)
+                        .collect(),
+                    usage: response.usage.as_ref().map(|usage| TokenUsageArtifact {
+                        prompt_tokens: usage.prompt_tokens,
+                        completion_tokens: usage.completion_tokens,
+                        total_tokens: usage.total_tokens,
+                    }),
+                }),
+            session_snapshot: result
+                .metadata
+                .session_snapshot
+                .as_ref()
+                .map(session_artifact),
+            workspace_before: workspace_snapshot_artifact(&result.metadata.workspace_snapshot_before),
+            workspace_after: workspace_snapshot_artifact(&result.metadata.workspace_snapshot_after),
+        }
+    }
+
+    // Compare the MVP baseline fields while keeping deeper metadata out of scope for now.
+    pub fn compare_to(&self, baseline: &Self) -> AgentRunArtifactDiff {
+        let mut differences = Vec::new();
+
+        if self.status != baseline.status {
+            differences.push(ArtifactDifference {
+                field: "status".to_string(),
+                expected: json!(baseline.status),
+                actual: json!(self.status),
+            });
+        }
+
+        if self.output_text != baseline.output_text {
+            differences.push(ArtifactDifference {
+                field: "output_text".to_string(),
+                expected: json!(baseline.output_text),
+                actual: json!(self.output_text),
+            });
+        }
+
+        let baseline_assertions = baseline
+            .assertions
+            .iter()
+            .map(assertion_signature)
+            .collect::<Vec<_>>();
+        let actual_assertions = self
+            .assertions
+            .iter()
+            .map(assertion_signature)
+            .collect::<Vec<_>>();
+        if actual_assertions != baseline_assertions {
+            differences.push(ArtifactDifference {
+                field: "assertions".to_string(),
+                expected: json!(baseline_assertions),
+                actual: json!(actual_assertions),
+            });
+        }
+
+        let baseline_tools = baseline
+            .tool_calls
+            .iter()
+            .map(|call| call.tool_name.clone())
+            .collect::<Vec<_>>();
+        let actual_tools = self
+            .tool_calls
+            .iter()
+            .map(|call| call.tool_name.clone())
+            .collect::<Vec<_>>();
+        if actual_tools != baseline_tools {
+            differences.push(ArtifactDifference {
+                field: "tool_calls".to_string(),
+                expected: json!(baseline_tools),
+                actual: json!(actual_tools),
+            });
+        }
+
+        AgentRunArtifactDiff {
+            matches: differences.is_empty(),
+            differences,
+        }
+    }
+}
+
+impl AgentRunArtifactComparison {
+    pub fn from_artifacts(
+        candidate: &AgentRunArtifact,
+        baseline: &AgentRunArtifact,
+        diff: AgentRunArtifactDiff,
+    ) -> Self {
+        Self {
+            case_name: candidate.case_name.clone(),
+            matches: diff.matches,
+            differences: diff.differences,
+            baseline_execution_id: baseline.execution_id.clone(),
+            candidate_execution_id: candidate.execution_id.clone(),
+        }
+    }
+}
+
+fn assertion_signature(outcome: &AssertionOutcome) -> serde_json::Value {
+    json!({
+        "kind": outcome.kind,
+        "passed": outcome.passed,
+    })
+}
+
+fn tool_call_artifact(record: &ToolCallRecord) -> ToolCallArtifact {
+    ToolCallArtifact {
+        tool_name: record.tool_name.clone(),
+        input: record.input.clone(),
+        output: record.output.clone(),
+        success: record.success,
+        duration_ms: record.duration_ms,
+        timed_out: record.timed_out,
+    }
+}
+
+fn llm_tool_call_artifact(tool_call: mofa_kernel::agent::types::ToolCall) -> LlmToolCallArtifact {
+    LlmToolCallArtifact {
+        id: tool_call.id,
+        name: tool_call.name,
+        arguments: tool_call.arguments,
+    }
+}
+
+// Session snapshots are reduced to ordered role/content pairs for stable comparisons.
+fn session_artifact(session: &Session) -> SessionArtifact {
+    SessionArtifact {
+        messages: session
+            .messages
+            .iter()
+            .map(|message| SessionMessageArtifact {
+                role: message.role.clone(),
+                content: message.content.clone(),
+            })
+            .collect(),
+    }
+}
+
+// Workspace snapshots preserve a compact file-level view before and after execution.
+fn workspace_snapshot_artifact(snapshot: &WorkspaceSnapshot) -> WorkspaceSnapshotArtifact {
+    WorkspaceSnapshotArtifact {
+        files: snapshot.files.iter().map(workspace_file_artifact).collect(),
+    }
+}
+
+fn workspace_file_artifact(file: &WorkspaceFileSnapshot) -> WorkspaceFileArtifact {
+    WorkspaceFileArtifact {
+        relative_path: file.relative_path.clone(),
+        size_bytes: file.size_bytes,
+        modified_ms: file.modified_ms,
+        checksum: file.checksum,
+    }
+}
diff --git a/tests/src/assertions.rs b/tests/src/assertions.rs
index 1a0b58594..946efe134 100644
--- a/tests/src/assertions.rs
+++ b/tests/src/assertions.rs
@@ -84,3 +84,173 @@ macro_rules! assert_bus_message_sent {
         );
     }};
 }
+
+/// Assert a session's messages match the expected (role, content) pairs.
+pub fn assert_session_messages(
+    session: &mofa_foundation::agent::session::Session,
+    expected: &[(&str, &str)],
+) {
+    assert_eq!(
+        session.messages.len(),
+        expected.len(),
+        "Expected {} session messages, got {}",
+        expected.len(),
+        session.messages.len()
+    );
+
+    for (idx, (role, content)) in expected.iter().enumerate() {
+        let msg = &session.messages[idx];
+        assert_eq!(
+            msg.role, *role,
+            "Expected role '{}' at index {}, got '{}'",
+            role, idx, msg.role
+        );
+        assert_eq!(
+            msg.content, *content,
+            "Expected content '{}' at index {}, got '{}'",
+            content, idx, msg.content
+        );
+    }
+}
+
+/// Assert the most recent tool result matches the expected JSON output.
+///
+/// # Example
+/// ```ignore
+/// assert_tool_last_result!(tool, json!("done"));
+/// ```
+#[macro_export]
+macro_rules! assert_tool_last_result {
+    ($tool:expr, $expected:expr) => {{
+        let result = $tool
+            .last_result()
+            .await
+            .expect("Expected tool to have a result, but it was never executed");
+        let expected = $expected;
+        assert_eq!(
+            result.output, expected,
+            "Expected latest tool result {:?}, got {:?}",
+            expected, result.output
+        );
+    }};
+}
+
+/// Assert the agent run produced the expected output text.
+///
+/// # Example
+/// ```ignore
+/// assert_agent_output_text!(result, "hello");
+/// ```
+#[macro_export]
+macro_rules! assert_agent_output_text {
+    ($result:expr, $expected:expr) => {{
+        let expected = $expected;
+        let actual = $result.output_text();
+        assert_eq!(
+            actual.as_deref(),
+            Some(expected),
+            "Expected agent output {:?}, got {:?}",
+            expected,
+            actual
+        );
+    }};
+}
+
+/// Assert the agent run failed with an error containing the given substring.
+///
+/// # Example
+/// ```ignore
+/// assert_run_failed_with!(result, "timeout");
+/// ```
+#[macro_export]
+macro_rules! assert_run_failed_with {
+    ($result:expr, $pattern:expr) => {{
+        let pattern = $pattern;
+        let error = $result
+            .error
+            .as_ref()
+            .expect("Expected run to fail, but it succeeded");
+        let message = error.to_string();
+        assert!(
+            message.contains(pattern),
+            "Expected error containing {:?}, got {:?}",
+            pattern,
+            message
+        );
+    }};
+}
+
+/// Assert the workspace snapshot contains a file with the given relative path.
+///
+/// # Example
+/// ```ignore
+/// assert_workspace_contains_file!(snapshot, "sessions/demo.jsonl");
+/// ```
+#[macro_export]
+macro_rules! assert_workspace_contains_file {
+    ($snapshot:expr, $relative_path:expr) => {{
+        let relative_path = $relative_path;
+        let found = $snapshot
+            .files
+            .iter()
+            .any(|file| file.relative_path == relative_path);
+        assert!(
+            found,
+            "Expected workspace snapshot to contain {:?}, found paths: {:?}",
+            relative_path,
+            $snapshot
+                .files
+                .iter()
+                .map(|file| file.relative_path.as_str())
+                .collect::<Vec<_>>()
+        );
+    }};
+}
+
+/// Assert the run metadata captured a tool call with the given tool name.
+///
+/// # Example
+/// ```ignore
+/// assert_run_recorded_tool_call!(result, "echo_tool");
+/// ```
+#[macro_export]
+macro_rules! assert_run_recorded_tool_call {
+    ($result:expr, $tool_name:expr) => {{
+        let tool_name = $tool_name;
+        let found = $result
+            .metadata
+            .tool_calls
+            .iter()
+            .any(|record| record.tool_name == tool_name);
+        assert!(
+            found,
+            "Expected run metadata to contain tool call {:?}, found tool calls: {:?}",
+            tool_name,
+            $result
+                .metadata
+                .tool_calls
+                .iter()
+                .map(|record| record.tool_name.as_str())
+                .collect::<Vec<_>>()
+        );
+    }};
+}
+
+/// Assert the runner total execution count after a run matches the expected value.
+///
+/// # Example
+/// ```ignore
+/// assert_runner_total_executions!(result, 1);
+/// ```
+#[macro_export]
+macro_rules! assert_runner_total_executions {
+    ($result:expr, $expected:expr) => {{
+        let expected = $expected;
+        let actual = $result.metadata.runner_stats_after.total_executions;
+        assert_eq!(
+            actual, expected,
+            "Expected runner total executions {}, got {}",
+            expected, actual
+        );
+    }};
+}
diff --git a/tests/src/dsl.rs b/tests/src/dsl.rs
new file mode 100644
index 000000000..9ef27e830
--- /dev/null
+++ b/tests/src/dsl.rs
@@ -0,0 +1,468 @@
+//! Minimal TOML DSL support for the testing MVP.
+//!
+//! This module keeps the schema intentionally small so contributors can define
+//! simple agent tests without introducing a full DSL framework yet.
+
+use crate::agent_runner::{AgentRunResult, AgentRunnerError, AgentTestRunner};
+use crate::live_llm::{OpenAiCompatProvider, OpenAiCompatProviderConfig};
+use crate::replay::{ReplayError, Tape};
+use crate::tools::MockTool;
+use mofa_foundation::agent::context::prompt::AgentIdentity;
+use mofa_foundation::agent::executor::AgentExecutorConfig;
+use mofa_kernel::agent::components::tool::ToolResult;
+use mofa_kernel::agent::types::LLMProvider;
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum DslError {
+    #[error("failed to read DSL file: {0}")]
+    Io(#[from] std::io::Error),
+
+    #[error("failed to parse TOML DSL: {0}")]
+    Toml(#[from] toml::de::Error),
+
+    #[error("runner error: {0}")]
+    Runner(#[from] AgentRunnerError),
+
+    #[error("replay error: {0}")]
+    Replay(#[from] ReplayError),
+
+    #[error("test case must define either `prompt` or `input`")]
+    MissingPrompt,
+
+    #[error("expected output to contain `{expected}`, got `{actual}`")]
+    ExpectedContains { expected: String, actual: String },
+
+    #[error("expected tool `{tool}` to be called, found tool calls: {actual:?}")]
+    ExpectedToolCall { tool: String, actual: Vec<String> },
+
+    #[error("run produced no text output")]
+    MissingOutput,
+
+    #[error("provider-backed llm cannot be combined with inline responses, steps, or replay tape")]
+    ConflictingLlmConfig,
+
+    #[error("record_tape requires a provider-backed llm")]
+    MissingRecordProvider,
+
+    #[error("provider-backed llm requires record_tape")]
+    MissingRecordTape,
+
+    #[error("unsupported llm provider kind: {0}")]
+    UnsupportedProvider(String),
+
+    #[error("missing environment variable `{0}` for llm provider")]
+    MissingProviderEnv(String),
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct TestCaseDsl {
+    pub name: String,
+    pub prompt: Option<String>,
+    pub input: Option<String>,
+    pub expected_text: Option<String>,
+    #[serde(default)]
+    pub bootstrap_files: Vec<BootstrapFileDsl>,
+    pub agent: Option<AgentDsl>,
+    #[serde(default)]
+    pub tools: Vec<ToolDsl>,
+    pub llm: Option<LlmDsl>,
+    #[serde(rename = "assert")]
+    pub assertions: Option<AssertDsl>,
+    #[serde(skip)]
+    pub source_path: Option<PathBuf>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct BootstrapFileDsl {
+    pub path: String,
+    pub content: String,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct AgentDsl {
+    pub name: Option<String>,
+    pub description: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct ToolDsl {
+    pub name: String,
+    pub description: String,
+    pub schema: Value,
+    pub result: Option<Value>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct LlmDsl {
+    #[serde(default)]
+    pub responses: Vec<String>,
+    #[serde(default)]
+    pub steps: Vec<LlmStepDsl>,
+    pub tape: Option<String>,
+    pub record_tape: Option<String>,
+    pub provider: Option<LlmProviderDsl>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct LlmProviderDsl {
+    pub kind: LlmProviderKind,
+    pub base_url: Option<String>,
+    pub model: Option<String>,
+    pub api_key_env: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum LlmProviderKind {
+    OpenAiCompatible,
+    Ollama,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct LlmStepDsl {
+    #[serde(rename = "type")]
+    pub kind: LlmStepKind,
+    pub content: Option<String>,
+    pub tool: Option<String>,
+    pub arguments: Option<Value>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum LlmStepKind {
+    Text,
+    ToolCall,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct AssertDsl {
+    pub contains: Option<String>,
+    pub tool_called: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AssertionOutcome {
+    pub kind: String,
+    pub expected: Value,
+    pub actual: Value,
+    pub passed: bool,
+}
+
+impl TestCaseDsl {
+    pub fn from_toml_str(input: &str) -> Result<Self, DslError> {
+        let mut case: Self = toml::from_str(input)?;
+        case.source_path = None;
+        Ok(case)
+    }
+
+    pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self, DslError> {
+        let path = path.as_ref();
+        let input = std::fs::read_to_string(path)?;
+        let mut case: Self = toml::from_str(&input)?;
+        case.source_path = Some(path.to_path_buf());
+        Ok(case)
+    }
+
+    fn execution_input(&self) -> Result<&str, DslError> {
+        self.prompt
+            .as_deref()
+            .or(self.input.as_deref())
+            .ok_or(DslError::MissingPrompt)
+    }
+}
+
+pub async fn run_test_case(case: &TestCaseDsl) -> Result<AgentRunResult, DslError> {
+    let result = execute_test_case(case).await?;
+    let assertions = collect_assertion_outcomes(case, &result);
+    if let Some(error) = assertion_error_from_outcomes(&assertions) {
+        return Err(error);
+    }
+    Ok(result)
+}
+
+pub async fn execute_test_case(case: &TestCaseDsl) -> Result<AgentRunResult, DslError> {
+    let mut runner = build_runner(case).await?;
+    configure_runner_from_test_case(case, &mut runner).await?;
+    let result = runner.run_text(case.execution_input()?).await?;
+    runner.shutdown().await?;
+    Ok(result)
+}
+
+pub async fn configure_runner_from_test_case(
+    case: &TestCaseDsl,
+    runner: &mut AgentTestRunner,
+) -> Result<(), DslError> {
+    if !case.bootstrap_files.is_empty() {
+        let mut bootstrap_paths = Vec::with_capacity(case.bootstrap_files.len());
+        for file in &case.bootstrap_files {
+            runner.write_bootstrap_file(&file.path, &file.content)?;
+            bootstrap_paths.push(file.path.clone());
+        }
+        runner
+            .configure_prompt(agent_identity(case.agent.as_ref()), Some(bootstrap_paths))
+            .await;
+    } else if case.agent.is_some() {
+        runner
+            .configure_prompt(agent_identity(case.agent.as_ref()), None)
+            .await;
+    }
+
+    for tool in &case.tools {
+        let mock_tool = MockTool::new(&tool.name, &tool.description, tool.schema.clone());
+        if let Some(result) = &tool.result {
+            mock_tool
+                .set_result(ToolResult::success(result.clone()))
+                .await;
+        }
+        runner.register_mock_tool(mock_tool).await?;
+    }
+
+    // Queue deterministic LLM responses before execution so the DSL stays a thin
+    // adapter over the existing runner harness.
+    if let Some(llm) = &case.llm {
+        if llm.provider.is_some() {
+            if !llm.steps.is_empty() || !llm.responses.is_empty() || llm.tape.is_some() {
+                return Err(DslError::ConflictingLlmConfig);
+            }
+        } else if llm.record_tape.is_some() {
+            return Err(DslError::MissingRecordProvider);
+        } else if !llm.steps.is_empty() {
+            for step in &llm.steps {
+                match step.kind {
+                    LlmStepKind::Text => {
+                        runner
+                            .mock_llm()
+                            .add_response(step.content.clone().unwrap_or_default())
+                            .await;
+                    }
+                    LlmStepKind::ToolCall => {
+                        runner
+                            .mock_llm()
+                            .add_tool_call_response(
+                                step.tool.as_deref().unwrap_or_default(),
+                                step.arguments.clone().unwrap_or(Value::Null),
+                                step.content.clone(),
+                            )
+                            .await;
+                    }
+                }
+            }
+        } else if let Some(tape_path) = &llm.tape {
+            let resolved = resolve_case_path(case, tape_path);
+            let tape = Tape::from_file(resolved)?;
+            let _responses = tape.responses()?;
+        } else {
+            for response in &llm.responses {
+                runner.mock_llm().add_response(response).await;
+            }
+        }
+    }
+    Ok(())
+}
+
+// Build the appropriate runner (recording, replay, or mock) for this case.
+async fn build_runner(case: &TestCaseDsl) -> Result<AgentTestRunner, DslError> {
+    let config = AgentExecutorConfig::default();
+    let llm = case.llm.as_ref();
+    match llm.and_then(|item| item.provider.as_ref()) {
+        Some(provider) => {
+            let provider = build_live_provider(provider)?;
+            if let Some(record_tape) = llm.and_then(|item| item.record_tape.as_deref()) {
+                let tape_path = resolve_case_path(case, record_tape);
+                AgentTestRunner::with_recording_provider(provider, &case.name, tape_path, config)
+                    .await
+                    .map_err(DslError::from)
+            } else {
+                Err(DslError::MissingRecordTape)
+            }
+        }
+        None => {
+            if let Some(llm) = llm {
+                if let Some(tape_path) = &llm.tape {
+                    let tape = Tape::from_file(resolve_case_path(case, tape_path))?;
+                    return AgentTestRunner::with_replay_tape(tape, config)
+                        .await
+                        .map_err(DslError::from);
+                }
+            }
+            AgentTestRunner::with_config(config)
+                .await
+                .map_err(DslError::from)
+        }
+    }
+}
+
+// Instantiate an LLM provider implementation from the DSL settings.
+fn build_live_provider(provider: &LlmProviderDsl) -> Result<Arc<dyn LLMProvider>, DslError> {
+    match provider.kind {
+        LlmProviderKind::OpenAiCompatible => {
+            let api_key = read_provider_api_key(provider)?;
+            Ok(Arc::new(OpenAiCompatProvider::new(OpenAiCompatProviderConfig {
+                base_url: provider
+                    .base_url
+                    .clone()
+                    .ok_or_else(|| DslError::UnsupportedProvider("open_ai_compatible missing base_url".to_string()))?,
+                model: provider.model.clone().unwrap_or_else(|| "gpt-4o-mini".to_string()),
+                api_key,
+            })))
+        }
+        LlmProviderKind::Ollama => Ok(Arc::new(OpenAiCompatProvider::new(OpenAiCompatProviderConfig {
+            base_url: provider
+                .base_url
+                .clone()
+                .or_else(|| std::env::var("OLLAMA_BASE_URL").ok())
+                .unwrap_or_else(|| "http://127.0.0.1:11434/v1".to_string()),
+            model: provider
+                .model
+                .clone()
+                .or_else(|| std::env::var("OLLAMA_MODEL").ok())
+                .unwrap_or_else(|| "llama3".to_string()),
+            api_key: None,
+        }))),
+    }
+}
+
+// Read or validate the configured API key name for a provider.
+fn read_provider_api_key(provider: &LlmProviderDsl) -> Result<Option<String>, DslError> {
+    let env_name = provider
+        .api_key_env
+        .clone()
+        .unwrap_or_else(|| "OPENAI_API_KEY".to_string());
+    if env_name.is_empty() {
+        return Ok(None);
+    }
+
+    match std::env::var(&env_name) {
+        Ok(value) if value.is_empty() => Ok(None),
+        Ok(value) => Ok(Some(value)),
+        Err(std::env::VarError::NotPresent) => {
+            if provider.base_url.as_deref().map(|url| url.contains("127.0.0.1") || url.contains("localhost")).unwrap_or(false) {
+                Ok(None)
+            } else {
+                Err(DslError::MissingProviderEnv(env_name))
+            }
+        }
+        Err(err) => Err(DslError::MissingProviderEnv(format!("{env_name}: {err}"))),
+    }
+}
+
+fn resolve_case_path(case: &TestCaseDsl, tape_path: &str) -> PathBuf {
+    let path = Path::new(tape_path);
+    if path.is_absolute() {
+        return path.to_path_buf();
+    }
+
+    if let Some(source_path) = &case.source_path {
+        if let Some(parent) = source_path.parent() {
+            return parent.join(path);
+        }
+    }
+
+    path.to_path_buf()
+}
+
+fn agent_identity(agent: Option<&AgentDsl>) -> Option<AgentIdentity> {
+    let agent = agent?;
+    let name = agent.name.clone()?;
+    Some(AgentIdentity {
+        name,
+        description: agent.description.clone().unwrap_or_default(),
+        icon: None,
+    })
+}
+
+fn expected_contains(case: &TestCaseDsl) -> Option<&str> {
+    // Prefer the explicit assertion block when present, while keeping
+    // `expected_text` as a lightweight shorthand for the MVP schema.
+    case.assertions
+        .as_ref()
+        .and_then(|assertions| assertions.contains.as_deref())
+        .or(case.expected_text.as_deref())
+}
+
+fn expected_tool_call(case: &TestCaseDsl) -> Option<&str> {
+    case.assertions
+        .as_ref()
+        .and_then(|assertions| assertions.tool_called.as_deref())
+}
+
+pub fn collect_assertion_outcomes(case: &TestCaseDsl, result: &AgentRunResult) -> Vec<AssertionOutcome> {
+    let mut outcomes = Vec::new();
+
+    if let Some(expected) = expected_contains(case) {
+        let actual = result.output_text();
+        outcomes.push(AssertionOutcome {
+            kind: "contains".to_string(),
+            expected: Value::String(expected.to_string()),
+            actual: actual
+                .clone()
+                .map(Value::String)
+                .unwrap_or(Value::Null),
+            passed: actual
+                .as_ref()
+                .map(|value| value.contains(expected))
+                .unwrap_or(false),
+        });
+    }
+
+    if let Some(expected_tool) = expected_tool_call(case) {
+        let actual = result
+            .metadata
+            .tool_calls
+            .iter()
+            .map(|record| Value::String(record.tool_name.clone()))
+            .collect::<Vec<_>>();
+        outcomes.push(AssertionOutcome {
+            kind: "tool_called".to_string(),
+            expected: Value::String(expected_tool.to_string()),
+            actual: Value::Array(actual.clone()),
+            passed: actual
+                .iter()
+                .any(|tool| tool.as_str() == Some(expected_tool)),
+        });
+    }
+
+    outcomes
+}
+
+pub fn assertion_error_from_outcomes(outcomes: &[AssertionOutcome]) -> Option<DslError> {
+    for outcome in outcomes {
+        if outcome.passed {
+            continue;
+        }
+
+        match outcome.kind.as_str() {
+            "contains" => {
+                return if outcome.actual.is_null() {
+                    Some(DslError::MissingOutput)
+                } else {
+                    Some(DslError::ExpectedContains {
+                        expected: outcome.expected.as_str().unwrap_or_default().to_string(),
+                        actual: outcome.actual.as_str().unwrap_or_default().to_string(),
+                    })
+                };
+            }
+            "tool_called" => {
+                let actual = outcome
+                    .actual
+                    .as_array()
+                    .into_iter()
+                    .flatten()
+                    .filter_map(|value| value.as_str().map(ToString::to_string))
+                    .collect::<Vec<_>>();
+                return Some(DslError::ExpectedToolCall {
+                    tool: outcome.expected.as_str().unwrap_or_default().to_string(),
+                    actual,
+                });
+            }
+            _ => continue,
+        }
+    }
+
+    None
+}
diff --git a/tests/src/lib.rs b/tests/src/lib.rs
index 3368dc49b..eba2196b8 100644
--- a/tests/src/lib.rs
+++ b/tests/src/lib.rs
@@ -4,18 +4,42 @@
 //! control for testing MoFA agents.
 
 pub mod adversarial;
+pub mod agent_runner;
+pub mod artifact;
 pub mod assertions;
 pub mod backend;
 pub mod bus;
 pub mod clock;
+pub mod dsl;
+pub mod live_llm;
+pub mod replay;
 pub mod report;
 pub mod tools;
 
 pub use backend::MockLLMBackend;
 pub use bus::MockAgentBus;
 pub use clock::{Clock, MockClock, SystemClock};
+pub use dsl::{
+    assertion_error_from_outcomes, collect_assertion_outcomes, configure_runner_from_test_case,
+    execute_test_case, run_test_case, AgentDsl, AssertDsl, AssertionOutcome, BootstrapFileDsl,
+    DslError, LlmDsl, LlmProviderDsl, LlmProviderKind, LlmStepDsl, LlmStepKind, TestCaseDsl,
+    ToolDsl,
+};
+pub use live_llm::{OpenAiCompatProvider, OpenAiCompatProviderConfig};
+pub use agent_runner::{
+    AgentRunMetadata, AgentRunResult, AgentRunnerError, AgentTestRunner, MockAgentLLMProvider,
+    ToolCallRecord, WorkspaceFileSnapshot, WorkspaceSnapshot,
+};
+pub use artifact::{
+    AgentArtifact, AgentRunArtifact, AgentRunArtifactComparison, AgentRunArtifactDiff,
+    ArtifactDifference,
+    LlmMessageArtifact, LlmRequestArtifact, LlmResponseArtifact, LlmToolCallArtifact,
+    SessionArtifact, SessionMessageArtifact, TokenUsageArtifact, ToolCallArtifact,
+    WorkspaceFileArtifact, WorkspaceSnapshotArtifact,
+};
 pub use report::{
     JsonFormatter, ReportFormatter, TestCaseResult, TestReport, TestReportBuilder, TestStatus,
     TextFormatter,
 };
+pub use replay::{ReplayError, Tape, TapeInteraction};
 pub use tools::MockTool;
diff --git a/tests/src/live_llm.rs b/tests/src/live_llm.rs
new file mode 100644
index 000000000..d8791600d
--- /dev/null
+++ b/tests/src/live_llm.rs
@@ -0,0 +1,268 @@
+//! Minimal OpenAI-compatible provider for record-mode DSL runs.
+
+use async_trait::async_trait;
+use mofa_kernel::agent::error::{AgentError, AgentResult};
+use mofa_kernel::agent::types::{
+    ChatCompletionRequest, ChatCompletionResponse, LLMProvider, ToolCall, ToolDefinition,
+    TokenUsage,
+};
+use serde::{Deserialize, Serialize};
+use std::error::Error as StdError;
+
+#[derive(Debug, Clone)]
+pub struct OpenAiCompatProviderConfig {
+    pub base_url: String,
+    pub model: String,
+    pub api_key: Option<String>,
+}
+
+#[derive(Debug)]
+pub struct OpenAiCompatProvider {
+    client: reqwest::Client,
+    config: OpenAiCompatProviderConfig,
+}
+
+impl OpenAiCompatProvider {
+    // Create a new provider backed by a fresh HTTP client.
+    pub fn new(config: OpenAiCompatProviderConfig) -> Self {
+        Self {
+            client: reqwest::Client::new(),
+            config,
+        }
+    }
+
+    // Normalize the configured base URL into a completions endpoint.
+    fn completions_url(&self) -> String {
+        let trimmed = self.config.base_url.trim_end_matches('/');
+        if trimmed.ends_with("/chat/completions") || trimmed.ends_with("/completions") {
+            trimmed.to_string()
+        } else {
+            format!("{trimmed}/chat/completions")
+        }
+    }
+}
+
+#[async_trait]
+impl LLMProvider for OpenAiCompatProvider {
+    // Provider identifier exposed to the kernel.
+    fn name(&self) -> &str {
+        "openai-compatible"
+    }
+
+    async fn chat(&self, request: ChatCompletionRequest) -> AgentResult<ChatCompletionResponse> {
+        // Translate the kernel chat request, send, and convert the response.
+        let request_body = OpenAiCompatRequest {
+            model: request
+                .model
+                .clone()
+                .unwrap_or_else(|| self.config.model.clone()),
+            messages: request
+                .messages
+                .iter()
+                .map(OpenAiCompatMessage::from_kernel)
+                .collect(),
+            tools: request.tools.as_ref().map(|tools| {
+                tools
+                    .iter()
+                    .map(OpenAiCompatToolDefinition::from_kernel)
+                    .collect()
+            }),
+            temperature: request.temperature,
+            max_tokens: request.max_tokens,
+        };
+
+        eprintln!("OpenAI-compatible request = {:?}", request_body);
+        let mut builder = self.client.post(self.completions_url()).json(&request_body);
+        if let Some(api_key) = &self.config.api_key {
+            builder = builder.bearer_auth(api_key);
+        }
+
+        let response = match builder.send().await {
+            Ok(response) => response,
+            Err(err) => {
+        eprintln!("Provider send error: {}", err);
+                let mut current: Option<&dyn StdError> = err.source();
+                while let Some(source) = current {
+            eprintln!("Provider send error source: {}", source);
+                    current = source.source();
+                }
+                return Err(AgentError::ExecutionFailed(err.to_string()));
+            }
+        };
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            eprintln!("Provider error status={} body={}", status, body);
+            return Err(AgentError::ExecutionFailed(format!(
+                "provider returned {}: {}",
+                status, body
+            )));
+        }
+
+        let body: OpenAiCompatResponse = response
+            .json()
+            .await
+            .map_err(|err| AgentError::SerializationError(err.to_string()))?;
+        eprintln!("OpenAI-compatible response = {:?}", body);
+        let message = body
+            .choices
+            .into_iter()
+            .next()
+            .ok_or_else(|| AgentError::ExecutionFailed("provider returned no choices".to_string()))?
+            .message;
+
+        Ok(ChatCompletionResponse {
+            content: message.content,
+            tool_calls: Some(
+                message
+                    .tool_calls
+                    .unwrap_or_default()
+                    .into_iter()
+                    .map(OpenAiCompatToolCall::into_kernel)
+                    .collect(),
+            ),
+            usage: body.usage.map(OpenAiCompatUsage::into_kernel),
+        })
+    }
+}
+
+#[derive(Debug, Serialize)]
+struct OpenAiCompatRequest {
+    model: String,
+    messages: Vec<OpenAiCompatMessage>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tools: Option<Vec<OpenAiCompatToolDefinition>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    temperature: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    max_tokens: Option<u32>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct OpenAiCompatMessage {
+    role: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    content: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tool_call_id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    tool_calls: Option<Vec<OpenAiCompatToolCall>>,
+}
+
+impl OpenAiCompatMessage {
+    // Translate a kernel chat message into the OpenAI-compatible wire format.
+    fn from_kernel(message: &mofa_kernel::agent::types::ChatMessage) -> Self {
+        Self {
+            role: message.role.clone(),
+            content: message.content.clone(),
+            tool_call_id: message.tool_call_id.clone(),
+            tool_calls: message.tool_calls.as_ref().map(|tool_calls| {
+                tool_calls
+                    .iter()
+                    .cloned()
+                    .map(OpenAiCompatToolCall::from_kernel)
+                    .collect()
+            }),
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+struct OpenAiCompatToolDefinition {
+    #[serde(rename = "type")]
+    kind: &'static str,
+    function: OpenAiCompatFunctionDefinition,
+}
+
+impl OpenAiCompatToolDefinition {
+    // Map kernel tool metadata into the OpenAI-compatible representation.
+    fn from_kernel(tool: &ToolDefinition) -> Self {
+        Self {
+            kind: "function",
+            function: OpenAiCompatFunctionDefinition {
+                name: tool.name.clone(),
+                description: tool.description.clone(),
+                parameters: tool.parameters.clone(),
+            },
+        }
+    }
+}
+
+#[derive(Debug, Serialize)]
+struct OpenAiCompatFunctionDefinition {
+    name: String,
+    description: String,
+    parameters: serde_json::Value,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct OpenAiCompatResponse {
+    choices: Vec<OpenAiCompatChoice>,
+    #[serde(default)]
+    usage: Option<OpenAiCompatUsage>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct OpenAiCompatChoice {
+    message: OpenAiCompatMessage,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct OpenAiCompatToolCall {
+    id: String,
+    #[serde(rename = "type", default)]
+    kind: Option<String>,
+    function: OpenAiCompatFunctionCall,
+}
+
+impl OpenAiCompatToolCall {
+    // Serialize a kernel tool call into the cached tape format.
+    fn from_kernel(tool_call: ToolCall) -> Self {
+        Self {
+            id: tool_call.id,
+            kind: Some("function".to_string()),
+            function: OpenAiCompatFunctionCall {
+                name: tool_call.name,
+                arguments: tool_call.arguments.to_string(),
+            },
+        }
+    }
+
+    // Restore the cached tool call to the kernel model.
+    fn into_kernel(self) -> ToolCall {
+        let arguments =
+            serde_json::from_str(&self.function.arguments).unwrap_or(serde_json::Value::String(
+                self.function.arguments,
+            ));
+        ToolCall {
+            id: self.id,
+            name: self.function.name,
+            arguments,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct OpenAiCompatFunctionCall {
+    name: String,
+    arguments: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct OpenAiCompatUsage {
+    prompt_tokens: u32,
+    completion_tokens: u32,
+    total_tokens: u32,
+}
+
+impl OpenAiCompatUsage {
+    // Convert recorded usage into the kernel token usage struct.
+    fn into_kernel(self) -> TokenUsage {
+        TokenUsage {
+            prompt_tokens: self.prompt_tokens,
+            completion_tokens: self.completion_tokens,
+            total_tokens: self.total_tokens,
+        }
+    }
+}
diff --git a/tests/src/replay.rs b/tests/src/replay.rs
new file mode 100644
index 000000000..ae2589f11
--- /dev/null
+++ b/tests/src/replay.rs
@@ -0,0 +1,337 @@
+//! Replay tape support for deterministic DSL-backed runs.
+
+use async_trait::async_trait;
+use mofa_kernel::agent::error::{AgentError, AgentResult};
+use mofa_kernel::agent::types::{
+    ChatCompletionRequest, ChatCompletionResponse, LLMProvider, ToolCall, TokenUsage,
+};
+use serde::{Deserialize, Serialize};
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use thiserror::Error;
+use tokio::sync::RwLock;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TapeInteraction {
+    #[serde(default)]
+    pub request: Option<TapeRequest>,
+    pub response: String,
+    #[serde(default)]
+    pub tool_calls: Vec<TapeToolCall>,
+    #[serde(default)]
+    pub usage: Option<TapeTokenUsage>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Tape {
+    pub version: u32,
+    pub case_name: String,
+    pub interactions: Vec<TapeInteraction>,
+}
+
+#[derive(Debug, Error)]
+pub enum ReplayError {
+    #[error("failed to read tape file: {0}")]
+    Io(#[from] std::io::Error),
+
+    #[error("failed to parse tape JSON: {0}")]
+    Json(#[from] serde_json::Error),
+
+    #[error("tape has no interactions")]
+    EmptyTape,
+
+    #[error("replay exhausted after {0} interactions")]
+    ReplayExhausted(usize),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TapeRequest {
+    pub messages: Vec<TapeMessage>,
+    pub model: Option<String>,
+    pub temperature: Option<f32>,
+    pub max_tokens: Option<u32>,
+    #[serde(default)]
+    pub tool_names: Vec<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TapeMessage {
+    pub role: String,
+    pub content: Option<String>,
+    pub tool_call_id: Option<String>,
+    #[serde(default)]
+    pub tool_calls: Vec<TapeToolCall>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TapeToolCall {
+    pub id: String,
+    pub name: String,
+    pub arguments: serde_json::Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TapeTokenUsage {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+}
+
+pub struct RecordingLLMProvider {
+    name: String,
+    inner: Arc<dyn LLMProvider>,
+    case_name: String,
+    output_path: PathBuf,
+    tape: RwLock<Tape>,
+    last_request: RwLock<Option<ChatCompletionRequest>>,
+    last_response: RwLock<Option<ChatCompletionResponse>>,
+}
+
+pub struct ReplayLLMProvider {
+    name: String,
+    tape: Tape,
+    cursor: RwLock<usize>,
+    last_request: RwLock<Option<ChatCompletionRequest>>,
+    last_response: RwLock<Option<ChatCompletionResponse>>,
+}
+
+impl Tape {
+    // Start a new tape for the given case.
+    pub fn new(case_name: impl Into<String>) -> Self {
+        Self {
+            version: 1,
+            case_name: case_name.into(),
+            interactions: Vec::new(),
+        }
+    }
+
+    // Load a tape from disk.
+    pub fn from_file(path: impl AsRef<Path>) -> Result<Self, ReplayError> {
+        let body = std::fs::read_to_string(path)?;
+        Ok(serde_json::from_str(&body)?)
+    }
+
+    // Persist the tape back to disk.
+    pub fn to_file(&self, path: impl AsRef<Path>) -> Result<(), ReplayError> {
+        let body = serde_json::to_string_pretty(self)?;
+        std::fs::write(path, body)?;
+        Ok(())
+    }
+
+    // Extract the recorded responses for quick replay validation.
+    pub fn responses(&self) -> Result<Vec<String>, ReplayError> {
+        if self.interactions.is_empty() {
+            return Err(ReplayError::EmptyTape);
+        }
+        Ok(self
+            .interactions
+            .iter()
+            .map(|interaction| interaction.response.clone())
+            .collect())
+    }
+
+    // Append a new interaction to the tape.
+    pub fn push_interaction(&mut self, request: ChatCompletionRequest, response: &ChatCompletionResponse) {
+        self.interactions.push(TapeInteraction {
+            request: Some(TapeRequest::from_request(&request)),
+            response: response.content.clone().unwrap_or_default(),
+            tool_calls: response
+                .tool_calls
+                .clone()
+                .unwrap_or_default()
+                .into_iter()
+                .map(TapeToolCall::from_tool_call)
+                .collect(),
+            usage: response.usage.clone().map(TapeTokenUsage::from_usage),
+        });
+    }
+}
+
+impl TapeRequest {
+    // Snapshot a chat request into the tape schema.
+    fn from_request(request: &ChatCompletionRequest) -> Self {
+        Self {
+            messages: request
+                .messages
+                .iter()
+                .map(|message| TapeMessage {
+                    role: message.role.clone(),
+                    content: message.content.clone(),
+                    tool_call_id: message.tool_call_id.clone(),
+                    tool_calls: message
+                        .tool_calls
+                        .clone()
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(TapeToolCall::from_tool_call)
+                        .collect(),
+                })
+                .collect(),
+            model: request.model.clone(),
+            temperature: request.temperature,
+            max_tokens: request.max_tokens,
+            tool_names: request
+                .tools
+                .clone()
+                .unwrap_or_default()
+                .into_iter()
+                .map(|tool| tool.name)
+                .collect(),
+        }
+    }
+}
+
+impl TapeToolCall {
+    // Encode a real tool call for replay.
+    fn from_tool_call(tool_call: ToolCall) -> Self {
+        Self {
+            id: tool_call.id,
+            name: tool_call.name,
+            arguments: tool_call.arguments,
+        }
+    }
+
+    // Decode the tape entry back into a kernel tool call.
+    fn into_tool_call(self) -> ToolCall {
+        ToolCall {
+            id: self.id,
+            name: self.name,
+            arguments: self.arguments,
+        }
+    }
+}
+
+impl TapeTokenUsage {
+    // Capture token usage for later inspection.
+    fn from_usage(usage: TokenUsage) -> Self {
+        Self {
+            prompt_tokens: usage.prompt_tokens,
+            completion_tokens: usage.completion_tokens,
+            total_tokens: usage.total_tokens,
+        }
+    }
+
+    // Rehydrate the token usage for replay responses.
+    fn into_usage(self) -> TokenUsage {
+        TokenUsage {
+            prompt_tokens: self.prompt_tokens,
+            completion_tokens: self.completion_tokens,
+            total_tokens: self.total_tokens,
+        }
+    }
+}
+
+impl RecordingLLMProvider {
+    // Wrap a live provider so every response is recorded to tape.
+    pub fn new(
+        name: impl Into<String>,
+        inner: Arc<dyn LLMProvider>,
+        case_name: impl Into<String>,
+        output_path: impl Into<PathBuf>,
+    ) -> Self {
+        let case_name = case_name.into();
+        Self {
+            name: name.into(),
+            inner,
+            tape: RwLock::new(Tape::new(case_name.clone())),
+            case_name,
+            output_path: output_path.into(),
+            last_request: RwLock::new(None),
+            last_response: RwLock::new(None),
+        }
+    }
+
+    // Return the most recent request issued through this recording provider.
+    pub async fn last_request(&self) -> Option<ChatCompletionRequest> {
+        self.last_request.read().await.clone()
+    }
+
+    // Return the latest response captured by the recording provider.
+    pub async fn last_response(&self) -> Option<ChatCompletionResponse> {
+        self.last_response.read().await.clone()
+    }
+}
+
+#[async_trait]
+impl LLMProvider for RecordingLLMProvider {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn chat(&self, request: ChatCompletionRequest) -> AgentResult<ChatCompletionResponse> {
+        // Forward the call, cache the interaction, and persist the tape.
+        *self.last_request.write().await = Some(request.clone());
+        let response = self.inner.chat(request.clone()).await?;
+        *self.last_response.write().await = Some(response.clone());
+
+        let mut tape = self.tape.write().await;
+        if tape.case_name.is_empty() {
+            tape.case_name = self.case_name.clone();
+        }
+        tape.push_interaction(request, &response);
+        tape.to_file(&self.output_path)
+            .map_err(|err| AgentError::ExecutionFailed(err.to_string()))?;
+
+        Ok(response)
+    }
+}
+
+impl ReplayLLMProvider {
+    // Create a replay provider that steps through a pre-recorded tape.
+    pub fn new(name: impl Into<String>, tape: Tape) -> Self {
+        Self {
+            name: name.into(),
+            tape,
+            cursor: RwLock::new(0),
+            last_request: RwLock::new(None),
+            last_response: RwLock::new(None),
+        }
+    }
+
+    // Return the last request seen by the replay provider.
+    pub async fn last_request(&self) -> Option<ChatCompletionRequest> {
+        self.last_request.read().await.clone()
+    }
+
+    // Return the last response handed out during replay.
+    pub async fn last_response(&self) -> Option<ChatCompletionResponse> {
+        self.last_response.read().await.clone()
+    }
+}
+
+#[async_trait]
+impl LLMProvider for ReplayLLMProvider {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn chat(&self, request: ChatCompletionRequest) -> AgentResult<ChatCompletionResponse> {
+        // Serve the next recorded interaction instead of calling a live LLM.
+        *self.last_request.write().await = Some(request);
+        let mut cursor = self.cursor.write().await;
+        let interaction = self
+            .tape
+            .interactions
+            .get(*cursor)
+            .cloned()
+            .or_else(|| self.tape.interactions.last().cloned())
+            .ok_or_else(|| AgentError::ExecutionFailed(ReplayError::EmptyTape.to_string()))?;
+        if *cursor < self.tape.interactions.len() {
+            *cursor += 1;
+        }
+
+        let response = ChatCompletionResponse {
+            content: Some(interaction.response),
+            tool_calls: Some(
+                interaction
+                    .tool_calls
+                    .into_iter()
+                    .map(TapeToolCall::into_tool_call)
+                    .collect(),
+            ),
+            usage: interaction.usage.map(TapeTokenUsage::into_usage),
+        };
+        *self.last_response.write().await = Some(response.clone());
+        Ok(response)
+    }
+}
diff --git a/tests/src/tools.rs b/tests/src/tools.rs
index 48356b551..fbc6bb5b6 100644
--- a/tests/src/tools.rs
+++ b/tests/src/tools.rs
@@ -20,6 +20,7 @@ pub struct MockTool {
     category: ToolCategory,
     pub stubbed_result: Arc<RwLock<ToolResult>>,
     pub call_history: Arc<RwLock<Vec<ToolInput>>>,
+    pub result_history: Arc<RwLock<Vec<ToolResult>>>,
     failure_queue: Arc<RwLock<VecDeque<String>>>,
     failure_patterns: Arc<RwLock<Vec<(Value, String)>>>,
     result_sequence: Arc<RwLock<VecDeque<ToolResult>>>,
@@ -37,6 +38,7 @@ impl MockTool {
                 "Mock execution default",
             ))),
             call_history: Arc::new(RwLock::new(Vec::new())),
+            result_history: Arc::new(RwLock::new(Vec::new())),
             failure_queue: Arc::new(RwLock::new(VecDeque::new())),
             failure_patterns: Arc::new(RwLock::new(Vec::new())),
             result_sequence: Arc::new(RwLock::new(VecDeque::new())),
@@ -58,6 +60,16 @@ impl MockTool {
         self.call_history.read().await.len()
     }
 
+    /// Retrieve a clone of the full result history.
+    pub async fn results(&self) -> Vec<ToolResult> {
+        self.result_history.read().await.clone()
+    }
+
+    /// Returns the most recent result, or `None` if never executed.
+    pub async fn last_result(&self) -> Option<ToolResult> {
+        self.result_history.read().await.last().cloned()
+    }
+
     /// Queue failures for the next N calls.
     pub async fn fail_next(&self, count: usize, error_msg: &str) {
         let mut queue = self.failure_queue.write().await;
@@ -110,12 +122,18 @@ impl SimpleTool for MockTool {
 
     async fn execute(&self, input: ToolInput) -> ToolResult {
         self.call_history.write().await.push(input.clone());
+        let start = std::time::Instant::now();
 
         // 1. Drain failure queue
         {
             let mut queue = self.failure_queue.write().await;
             if let Some(err) = queue.pop_front() {
-                return ToolResult::failure(err);
+                let mut result = ToolResult::failure(err);
+                result
+                    .metadata
+                    .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+                self.result_history.write().await.push(result.clone());
+                return result;
             }
         }
 
@@ -124,7 +142,12 @@ impl SimpleTool for MockTool {
             let patterns = self.failure_patterns.read().await;
             for (pattern, err) in patterns.iter() {
                 if input.arguments == *pattern {
-                    return ToolResult::failure(err);
+                    let mut result = ToolResult::failure(err);
+                    result
+                        .metadata
+                        .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+                    self.result_history.write().await.push(result.clone());
+                    return result;
                 }
             }
         }
@@ -133,11 +156,21 @@ impl SimpleTool for MockTool {
         {
             let mut seq = self.result_sequence.write().await;
             if let Some(result) = seq.pop_front() {
+                let mut result = result;
+                result
+                    .metadata
+                    .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+                self.result_history.write().await.push(result.clone());
                 return result;
             }
         }
 
-        self.stubbed_result.read().await.clone()
+        let mut result = self.stubbed_result.read().await.clone();
+        result
+            .metadata
+            .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+        self.result_history.write().await.push(result.clone());
+        result
     }
 
     fn category(&self) -> ToolCategory {
diff --git a/tests/tests/agent_runner_tests.rs b/tests/tests/agent_runner_tests.rs
new file mode 100644
index 000000000..250b9e62d
--- /dev/null
+++ b/tests/tests/agent_runner_tests.rs
@@ -0,0 +1,311 @@
+use mofa_testing::agent_runner::AgentTestRunner;
+use mofa_testing::assertions::assert_session_messages;
+use mofa_testing::tools::MockTool;
+use serde_json::json;
+
+#[tokio::test]
+async fn agent_runner_executes_and_captures_output() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .mock_llm()
+        .add_response("Mocked response")
+        .await;
+
+    let result = runner
+        .run_text("hello")
+        .await
+        .expect("run should succeed");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("Mocked response"));
+    assert_eq!(
+        result.metadata.session_id.as_deref(),
+        Some(runner.session_id())
+    );
+    assert_eq!(result.metadata.execution_id, runner.execution_id());
+    assert_eq!(result.metadata.runner_stats_before.total_executions, 0);
+    assert_eq!(result.metadata.runner_stats_after.total_executions, 1);
+    assert!(result.metadata.session_snapshot.is_some());
+    let snapshot = result.metadata.session_snapshot.as_ref().unwrap();
+    assert_eq!(snapshot.len(), 2);
+    assert_session_messages(snapshot, &[("user", "hello"), ("assistant", "Mocked response")]);
+
+    let expected_session_path = format!("sessions/{}.jsonl", runner.session_id());
+    assert!(
+        !result
+            .metadata
+            .workspace_snapshot_before
+            .files
+            .iter()
+            .any(|file| file.relative_path == expected_session_path)
+    );
+    assert!(
+        result
+            .metadata
+            .workspace_snapshot_after
+            .files
+            .iter()
+            .any(|file| file.relative_path == expected_session_path)
+    );
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_creates_isolated_workspaces() {
+    let mut runner_a = AgentTestRunner::new().await.expect("runner A initializes");
+    let mut runner_b = AgentTestRunner::new().await.expect("runner B initializes");
+
+    assert_ne!(runner_a.workspace(), runner_b.workspace());
+
+    runner_a
+        .mock_llm()
+        .add_response("Response A")
+        .await;
+    runner_b
+        .mock_llm()
+        .add_response("Response B")
+        .await;
+
+    let _ = runner_a
+        .run_text("hi")
+        .await
+        .expect("runner A executes");
+    let _ = runner_b
+        .run_text("hi")
+        .await
+        .expect("runner B executes");
+
+    // Session files should exist in each separate workspace.
+    let session_a = runner_a
+        .workspace()
+        .join("sessions")
+        .join(format!("{}.jsonl", runner_a.session_id()));
+    let session_b = runner_b
+        .workspace()
+        .join("sessions")
+        .join(format!("{}.jsonl", runner_b.session_id()));
+
+    assert!(session_a.exists());
+    assert!(session_b.exists());
+
+    runner_a.shutdown().await.expect("shutdown A");
+    runner_b.shutdown().await.expect("shutdown B");
+}
+
+#[tokio::test]
+async fn agent_runner_executes_tool_calls() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+
+    let tool = MockTool::new(
+        "echo_tool",
+        "Echo the provided input",
+        json!({
+            "type": "object",
+            "properties": {
+                "input": { "type": "string" }
+            },
+            "required": ["input"]
+        }),
+    );
+
+    runner
+        .register_mock_tool(tool.clone())
+        .await
+        .expect("tool registered");
+
+    // First response triggers a tool call; second response is the final answer.
+    runner
+        .mock_llm()
+        .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None)
+        .await;
+    runner
+        .mock_llm()
+        .add_response("Final response")
+        .await;
+
+    let result = runner
+        .run_text("use tool")
+        .await
+        .expect("run should succeed");
+
+    // Tool call should be captured in both tool history and run metadata.
+    assert_eq!(result.output_text().as_deref(), Some("Final response"));
+    assert_eq!(tool.call_count().await, 1);
+    let last_call = tool.last_call().await.expect("tool call captured");
+    assert_eq!(last_call.arguments, json!({ "input": "ping" }));
+    assert_eq!(result.metadata.tool_calls.len(), 1);
+    let record = &result.metadata.tool_calls[0];
+    assert_eq!(record.tool_name, "echo_tool");
+    assert_eq!(record.input, json!({ "input": "ping" }));
+    assert!(record.success);
+    assert_eq!(
+        record.output,
+        Some(json!("Mock execution default"))
+    );
+    assert!(record.duration_ms.is_some());
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_loads_bootstrap_files() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .write_bootstrap_file("AGENTS.md", "Bootstrap content for agent test.")
+        .expect("bootstrap file written");
+
+    runner
+        .mock_llm()
+        .add_response("Bootstrapped response")
+        .await;
+
+    let _ = runner
+        .run_text("check prompt")
+        .await
+        .expect("run should succeed");
+
+    let request = runner
+        .mock_llm()
+        .last_request()
+        .await
+        .expect("request captured");
+    // Validate the system message includes the bootstrap content.
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+
+    assert!(system_message.contains("AGENTS.md"));
+    assert!(system_message.contains("Bootstrap content for agent test."));
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_supports_multi_turn_runs() {
+    // Multi-turn helper should keep the same session and extend history.
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("First reply").await;
+    runner.mock_llm().add_response("Second reply").await;
+
+    let results = runner
+        .run_texts(&["turn one", "turn two"])
+        .await
+        .expect("multi-turn run succeeds");
+
+    assert_eq!(results.len(), 2);
+    assert_eq!(results[0].output_text().as_deref(), Some("First reply"));
+    assert_eq!(results[1].output_text().as_deref(), Some("Second reply"));
+
+    // Session snapshot should contain two user/assistant pairs.
+    let snapshot = results
+        .last()
+        .and_then(|result| result.metadata.session_snapshot.as_ref())
+        .expect("session snapshot captured");
+    assert_eq!(snapshot.len(), 4);
+    assert_session_messages(
+        snapshot,
+        &[
+            ("user", "turn one"),
+            ("assistant", "First reply"),
+            ("user", "turn two"),
+            ("assistant", "Second reply"),
+        ],
+    );
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_customizes_prompt_identity_and_bootstraps() {
+    // Custom identity + bootstrap list should appear in the system prompt.
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .write_bootstrap_file("CUSTOM.md", "Custom bootstrap content.")
+        .expect("bootstrap file written");
+    runner
+        .configure_prompt(
+            Some(mofa_foundation::agent::context::prompt::AgentIdentity {
+                name: "TestAgent".to_string(),
+                description: "Custom identity".to_string(),
+                icon: None,
+            }),
+            Some(vec!["CUSTOM.md".to_string()]),
+        )
+        .await;
+
+    runner.mock_llm().add_response("Custom response").await;
+    let _ = runner
+        .run_text("custom prompt")
+        .await
+        .expect("run should succeed");
+
+    let request = runner
+        .mock_llm()
+        .last_request()
+        .await
+        .expect("request captured");
+    // Validate custom identity and bootstrap content.
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+
+    assert!(system_message.contains("TestAgent"));
+    assert!(system_message.contains("Custom identity"));
+    assert!(system_message.contains("CUSTOM.md"));
+    assert!(system_message.contains("Custom bootstrap content."));
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_captures_llm_failure() {
+    // LLM failures should surface in AgentRunResult with failed stats.
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .mock_llm()
+        .add_error_response("mock failure")
+        .await;
+
+    let result = runner
+        .run_text("trigger failure")
+        .await
+        .expect("run should return result");
+
+    assert!(!result.is_success());
+    let error = result.error.expect("error captured");
+    assert!(error.to_string().contains("mock failure"));
+    assert_eq!(result.metadata.runner_stats_after.failed_executions, 1);
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_allows_custom_session_keys() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .mock_llm()
+        .add_response("Custom session response")
+        .await;
+
+    let result = runner
+        .run_text_with_session("custom-session", "hello session")
+        .await
+        .expect("run should succeed");
+
+    assert_eq!(
+        result.metadata.session_id.as_deref(),
+        Some("custom-session")
+    );
+    let session_path = runner
+        .workspace()
+        .join("sessions")
+        .join("custom-session.jsonl");
+    assert!(session_path.exists());
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
diff --git a/tests/tests/artifact_tests.rs b/tests/tests/artifact_tests.rs
new file mode 100644
index 000000000..1981f122e
--- /dev/null
+++ b/tests/tests/artifact_tests.rs
@@ -0,0 +1,136 @@
+//! Tests for canonical DSL run artifact generation.
+
+use mofa_testing::{
+    AgentRunArtifact, TestCaseDsl, assertion_error_from_outcomes, collect_assertion_outcomes,
+    execute_test_case,
+};
+
+#[tokio::test]
+async fn artifact_contains_core_run_data() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/tool_agent.toml"
+    ))
+    .expect("tool DSL example should parse");
+
+    let result = execute_test_case(&case)
+        .await
+        .expect("DSL case should execute");
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions);
+
+    assert_eq!(artifact.case_name, "tool_agent_run");
+    assert_eq!(artifact.status, "passed");
+    assert_eq!(artifact.output_text.as_deref(), Some("Tool execution complete"));
+    assert_eq!(artifact.tool_calls.len(), 1);
+    assert_eq!(artifact.tool_calls[0].tool_name, "echo_tool");
+    assert!(!artifact.agent.name.is_empty());
+    assert!(artifact.llm_request.is_some());
+    assert!(artifact.workspace_after.files.iter().any(|file| {
+        file.relative_path.ends_with(".jsonl")
+    }));
+}
+
+#[tokio::test]
+async fn artifact_captures_failed_assertions() {
+    let case = TestCaseDsl::from_toml_str(
+        r#"
+name = "failing_case"
+prompt = "Say hello"
+
+[llm]
+responses = ["wrong output"]
+
+[assert]
+contains = "expected text"
+"#,
+    )
+    .expect("inline DSL should parse");
+
+    let result = execute_test_case(&case)
+        .await
+        .expect("DSL case should execute");
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone());
+
+    assert_eq!(artifact.status, "failed");
+    assert_eq!(artifact.assertions.len(), 1);
+    assert!(!artifact.assertions[0].passed);
+    assert!(assertion_error_from_outcomes(&assertions).is_some());
+}
+
+#[tokio::test]
+async fn artifact_compare_to_returns_match_for_identical_runs() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/simple_agent.toml"
+    ))
+    .expect("DSL example should parse");
+
+    let result = execute_test_case(&case)
+        .await
+        .expect("DSL case should execute");
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions);
+
+    let diff = artifact.compare_to(&artifact);
+
+    assert!(diff.matches);
+    assert!(diff.differences.is_empty());
+}
+
+#[tokio::test]
+async fn artifact_compare_to_detects_output_and_tool_changes() {
+    let baseline = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/tool_agent.toml"
+    ))
+    .expect("tool DSL should parse");
+    let baseline_result = execute_test_case(&baseline)
+        .await
+        .expect("baseline DSL should execute");
+    let baseline_assertions = collect_assertion_outcomes(&baseline, &baseline_result);
+    let baseline_artifact =
+        AgentRunArtifact::from_run_result(&baseline, &baseline_result, baseline_assertions);
+
+    let candidate = TestCaseDsl::from_toml_str(
+        r#"
+name = "tool_agent_run"
+input = "Use the echo tool and summarize the result."
+
+[[tools]]
+name = "lookup_tool"
+description = "Lookup the provided input."
+schema = { type = "object", properties = { input = { type = "string" } }, required = ["input"] }
+result = "lookup result"
+
+[assert]
+contains = "Different output"
+tool_called = "lookup_tool"
+
+[llm]
+
+[[llm.steps]]
+type = "tool_call"
+tool = "lookup_tool"
+arguments = { input = "ping" }
+
+[[llm.steps]]
+type = "text"
+content = "Different output"
+"#,
+    )
+    .expect("candidate DSL should parse");
+    let candidate_result = execute_test_case(&candidate)
+        .await
+        .expect("candidate DSL should execute");
+    let candidate_assertions = collect_assertion_outcomes(&candidate, &candidate_result);
+    let candidate_artifact =
+        AgentRunArtifact::from_run_result(&candidate, &candidate_result, candidate_assertions);
+
+    let diff = candidate_artifact.compare_to(&baseline_artifact);
+
+    assert!(!diff.matches);
+    assert!(diff.differences.iter().any(|item| item.field == "output_text"));
+    assert!(diff.differences.iter().any(|item| item.field == "tool_calls"));
+}
diff --git a/tests/tests/assertion_macro_tests.rs b/tests/tests/assertion_macro_tests.rs
index 55c2af2f4..c27aee010 100644
--- a/tests/tests/assertion_macro_tests.rs
+++ b/tests/tests/assertion_macro_tests.rs
@@ -1,11 +1,11 @@
-//! Tests for assertion macros: assert_tool_called!, assert_tool_called_with!,
-//! assert_infer_called!, assert_bus_message_sent!.
+//! Tests for assertion macros and assertion helpers used by the testing crate.
 
 use mofa_foundation::agent::components::tool::SimpleTool;
 use mofa_foundation::orchestrator::{ModelOrchestrator, ModelProviderConfig, ModelType};
 use mofa_kernel::agent::components::tool::ToolInput;
 use mofa_kernel::bus::CommunicationMode;
 use mofa_kernel::message::AgentMessage;
+use mofa_testing::agent_runner::AgentTestRunner;
 use mofa_testing::backend::MockLLMBackend;
 use mofa_testing::bus::MockAgentBus;
 use mofa_testing::tools::MockTool;
@@ -121,3 +121,89 @@ async fn assert_bus_message_sent_panics_when_no_message_from_sender() {
 
     mofa_testing::assert_bus_message_sent!(bus, "agent-2");
 }
+
+// ===================================================================
+// New assertion helpers
+// ===================================================================
+
+#[tokio::test]
+async fn assert_tool_last_result_passes_on_matching_output() {
+    let tool = MockTool::new("search", "Search tool", json!({"type": "object"}));
+    tool.execute(ToolInput::from_json(json!({"query": "rust"})))
+        .await;
+
+    mofa_testing::assert_tool_last_result!(tool, json!("Mock execution default"));
+}
+
+#[tokio::test]
+async fn assert_agent_output_text_passes_on_matching_output() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("hello from runner").await;
+
+    let result = runner.run_text("hello").await.expect("run succeeds");
+
+    mofa_testing::assert_agent_output_text!(result, "hello from runner");
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_run_failed_with_passes_on_matching_error() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_error_response("mock failure").await;
+
+    let result = runner.run_text("hello").await.expect("run completes");
+
+    mofa_testing::assert_run_failed_with!(result, "mock failure");
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_workspace_contains_file_passes_when_snapshot_has_file() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("workspace ready").await;
+
+    let result = runner.run_text("hello").await.expect("run succeeds");
+    let expected = format!("sessions/{}.jsonl", runner.session_id());
+
+    mofa_testing::assert_workspace_contains_file!(result.metadata.workspace_snapshot_after, expected);
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_run_recorded_tool_call_passes_when_tool_metadata_exists() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    let tool = MockTool::new(
+        "echo_tool",
+        "Echo tool",
+        json!({
+            "type": "object",
+            "properties": { "input": { "type": "string" } },
+            "required": ["input"]
+        }),
+    );
+    runner
+        .register_mock_tool(tool)
+        .await
+        .expect("tool registered");
+    runner
+        .mock_llm()
+        .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None)
+        .await;
+    runner.mock_llm().add_response("done").await;
+
+    let result = runner.run_text("use tool").await.expect("run succeeds");
+
+    mofa_testing::assert_run_recorded_tool_call!(result, "echo_tool");
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_runner_total_executions_passes_on_first_run() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("counted").await;
+
+    let result = runner.run_text("hello").await.expect("run succeeds");
+
+    mofa_testing::assert_runner_total_executions!(result, 1);
+    runner.shutdown().await.expect("shutdown succeeds");
+}
diff --git a/tests/tests/dsl_tests.rs b/tests/tests/dsl_tests.rs
new file mode 100644
index 000000000..273366211
--- /dev/null
+++ b/tests/tests/dsl_tests.rs
@@ -0,0 +1,108 @@
+//! Integration tests for the minimal TOML DSL adapter.
+
+use mofa_testing::{configure_runner_from_test_case, run_test_case, AgentTestRunner, TestCaseDsl};
+
+#[tokio::test]
+async fn toml_dsl_runs_through_agent_runner() {
+    // Load the example DSL from the crate so the test exercises parsing and
+    // adapter execution together.
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/simple_agent.toml"
+    ))
+        .expect("DSL example should parse");
+
+    assert_eq!(case.name, "simple_agent_run");
+
+    let result = run_test_case(&case)
+        .await
+        .expect("DSL case should run successfully");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("hello from DSL"));
+}
+
+#[tokio::test]
+async fn toml_dsl_supports_bootstrap_files() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/bootstrap_agent.toml"
+    ))
+    .expect("bootstrap DSL example should parse");
+
+    let mut runner = AgentTestRunner::new()
+        .await
+        .expect("runner should initialize");
+
+    configure_runner_from_test_case(&case, &mut runner)
+        .await
+        .expect("DSL bootstrap config should apply");
+
+    let _ = runner
+        .run_text(case.prompt.as_deref().expect("prompt should be present"))
+        .await
+        .expect("bootstrap run should succeed");
+
+    let request = runner
+        .mock_llm()
+        .last_request()
+        .await
+        .expect("request should be captured");
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+
+    assert!(system_message.contains("AGENTS.md"));
+    assert!(system_message.contains("Bootstrapped instructions for the DSL test."));
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn toml_dsl_supports_tool_backed_runs() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/tool_agent.toml"
+    ))
+    .expect("tool DSL example should parse");
+
+    let result = run_test_case(&case)
+        .await
+        .expect("tool-backed DSL case should run successfully");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("Tool execution complete"));
+    assert_eq!(result.metadata.tool_calls.len(), 1);
+    assert_eq!(result.metadata.tool_calls[0].tool_name, "echo_tool");
+
+    let request = result
+        .metadata
+        .llm_last_request
+        .as_ref()
+        .expect("request should be captured");
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+    assert!(system_message.contains("ToolAgent"));
+    assert!(system_message.contains("Agent used to validate tool-aware DSL execution."));
+}
+
+#[tokio::test]
+async fn toml_dsl_supports_tape_backed_runs() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/simple_agent_tape.toml"
+    ))
+    .expect("tape-backed DSL example should parse");
+
+    let result = run_test_case(&case)
+        .await
+        .expect("tape-backed DSL case should run successfully");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("hello from tape"));
+}