diff --git a/Cargo.toml b/Cargo.toml
index f71801118..aca372a14 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,6 +86,9 @@ lazy_static = "1.4"
 # Actor framework for ReAct agents
 ractor = "0"
 
+# TOML deserialization (also used transitively by config)
+toml = "0.8"
+
 # Configuration file support (multi-format)
 config = { version = "0.14", features = [
     "toml",
diff --git a/crates/mofa-cli/Cargo.toml b/crates/mofa-cli/Cargo.toml
index 875ae62a6..5a9d593d6 100644
--- a/crates/mofa-cli/Cargo.toml
+++ b/crates/mofa-cli/Cargo.toml
@@ -25,6 +25,7 @@ mofa-kernel = { path = "../mofa-kernel", version = "0.1", features = [
 ] }
 mofa-runtime = { path = "../mofa-runtime", version = "0.1" }
 mofa-foundation = { path = "../mofa-foundation", version = "0.1" }
+mofa-testing = { path = "../../tests", version = "0.1" }
 config.workspace = true
 tokio = { workspace = true }
 thiserror = { workspace = true }
diff --git a/crates/mofa-cli/src/cli.rs b/crates/mofa-cli/src/cli.rs
index 7e10ad479..866ef80e9 100644
--- a/crates/mofa-cli/src/cli.rs
+++ b/crates/mofa-cli/src/cli.rs
@@ -81,6 +81,24 @@ pub enum Commands {
         dora: bool,
     },
 
+    /// Run a testing DSL case file
+    TestDsl {
+        /// TOML DSL file to execute
+        file: PathBuf,
+
+        /// Optional canonical artifact file path
+        #[arg(long)]
+        artifact_out: Option<PathBuf>,
+
+        /// Optional report file path
+        #[arg(long)]
+        report_out: Option<PathBuf>,
+
+        /// Report file format
+        #[arg(long, value_enum, default_value_t = TestDslReportFormat::Json)]
+        report_format: TestDslReportFormat,
+    },
+
     /// Run a dora dataflow
     #[cfg(feature = "dora")]
     Dataflow {
@@ -219,6 +237,12 @@ pub enum DatabaseType {
     Sqlite,
 }
 
+#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)]
+pub enum TestDslReportFormat {
+    Json,
+    Text,
+}
+
 impl std::fmt::Display for DatabaseType {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
@@ -725,6 +749,38 @@ mod tests {
         assert!(parsed.is_ok(), "doctor ci strict json should parse");
     }
 
+    #[test]
+    fn test_test_dsl_parses() {
+        let parsed = Cli::try_parse_from(["mofa", "test-dsl", "tests/examples/simple_agent.toml"]);
+        assert!(parsed.is_ok(), "test-dsl command should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_report_flags_parse() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--report-out",
+            "/tmp/report.json",
+            "--report-format",
+            "json",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl report flags should parse");
+    }
+
+    #[test]
+    fn test_test_dsl_artifact_flag_parses() {
+        let parsed = Cli::try_parse_from([
+            "mofa",
+            "test-dsl",
+            "tests/examples/simple_agent.toml",
+            "--artifact-out",
+            "/tmp/artifact.json",
+        ]);
+        assert!(parsed.is_ok(), "test-dsl artifact flag should parse");
+    }
+
     #[test]
     fn test_rag_index_parses() {
         let parsed = Cli::try_parse_from([
diff --git a/crates/mofa-cli/src/commands/mod.rs b/crates/mofa-cli/src/commands/mod.rs
index 0fb02a9d3..1b798f63f 100644
--- a/crates/mofa-cli/src/commands/mod.rs
+++ b/crates/mofa-cli/src/commands/mod.rs
@@ -11,5 +11,6 @@ pub mod new;
 pub mod plugin;
 pub mod rag;
 pub mod run;
+pub mod test_dsl;
 pub mod session;
 pub mod tool;
diff --git a/crates/mofa-cli/src/commands/test_dsl.rs b/crates/mofa-cli/src/commands/test_dsl.rs
new file mode 100644
index 000000000..397e55a39
--- /dev/null
+++ b/crates/mofa-cli/src/commands/test_dsl.rs
@@ -0,0 +1,151 @@
+//! `mofa test-dsl` command implementation
+
+use crate::CliError;
+use crate::cli::TestDslReportFormat;
+use crate::output::OutputFormat;
+use mofa_testing::{
+    AgentRunArtifact, DslError, JsonFormatter, ReportFormatter, TestCaseResult, TestReport,
+    TestStatus, TextFormatter, TestCaseDsl, assertion_error_from_outcomes,
+    collect_assertion_outcomes, execute_test_case,
+};
+use serde::Serialize;
+use serde_json::json;
+use std::path::Path;
+
+#[derive(Debug, Serialize)]
+struct TestDslSummary {
+    name: String,
+    success: bool,
+    output_text: Option<String>,
+    duration_ms: u128,
+    tool_calls: Vec<String>,
+    workspace_root: String,
+}
+
+/// Execute one TOML DSL test case through the testing runner.
+pub async fn run(
+    path: &Path,
+    format: OutputFormat,
+    artifact_out: Option<&Path>,
+    report_out: Option<&Path>,
+    report_format: TestDslReportFormat,
+) -> Result<(), CliError> {
+    let case = TestCaseDsl::from_toml_file(path).map_err(map_dsl_error)?;
+    let result = execute_test_case(&case).await.map_err(map_dsl_error)?;
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone());
+    let report = build_report(&artifact);
+
+    if let Some(artifact_out) = artifact_out {
+        write_artifact(artifact_out, &artifact)?;
+    }
+
+    if let Some(report_out) = report_out {
+        write_report(report_out, report_format, &report)?;
+    }
+
+    let summary = TestDslSummary {
+        name: case.name,
+        success: result.is_success(),
+        output_text: result.output_text(),
+        duration_ms: result.duration.as_millis(),
+        tool_calls: result
+            .metadata
+            .tool_calls
+            .iter()
+            .map(|record| record.tool_name.clone())
+            .collect(),
+        workspace_root: result.metadata.workspace_root.display().to_string(),
+    };
+
+    match format {
+        OutputFormat::Json => {
+            let output = json!({
+                "success": true,
+                "case": summary,
+            });
+            println!("{}", serde_json::to_string_pretty(&output)?);
+        }
+        _ => {
+            println!("case: {}", summary.name);
+            println!("status: {}", if summary.success { "passed" } else { "failed" });
+            if let Some(output_text) = &summary.output_text {
+                println!("output: {}", output_text);
+            }
+            if !summary.tool_calls.is_empty() {
+                println!("tool_calls: {}", summary.tool_calls.join(", "));
+            }
+            println!("duration_ms: {}", summary.duration_ms);
+        }
+    }
+
+    if let Some(error) = assertion_error_from_outcomes(&assertions) {
+        return Err(map_dsl_error(error));
+    }
+
+    Ok(())
+}
+
+fn build_report(artifact: &AgentRunArtifact) -> TestReport {
+    let status = if artifact.status == "passed" {
+        TestStatus::Passed
+    } else {
+        TestStatus::Failed
+    };
+    let error = artifact
+        .runner_error
+        .clone()
+        .or_else(|| {
+            artifact
+                .assertions
+                .iter()
+                .find(|item| !item.passed)
+                .map(|item| format!("assertion failed: {}", item.kind))
+        });
+    let metadata = vec![
+        (
+            "execution_id".to_string(),
+            artifact.execution_id.clone(),
+        ),
+        (
+            "workspace_root".to_string(),
+            artifact.workspace_root.clone(),
+        ),
+        (
+            "tool_calls".to_string(),
+            artifact.tool_calls.len().to_string(),
+        ),
+    ];
+
+    TestReport {
+        suite_name: "dsl".to_string(),
+        results: vec![TestCaseResult {
+            name: artifact.case_name.clone(),
+            status,
+            duration: std::time::Duration::from_millis(artifact.duration_ms),
+            error,
+            metadata,
+        }],
+        total_duration: std::time::Duration::from_millis(artifact.duration_ms),
+        timestamp: artifact.started_at_ms,
+    }
+}
+
+fn write_artifact(path: &Path, artifact: &AgentRunArtifact) -> Result<(), CliError> {
+    let body = serde_json::to_string_pretty(artifact)?;
+    std::fs::write(path, body)?;
+    Ok(())
+}
+
+fn write_report(path: &Path, format: TestDslReportFormat, report: &TestReport) -> Result<(), CliError> {
+    let body = match format {
+        TestDslReportFormat::Json => JsonFormatter.format(report),
+        TestDslReportFormat::Text => TextFormatter.format(report),
+    };
+    std::fs::write(path, body)?;
+    Ok(())
+}
+
+fn map_dsl_error(error: DslError) -> CliError {
+    CliError::Other(format!("DSL test failed: {error}"))
+}
diff --git a/crates/mofa-cli/src/main.rs b/crates/mofa-cli/src/main.rs
index 749fe56f7..089bcc6ac 100644
--- a/crates/mofa-cli/src/main.rs
+++ b/crates/mofa-cli/src/main.rs
@@ -75,6 +75,7 @@ fn main() {
 
 async fn run_command(cli: Cli) -> CliResult<()> {
     use cli::Commands;
+    let output_format = cli.output_format.unwrap_or_default();
 
     // Initialize context for commands that need backend services
     let needs_context = matches!(
@@ -121,6 +122,24 @@ async fn run_command(cli: Cli) -> CliResult<()> {
             commands::run::run(&config, dora)?;
         }
 
+        Some(Commands::TestDsl {
+            file,
+            artifact_out,
+            report_out,
+            report_format,
+        }) => {
+            commands::test_dsl::run(
+                &file,
+                output_format,
+                artifact_out.as_deref(),
+                report_out.as_deref(),
+                report_format,
+            )
+                .await
+                .into_report()
+                .attach_with(|| format!("running DSL test case from {}", file.display()))?;
+        }
+
         #[cfg(feature = "dora")]
         Some(Commands::Dataflow { file, uv }) => {
             commands::run::run_dataflow(&file, uv)?;
diff --git a/crates/mofa-cli/tests/test_dsl_integration_tests.rs b/crates/mofa-cli/tests/test_dsl_integration_tests.rs
new file mode 100644
index 000000000..1da5a294e
--- /dev/null
+++ b/crates/mofa-cli/tests/test_dsl_integration_tests.rs
@@ -0,0 +1,121 @@
+//! Integration tests for `mofa test-dsl`.
+
+use assert_cmd::Command;
+use predicates::prelude::*;
+use tempfile::tempdir;
+
+#[test]
+fn test_dsl_command_runs_example_case() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent.toml"
+    );
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args(["test-dsl", case_path])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("status: passed"))
+        .stdout(predicate::str::contains("output: hello from DSL"));
+}
+
+#[test]
+fn test_dsl_command_emits_json() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args(["--output-format", "json", "test-dsl", case_path])
+        .assert()
+        .success()
+        .stdout(predicate::str::contains("\"success\": true"))
+        .stdout(predicate::str::contains("\"tool_calls\""))
+        .stdout(predicate::str::contains("\"echo_tool\""));
+}
+
+#[test]
+fn test_dsl_command_writes_json_report_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/simple_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let report_path = temp.path().join("dsl-report.json");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--report-out",
+            report_path.to_str().expect("utf8 report path"),
+            "--report-format",
+            "json",
+        ])
+        .assert()
+        .success();
+
+    let report = std::fs::read_to_string(&report_path).expect("report file exists");
+    assert!(report.contains("\"suite\": \"dsl\""));
+    assert!(report.contains("\"name\": \"simple_agent_run\""));
+    assert!(report.contains("\"status\": \"passed\""));
+}
+
+#[test]
+fn test_dsl_command_writes_text_report_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let report_path = temp.path().join("dsl-report.txt");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--report-out",
+            report_path.to_str().expect("utf8 report path"),
+            "--report-format",
+            "text",
+        ])
+        .assert()
+        .success();
+
+    let report = std::fs::read_to_string(&report_path).expect("report file exists");
+    assert!(report.contains("=== dsl ==="));
+    assert!(report.contains("tool_agent_run"));
+    assert!(report.contains("[+]"));
+}
+
+#[test]
+fn test_dsl_command_writes_canonical_artifact_file() {
+    let case_path = concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/../../tests/examples/tool_agent.toml"
+    );
+    let temp = tempdir().expect("temp dir");
+    let artifact_path = temp.path().join("dsl-artifact.json");
+
+    Command::cargo_bin("mofa")
+        .expect("mofa bin")
+        .args([
+            "test-dsl",
+            case_path,
+            "--artifact-out",
+            artifact_path.to_str().expect("utf8 artifact path"),
+        ])
+        .assert()
+        .success();
+
+    let artifact = std::fs::read_to_string(&artifact_path).expect("artifact file exists");
+    assert!(artifact.contains("\"case_name\": \"tool_agent_run\""));
+    assert!(artifact.contains("\"status\": \"passed\""));
+    assert!(artifact.contains("\"assertions\""));
+    assert!(artifact.contains("\"tool_calls\""));
+}
diff --git a/crates/mofa-foundation/src/agent/context/prompt.rs b/crates/mofa-foundation/src/agent/context/prompt.rs
index 04ec30bd6..87cc8a127 100644
--- a/crates/mofa-foundation/src/agent/context/prompt.rs
+++ b/crates/mofa-foundation/src/agent/context/prompt.rs
@@ -142,6 +142,17 @@ impl PromptContext {
         self
     }
 
+    /// Replace the agent identity.
+    pub fn set_identity(&mut self, identity: AgentIdentity) {
+        self.agent_name = identity.name.clone();
+        self.identity = identity;
+    }
+
+    /// Replace the bootstrap file list.
+    pub fn set_bootstrap_files(&mut self, files: Vec<String>) {
+        self.bootstrap_files = files;
+    }
+
     /// Set skills that should always be loaded
     pub fn with_always_load(mut self, skills: Vec<String>) -> Self {
         self.always_load = skills;
diff --git a/crates/mofa-foundation/src/agent/executor.rs b/crates/mofa-foundation/src/agent/executor.rs
index 605ab8fe0..15dca5065 100644
--- a/crates/mofa-foundation/src/agent/executor.rs
+++ b/crates/mofa-foundation/src/agent/executor.rs
@@ -548,6 +548,15 @@ impl AgentExecutor {
         &self.config
     }
 
+    /// Update the prompt context (system prompt builder).
+    pub async fn update_prompt_context<F>(&self, updater: F)
+    where
+        F: FnOnce(&mut PromptContext),
+    {
+        let mut ctx = self.context.write().await;
+        updater(&mut ctx);
+    }
+
     /// Get mutable reference to base agent
     pub fn base_mut(&mut self) -> &mut BaseAgent {
         &mut self.base
@@ -586,7 +595,9 @@ impl MoFAAgent for AgentExecutor {
         self.base.initialize(ctx).await?;
 
         // Additional executor-specific initialization
-        self.base.transition_to(AgentState::Ready)?;
+        if self.base.state() != AgentState::Ready {
+            self.base.transition_to(AgentState::Ready)?;
+        }
 
         Ok(())
     }
@@ -643,7 +654,10 @@ mod tests {
             "mock"
         }
 
-        async fn chat(&self, _request: ChatCompletionRequest) -> AgentResult<ChatCompletionResponse> {
+        async fn chat(
+            &self,
+            _request: ChatCompletionRequest,
+        ) -> AgentResult<ChatCompletionResponse> {
             Ok(ChatCompletionResponse {
                 content: Some("ok".to_string()),
                 tool_calls: Some(Vec::<ToolCall>::new()),
diff --git a/crates/mofa-runtime/src/runner.rs b/crates/mofa-runtime/src/runner.rs
index 91ca5aaff..09a856636 100644
--- a/crates/mofa-runtime/src/runner.rs
+++ b/crates/mofa-runtime/src/runner.rs
@@ -349,6 +349,11 @@ impl<T: MoFAAgent> AgentRunner<T> {
         &self.context
     }
 
+    /// Update session ID in the execution context.
+    pub fn set_session_id(&mut self, session_id: Option<String>) {
+        self.context.session_id = session_id;
+    }
+
     /// 获取运行器状态
     /// Get runner state
     pub async fn state(&self) -> RunnerState {
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
index f952ce045..3da476607 100644
--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -1,6 +1,9 @@
 [workspace]
 resolver = "3"
 members = [
+    "agent_runner_basic",
+    "agent_runner_custom_session",
+    "agent_runner_tools",
     "cli_production_smoke",
     "cli_agent_logs_demo",
     "cli_plugin_lifecycle",
diff --git a/examples/agent_runner_basic/Cargo.toml b/examples/agent_runner_basic/Cargo.toml
new file mode 100644
index 000000000..e49ef8dea
--- /dev/null
+++ b/examples/agent_runner_basic/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "agent_runner_basic"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+mofa-testing = { path = "../../tests" }
+tokio.workspace = true
diff --git a/examples/agent_runner_basic/src/main.rs b/examples/agent_runner_basic/src/main.rs
new file mode 100644
index 000000000..f49a93ee7
--- /dev/null
+++ b/examples/agent_runner_basic/src/main.rs
@@ -0,0 +1,29 @@
+use anyhow::Result;
+use mofa_testing::AgentTestRunner;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut runner = AgentTestRunner::new().await?;
+    runner.mock_llm().add_response("Hello from the runner").await;
+
+    let result = runner.run_text("hi").await?;
+    println!("Output: {}", result.output_text().unwrap_or_default());
+    println!(
+        "Session: {}",
+        result
+            .metadata
+            .session_id
+            .as_deref()
+            .unwrap_or("<none>")
+    );
+    println!("Workspace: {}", result.metadata.workspace_root.display());
+    println!(
+        "Runner stats: total={} success={} failed={}",
+        result.metadata.runner_stats_after.total_executions,
+        result.metadata.runner_stats_after.successful_executions,
+        result.metadata.runner_stats_after.failed_executions
+    );
+
+    runner.shutdown().await?;
+    Ok(())
+}
diff --git a/examples/agent_runner_custom_session/Cargo.toml b/examples/agent_runner_custom_session/Cargo.toml
new file mode 100644
index 000000000..52620ec07
--- /dev/null
+++ b/examples/agent_runner_custom_session/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "agent_runner_custom_session"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+mofa-testing = { path = "../../tests" }
+mofa-foundation = { path = "../../crates/mofa-foundation" }
+tokio.workspace = true
diff --git a/examples/agent_runner_custom_session/src/main.rs b/examples/agent_runner_custom_session/src/main.rs
new file mode 100644
index 000000000..fc03881b8
--- /dev/null
+++ b/examples/agent_runner_custom_session/src/main.rs
@@ -0,0 +1,42 @@
+use anyhow::Result;
+use mofa_foundation::agent::context::prompt::AgentIdentity;
+use mofa_testing::AgentTestRunner;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut runner = AgentTestRunner::new().await?;
+
+    runner.write_bootstrap_file("CUSTOM.md", "Custom bootstrap content.")?;
+    runner
+        .configure_prompt(
+            Some(AgentIdentity {
+                name: "RunnerDemo".to_string(),
+                description: "Custom identity for example runs".to_string(),
+                icon: None,
+            }),
+            Some(vec!["CUSTOM.md".to_string()]),
+        )
+        .await;
+
+    runner
+        .mock_llm()
+        .add_response("Custom session response")
+        .await;
+
+    let result = runner
+        .run_text_with_session("demo-session", "hello session")
+        .await?;
+
+    println!(
+        "Session id: {}",
+        result
+            .metadata
+            .session_id
+            .as_deref()
+            .unwrap_or("<none>")
+    );
+    println!("Output: {}", result.output_text().unwrap_or_default());
+
+    runner.shutdown().await?;
+    Ok(())
+}
diff --git a/examples/agent_runner_tools/Cargo.toml b/examples/agent_runner_tools/Cargo.toml
new file mode 100644
index 000000000..511578074
--- /dev/null
+++ b/examples/agent_runner_tools/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "agent_runner_tools"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+mofa-testing = { path = "../../tests" }
+serde_json.workspace = true
+tokio.workspace = true
diff --git a/examples/agent_runner_tools/src/main.rs b/examples/agent_runner_tools/src/main.rs
new file mode 100644
index 000000000..7399abf8b
--- /dev/null
+++ b/examples/agent_runner_tools/src/main.rs
@@ -0,0 +1,47 @@
+use anyhow::Result;
+use mofa_testing::{AgentTestRunner, MockTool};
+use serde_json::json;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut runner = AgentTestRunner::new().await?;
+
+    let tool = MockTool::new(
+        "echo_tool",
+        "Echo the provided input",
+        json!({
+            "type": "object",
+            "properties": {
+                "input": { "type": "string" }
+            },
+            "required": ["input"]
+        }),
+    );
+
+    runner.register_mock_tool(tool).await?;
+
+    runner
+        .mock_llm()
+        .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None)
+        .await;
+    runner
+        .mock_llm()
+        .add_response("Tool response completed")
+        .await;
+
+    let result = runner.run_text("use the tool").await?;
+    println!("Output: {}", result.output_text().unwrap_or_default());
+
+    for record in &result.metadata.tool_calls {
+        println!(
+            "Tool call: name={} input={} output={} duration_ms={:?}",
+            record.tool_name,
+            record.input,
+            record.output.as_ref().unwrap_or(&serde_json::Value::Null),
+            record.duration_ms
+        );
+    }
+
+    runner.shutdown().await?;
+    Ok(())
+}
diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index a0597654f..7eec6a9f7 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -9,10 +9,14 @@ description = "Testing utilities for the MoFA agent framework"
 [dependencies]
 mofa-kernel = { path = "../crates/mofa-kernel" }
 mofa-foundation = { path = "../crates/mofa-foundation" }
+mofa-runtime = { path = "../crates/mofa-runtime" }
 tokio = { workspace = true }
 async-trait = { workspace = true }
 anyhow = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 chrono = { workspace = true }
+thiserror = { workspace = true }
+uuid = { workspace = true }
 regex = { workspace = true }
+toml = { workspace = true }
diff --git a/tests/examples/bootstrap_agent.toml b/tests/examples/bootstrap_agent.toml
new file mode 100644
index 000000000..9dd75291c
--- /dev/null
+++ b/tests/examples/bootstrap_agent.toml
@@ -0,0 +1,13 @@
+name = "bootstrap_agent_run"
+prompt = "What file was loaded?"
+expected_text = "Bootstrapped"
+
+[[bootstrap_files]]
+path = "AGENTS.md"
+content = "Bootstrapped instructions for the DSL test."
+
+[llm]
+responses = ["Bootstrapped response"]
+
+[assert]
+contains = "Bootstrapped"
\ No newline at end of file
diff --git a/tests/examples/simple_agent.toml b/tests/examples/simple_agent.toml
new file mode 100644
index 000000000..dc673d88a
--- /dev/null
+++ b/tests/examples/simple_agent.toml
@@ -0,0 +1,9 @@
+name = "simple_agent_run"
+prompt = "Say hello"
+expected_text = "hello"
+
+[llm]
+responses = ["hello from DSL"]
+
+[assert]
+contains = "hello"
diff --git a/tests/examples/tool_agent.toml b/tests/examples/tool_agent.toml
new file mode 100644
index 000000000..81be62550
--- /dev/null
+++ b/tests/examples/tool_agent.toml
@@ -0,0 +1,27 @@
+name = "tool_agent_run"
+input = "Use the echo tool and summarize the result."
+
+[agent]
+name = "ToolAgent"
+description = "Agent used to validate tool-aware DSL execution."
+
+[[tools]]
+name = "echo_tool"
+description = "Echo the provided input."
+schema = { type = "object", properties = { input = { type = "string" } }, required = ["input"] }
+result = "echoed from tool"
+
+[assert]
+contains = "Tool execution complete"
+tool_called = "echo_tool"
+
+[llm]
+
+[[llm.steps]]
+type = "tool_call"
+tool = "echo_tool"
+arguments = { input = "ping" }
+
+[[llm.steps]]
+type = "text"
+content = "Tool execution complete"
diff --git a/tests/src/agent_runner.rs b/tests/src/agent_runner.rs
new file mode 100644
index 000000000..ca22c1c1b
--- /dev/null
+++ b/tests/src/agent_runner.rs
@@ -0,0 +1,619 @@
+//! Real agent runner harness for integration-style tests.
+//!
+//! Provides a lightweight wrapper around the MoFA runtime `AgentRunner`
+//! with an isolated workspace and deterministic mock LLM.
+
+use async_trait::async_trait;
+use chrono::{DateTime, Utc};
+use mofa_foundation::agent::context::prompt::AgentIdentity;
+use mofa_foundation::agent::executor::{AgentExecutor, AgentExecutorConfig};
+use mofa_kernel::agent::context::AgentContext;
+use mofa_kernel::agent::core::MoFAAgent;
+use mofa_kernel::agent::error::{AgentError, AgentResult};
+use mofa_foundation::agent::components::tool::as_tool;
+use mofa_foundation::agent::components::tool::SimpleTool;
+use mofa_foundation::agent::session::{JsonlSessionStorage, Session, SessionStorage};
+use crate::tools::MockTool;
+use mofa_kernel::agent::types::{AgentInput, AgentOutput, ChatCompletionRequest};
+use mofa_kernel::agent::types::{ChatCompletionResponse, ToolCall};
+use mofa_kernel::agent::AgentCapabilities;
+use mofa_kernel::agent::AgentState;
+use mofa_runtime::runner::{AgentRunner, RunnerState, RunnerStats};
+use std::collections::VecDeque;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use thiserror::Error;
+use tokio::sync::RwLock;
+use uuid::Uuid;
+
+/// Errors returned by the agent runner harness itself.
+#[derive(Debug, Error)]
+#[non_exhaustive]
+pub enum AgentRunnerError {
+    #[error("failed to create test workspace: {0}")]
+    WorkspaceIo(#[from] std::io::Error),
+
+    #[error("agent runner failure: {0}")]
+    Agent(#[from] AgentError),
+}
+
+/// Metadata captured for each run.
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub struct AgentRunMetadata {
+    pub agent_id: String,
+    pub agent_name: String,
+    pub execution_id: String,
+    pub session_id: Option<String>,
+    pub workspace_root: PathBuf,
+    pub runner_state_before: RunnerState,
+    pub runner_state_after: RunnerState,
+    pub runner_stats_before: RunnerStats,
+    pub runner_stats_after: RunnerStats,
+    pub agent_state_before: AgentState,
+    pub agent_state_after: AgentState,
+    pub started_at: DateTime<Utc>,
+    pub session_snapshot: Option<Session>,
+    pub tool_calls: Vec<ToolCallRecord>,
+    pub llm_last_request: Option<ChatCompletionRequest>,
+    pub llm_last_response: Option<ChatCompletionResponse>,
+    pub workspace_snapshot_before: WorkspaceSnapshot,
+    pub workspace_snapshot_after: WorkspaceSnapshot,
+}
+
+/// Result of a single agent run.
+#[derive(Debug)]
+#[non_exhaustive]
+pub struct AgentRunResult {
+    pub output: Option<AgentOutput>,
+    pub error: Option<AgentError>,
+    pub duration: Duration,
+    pub metadata: AgentRunMetadata,
+}
+
+/// Captures a tool call with its input and output.
+#[derive(Debug, Clone)]
+pub struct ToolCallRecord {
+    pub tool_name: String,
+    pub input: serde_json::Value,
+    pub output: Option<serde_json::Value>,
+    pub success: bool,
+    pub duration_ms: Option<u64>,
+    pub timed_out: bool,
+}
+
+/// Snapshot of files in the test workspace.
+#[derive(Debug, Clone)]
+pub struct WorkspaceSnapshot {
+    pub files: Vec<WorkspaceFileSnapshot>,
+}
+
+#[derive(Debug, Clone)]
+pub struct WorkspaceFileSnapshot {
+    pub relative_path: String,
+    pub size_bytes: u64,
+    pub modified_ms: Option<u64>,
+    pub checksum: u64,
+}
+
+impl AgentRunResult {
+    pub fn is_success(&self) -> bool {
+        self.error.is_none()
+    }
+
+    pub fn output_text(&self) -> Option<String> {
+        self.output.as_ref().map(AgentOutput::to_text)
+    }
+}
+
+/// Simple deterministic LLM provider for tests.
+#[derive(Debug)]
+pub struct MockAgentLLMProvider {
+    name: String,
+    responses: RwLock<VecDeque<MockLlmResponse>>,
+    default_response: RwLock<String>,
+    last_request: RwLock<Option<ChatCompletionRequest>>,
+    last_response: RwLock<Option<ChatCompletionResponse>>,
+}
+
+#[derive(Debug, Clone)]
+enum MockLlmResponse {
+    Text(String),
+    ToolCall {
+        content: Option<String>,
+        tool_calls: Vec<ToolCall>,
+    },
+    Error(String),
+}
+
+impl MockAgentLLMProvider {
+    pub fn new(name: impl Into<String>) -> Self {
+        Self {
+            name: name.into(),
+            responses: RwLock::new(VecDeque::new()),
+            default_response: RwLock::new("This is a mock response.".to_string()),
+            last_request: RwLock::new(None),
+            last_response: RwLock::new(None),
+        }
+    }
+
+    pub async fn add_response(&self, response: impl Into<String>) {
+        self.responses
+            .write()
+            .await
+            .push_back(MockLlmResponse::Text(response.into()));
+    }
+
+    pub async fn add_tool_call_response(
+        &self,
+        tool_name: &str,
+        arguments: serde_json::Value,
+        content: Option<String>,
+    ) {
+        let tool_call = ToolCall {
+            id: Uuid::now_v7().to_string(),
+            name: tool_name.to_string(),
+            arguments,
+        };
+        self.responses.write().await.push_back(MockLlmResponse::ToolCall {
+            content,
+            tool_calls: vec![tool_call],
+        });
+    }
+
+    pub async fn add_error_response(&self, message: impl Into<String>) {
+        self.responses
+            .write()
+            .await
+            .push_back(MockLlmResponse::Error(message.into()));
+    }
+
+    pub async fn set_default_response(&self, response: impl Into<String>) {
+        *self.default_response.write().await = response.into();
+    }
+
+    pub async fn pending_responses(&self) -> usize {
+        self.responses.read().await.len()
+    }
+
+    pub async fn last_request(&self) -> Option<ChatCompletionRequest> {
+        self.last_request.read().await.clone()
+    }
+
+    pub async fn last_response(&self) -> Option<ChatCompletionResponse> {
+        self.last_response.read().await.clone()
+    }
+}
+
+#[async_trait]
+impl mofa_kernel::agent::types::LLMProvider for MockAgentLLMProvider {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn chat(
+        &self,
+        request: ChatCompletionRequest,
+    ) -> AgentResult<ChatCompletionResponse> {
+        *self.last_request.write().await = Some(request);
+        let response = {
+            let mut responses = self.responses.write().await;
+            if let Some(next) = responses.pop_front() {
+                next
+            } else {
+                MockLlmResponse::Text(self.default_response.read().await.clone())
+            }
+        };
+
+        let response = match response {
+            MockLlmResponse::Text(content) => Ok(ChatCompletionResponse {
+                content: Some(content),
+                tool_calls: Some(Vec::<ToolCall>::new()),
+                usage: None,
+            }),
+            MockLlmResponse::ToolCall { content, tool_calls } => Ok(ChatCompletionResponse {
+                content,
+                tool_calls: Some(tool_calls),
+                usage: None,
+            }),
+            MockLlmResponse::Error(message) => Err(AgentError::ExecutionFailed(message)),
+        }?;
+
+        *self.last_response.write().await = Some(response.clone());
+        Ok(response)
+    }
+}
+
+struct SessionAwareExecutor {
+    executor: AgentExecutor,
+}
+
+impl SessionAwareExecutor {
+    fn new(executor: AgentExecutor) -> Self {
+        Self { executor }
+    }
+
+    async fn register_tool(
+        &self,
+        tool: Arc<dyn mofa_kernel::agent::components::tool::DynTool>,
+    ) -> AgentResult<()> {
+        self.executor.register_tool(tool).await
+    }
+
+    async fn update_prompt_context<F>(&self, updater: F)
+    where
+        F: FnOnce(&mut mofa_foundation::agent::context::prompt::PromptContext),
+    {
+        self.executor.update_prompt_context(updater).await;
+    }
+}
+
+#[async_trait]
+impl MoFAAgent for SessionAwareExecutor {
+    fn id(&self) -> &str {
+        self.executor.id()
+    }
+
+    fn name(&self) -> &str {
+        self.executor.name()
+    }
+
+    fn capabilities(&self) -> &AgentCapabilities {
+        self.executor.capabilities()
+    }
+
+    fn state(&self) -> mofa_kernel::agent::AgentState {
+        self.executor.state()
+    }
+
+    async fn initialize(&mut self, ctx: &AgentContext) -> AgentResult<()> {
+        self.executor.initialize(ctx).await
+    }
+
+    async fn execute(
+        &mut self,
+        input: AgentInput,
+        ctx: &AgentContext,
+    ) -> AgentResult<AgentOutput> {
+        let message = input.as_text().unwrap_or("");
+        let session_key = ctx.session_id.as_deref().unwrap_or("default");
+        let response = self.executor.process_message(session_key, message).await?;
+        Ok(AgentOutput::text(response))
+    }
+
+    async fn shutdown(&mut self) -> AgentResult<()> {
+        self.executor.shutdown().await
+    }
+}
+
+struct TempWorkspace {
+    root: PathBuf,
+}
+
+impl TempWorkspace {
+    fn new(prefix: &str) -> Result<Self, AgentRunnerError> {
+        let root = std::env::temp_dir().join(format!("{}-{}", prefix, Uuid::now_v7()));
+        std::fs::create_dir_all(&root)?;
+        Ok(Self { root })
+    }
+
+    fn path(&self) -> &Path {
+        &self.root
+    }
+
+    fn write_file(&self, relative_path: &Path, content: &str) -> Result<PathBuf, AgentRunnerError> {
+        let path = self.root.join(relative_path);
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent)?;
+        }
+        std::fs::write(&path, content)?;
+        Ok(path)
+    }
+
+    fn snapshot(&self) -> WorkspaceSnapshot {
+        let mut files = Vec::new();
+        collect_workspace_files(&self.root, &self.root, &mut files);
+        files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
+        WorkspaceSnapshot { files }
+    }
+}
+
+impl Drop for TempWorkspace {
+    fn drop(&mut self) {
+        let _ = std::fs::remove_dir_all(&self.root);
+    }
+}
+
+fn collect_workspace_files(root: &Path, current: &Path, files: &mut Vec<WorkspaceFileSnapshot>) {
+    let entries = match std::fs::read_dir(current) {
+        Ok(entries) => entries,
+        Err(_) => return,
+    };
+
+    for entry in entries.flatten() {
+        let path = entry.path();
+        if path.is_dir() {
+            collect_workspace_files(root, &path, files);
+            continue;
+        }
+
+        let metadata = match entry.metadata() {
+            Ok(metadata) => metadata,
+            Err(_) => continue,
+        };
+
+        let size_bytes = metadata.len();
+        let modified_ms = metadata
+            .modified()
+            .ok()
+            .and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok())
+            .map(|duration| duration.as_millis() as u64);
+
+        let bytes = match std::fs::read(&path) {
+            Ok(bytes) => bytes,
+            Err(_) => Vec::new(),
+        };
+        let checksum = hash_bytes(&bytes);
+        let relative_path = path
+            .strip_prefix(root)
+            .unwrap_or(&path)
+            .to_string_lossy()
+            .replace('\\', "/");
+
+        files.push(WorkspaceFileSnapshot {
+            relative_path,
+            size_bytes,
+            modified_ms,
+            checksum,
+        });
+    }
+}
+
+fn hash_bytes(bytes: &[u8]) -> u64 {
+    let mut hasher = DefaultHasher::new();
+    bytes.hash(&mut hasher);
+    hasher.finish()
+}
+
+/// Test harness for running real agent execution paths.
+pub struct AgentTestRunner {
+    workspace: TempWorkspace,
+    session_id: String,
+    execution_id: String,
+    llm: Arc<MockAgentLLMProvider>,
+    runner: AgentRunner<SessionAwareExecutor>,
+    mock_tools: Vec<MockTool>,
+}
+
+impl AgentTestRunner {
+    pub async fn new() -> Result<Self, AgentRunnerError> {
+        Self::with_config(AgentExecutorConfig::default()).await
+    }
+
+    pub async fn with_config(config: AgentExecutorConfig) -> Result<Self, AgentRunnerError> {
+        let workspace = TempWorkspace::new("mofa-agent-test")?;
+        let llm = Arc::new(MockAgentLLMProvider::new("mock-llm"));
+        let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?;
+        let agent = SessionAwareExecutor::new(executor);
+
+        let execution_id = Uuid::now_v7().to_string();
+        let session_id = Uuid::now_v7().to_string();
+        let context = AgentContext::with_session(&execution_id, &session_id);
+
+        let runner = AgentRunner::with_context(agent, context).await?;
+
+        Ok(Self {
+            workspace,
+            session_id,
+            execution_id,
+            llm,
+            runner,
+            mock_tools: Vec::new(),
+        })
+    }
+
+    pub fn workspace(&self) -> &Path {
+        self.workspace.path()
+    }
+
+    pub fn session_id(&self) -> &str {
+        &self.session_id
+    }
+
+    pub fn execution_id(&self) -> &str {
+        &self.execution_id
+    }
+
+    pub fn mock_llm(&self) -> Arc<MockAgentLLMProvider> {
+        Arc::clone(&self.llm)
+    }
+
+    pub fn write_bootstrap_file(
+        &self,
+        filename: &str,
+        content: &str,
+    ) -> Result<PathBuf, AgentRunnerError> {
+        self.workspace.write_file(Path::new(filename), content)
+    }
+
+    pub fn write_workspace_file(
+        &self,
+        relative_path: impl AsRef<Path>,
+        content: &str,
+    ) -> Result<PathBuf, AgentRunnerError> {
+        self.workspace.write_file(relative_path.as_ref(), content)
+    }
+
+    pub async fn register_simple_tool<T>(&self, tool: T) -> Result<(), AgentRunnerError>
+    where
+        T: mofa_foundation::agent::components::tool::SimpleTool + Send + Sync + 'static,
+    {
+        let tool_ref = as_tool(tool);
+        self.runner
+            .agent()
+            .register_tool(tool_ref)
+            .await
+            .map_err(AgentRunnerError::from)
+    }
+
+    pub async fn register_mock_tool(&mut self, tool: MockTool) -> Result<(), AgentRunnerError> {
+        self.register_simple_tool(tool.clone()).await?;
+        self.mock_tools.push(tool);
+        Ok(())
+    }
+
+    pub async fn configure_prompt(
+        &self,
+        identity: Option<AgentIdentity>,
+        bootstrap_files: Option<Vec<String>>,
+    ) {
+        self.runner
+            .agent()
+            .update_prompt_context(|ctx| {
+                if let Some(identity) = identity {
+                    ctx.set_identity(identity);
+                }
+                if let Some(files) = bootstrap_files {
+                    ctx.set_bootstrap_files(files);
+                }
+            })
+            .await;
+    }
+
+    pub async fn run_text(&mut self, input: &str) -> Result<AgentRunResult, AgentRunnerError> {
+        self.run_input(AgentInput::text(input)).await
+    }
+
+    pub async fn run_text_with_session(
+        &mut self,
+        session_id: &str,
+        input: &str,
+    ) -> Result<AgentRunResult, AgentRunnerError> {
+        let original_session = self.runner.context().session_id.clone();
+        self.runner
+            .set_session_id(Some(session_id.to_string()));
+        let result = self.run_text(input).await;
+        self.runner.set_session_id(original_session);
+        result
+    }
+
+    pub async fn run_texts(
+        &mut self,
+        inputs: &[&str],
+    ) -> Result<Vec<AgentRunResult>, AgentRunnerError> {
+        let mut results = Vec::with_capacity(inputs.len());
+        for input in inputs {
+            results.push(self.run_text(input).await?);
+        }
+        Ok(results)
+    }
+
+    pub async fn run_input(
+        &mut self,
+        input: AgentInput,
+    ) -> Result<AgentRunResult, AgentRunnerError> {
+        let started_at = Utc::now();
+        let runner_state_before = self.runner.state().await;
+        let runner_stats_before = self.runner.stats().await;
+        let agent_state_before = self.runner.agent_state();
+        let workspace_snapshot_before = self.workspace.snapshot();
+        let timer = Instant::now();
+        let result = self.runner.execute(input).await;
+        let duration = timer.elapsed();
+        let runner_state_after = self.runner.state().await;
+        let runner_stats_after = self.runner.stats().await;
+        let agent_state_after = self.runner.agent_state();
+        let session_snapshot = self.load_session_snapshot().await;
+        let workspace_snapshot_after = self.workspace.snapshot();
+        let tool_calls = self.collect_tool_calls().await;
+        let llm_last_request = self.llm.last_request().await;
+        let llm_last_response = self.llm.last_response().await;
+
+        let (output, error) = match result {
+            Ok(output) => (Some(output), None),
+            Err(err) => (None, Some(err)),
+        };
+
+        let metadata = AgentRunMetadata {
+            agent_id: self.runner.agent().id().to_string(),
+            agent_name: self.runner.agent().name().to_string(),
+            execution_id: self.runner.context().execution_id.clone(),
+            session_id: self.runner.context().session_id.clone(),
+            workspace_root: self.workspace.path().to_path_buf(),
+            runner_state_before,
+            runner_state_after,
+            runner_stats_before,
+            runner_stats_after,
+            agent_state_before,
+            agent_state_after,
+            started_at,
+            session_snapshot,
+            tool_calls,
+            llm_last_request,
+            llm_last_response,
+            workspace_snapshot_before,
+            workspace_snapshot_after,
+        };
+
+        Ok(AgentRunResult {
+            output,
+            error,
+            duration,
+            metadata,
+        })
+    }
+
+    pub async fn shutdown(self) -> Result<(), AgentRunnerError> {
+        self.runner.shutdown().await?;
+        Ok(())
+    }
+
+    async fn load_session_snapshot(&self) -> Option<Session> {
+        let session_id = self.runner.context().session_id.as_deref()?;
+        let storage = JsonlSessionStorage::new(self.workspace.path()).await.ok()?;
+        storage.load(session_id).await.ok()?
+    }
+
+    async fn collect_tool_calls(&self) -> Vec<ToolCallRecord> {
+        let mut records = Vec::new();
+        for tool in &self.mock_tools {
+            let calls = tool.history().await;
+            let results = tool.results().await;
+            for (idx, call) in calls.into_iter().enumerate() {
+                let result = results.get(idx).cloned();
+                let (output, success, duration_ms, timed_out) = match result {
+                    Some(result) => {
+                        let duration_ms = result
+                            .metadata
+                            .get("duration_ms")
+                            .and_then(|value| value.parse::<u64>().ok());
+                        let timed_out = result
+                            .error
+                            .as_ref()
+                            .map(|err| err.contains("timed out"))
+                            .unwrap_or(false);
+                        (
+                            Some(result.output.clone()),
+                            result.success,
+                            duration_ms,
+                            timed_out,
+                        )
+                    }
+                    None => (None, false, None, false),
+                };
+                records.push(ToolCallRecord {
+                    tool_name: tool.name().to_string(),
+                    input: call.arguments,
+                    output,
+                    success,
+                    duration_ms,
+                    timed_out,
+                });
+            }
+        }
+        records
+    }
+}
diff --git a/tests/src/artifact.rs b/tests/src/artifact.rs
new file mode 100644
index 000000000..b121cd515
--- /dev/null
+++ b/tests/src/artifact.rs
@@ -0,0 +1,255 @@
+//! Canonical run artifacts for DSL-backed agent test execution.
+//!
+//! These types provide the stable, serializable output model for DSL runs,
+//! built from the existing runner result.
+
+use crate::agent_runner::{AgentRunResult, ToolCallRecord, WorkspaceFileSnapshot, WorkspaceSnapshot};
+use crate::dsl::{AssertionOutcome, TestCaseDsl};
+use mofa_foundation::agent::session::Session;
+use serde::{Deserialize, Serialize};
+
+// Top-level artifact emitted for a single DSL-backed case execution.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentRunArtifact {
+    pub case_name: String,
+    pub status: String,
+    pub output_text: Option<String>,
+    pub runner_error: Option<String>,
+    pub duration_ms: u64,
+    pub started_at_ms: u64,
+    pub execution_id: String,
+    pub session_id: Option<String>,
+    pub workspace_root: String,
+    pub agent: AgentArtifact,
+    pub assertions: Vec<AssertionOutcome>,
+    pub tool_calls: Vec<ToolCallArtifact>,
+    pub llm_request: Option<LlmRequestArtifact>,
+    pub llm_response: Option<LlmResponseArtifact>,
+    pub session_snapshot: Option<SessionArtifact>,
+    pub workspace_before: WorkspaceSnapshotArtifact,
+    pub workspace_after: WorkspaceSnapshotArtifact,
+}
+
+// Compact identity data for the agent used by the run.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentArtifact {
+    pub id: String,
+    pub name: String,
+}
+
+// Tool execution records are flattened into the artifact for downstream checks.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolCallArtifact {
+    pub tool_name: String,
+    pub input: serde_json::Value,
+    pub output: Option<serde_json::Value>,
+    pub success: bool,
+    pub duration_ms: Option<u64>,
+    pub timed_out: bool,
+}
+
+// LLM request/response types keep only the fields needed for stable inspection.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmRequestArtifact {
+    pub model: Option<String>,
+    pub temperature: Option<f32>,
+    pub max_tokens: Option<u32>,
+    pub messages: Vec<LlmMessageArtifact>,
+    pub tool_names: Vec<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmResponseArtifact {
+    pub content: Option<String>,
+    pub tool_calls: Vec<LlmToolCallArtifact>,
+    pub usage: Option<TokenUsageArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmMessageArtifact {
+    pub role: String,
+    pub content: Option<String>,
+    pub tool_call_id: Option<String>,
+    pub tool_calls: Vec<LlmToolCallArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LlmToolCallArtifact {
+    pub id: String,
+    pub name: String,
+    pub arguments: serde_json::Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TokenUsageArtifact {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SessionArtifact {
+    pub messages: Vec<SessionMessageArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SessionMessageArtifact {
+    pub role: String,
+    pub content: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WorkspaceSnapshotArtifact {
+    pub files: Vec<WorkspaceFileArtifact>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WorkspaceFileArtifact {
+    pub relative_path: String,
+    pub size_bytes: u64,
+    pub modified_ms: Option<u64>,
+    pub checksum: u64,
+}
+
+impl AgentRunArtifact {
+    // Build the canonical artifact from the current runner result plus DSL assertion outcomes.
+    pub fn from_run_result(
+        case: &TestCaseDsl,
+        result: &AgentRunResult,
+        assertions: Vec<AssertionOutcome>,
+    ) -> Self {
+        Self {
+            case_name: case.name.clone(),
+            status: if result.is_success() && assertions.iter().all(|item| item.passed) {
+                "passed".to_string()
+            } else {
+                "failed".to_string()
+            },
+            output_text: result.output_text(),
+            runner_error: result.error.as_ref().map(ToString::to_string),
+            duration_ms: result.duration.as_millis() as u64,
+            started_at_ms: result.metadata.started_at.timestamp_millis() as u64,
+            execution_id: result.metadata.execution_id.clone(),
+            session_id: result.metadata.session_id.clone(),
+            workspace_root: result.metadata.workspace_root.display().to_string(),
+            agent: AgentArtifact {
+                id: result.metadata.agent_id.clone(),
+                name: result.metadata.agent_name.clone(),
+            },
+            assertions,
+            tool_calls: result
+                .metadata
+                .tool_calls
+                .iter()
+                .map(tool_call_artifact)
+                .collect(),
+            llm_request: result
+                .metadata
+                .llm_last_request
+                .as_ref()
+                .map(|request| LlmRequestArtifact {
+                    model: request.model.clone(),
+                    temperature: request.temperature,
+                    max_tokens: request.max_tokens,
+                    messages: request
+                        .messages
+                        .iter()
+                        .map(|message| LlmMessageArtifact {
+                            role: message.role.clone(),
+                            content: message.content.clone(),
+                            tool_call_id: message.tool_call_id.clone(),
+                            tool_calls: message
+                                .tool_calls
+                                .clone()
+                                .unwrap_or_default()
+                                .into_iter()
+                                .map(llm_tool_call_artifact)
+                                .collect(),
+                        })
+                        .collect(),
+                    tool_names: request
+                        .tools
+                        .clone()
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(|tool| tool.name)
+                        .collect(),
+                }),
+            llm_response: result
+                .metadata
+                .llm_last_response
+                .as_ref()
+                .map(|response| LlmResponseArtifact {
+                    content: response.content.clone(),
+                    tool_calls: response
+                        .tool_calls
+                        .clone()
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(llm_tool_call_artifact)
+                        .collect(),
+                    usage: response.usage.as_ref().map(|usage| TokenUsageArtifact {
+                        prompt_tokens: usage.prompt_tokens,
+                        completion_tokens: usage.completion_tokens,
+                        total_tokens: usage.total_tokens,
+                    }),
+                }),
+            session_snapshot: result
+                .metadata
+                .session_snapshot
+                .as_ref()
+                .map(session_artifact),
+            workspace_before: workspace_snapshot_artifact(&result.metadata.workspace_snapshot_before),
+            workspace_after: workspace_snapshot_artifact(&result.metadata.workspace_snapshot_after),
+        }
+    }
+}
+
+fn tool_call_artifact(record: &ToolCallRecord) -> ToolCallArtifact {
+    ToolCallArtifact {
+        tool_name: record.tool_name.clone(),
+        input: record.input.clone(),
+        output: record.output.clone(),
+        success: record.success,
+        duration_ms: record.duration_ms,
+        timed_out: record.timed_out,
+    }
+}
+
+fn llm_tool_call_artifact(tool_call: mofa_kernel::agent::types::ToolCall) -> LlmToolCallArtifact {
+    LlmToolCallArtifact {
+        id: tool_call.id,
+        name: tool_call.name,
+        arguments: tool_call.arguments,
+    }
+}
+
+// Session snapshots are reduced to ordered role/content pairs for stable comparisons.
+fn session_artifact(session: &Session) -> SessionArtifact {
+    SessionArtifact {
+        messages: session
+            .messages
+            .iter()
+            .map(|message| SessionMessageArtifact {
+                role: message.role.clone(),
+                content: message.content.clone(),
+            })
+            .collect(),
+    }
+}
+
+// Workspace snapshots preserve a compact file-level view before and after execution.
+fn workspace_snapshot_artifact(snapshot: &WorkspaceSnapshot) -> WorkspaceSnapshotArtifact {
+    WorkspaceSnapshotArtifact {
+        files: snapshot.files.iter().map(workspace_file_artifact).collect(),
+    }
+}
+
+fn workspace_file_artifact(file: &WorkspaceFileSnapshot) -> WorkspaceFileArtifact {
+    WorkspaceFileArtifact {
+        relative_path: file.relative_path.clone(),
+        size_bytes: file.size_bytes,
+        modified_ms: file.modified_ms,
+        checksum: file.checksum,
+    }
+}
diff --git a/tests/src/assertions.rs b/tests/src/assertions.rs
index 1a0b58594..946efe134 100644
--- a/tests/src/assertions.rs
+++ b/tests/src/assertions.rs
@@ -84,3 +84,173 @@ macro_rules! assert_bus_message_sent {
         );
     }};
 }
+
+/// Assert a session's messages match the expected (role, content) pairs.
+pub fn assert_session_messages(
+    session: &mofa_foundation::agent::session::Session,
+    expected: &[(&str, &str)],
+) {
+    assert_eq!(
+        session.messages.len(),
+        expected.len(),
+        "Expected {} session messages, got {}",
+        expected.len(),
+        session.messages.len()
+    );
+
+    for (idx, (role, content)) in expected.iter().enumerate() {
+        let msg = &session.messages[idx];
+        assert_eq!(
+            msg.role, *role,
+            "Expected role '{}' at index {}, got '{}'",
+            role, idx, msg.role
+        );
+        assert_eq!(
+            msg.content, *content,
+            "Expected content '{}' at index {}, got '{}'",
+            content, idx, msg.content
+        );
+    }
+}
+
+/// Assert the most recent tool result matches the expected JSON output.
+///
+/// # Example
+/// ```ignore
+/// assert_tool_last_result!(tool, json!("done"));
+/// ```
+#[macro_export]
+macro_rules! assert_tool_last_result {
+    ($tool:expr, $expected:expr) => {{
+        let result = $tool
+            .last_result()
+            .await
+            .expect("Expected tool to have a result, but it was never executed");
+        let expected = $expected;
+        assert_eq!(
+            result.output, expected,
+            "Expected latest tool result {:?}, got {:?}",
+            expected, result.output
+        );
+    }};
+}
+
+/// Assert the agent run produced the expected output text.
+///
+/// # Example
+/// ```ignore
+/// assert_agent_output_text!(result, "hello");
+/// ```
+#[macro_export]
+macro_rules! assert_agent_output_text {
+    ($result:expr, $expected:expr) => {{
+        let expected = $expected;
+        let actual = $result.output_text();
+        assert_eq!(
+            actual.as_deref(),
+            Some(expected),
+            "Expected agent output {:?}, got {:?}",
+            expected,
+            actual
+        );
+    }};
+}
+
+/// Assert the agent run failed with an error containing the given substring.
+///
+/// # Example
+/// ```ignore
+/// assert_run_failed_with!(result, "timeout");
+/// ```
+#[macro_export]
+macro_rules! assert_run_failed_with {
+    ($result:expr, $pattern:expr) => {{
+        let pattern = $pattern;
+        let error = $result
+            .error
+            .as_ref()
+            .expect("Expected run to fail, but it succeeded");
+        let message = error.to_string();
+        assert!(
+            message.contains(pattern),
+            "Expected error containing {:?}, got {:?}",
+            pattern,
+            message
+        );
+    }};
+}
+
+/// Assert the workspace snapshot contains a file with the given relative path.
+///
+/// # Example
+/// ```ignore
+/// assert_workspace_contains_file!(snapshot, "sessions/demo.jsonl");
+/// ```
+#[macro_export]
+macro_rules! assert_workspace_contains_file {
+    ($snapshot:expr, $relative_path:expr) => {{
+        let relative_path = $relative_path;
+        let found = $snapshot
+            .files
+            .iter()
+            .any(|file| file.relative_path == relative_path);
+        assert!(
+            found,
+            "Expected workspace snapshot to contain {:?}, found paths: {:?}",
+            relative_path,
+            $snapshot
+                .files
+                .iter()
+                .map(|file| file.relative_path.as_str())
+                .collect::<Vec<_>>()
+        );
+    }};
+}
+
+/// Assert the run metadata captured a tool call with the given tool name.
+///
+/// # Example
+/// ```ignore
+/// assert_run_recorded_tool_call!(result, "echo_tool");
+/// ```
+#[macro_export]
+macro_rules! assert_run_recorded_tool_call {
+    ($result:expr, $tool_name:expr) => {{
+        let tool_name = $tool_name;
+        let found = $result
+            .metadata
+            .tool_calls
+            .iter()
+            .any(|record| record.tool_name == tool_name);
+        assert!(
+            found,
+            "Expected run metadata to contain tool call {:?}, found tool calls: {:?}",
+            tool_name,
+            $result
+                .metadata
+                .tool_calls
+                .iter()
+                .map(|record| record.tool_name.as_str())
+                .collect::<Vec<_>>()
+        );
+    }};
+}
+
+/// Assert the runner total execution count after a run matches the expected value.
+///
+/// # Example
+/// ```ignore
+/// assert_runner_total_executions!(result, 1);
+/// ```
+#[macro_export]
+macro_rules! assert_runner_total_executions {
+    ($result:expr, $expected:expr) => {{
+        let expected = $expected;
+        let actual = $result.metadata.runner_stats_after.total_executions;
+        assert_eq!(
+            actual, expected,
+            "Expected runner total executions {}, got {}",
+            expected, actual
+        );
+    }};
+}
diff --git a/tests/src/dsl.rs b/tests/src/dsl.rs
new file mode 100644
index 000000000..deb730f57
--- /dev/null
+++ b/tests/src/dsl.rs
@@ -0,0 +1,309 @@
+//! Minimal TOML DSL support for the testing MVP.
+//!
+//! This module keeps the schema intentionally small so contributors can define
+//! simple agent tests without introducing a full DSL framework yet.
+
+use crate::agent_runner::{AgentRunResult, AgentRunnerError, AgentTestRunner};
+use crate::tools::MockTool;
+use mofa_foundation::agent::context::prompt::AgentIdentity;
+use mofa_kernel::agent::components::tool::ToolResult;
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use std::path::Path;
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum DslError {
+    #[error("failed to read DSL file: {0}")]
+    Io(#[from] std::io::Error),
+
+    #[error("failed to parse TOML DSL: {0}")]
+    Toml(#[from] toml::de::Error),
+
+    #[error("runner error: {0}")]
+    Runner(#[from] AgentRunnerError),
+
+    #[error("test case must define either `prompt` or `input`")]
+    MissingPrompt,
+
+    #[error("expected output to contain `{expected}`, got `{actual}`")]
+    ExpectedContains { expected: String, actual: String },
+
+    #[error("expected tool `{tool}` to be called, found tool calls: {actual:?}")]
+    ExpectedToolCall { tool: String, actual: Vec<String> },
+
+    #[error("run produced no text output")]
+    MissingOutput,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct TestCaseDsl {
+    pub name: String,
+    pub prompt: Option<String>,
+    pub input: Option<String>,
+    pub expected_text: Option<String>,
+    #[serde(default)]
+    pub bootstrap_files: Vec<BootstrapFileDsl>,
+    pub agent: Option<AgentDsl>,
+    #[serde(default)]
+    pub tools: Vec<ToolDsl>,
+    pub llm: Option<LlmDsl>,
+    #[serde(rename = "assert")]
+    pub assertions: Option<AssertDsl>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct BootstrapFileDsl {
+    pub path: String,
+    pub content: String,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct AgentDsl {
+    pub name: Option<String>,
+    pub description: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct ToolDsl {
+    pub name: String,
+    pub description: String,
+    pub schema: Value,
+    pub result: Option<Value>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct LlmDsl {
+    #[serde(default)]
+    pub responses: Vec<String>,
+    #[serde(default)]
+    pub steps: Vec<LlmStepDsl>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct LlmStepDsl {
+    #[serde(rename = "type")]
+    pub kind: LlmStepKind,
+    pub content: Option<String>,
+    pub tool: Option<String>,
+    pub arguments: Option<Value>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum LlmStepKind {
+    Text,
+    ToolCall,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct AssertDsl {
+    pub contains: Option<String>,
+    pub tool_called: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AssertionOutcome {
+    pub kind: String,
+    pub expected: Value,
+    pub actual: Value,
+    pub passed: bool,
+}
+
+impl TestCaseDsl {
+    pub fn from_toml_str(input: &str) -> Result<Self, DslError> {
+        Ok(toml::from_str(input)?)
+    }
+
+    pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self, DslError> {
+        let input = std::fs::read_to_string(path)?;
+        Self::from_toml_str(&input)
+    }
+
+    fn execution_input(&self) -> Result<&str, DslError> {
+        self.prompt
+            .as_deref()
+            .or(self.input.as_deref())
+            .ok_or(DslError::MissingPrompt)
+    }
+}
+
+pub async fn run_test_case(case: &TestCaseDsl) -> Result<AgentRunResult, DslError> {
+    let result = execute_test_case(case).await?;
+    let assertions = collect_assertion_outcomes(case, &result);
+    if let Some(error) = assertion_error_from_outcomes(&assertions) {
+        return Err(error);
+    }
+    Ok(result)
+}
+
+pub async fn execute_test_case(case: &TestCaseDsl) -> Result<AgentRunResult, DslError> {
+    let mut runner = AgentTestRunner::new().await?;
+    configure_runner_from_test_case(case, &mut runner).await?;
+    let result = runner.run_text(case.execution_input()?).await?;
+    runner.shutdown().await?;
+    Ok(result)
+}
+
+pub async fn configure_runner_from_test_case(
+    case: &TestCaseDsl,
+    runner: &mut AgentTestRunner,
+) -> Result<(), DslError> {
+    if !case.bootstrap_files.is_empty() {
+        let mut bootstrap_paths = Vec::with_capacity(case.bootstrap_files.len());
+        for file in &case.bootstrap_files {
+            runner.write_bootstrap_file(&file.path, &file.content)?;
+            bootstrap_paths.push(file.path.clone());
+        }
+        runner
+            .configure_prompt(agent_identity(case.agent.as_ref()), Some(bootstrap_paths))
+            .await;
+    } else if case.agent.is_some() {
+        runner
+            .configure_prompt(agent_identity(case.agent.as_ref()), None)
+            .await;
+    }
+
+    for tool in &case.tools {
+        let mock_tool = MockTool::new(&tool.name, &tool.description, tool.schema.clone());
+        if let Some(result) = &tool.result {
+            mock_tool
+                .set_result(ToolResult::success(result.clone()))
+                .await;
+        }
+        runner.register_mock_tool(mock_tool).await?;
+    }
+
+    // Queue deterministic LLM responses before execution so the DSL stays a thin
+    // adapter over the existing runner harness.
+    if let Some(llm) = &case.llm {
+        if !llm.steps.is_empty() {
+            for step in &llm.steps {
+                match step.kind {
+                    LlmStepKind::Text => {
+                        runner
+                            .mock_llm()
+                            .add_response(step.content.clone().unwrap_or_default())
+                            .await;
+                    }
+                    LlmStepKind::ToolCall => {
+                        runner
+                            .mock_llm()
+                            .add_tool_call_response(
+                                step.tool.as_deref().unwrap_or_default(),
+                                step.arguments.clone().unwrap_or(Value::Null),
+                                step.content.clone(),
+                            )
+                            .await;
+                    }
+                }
+            }
+        } else {
+            for response in &llm.responses {
+                runner.mock_llm().add_response(response).await;
+            }
+        }
+    }
+    Ok(())
+}
+
+fn agent_identity(agent: Option<&AgentDsl>) -> Option<AgentIdentity> {
+    let agent = agent?;
+    let name = agent.name.clone()?;
+    Some(AgentIdentity {
+        name,
+        description: agent.description.clone().unwrap_or_default(),
+        icon: None,
+    })
+}
+
+fn expected_contains(case: &TestCaseDsl) -> Option<&str> {
+    // Prefer the explicit assertion block when present, while keeping
+    // `expected_text` as a lightweight shorthand for the MVP schema.
+    case.assertions
+        .as_ref()
+        .and_then(|assertions| assertions.contains.as_deref())
+        .or(case.expected_text.as_deref())
+}
+
+fn expected_tool_call(case: &TestCaseDsl) -> Option<&str> {
+    case.assertions
+        .as_ref()
+        .and_then(|assertions| assertions.tool_called.as_deref())
+}
+
+pub fn collect_assertion_outcomes(case: &TestCaseDsl, result: &AgentRunResult) -> Vec<AssertionOutcome> {
+    let mut outcomes = Vec::new();
+
+    if let Some(expected) = expected_contains(case) {
+        let actual = result.output_text();
+        outcomes.push(AssertionOutcome {
+            kind: "contains".to_string(),
+            expected: Value::String(expected.to_string()),
+            actual: actual
+                .clone()
+                .map(Value::String)
+                .unwrap_or(Value::Null),
+            passed: actual
+                .as_ref()
+                .map(|value| value.contains(expected))
+                .unwrap_or(false),
+        });
+    }
+
+    if let Some(expected_tool) = expected_tool_call(case) {
+        let actual = result
+            .metadata
+            .tool_calls
+            .iter()
+            .map(|record| Value::String(record.tool_name.clone()))
+            .collect::<Vec<_>>();
+        outcomes.push(AssertionOutcome {
+            kind: "tool_called".to_string(),
+            expected: Value::String(expected_tool.to_string()),
+            actual: Value::Array(actual.clone()),
+            passed: actual
+                .iter()
+                .any(|tool| tool.as_str() == Some(expected_tool)),
+        });
+    }
+
+    outcomes
+}
+
+pub fn assertion_error_from_outcomes(outcomes: &[AssertionOutcome]) -> Option<DslError> {
+    for outcome in outcomes {
+        if outcome.passed {
+            continue;
+        }
+
+        match outcome.kind.as_str() {
+            "contains" => {
+                return if outcome.actual.is_null() {
+                    Some(DslError::MissingOutput)
+                } else {
+                    Some(DslError::ExpectedContains {
+                        expected: outcome.expected.as_str().unwrap_or_default().to_string(),
+                        actual: outcome.actual.as_str().unwrap_or_default().to_string(),
+                    })
+                };
+            }
+            "tool_called" => {
+                let actual = outcome
+                    .actual
+                    .as_array()
+                    .into_iter()
+                    .flatten()
+                    .filter_map(|value| value.as_str().map(ToString::to_string))
+                    .collect::<Vec<_>>();
+                return Some(DslError::ExpectedToolCall {
+                    tool: outcome.expected.as_str().unwrap_or_default().to_string(),
+                    actual,
+                });
+            }
+            _ => continue,
+        }
+    }
+
+    None
+}
diff --git a/tests/src/lib.rs b/tests/src/lib.rs
index 3368dc49b..2490ae668 100644
--- a/tests/src/lib.rs
+++ b/tests/src/lib.rs
@@ -4,16 +4,33 @@
 //! control for testing MoFA agents.
 
 pub mod adversarial;
+pub mod agent_runner;
+pub mod artifact;
 pub mod assertions;
 pub mod backend;
 pub mod bus;
 pub mod clock;
+pub mod dsl;
 pub mod report;
 pub mod tools;
 
 pub use backend::MockLLMBackend;
 pub use bus::MockAgentBus;
 pub use clock::{Clock, MockClock, SystemClock};
+pub use dsl::{
+    assertion_error_from_outcomes, collect_assertion_outcomes, configure_runner_from_test_case,
+    execute_test_case, run_test_case, AgentDsl, AssertDsl, AssertionOutcome, BootstrapFileDsl,
+    DslError, LlmDsl, LlmStepDsl, LlmStepKind, TestCaseDsl, ToolDsl,
+};
+pub use agent_runner::{
+    AgentRunMetadata, AgentRunResult, AgentRunnerError, AgentTestRunner, MockAgentLLMProvider,
+    ToolCallRecord, WorkspaceFileSnapshot, WorkspaceSnapshot,
+};
+pub use artifact::{
+    AgentArtifact, AgentRunArtifact, LlmMessageArtifact, LlmRequestArtifact, LlmResponseArtifact,
+    LlmToolCallArtifact, SessionArtifact, SessionMessageArtifact, TokenUsageArtifact,
+    ToolCallArtifact, WorkspaceFileArtifact, WorkspaceSnapshotArtifact,
+};
 pub use report::{
     JsonFormatter, ReportFormatter, TestCaseResult, TestReport, TestReportBuilder, TestStatus,
     TextFormatter,
diff --git a/tests/src/tools.rs b/tests/src/tools.rs
index 48356b551..fbc6bb5b6 100644
--- a/tests/src/tools.rs
+++ b/tests/src/tools.rs
@@ -20,6 +20,7 @@ pub struct MockTool {
     category: ToolCategory,
     pub stubbed_result: Arc<RwLock<ToolResult>>,
     pub call_history: Arc<RwLock<Vec<ToolInput>>>,
+    pub result_history: Arc<RwLock<Vec<ToolResult>>>,
     failure_queue: Arc<RwLock<VecDeque<String>>>,
     failure_patterns: Arc<RwLock<Vec<(Value, String)>>>,
     result_sequence: Arc<RwLock<VecDeque<ToolResult>>>,
@@ -37,6 +38,7 @@ impl MockTool {
                 "Mock execution default",
             ))),
             call_history: Arc::new(RwLock::new(Vec::new())),
+            result_history: Arc::new(RwLock::new(Vec::new())),
             failure_queue: Arc::new(RwLock::new(VecDeque::new())),
             failure_patterns: Arc::new(RwLock::new(Vec::new())),
             result_sequence: Arc::new(RwLock::new(VecDeque::new())),
@@ -58,6 +60,16 @@ impl MockTool {
         self.call_history.read().await.len()
     }
 
+    /// Retrieve a clone of the full result history.
+    pub async fn results(&self) -> Vec<ToolResult> {
+        self.result_history.read().await.clone()
+    }
+
+    /// Returns the most recent result, or `None` if never executed.
+    pub async fn last_result(&self) -> Option<ToolResult> {
+        self.result_history.read().await.last().cloned()
+    }
+
     /// Queue failures for the next N calls.
     pub async fn fail_next(&self, count: usize, error_msg: &str) {
         let mut queue = self.failure_queue.write().await;
@@ -110,12 +122,18 @@ impl SimpleTool for MockTool {
 
     async fn execute(&self, input: ToolInput) -> ToolResult {
         self.call_history.write().await.push(input.clone());
+        let start = std::time::Instant::now();
 
         // 1. Drain failure queue
         {
             let mut queue = self.failure_queue.write().await;
             if let Some(err) = queue.pop_front() {
-                return ToolResult::failure(err);
+                let mut result = ToolResult::failure(err);
+                result
+                    .metadata
+                    .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+                self.result_history.write().await.push(result.clone());
+                return result;
             }
         }
 
@@ -124,7 +142,12 @@ impl SimpleTool for MockTool {
             let patterns = self.failure_patterns.read().await;
             for (pattern, err) in patterns.iter() {
                 if input.arguments == *pattern {
-                    return ToolResult::failure(err);
+                    let mut result = ToolResult::failure(err);
+                    result
+                        .metadata
+                        .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+                    self.result_history.write().await.push(result.clone());
+                    return result;
                 }
             }
         }
@@ -133,11 +156,21 @@ impl SimpleTool for MockTool {
         {
             let mut seq = self.result_sequence.write().await;
             if let Some(result) = seq.pop_front() {
+                let mut result = result;
+                result
+                    .metadata
+                    .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+                self.result_history.write().await.push(result.clone());
                 return result;
             }
         }
 
-        self.stubbed_result.read().await.clone()
+        let mut result = self.stubbed_result.read().await.clone();
+        result
+            .metadata
+            .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string());
+        self.result_history.write().await.push(result.clone());
+        result
     }
 
     fn category(&self) -> ToolCategory {
diff --git a/tests/tests/agent_runner_tests.rs b/tests/tests/agent_runner_tests.rs
new file mode 100644
index 000000000..250b9e62d
--- /dev/null
+++ b/tests/tests/agent_runner_tests.rs
@@ -0,0 +1,311 @@
+use mofa_testing::agent_runner::AgentTestRunner;
+use mofa_testing::assertions::assert_session_messages;
+use mofa_testing::tools::MockTool;
+use serde_json::json;
+
+#[tokio::test]
+async fn agent_runner_executes_and_captures_output() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .mock_llm()
+        .add_response("Mocked response")
+        .await;
+
+    let result = runner
+        .run_text("hello")
+        .await
+        .expect("run should succeed");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("Mocked response"));
+    assert_eq!(
+        result.metadata.session_id.as_deref(),
+        Some(runner.session_id())
+    );
+    assert_eq!(result.metadata.execution_id, runner.execution_id());
+    assert_eq!(result.metadata.runner_stats_before.total_executions, 0);
+    assert_eq!(result.metadata.runner_stats_after.total_executions, 1);
+    assert!(result.metadata.session_snapshot.is_some());
+    let snapshot = result.metadata.session_snapshot.as_ref().unwrap();
+    assert_eq!(snapshot.len(), 2);
+    assert_session_messages(snapshot, &[("user", "hello"), ("assistant", "Mocked response")]);
+
+    let expected_session_path = format!("sessions/{}.jsonl", runner.session_id());
+    assert!(
+        !result
+            .metadata
+            .workspace_snapshot_before
+            .files
+            .iter()
+            .any(|file| file.relative_path == expected_session_path)
+    );
+    assert!(
+        result
+            .metadata
+            .workspace_snapshot_after
+            .files
+            .iter()
+            .any(|file| file.relative_path == expected_session_path)
+    );
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_creates_isolated_workspaces() {
+    let mut runner_a = AgentTestRunner::new().await.expect("runner A initializes");
+    let mut runner_b = AgentTestRunner::new().await.expect("runner B initializes");
+
+    assert_ne!(runner_a.workspace(), runner_b.workspace());
+
+    runner_a
+        .mock_llm()
+        .add_response("Response A")
+        .await;
+    runner_b
+        .mock_llm()
+        .add_response("Response B")
+        .await;
+
+    let _ = runner_a
+        .run_text("hi")
+        .await
+        .expect("runner A executes");
+    let _ = runner_b
+        .run_text("hi")
+        .await
+        .expect("runner B executes");
+
+    // Session files should exist in each separate workspace.
+    let session_a = runner_a
+        .workspace()
+        .join("sessions")
+        .join(format!("{}.jsonl", runner_a.session_id()));
+    let session_b = runner_b
+        .workspace()
+        .join("sessions")
+        .join(format!("{}.jsonl", runner_b.session_id()));
+
+    assert!(session_a.exists());
+    assert!(session_b.exists());
+
+    runner_a.shutdown().await.expect("shutdown A");
+    runner_b.shutdown().await.expect("shutdown B");
+}
+
+#[tokio::test]
+async fn agent_runner_executes_tool_calls() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+
+    let tool = MockTool::new(
+        "echo_tool",
+        "Echo the provided input",
+        json!({
+            "type": "object",
+            "properties": {
+                "input": { "type": "string" }
+            },
+            "required": ["input"]
+        }),
+    );
+
+    runner
+        .register_mock_tool(tool.clone())
+        .await
+        .expect("tool registered");
+
+    // First response triggers a tool call; second response is the final answer.
+    runner
+        .mock_llm()
+        .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None)
+        .await;
+    runner
+        .mock_llm()
+        .add_response("Final response")
+        .await;
+
+    let result = runner
+        .run_text("use tool")
+        .await
+        .expect("run should succeed");
+
+    // Tool call should be captured in both tool history and run metadata.
+    assert_eq!(result.output_text().as_deref(), Some("Final response"));
+    assert_eq!(tool.call_count().await, 1);
+    let last_call = tool.last_call().await.expect("tool call captured");
+    assert_eq!(last_call.arguments, json!({ "input": "ping" }));
+    assert_eq!(result.metadata.tool_calls.len(), 1);
+    let record = &result.metadata.tool_calls[0];
+    assert_eq!(record.tool_name, "echo_tool");
+    assert_eq!(record.input, json!({ "input": "ping" }));
+    assert!(record.success);
+    assert_eq!(
+        record.output,
+        Some(json!("Mock execution default"))
+    );
+    assert!(record.duration_ms.is_some());
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_loads_bootstrap_files() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .write_bootstrap_file("AGENTS.md", "Bootstrap content for agent test.")
+        .expect("bootstrap file written");
+
+    runner
+        .mock_llm()
+        .add_response("Bootstrapped response")
+        .await;
+
+    let _ = runner
+        .run_text("check prompt")
+        .await
+        .expect("run should succeed");
+
+    let request = runner
+        .mock_llm()
+        .last_request()
+        .await
+        .expect("request captured");
+    // Validate the system message includes the bootstrap content.
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+
+    assert!(system_message.contains("AGENTS.md"));
+    assert!(system_message.contains("Bootstrap content for agent test."));
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_supports_multi_turn_runs() {
+    // Multi-turn helper should keep the same session and extend history.
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("First reply").await;
+    runner.mock_llm().add_response("Second reply").await;
+
+    let results = runner
+        .run_texts(&["turn one", "turn two"])
+        .await
+        .expect("multi-turn run succeeds");
+
+    assert_eq!(results.len(), 2);
+    assert_eq!(results[0].output_text().as_deref(), Some("First reply"));
+    assert_eq!(results[1].output_text().as_deref(), Some("Second reply"));
+
+    // Session snapshot should contain two user/assistant pairs.
+    let snapshot = results
+        .last()
+        .and_then(|result| result.metadata.session_snapshot.as_ref())
+        .expect("session snapshot captured");
+    assert_eq!(snapshot.len(), 4);
+    assert_session_messages(
+        snapshot,
+        &[
+            ("user", "turn one"),
+            ("assistant", "First reply"),
+            ("user", "turn two"),
+            ("assistant", "Second reply"),
+        ],
+    );
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_customizes_prompt_identity_and_bootstraps() {
+    // Custom identity + bootstrap list should appear in the system prompt.
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .write_bootstrap_file("CUSTOM.md", "Custom bootstrap content.")
+        .expect("bootstrap file written");
+    runner
+        .configure_prompt(
+            Some(mofa_foundation::agent::context::prompt::AgentIdentity {
+                name: "TestAgent".to_string(),
+                description: "Custom identity".to_string(),
+                icon: None,
+            }),
+            Some(vec!["CUSTOM.md".to_string()]),
+        )
+        .await;
+
+    runner.mock_llm().add_response("Custom response").await;
+    let _ = runner
+        .run_text("custom prompt")
+        .await
+        .expect("run should succeed");
+
+    let request = runner
+        .mock_llm()
+        .last_request()
+        .await
+        .expect("request captured");
+    // Validate custom identity and bootstrap content.
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+
+    assert!(system_message.contains("TestAgent"));
+    assert!(system_message.contains("Custom identity"));
+    assert!(system_message.contains("CUSTOM.md"));
+    assert!(system_message.contains("Custom bootstrap content."));
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_captures_llm_failure() {
+    // LLM failures should surface in AgentRunResult with failed stats.
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .mock_llm()
+        .add_error_response("mock failure")
+        .await;
+
+    let result = runner
+        .run_text("trigger failure")
+        .await
+        .expect("run should return result");
+
+    assert!(!result.is_success());
+    let error = result.error.expect("error captured");
+    assert!(error.to_string().contains("mock failure"));
+    assert_eq!(result.metadata.runner_stats_after.failed_executions, 1);
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn agent_runner_allows_custom_session_keys() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner
+        .mock_llm()
+        .add_response("Custom session response")
+        .await;
+
+    let result = runner
+        .run_text_with_session("custom-session", "hello session")
+        .await
+        .expect("run should succeed");
+
+    assert_eq!(
+        result.metadata.session_id.as_deref(),
+        Some("custom-session")
+    );
+    let session_path = runner
+        .workspace()
+        .join("sessions")
+        .join("custom-session.jsonl");
+    assert!(session_path.exists());
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
diff --git a/tests/tests/artifact_tests.rs b/tests/tests/artifact_tests.rs
new file mode 100644
index 000000000..6876ef661
--- /dev/null
+++ b/tests/tests/artifact_tests.rs
@@ -0,0 +1,60 @@
+//! Tests for canonical DSL run artifact generation.
+
+use mofa_testing::{
+    AgentRunArtifact, TestCaseDsl, assertion_error_from_outcomes, collect_assertion_outcomes,
+    execute_test_case,
+};
+
+#[tokio::test]
+async fn artifact_contains_core_run_data() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/tool_agent.toml"
+    ))
+    .expect("tool DSL example should parse");
+
+    let result = execute_test_case(&case)
+        .await
+        .expect("DSL case should execute");
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions);
+
+    assert_eq!(artifact.case_name, "tool_agent_run");
+    assert_eq!(artifact.status, "passed");
+    assert_eq!(artifact.output_text.as_deref(), Some("Tool execution complete"));
+    assert_eq!(artifact.tool_calls.len(), 1);
+    assert_eq!(artifact.tool_calls[0].tool_name, "echo_tool");
+    assert!(!artifact.agent.name.is_empty());
+    assert!(artifact.llm_request.is_some());
+    assert!(artifact.workspace_after.files.iter().any(|file| {
+        file.relative_path.ends_with(".jsonl")
+    }));
+}
+
+#[tokio::test]
+async fn artifact_captures_failed_assertions() {
+    let case = TestCaseDsl::from_toml_str(
+        r#"
+name = "failing_case"
+prompt = "Say hello"
+
+[llm]
+responses = ["wrong output"]
+
+[assert]
+contains = "expected text"
+"#,
+    )
+    .expect("inline DSL should parse");
+
+    let result = execute_test_case(&case)
+        .await
+        .expect("DSL case should execute");
+    let assertions = collect_assertion_outcomes(&case, &result);
+    let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone());
+
+    assert_eq!(artifact.status, "failed");
+    assert_eq!(artifact.assertions.len(), 1);
+    assert!(!artifact.assertions[0].passed);
+    assert!(assertion_error_from_outcomes(&assertions).is_some());
+}
diff --git a/tests/tests/assertion_macro_tests.rs b/tests/tests/assertion_macro_tests.rs
index 55c2af2f4..c27aee010 100644
--- a/tests/tests/assertion_macro_tests.rs
+++ b/tests/tests/assertion_macro_tests.rs
@@ -1,11 +1,11 @@
-//! Tests for assertion macros: assert_tool_called!, assert_tool_called_with!,
-//! assert_infer_called!, assert_bus_message_sent!.
+//! Tests for assertion macros and assertion helpers used by the testing crate.
 
 use mofa_foundation::agent::components::tool::SimpleTool;
 use mofa_foundation::orchestrator::{ModelOrchestrator, ModelProviderConfig, ModelType};
 use mofa_kernel::agent::components::tool::ToolInput;
 use mofa_kernel::bus::CommunicationMode;
 use mofa_kernel::message::AgentMessage;
+use mofa_testing::agent_runner::AgentTestRunner;
 use mofa_testing::backend::MockLLMBackend;
 use mofa_testing::bus::MockAgentBus;
 use mofa_testing::tools::MockTool;
@@ -121,3 +121,89 @@ async fn assert_bus_message_sent_panics_when_no_message_from_sender() {
 
     mofa_testing::assert_bus_message_sent!(bus, "agent-2");
 }
+
+// ===================================================================
+// New assertion helpers
+// ===================================================================
+
+#[tokio::test]
+async fn assert_tool_last_result_passes_on_matching_output() {
+    let tool = MockTool::new("search", "Search tool", json!({"type": "object"}));
+    tool.execute(ToolInput::from_json(json!({"query": "rust"})))
+        .await;
+
+    mofa_testing::assert_tool_last_result!(tool, json!("Mock execution default"));
+}
+
+#[tokio::test]
+async fn assert_agent_output_text_passes_on_matching_output() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("hello from runner").await;
+
+    let result = runner.run_text("hello").await.expect("run succeeds");
+
+    mofa_testing::assert_agent_output_text!(result, "hello from runner");
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_run_failed_with_passes_on_matching_error() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_error_response("mock failure").await;
+
+    let result = runner.run_text("hello").await.expect("run completes");
+
+    mofa_testing::assert_run_failed_with!(result, "mock failure");
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_workspace_contains_file_passes_when_snapshot_has_file() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("workspace ready").await;
+
+    let result = runner.run_text("hello").await.expect("run succeeds");
+    let expected = format!("sessions/{}.jsonl", runner.session_id());
+
+    mofa_testing::assert_workspace_contains_file!(result.metadata.workspace_snapshot_after, expected);
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_run_recorded_tool_call_passes_when_tool_metadata_exists() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    let tool = MockTool::new(
+        "echo_tool",
+        "Echo tool",
+        json!({
+            "type": "object",
+            "properties": { "input": { "type": "string" } },
+            "required": ["input"]
+        }),
+    );
+    runner
+        .register_mock_tool(tool)
+        .await
+        .expect("tool registered");
+    runner
+        .mock_llm()
+        .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None)
+        .await;
+    runner.mock_llm().add_response("done").await;
+
+    let result = runner.run_text("use tool").await.expect("run succeeds");
+
+    mofa_testing::assert_run_recorded_tool_call!(result, "echo_tool");
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn assert_runner_total_executions_passes_on_first_run() {
+    let mut runner = AgentTestRunner::new().await.expect("runner initializes");
+    runner.mock_llm().add_response("counted").await;
+
+    let result = runner.run_text("hello").await.expect("run succeeds");
+
+    mofa_testing::assert_runner_total_executions!(result, 1);
+    runner.shutdown().await.expect("shutdown succeeds");
+}
diff --git a/tests/tests/dsl_tests.rs b/tests/tests/dsl_tests.rs
new file mode 100644
index 000000000..00fdb1a5f
--- /dev/null
+++ b/tests/tests/dsl_tests.rs
@@ -0,0 +1,92 @@
+//! Integration tests for the minimal TOML DSL adapter.
+
+use mofa_testing::{configure_runner_from_test_case, run_test_case, AgentTestRunner, TestCaseDsl};
+
+#[tokio::test]
+async fn toml_dsl_runs_through_agent_runner() {
+    // Load the example DSL from the crate so the test exercises parsing and
+    // adapter execution together.
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/simple_agent.toml"
+    ))
+        .expect("DSL example should parse");
+
+    assert_eq!(case.name, "simple_agent_run");
+
+    let result = run_test_case(&case)
+        .await
+        .expect("DSL case should run successfully");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("hello from DSL"));
+}
+
+#[tokio::test]
+async fn toml_dsl_supports_bootstrap_files() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/bootstrap_agent.toml"
+    ))
+    .expect("bootstrap DSL example should parse");
+
+    let mut runner = AgentTestRunner::new()
+        .await
+        .expect("runner should initialize");
+
+    configure_runner_from_test_case(&case, &mut runner)
+        .await
+        .expect("DSL bootstrap config should apply");
+
+    let _ = runner
+        .run_text(case.prompt.as_deref().expect("prompt should be present"))
+        .await
+        .expect("bootstrap run should succeed");
+
+    let request = runner
+        .mock_llm()
+        .last_request()
+        .await
+        .expect("request should be captured");
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+
+    assert!(system_message.contains("AGENTS.md"));
+    assert!(system_message.contains("Bootstrapped instructions for the DSL test."));
+
+    runner.shutdown().await.expect("shutdown succeeds");
+}
+
+#[tokio::test]
+async fn toml_dsl_supports_tool_backed_runs() {
+    let case = TestCaseDsl::from_toml_file(concat!(
+        env!("CARGO_MANIFEST_DIR"),
+        "/examples/tool_agent.toml"
+    ))
+    .expect("tool DSL example should parse");
+
+    let result = run_test_case(&case)
+        .await
+        .expect("tool-backed DSL case should run successfully");
+
+    assert!(result.is_success());
+    assert_eq!(result.output_text().as_deref(), Some("Tool execution complete"));
+    assert_eq!(result.metadata.tool_calls.len(), 1);
+    assert_eq!(result.metadata.tool_calls[0].tool_name, "echo_tool");
+
+    let request = result
+        .metadata
+        .llm_last_request
+        .as_ref()
+        .expect("request should be captured");
+    let system_message = request
+        .messages
+        .first()
+        .and_then(|msg| msg.content.as_deref())
+        .expect("system message content");
+    assert!(system_message.contains("ToolAgent"));
+    assert!(system_message.contains("Agent used to validate tool-aware DSL execution."));
+}