diff --git a/Cargo.toml b/Cargo.toml index f71801118..aca372a14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -86,6 +86,9 @@ lazy_static = "1.4" # Actor framework for ReAct agents ractor = "0" +# TOML deserialization (also used transitively by config) +toml = "0.8" + # Configuration file support (multi-format) config = { version = "0.14", features = [ "toml", diff --git a/crates/mofa-cli/Cargo.toml b/crates/mofa-cli/Cargo.toml index 875ae62a6..fa3120073 100644 --- a/crates/mofa-cli/Cargo.toml +++ b/crates/mofa-cli/Cargo.toml @@ -25,6 +25,7 @@ mofa-kernel = { path = "../mofa-kernel", version = "0.1", features = [ ] } mofa-runtime = { path = "../mofa-runtime", version = "0.1" } mofa-foundation = { path = "../mofa-foundation", version = "0.1" } +mofa-testing = { path = "../../tests", version = "0.1" } config.workspace = true tokio = { workspace = true } thiserror = { workspace = true } @@ -90,6 +91,7 @@ tokio-stream = "0.1" assert_cmd = "2" predicates = "3" tempfile = "3" +axum = { workspace = true } [features] default = [] diff --git a/crates/mofa-cli/src/cli.rs b/crates/mofa-cli/src/cli.rs index 7e10ad479..d55a67ec9 100644 --- a/crates/mofa-cli/src/cli.rs +++ b/crates/mofa-cli/src/cli.rs @@ -81,6 +81,40 @@ pub enum Commands { dora: bool, }, + /// Run a testing DSL case file + TestDsl { + /// TOML DSL file to execute + file: PathBuf, + + /// Optional canonical artifact file path + #[arg(long)] + artifact_out: Option, + + /// Optional report file path + #[arg(long)] + report_out: Option, + + /// Compare the current artifact against a saved baseline artifact + #[arg(long)] + baseline_in: Option, + + /// Write the current artifact to a baseline file + #[arg(long)] + baseline_out: Option, + + /// Write machine-readable comparison output (requires --baseline-in) + #[arg(long)] + comparison_out: Option, + + /// Exit non-zero when baseline comparison mismatches + #[arg(long)] + fail_on_diff: bool, + + /// Report file format + #[arg(long, value_enum, default_value_t = TestDslReportFormat::Json)] + report_format: TestDslReportFormat, + }, + /// Run a dora dataflow #[cfg(feature = "dora")] Dataflow { @@ -219,6 +253,12 @@ pub enum DatabaseType { Sqlite, } +#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)] +pub enum TestDslReportFormat { + Json, + Text, +} + impl std::fmt::Display for DatabaseType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -725,6 +765,77 @@ mod tests { assert!(parsed.is_ok(), "doctor ci strict json should parse"); } + #[test] + fn test_test_dsl_parses() { + let parsed = Cli::try_parse_from(["mofa", "test-dsl", "tests/examples/simple_agent.toml"]); + assert!(parsed.is_ok(), "test-dsl command should parse"); + } + + #[test] + fn test_test_dsl_report_flags_parse() { + let parsed = Cli::try_parse_from([ + "mofa", + "test-dsl", + "tests/examples/simple_agent.toml", + "--report-out", + "/tmp/report.json", + "--report-format", + "json", + ]); + assert!(parsed.is_ok(), "test-dsl report flags should parse"); + } + + #[test] + fn test_test_dsl_artifact_flag_parses() { + let parsed = Cli::try_parse_from([ + "mofa", + "test-dsl", + "tests/examples/simple_agent.toml", + "--artifact-out", + "/tmp/artifact.json", + ]); + assert!(parsed.is_ok(), "test-dsl artifact flag should parse"); + } + + #[test] + fn test_test_dsl_baseline_flags_parse() { + let parsed = Cli::try_parse_from([ + "mofa", + "test-dsl", + "tests/examples/simple_agent.toml", + "--baseline-in", + "/tmp/baseline.json", + "--baseline-out", + "/tmp/new-baseline.json", + ]); + assert!(parsed.is_ok(), "test-dsl baseline flags should parse"); + } + + #[test] + fn test_test_dsl_comparison_flag_parse() { + let parsed = Cli::try_parse_from([ + "mofa", + "test-dsl", + "tests/examples/simple_agent.toml", + "--baseline-in", + "/tmp/baseline.json", + "--comparison-out", + "/tmp/comparison.json", + ]); + assert!(parsed.is_ok(), "test-dsl comparison flag should parse"); + } + + #[test] + fn test_test_dsl_fail_on_diff_flag_parse() { + let parsed = Cli::try_parse_from([ + "mofa", + "test-dsl", + "tests/examples/simple_agent.toml", + "--fail-on-diff", + ]); + assert!(parsed.is_ok(), "test-dsl fail-on-diff flag should parse"); + } + #[test] fn test_rag_index_parses() { let parsed = Cli::try_parse_from([ diff --git a/crates/mofa-cli/src/commands/mod.rs b/crates/mofa-cli/src/commands/mod.rs index 0fb02a9d3..1b798f63f 100644 --- a/crates/mofa-cli/src/commands/mod.rs +++ b/crates/mofa-cli/src/commands/mod.rs @@ -11,5 +11,6 @@ pub mod new; pub mod plugin; pub mod rag; pub mod run; +pub mod test_dsl; pub mod session; pub mod tool; diff --git a/crates/mofa-cli/src/commands/test_dsl.rs b/crates/mofa-cli/src/commands/test_dsl.rs new file mode 100644 index 000000000..a3222595f --- /dev/null +++ b/crates/mofa-cli/src/commands/test_dsl.rs @@ -0,0 +1,213 @@ +//! `mofa test-dsl` command implementation + +use crate::CliError; +use crate::cli::TestDslReportFormat; +use crate::output::OutputFormat; +use mofa_testing::{ + AgentRunArtifact, AgentRunArtifactComparison, DslError, JsonFormatter, + ReportFormatter, + TestCaseResult, TestReport, TestStatus, TextFormatter, TestCaseDsl, + assertion_error_from_outcomes, collect_assertion_outcomes, execute_test_case, +}; +use serde::Serialize; +use serde_json::json; +use std::path::Path; + +#[derive(Debug, Serialize)] +struct TestDslSummary { + name: String, + success: bool, + output_text: Option, + duration_ms: u128, + tool_calls: Vec, + workspace_root: String, + baseline_matches: Option, +} + +/// Execute one TOML DSL test case through the testing runner. +pub async fn run( + path: &Path, + format: OutputFormat, + artifact_out: Option<&Path>, + report_out: Option<&Path>, + baseline_in: Option<&Path>, + baseline_out: Option<&Path>, + comparison_out: Option<&Path>, + fail_on_diff: bool, + report_format: TestDslReportFormat, +) -> Result<(), CliError> { + let case = TestCaseDsl::from_toml_file(path).map_err(map_dsl_error)?; + let result = execute_test_case(&case).await.map_err(map_dsl_error)?; + let assertions = collect_assertion_outcomes(&case, &result); + let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone()); + let report = build_report(&artifact); + let baseline = if let Some(baseline_in) = baseline_in { + Some(read_artifact(baseline_in)?) + } else { + None + }; + let baseline_diff = baseline.as_ref().map(|baseline| artifact.compare_to(baseline)); + + if let Some(artifact_out) = artifact_out { + write_artifact(artifact_out, &artifact)?; + } + + if let Some(baseline_out) = baseline_out { + write_artifact(baseline_out, &artifact)?; + } + + if let Some(comparison_out) = comparison_out { + let baseline = baseline.as_ref().ok_or_else(|| { + CliError::Other("comparison output requires --baseline-in".to_string()) + })?; + let diff = baseline_diff.as_ref().ok_or_else(|| { + CliError::Other("comparison output requires --baseline-in".to_string()) + })?; + let comparison = AgentRunArtifactComparison::from_artifacts( + &artifact, + baseline, + diff.clone(), + ); + write_comparison(comparison_out, &comparison)?; + } + + if let Some(report_out) = report_out { + write_report(report_out, report_format, &report)?; + } + + let summary = TestDslSummary { + name: case.name, + success: result.is_success(), + output_text: result.output_text(), + duration_ms: result.duration.as_millis(), + tool_calls: result + .metadata + .tool_calls + .iter() + .map(|record| record.tool_name.clone()) + .collect(), + workspace_root: result.metadata.workspace_root.display().to_string(), + baseline_matches: baseline_diff.as_ref().map(|diff| diff.matches), + }; + + match format { + OutputFormat::Json => { + let output = json!({ + "success": true, + "case": summary, + "baseline": baseline_diff, + }); + println!("{}", serde_json::to_string_pretty(&output)?); + } + _ => { + println!("case: {}", summary.name); + println!("status: {}", if summary.success { "passed" } else { "failed" }); + if let Some(output_text) = &summary.output_text { + println!("output: {}", output_text); + } + if !summary.tool_calls.is_empty() { + println!("tool_calls: {}", summary.tool_calls.join(", ")); + } + println!("duration_ms: {}", summary.duration_ms); + if let Some(diff) = &baseline_diff { + println!("baseline: {}", if diff.matches { "matched" } else { "mismatch" }); + for difference in &diff.differences { + println!("difference: {}", difference.field); + } + } + } + } + + if fail_on_diff { + if let Some(diff) = &baseline_diff { + if !diff.matches { + return Err(CliError::Other("baseline comparison mismatch".to_string())); + } + } + } + + if let Some(error) = assertion_error_from_outcomes(&assertions) { + return Err(map_dsl_error(error)); + } + + Ok(()) +} + +fn build_report(artifact: &AgentRunArtifact) -> TestReport { + let status = if artifact.status == "passed" { + TestStatus::Passed + } else { + TestStatus::Failed + }; + let error = artifact + .runner_error + .clone() + .or_else(|| { + artifact + .assertions + .iter() + .find(|item| !item.passed) + .map(|item| format!("assertion failed: {}", item.kind)) + }); + let metadata = vec![ + ( + "execution_id".to_string(), + artifact.execution_id.clone(), + ), + ( + "workspace_root".to_string(), + artifact.workspace_root.clone(), + ), + ( + "tool_calls".to_string(), + artifact.tool_calls.len().to_string(), + ), + ]; + + TestReport { + suite_name: "dsl".to_string(), + results: vec![TestCaseResult { + name: artifact.case_name.clone(), + status, + duration: std::time::Duration::from_millis(artifact.duration_ms), + error, + metadata, + }], + total_duration: std::time::Duration::from_millis(artifact.duration_ms), + timestamp: artifact.started_at_ms, + } +} + +fn write_artifact(path: &Path, artifact: &AgentRunArtifact) -> Result<(), CliError> { + let body = serde_json::to_string_pretty(artifact)?; + std::fs::write(path, body)?; + Ok(()) +} + +fn write_comparison( + path: &Path, + comparison: &AgentRunArtifactComparison, +) -> Result<(), CliError> { + // Emit machine readable baseline comparison output. + let body = serde_json::to_string_pretty(comparison)?; + std::fs::write(path, body)?; + Ok(()) +} + +fn read_artifact(path: &Path) -> Result { + let body = std::fs::read_to_string(path)?; + Ok(serde_json::from_str(&body)?) +} + +fn write_report(path: &Path, format: TestDslReportFormat, report: &TestReport) -> Result<(), CliError> { + let body = match format { + TestDslReportFormat::Json => JsonFormatter.format(report), + TestDslReportFormat::Text => TextFormatter.format(report), + }; + std::fs::write(path, body)?; + Ok(()) +} + +fn map_dsl_error(error: DslError) -> CliError { + CliError::Other(format!("DSL test failed: {error}")) +} diff --git a/crates/mofa-cli/src/main.rs b/crates/mofa-cli/src/main.rs index 749fe56f7..1ef811cb2 100644 --- a/crates/mofa-cli/src/main.rs +++ b/crates/mofa-cli/src/main.rs @@ -75,6 +75,7 @@ fn main() { async fn run_command(cli: Cli) -> CliResult<()> { use cli::Commands; + let output_format = cli.output_format.unwrap_or_default(); // Initialize context for commands that need backend services let needs_context = matches!( @@ -121,6 +122,32 @@ async fn run_command(cli: Cli) -> CliResult<()> { commands::run::run(&config, dora)?; } + Some(Commands::TestDsl { + file, + artifact_out, + report_out, + baseline_in, + baseline_out, + comparison_out, + fail_on_diff, + report_format, + }) => { + commands::test_dsl::run( + &file, + output_format, + artifact_out.as_deref(), + report_out.as_deref(), + baseline_in.as_deref(), + baseline_out.as_deref(), + comparison_out.as_deref(), + fail_on_diff, + report_format, + ) + .await + .into_report() + .attach_with(|| format!("running DSL test case from {}", file.display()))?; + } + #[cfg(feature = "dora")] Some(Commands::Dataflow { file, uv }) => { commands::run::run_dataflow(&file, uv)?; diff --git a/crates/mofa-cli/tests/test_dsl_integration_tests.rs b/crates/mofa-cli/tests/test_dsl_integration_tests.rs new file mode 100644 index 000000000..7a7f02483 --- /dev/null +++ b/crates/mofa-cli/tests/test_dsl_integration_tests.rs @@ -0,0 +1,357 @@ +//! Integration tests for `mofa test-dsl`. + +use axum::{Json, Router, routing::post}; +use assert_cmd::Command; +use predicates::prelude::*; +use serde_json::json; +use tempfile::tempdir; + +#[test] +fn test_dsl_command_runs_example_case() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/simple_agent.toml" + ); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args(["test-dsl", case_path]) + .assert() + .success() + .stdout(predicate::str::contains("status: passed")) + .stdout(predicate::str::contains("output: hello from DSL")); +} + +#[test] +fn test_dsl_command_emits_json() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/tool_agent.toml" + ); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args(["--output-format", "json", "test-dsl", case_path]) + .assert() + .success() + .stdout(predicate::str::contains("\"success\": true")) + .stdout(predicate::str::contains("\"tool_calls\"")) + .stdout(predicate::str::contains("\"echo_tool\"")); +} + +#[test] +fn test_dsl_command_runs_tape_backed_case() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/simple_agent_tape.toml" + ); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args(["test-dsl", case_path]) + .assert() + .success() + .stdout(predicate::str::contains("status: passed")) + .stdout(predicate::str::contains("output: hello from tape")); +} + +#[test] +fn test_dsl_command_writes_json_report_file() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/simple_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let report_path = temp.path().join("dsl-report.json"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--report-out", + report_path.to_str().expect("utf8 report path"), + "--report-format", + "json", + ]) + .assert() + .success(); + + let report = std::fs::read_to_string(&report_path).expect("report file exists"); + assert!(report.contains("\"suite\": \"dsl\"")); + assert!(report.contains("\"name\": \"simple_agent_run\"")); + assert!(report.contains("\"status\": \"passed\"")); +} + +#[test] +fn test_dsl_command_writes_text_report_file() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/tool_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let report_path = temp.path().join("dsl-report.txt"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--report-out", + report_path.to_str().expect("utf8 report path"), + "--report-format", + "text", + ]) + .assert() + .success(); + + let report = std::fs::read_to_string(&report_path).expect("report file exists"); + assert!(report.contains("=== dsl ===")); + assert!(report.contains("tool_agent_run")); + assert!(report.contains("[+]")); +} + +#[test] +fn test_dsl_command_writes_canonical_artifact_file() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/tool_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let artifact_path = temp.path().join("dsl-artifact.json"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--artifact-out", + artifact_path.to_str().expect("utf8 artifact path"), + ]) + .assert() + .success(); + + let artifact = std::fs::read_to_string(&artifact_path).expect("artifact file exists"); + assert!(artifact.contains("\"case_name\": \"tool_agent_run\"")); + assert!(artifact.contains("\"status\": \"passed\"")); + assert!(artifact.contains("\"assertions\"")); + assert!(artifact.contains("\"tool_calls\"")); +} + +#[test] +fn test_dsl_command_writes_baseline_file() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/simple_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let baseline_path = temp.path().join("dsl-baseline.json"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--baseline-out", + baseline_path.to_str().expect("utf8 baseline path"), + ]) + .assert() + .success(); + + let baseline = std::fs::read_to_string(&baseline_path).expect("baseline file exists"); + assert!(baseline.contains("\"case_name\": \"simple_agent_run\"")); + assert!(baseline.contains("\"status\": \"passed\"")); +} + +#[test] +fn test_dsl_command_reports_baseline_mismatch() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/tool_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let baseline_path = temp.path().join("dsl-baseline.json"); + + std::fs::write( + &baseline_path, + r#"{ + "case_name": "tool_agent_run", + "status": "passed", + "output_text": "Baseline output", + "runner_error": null, + "duration_ms": 0, + "started_at_ms": 0, + "execution_id": "baseline-exec", + "session_id": "baseline-session", + "workspace_root": "/tmp/baseline", + "agent": { "id": "baseline-agent", "name": "baseline" }, + "assertions": [{ "kind": "contains", "expected": "Baseline output", "actual": "Baseline output", "passed": true }], + "tool_calls": [], + "llm_request": null, + "llm_response": null, + "session_snapshot": null, + "workspace_before": { "files": [] }, + "workspace_after": { "files": [] } +}"#, + ) + .expect("baseline fixture written"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--baseline-in", + baseline_path.to_str().expect("utf8 baseline path"), + ]) + .assert() + .success() + .stdout(predicate::str::contains("baseline: mismatch")) + .stdout(predicate::str::contains("difference: output_text")); +} + +#[test] +fn test_dsl_command_writes_comparison_file() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/simple_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let baseline_path = temp.path().join("dsl-baseline.json"); + let comparison_path = temp.path().join("dsl-comparison.json"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--baseline-out", + baseline_path.to_str().expect("utf8 baseline path"), + ]) + .assert() + .success(); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--baseline-in", + baseline_path.to_str().expect("utf8 baseline path"), + "--comparison-out", + comparison_path.to_str().expect("utf8 comparison path"), + ]) + .assert() + .success(); + + let comparison = std::fs::read_to_string(&comparison_path).expect("comparison file exists"); + assert!(comparison.contains("\"case_name\": \"simple_agent_run\"")); + assert!(comparison.contains("\"matches\": true")); +} + +#[test] +fn test_dsl_command_fails_on_baseline_mismatch_when_flag_set() { + let case_path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../tests/examples/tool_agent.toml" + ); + let temp = tempdir().expect("temp dir"); + let baseline_path = temp.path().join("dsl-baseline.json"); + + std::fs::write( + &baseline_path, + r#"{ + "case_name": "tool_agent_run", + "status": "passed", + "output_text": "Baseline output", + "runner_error": null, + "duration_ms": 0, + "started_at_ms": 0, + "execution_id": "baseline-exec", + "session_id": "baseline-session", + "workspace_root": "/tmp/baseline", + "agent": { "id": "baseline-agent", "name": "baseline" }, + "assertions": [{ "kind": "contains", "expected": "Baseline output", "actual": "Baseline output", "passed": true }], + "tool_calls": [], + "llm_request": null, + "llm_response": null, + "session_snapshot": null, + "workspace_before": { "files": [] }, + "workspace_after": { "files": [] } +}"#, + ) + .expect("baseline fixture written"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args([ + "test-dsl", + case_path, + "--baseline-in", + baseline_path.to_str().expect("utf8 baseline path"), + "--fail-on-diff", + ]) + .assert() + .failure(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_dsl_command_records_tape_from_live_provider() { + fn toml_string(path: &std::path::Path) -> String { + path.display().to_string().replace('\\', "\\\\") + } + + async fn completions(Json(_request): Json) -> Json { + Json(json!({ + "choices": [ + { + "message": { + "role": "assistant", + "content": "hello from live provider" + } + } + ], + "usage": { + "prompt_tokens": 3, + "completion_tokens": 4, + "total_tokens": 7 + } + })) + } + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0") + .await + .expect("bind mock server"); + let address = listener.local_addr().expect("local addr"); + let server = tokio::spawn(async move { + axum::serve(listener, Router::new().route("/v1/chat/completions", post(completions))) + .await + .expect("mock server should run"); + }); + + let temp = tempdir().expect("temp dir"); + let tape_path = temp.path().join("recorded.tape.json"); + let case_path = temp.path().join("record_case.toml"); + std::fs::write( + &case_path, + format!( + "name = \"record_case\"\nprompt = \"hello\"\n\n[llm]\nrecord_tape = \"{}\"\n\n[llm.provider]\nkind = \"open_ai_compatible\"\nbase_url = \"http://{}/v1\"\nmodel = \"mock-model\"\n", + toml_string(&tape_path), + address + ), + ) + .expect("record case written"); + + Command::cargo_bin("mofa") + .expect("mofa bin") + .args(["test-dsl", case_path.to_str().expect("utf8 case path")]) + .assert() + .success() + .stdout(predicate::str::contains("output: hello from live provider")); + + let tape = std::fs::read_to_string(&tape_path).expect("tape file exists"); + assert!(tape.contains("\"case_name\": \"record_case\"")); + assert!(tape.contains("\"response\": \"hello from live provider\"")); + + server.abort(); +} diff --git a/crates/mofa-foundation/src/agent/context/prompt.rs b/crates/mofa-foundation/src/agent/context/prompt.rs index 04ec30bd6..87cc8a127 100644 --- a/crates/mofa-foundation/src/agent/context/prompt.rs +++ b/crates/mofa-foundation/src/agent/context/prompt.rs @@ -142,6 +142,17 @@ impl PromptContext { self } + /// Replace the agent identity. + pub fn set_identity(&mut self, identity: AgentIdentity) { + self.agent_name = identity.name.clone(); + self.identity = identity; + } + + /// Replace the bootstrap file list. + pub fn set_bootstrap_files(&mut self, files: Vec) { + self.bootstrap_files = files; + } + /// Set skills that should always be loaded pub fn with_always_load(mut self, skills: Vec) -> Self { self.always_load = skills; diff --git a/crates/mofa-foundation/src/agent/executor.rs b/crates/mofa-foundation/src/agent/executor.rs index 605ab8fe0..15dca5065 100644 --- a/crates/mofa-foundation/src/agent/executor.rs +++ b/crates/mofa-foundation/src/agent/executor.rs @@ -548,6 +548,15 @@ impl AgentExecutor { &self.config } + /// Update the prompt context (system prompt builder). + pub async fn update_prompt_context(&self, updater: F) + where + F: FnOnce(&mut PromptContext), + { + let mut ctx = self.context.write().await; + updater(&mut ctx); + } + /// Get mutable reference to base agent pub fn base_mut(&mut self) -> &mut BaseAgent { &mut self.base @@ -586,7 +595,9 @@ impl MoFAAgent for AgentExecutor { self.base.initialize(ctx).await?; // Additional executor-specific initialization - self.base.transition_to(AgentState::Ready)?; + if self.base.state() != AgentState::Ready { + self.base.transition_to(AgentState::Ready)?; + } Ok(()) } @@ -643,7 +654,10 @@ mod tests { "mock" } - async fn chat(&self, _request: ChatCompletionRequest) -> AgentResult { + async fn chat( + &self, + _request: ChatCompletionRequest, + ) -> AgentResult { Ok(ChatCompletionResponse { content: Some("ok".to_string()), tool_calls: Some(Vec::::new()), diff --git a/crates/mofa-runtime/src/runner.rs b/crates/mofa-runtime/src/runner.rs index 91ca5aaff..09a856636 100644 --- a/crates/mofa-runtime/src/runner.rs +++ b/crates/mofa-runtime/src/runner.rs @@ -349,6 +349,11 @@ impl AgentRunner { &self.context } + /// Update session ID in the execution context. + pub fn set_session_id(&mut self, session_id: Option) { + self.context.session_id = session_id; + } + /// 获取运行器状态 /// Get runner state pub async fn state(&self) -> RunnerState { diff --git a/examples/Cargo.toml b/examples/Cargo.toml index f952ce045..3da476607 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,9 @@ [workspace] resolver = "3" members = [ + "agent_runner_basic", + "agent_runner_custom_session", + "agent_runner_tools", "cli_production_smoke", "cli_agent_logs_demo", "cli_plugin_lifecycle", diff --git a/examples/agent_runner_basic/Cargo.toml b/examples/agent_runner_basic/Cargo.toml new file mode 100644 index 000000000..e49ef8dea --- /dev/null +++ b/examples/agent_runner_basic/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "agent_runner_basic" +version.workspace = true +edition.workspace = true + +[dependencies] +anyhow.workspace = true +mofa-testing = { path = "../../tests" } +tokio.workspace = true diff --git a/examples/agent_runner_basic/src/main.rs b/examples/agent_runner_basic/src/main.rs new file mode 100644 index 000000000..f49a93ee7 --- /dev/null +++ b/examples/agent_runner_basic/src/main.rs @@ -0,0 +1,29 @@ +use anyhow::Result; +use mofa_testing::AgentTestRunner; + +#[tokio::main] +async fn main() -> Result<()> { + let mut runner = AgentTestRunner::new().await?; + runner.mock_llm().add_response("Hello from the runner").await; + + let result = runner.run_text("hi").await?; + println!("Output: {}", result.output_text().unwrap_or_default()); + println!( + "Session: {}", + result + .metadata + .session_id + .as_deref() + .unwrap_or("") + ); + println!("Workspace: {}", result.metadata.workspace_root.display()); + println!( + "Runner stats: total={} success={} failed={}", + result.metadata.runner_stats_after.total_executions, + result.metadata.runner_stats_after.successful_executions, + result.metadata.runner_stats_after.failed_executions + ); + + runner.shutdown().await?; + Ok(()) +} diff --git a/examples/agent_runner_custom_session/Cargo.toml b/examples/agent_runner_custom_session/Cargo.toml new file mode 100644 index 000000000..52620ec07 --- /dev/null +++ b/examples/agent_runner_custom_session/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "agent_runner_custom_session" +version.workspace = true +edition.workspace = true + +[dependencies] +anyhow.workspace = true +mofa-testing = { path = "../../tests" } +mofa-foundation = { path = "../../crates/mofa-foundation" } +tokio.workspace = true diff --git a/examples/agent_runner_custom_session/src/main.rs b/examples/agent_runner_custom_session/src/main.rs new file mode 100644 index 000000000..fc03881b8 --- /dev/null +++ b/examples/agent_runner_custom_session/src/main.rs @@ -0,0 +1,42 @@ +use anyhow::Result; +use mofa_foundation::agent::context::prompt::AgentIdentity; +use mofa_testing::AgentTestRunner; + +#[tokio::main] +async fn main() -> Result<()> { + let mut runner = AgentTestRunner::new().await?; + + runner.write_bootstrap_file("CUSTOM.md", "Custom bootstrap content.")?; + runner + .configure_prompt( + Some(AgentIdentity { + name: "RunnerDemo".to_string(), + description: "Custom identity for example runs".to_string(), + icon: None, + }), + Some(vec!["CUSTOM.md".to_string()]), + ) + .await; + + runner + .mock_llm() + .add_response("Custom session response") + .await; + + let result = runner + .run_text_with_session("demo-session", "hello session") + .await?; + + println!( + "Session id: {}", + result + .metadata + .session_id + .as_deref() + .unwrap_or("") + ); + println!("Output: {}", result.output_text().unwrap_or_default()); + + runner.shutdown().await?; + Ok(()) +} diff --git a/examples/agent_runner_tools/Cargo.toml b/examples/agent_runner_tools/Cargo.toml new file mode 100644 index 000000000..511578074 --- /dev/null +++ b/examples/agent_runner_tools/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "agent_runner_tools" +version.workspace = true +edition.workspace = true + +[dependencies] +anyhow.workspace = true +mofa-testing = { path = "../../tests" } +serde_json.workspace = true +tokio.workspace = true diff --git a/examples/agent_runner_tools/src/main.rs b/examples/agent_runner_tools/src/main.rs new file mode 100644 index 000000000..7399abf8b --- /dev/null +++ b/examples/agent_runner_tools/src/main.rs @@ -0,0 +1,47 @@ +use anyhow::Result; +use mofa_testing::{AgentTestRunner, MockTool}; +use serde_json::json; + +#[tokio::main] +async fn main() -> Result<()> { + let mut runner = AgentTestRunner::new().await?; + + let tool = MockTool::new( + "echo_tool", + "Echo the provided input", + json!({ + "type": "object", + "properties": { + "input": { "type": "string" } + }, + "required": ["input"] + }), + ); + + runner.register_mock_tool(tool).await?; + + runner + .mock_llm() + .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None) + .await; + runner + .mock_llm() + .add_response("Tool response completed") + .await; + + let result = runner.run_text("use the tool").await?; + println!("Output: {}", result.output_text().unwrap_or_default()); + + for record in &result.metadata.tool_calls { + println!( + "Tool call: name={} input={} output={} duration_ms={:?}", + record.tool_name, + record.input, + record.output.as_ref().unwrap_or(&serde_json::Value::Null), + record.duration_ms + ); + } + + runner.shutdown().await?; + Ok(()) +} diff --git a/tests/Cargo.toml b/tests/Cargo.toml index a0597654f..72f369eb3 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -9,10 +9,19 @@ description = "Testing utilities for the MoFA agent framework" [dependencies] mofa-kernel = { path = "../crates/mofa-kernel" } mofa-foundation = { path = "../crates/mofa-foundation" } +mofa-runtime = { path = "../crates/mofa-runtime" } tokio = { workspace = true } async-trait = { workspace = true } anyhow = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } chrono = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } regex = { workspace = true } +toml = { workspace = true } +tracing = { workspace = true } +reqwest = { workspace = true, features = ["json", "rustls-tls"] } + +[dev-dependencies] +axum = { workspace = true } diff --git a/tests/examples/bootstrap_agent.toml b/tests/examples/bootstrap_agent.toml new file mode 100644 index 000000000..9dd75291c --- /dev/null +++ b/tests/examples/bootstrap_agent.toml @@ -0,0 +1,13 @@ +name = "bootstrap_agent_run" +prompt = "What file was loaded?" +expected_text = "Bootstrapped" + +[[bootstrap_files]] +path = "AGENTS.md" +content = "Bootstrapped instructions for the DSL test." + +[llm] +responses = ["Bootstrapped response"] + +[assert] +contains = "Bootstrapped" \ No newline at end of file diff --git a/tests/examples/record_case_gpt_oss_120b.toml b/tests/examples/record_case_gpt_oss_120b.toml new file mode 100644 index 000000000..cb313bc97 --- /dev/null +++ b/tests/examples/record_case_gpt_oss_120b.toml @@ -0,0 +1,11 @@ +name = "simple_agent_record" +prompt = "hello" + +[llm] +record_tape = "../fixtures/simple_agent_recorded.tape.json" + +[llm.provider] +kind = "open_ai_compatible" +base_url = "https://api.groq.com/openai/v1" +model = "openai/gpt-oss-120b" +api_key_env = "GROQ_API_KEY" diff --git a/tests/examples/simple_agent.toml b/tests/examples/simple_agent.toml new file mode 100644 index 000000000..dc673d88a --- /dev/null +++ b/tests/examples/simple_agent.toml @@ -0,0 +1,9 @@ +name = "simple_agent_run" +prompt = "Say hello" +expected_text = "hello" + +[llm] +responses = ["hello from DSL"] + +[assert] +contains = "hello" diff --git a/tests/examples/simple_agent_llama_record.toml b/tests/examples/simple_agent_llama_record.toml new file mode 100644 index 000000000..3fedfbe8b --- /dev/null +++ b/tests/examples/simple_agent_llama_record.toml @@ -0,0 +1,11 @@ +name = "simple_agent_llama_record" +prompt = "hello" + +[llm] +record_tape = "../fixtures/simple_agent_recorded.tape.json" + +[llm.provider] +kind = "open_ai_compatible" +base_url = "https://api.groq.com/openai/v1" +model = "llama-3.3-70b-versatile" +api_key_env = "GROQ_API_KEY" diff --git a/tests/examples/simple_agent_tape.toml b/tests/examples/simple_agent_tape.toml new file mode 100644 index 000000000..02e65a793 --- /dev/null +++ b/tests/examples/simple_agent_tape.toml @@ -0,0 +1,8 @@ +name = "simple_agent_run" +prompt = "hello" + +[llm] +tape = "../fixtures/simple_agent.tape.json" + +[assert] +contains = "hello from tape" diff --git a/tests/examples/tool_agent.toml b/tests/examples/tool_agent.toml new file mode 100644 index 000000000..81be62550 --- /dev/null +++ b/tests/examples/tool_agent.toml @@ -0,0 +1,27 @@ +name = "tool_agent_run" +input = "Use the echo tool and summarize the result." + +[agent] +name = "ToolAgent" +description = "Agent used to validate tool-aware DSL execution." + +[[tools]] +name = "echo_tool" +description = "Echo the provided input." +schema = { type = "object", properties = { input = { type = "string" } }, required = ["input"] } +result = "echoed from tool" + +[assert] +contains = "Tool execution complete" +tool_called = "echo_tool" + +[llm] + +[[llm.steps]] +type = "tool_call" +tool = "echo_tool" +arguments = { input = "ping" } + +[[llm.steps]] +type = "text" +content = "Tool execution complete" diff --git a/tests/fixtures/simple_agent.tape.json b/tests/fixtures/simple_agent.tape.json new file mode 100644 index 000000000..a6a0e204a --- /dev/null +++ b/tests/fixtures/simple_agent.tape.json @@ -0,0 +1,7 @@ +{ + "version": 1, + "case_name": "simple_agent_run", + "interactions": [ + { "response": "hello from tape" } + ] +} diff --git a/tests/fixtures/simple_agent_recorded.tape.json b/tests/fixtures/simple_agent_recorded.tape.json new file mode 100644 index 000000000..cd203ee4e --- /dev/null +++ b/tests/fixtures/simple_agent_recorded.tape.json @@ -0,0 +1,35 @@ +{ + "version": 1, + "case_name": "simple_agent_record", + "interactions": [ + { + "request": { + "messages": [ + { + "role": "system", + "content": "# Agent A helpful AI assistant\n\nYou are Agent, A helpful AI assistant.\n\n## Current Time\n2026-04-04 14:22 (Saturday)\n\n## Workspace\nYour workspace is at: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb\n- Memory files: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/memory/MEMORY.md\n- Daily notes: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/memory/YYYY-MM-DD.md\n- Custom skills: /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/skills/{{skill-name}}/SKILL.md\n\nAlways be helpful, accurate, and concise. When using tools, explain what you're doing.\nWhen remembering something, write to /tmp/mofa-agent-test-019d58df-5e39-7920-a62f-c328e6970edb/memory/MEMORY.md", + "tool_call_id": null, + "tool_calls": [] + }, + { + "role": "user", + "content": "hello", + "tool_call_id": null, + "tool_calls": [] + } + ], + "model": null, + "temperature": null, + "max_tokens": null, + "tool_names": [] + }, + "response": "Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?", + "tool_calls": [], + "usage": { + "prompt_tokens": 284, + "completion_tokens": 25, + "total_tokens": 309 + } + } + ] +} \ No newline at end of file diff --git a/tests/src/agent_runner.rs b/tests/src/agent_runner.rs new file mode 100644 index 000000000..a59975264 --- /dev/null +++ b/tests/src/agent_runner.rs @@ -0,0 +1,685 @@ +//! Real agent runner harness for integration-style tests. +//! +//! Provides a lightweight wrapper around the MoFA runtime `AgentRunner` +//! with an isolated workspace and deterministic mock LLM. + +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use mofa_foundation::agent::context::prompt::AgentIdentity; +use mofa_foundation::agent::executor::{AgentExecutor, AgentExecutorConfig}; +use mofa_kernel::agent::context::AgentContext; +use mofa_kernel::agent::core::MoFAAgent; +use mofa_kernel::agent::error::{AgentError, AgentResult}; +use mofa_foundation::agent::components::tool::as_tool; +use mofa_foundation::agent::components::tool::SimpleTool; +use mofa_foundation::agent::session::{JsonlSessionStorage, Session, SessionStorage}; +use crate::tools::MockTool; +use mofa_kernel::agent::types::{AgentInput, AgentOutput, ChatCompletionRequest}; +use mofa_kernel::agent::types::{ChatCompletionResponse, ToolCall}; +use mofa_kernel::agent::AgentCapabilities; +use mofa_kernel::agent::AgentState; +use mofa_runtime::runner::{AgentRunner, RunnerState, RunnerStats}; +use crate::replay::{RecordingLLMProvider, ReplayLLMProvider, Tape}; +use std::collections::VecDeque; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use thiserror::Error; +use tokio::sync::RwLock; +use uuid::Uuid; + +/// Errors returned by the agent runner harness itself. +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum AgentRunnerError { + #[error("failed to create test workspace: {0}")] + WorkspaceIo(#[from] std::io::Error), + + #[error("agent runner failure: {0}")] + Agent(#[from] AgentError), +} + +/// Metadata captured for each run. +#[derive(Debug, Clone)] +#[non_exhaustive] +pub struct AgentRunMetadata { + pub agent_id: String, + pub agent_name: String, + pub execution_id: String, + pub session_id: Option, + pub workspace_root: PathBuf, + pub runner_state_before: RunnerState, + pub runner_state_after: RunnerState, + pub runner_stats_before: RunnerStats, + pub runner_stats_after: RunnerStats, + pub agent_state_before: AgentState, + pub agent_state_after: AgentState, + pub started_at: DateTime, + pub session_snapshot: Option, + pub tool_calls: Vec, + pub llm_last_request: Option, + pub llm_last_response: Option, + pub workspace_snapshot_before: WorkspaceSnapshot, + pub workspace_snapshot_after: WorkspaceSnapshot, +} + +/// Result of a single agent run. +#[derive(Debug)] +#[non_exhaustive] +pub struct AgentRunResult { + pub output: Option, + pub error: Option, + pub duration: Duration, + pub metadata: AgentRunMetadata, +} + +/// Captures a tool call with its input and output. +#[derive(Debug, Clone)] +pub struct ToolCallRecord { + pub tool_name: String, + pub input: serde_json::Value, + pub output: Option, + pub success: bool, + pub duration_ms: Option, + pub timed_out: bool, +} + +/// Snapshot of files in the test workspace. +#[derive(Debug, Clone)] +pub struct WorkspaceSnapshot { + pub files: Vec, +} + +#[derive(Debug, Clone)] +pub struct WorkspaceFileSnapshot { + pub relative_path: String, + pub size_bytes: u64, + pub modified_ms: Option, + pub checksum: u64, +} + +impl AgentRunResult { + pub fn is_success(&self) -> bool { + self.error.is_none() + } + + pub fn output_text(&self) -> Option { + self.output.as_ref().map(AgentOutput::to_text) + } +} + +/// Simple deterministic LLM provider for tests. +#[derive(Debug)] +pub struct MockAgentLLMProvider { + name: String, + responses: RwLock>, + default_response: RwLock, + last_request: RwLock>, + last_response: RwLock>, +} + +#[derive(Debug, Clone)] +enum MockLlmResponse { + Text(String), + ToolCall { + content: Option, + tool_calls: Vec, + }, + Error(String), +} + +impl MockAgentLLMProvider { + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + responses: RwLock::new(VecDeque::new()), + default_response: RwLock::new("This is a mock response.".to_string()), + last_request: RwLock::new(None), + last_response: RwLock::new(None), + } + } + + pub async fn add_response(&self, response: impl Into) { + self.responses + .write() + .await + .push_back(MockLlmResponse::Text(response.into())); + } + + pub async fn add_tool_call_response( + &self, + tool_name: &str, + arguments: serde_json::Value, + content: Option, + ) { + let tool_call = ToolCall { + id: Uuid::now_v7().to_string(), + name: tool_name.to_string(), + arguments, + }; + self.responses.write().await.push_back(MockLlmResponse::ToolCall { + content, + tool_calls: vec![tool_call], + }); + } + + pub async fn add_error_response(&self, message: impl Into) { + self.responses + .write() + .await + .push_back(MockLlmResponse::Error(message.into())); + } + + pub async fn set_default_response(&self, response: impl Into) { + *self.default_response.write().await = response.into(); + } + + pub async fn pending_responses(&self) -> usize { + self.responses.read().await.len() + } + + pub async fn last_request(&self) -> Option { + self.last_request.read().await.clone() + } + + pub async fn last_response(&self) -> Option { + self.last_response.read().await.clone() + } +} + +#[async_trait] +impl mofa_kernel::agent::types::LLMProvider for MockAgentLLMProvider { + fn name(&self) -> &str { + &self.name + } + + async fn chat( + &self, + request: ChatCompletionRequest, + ) -> AgentResult { + *self.last_request.write().await = Some(request); + let response = { + let mut responses = self.responses.write().await; + if let Some(next) = responses.pop_front() { + next + } else { + MockLlmResponse::Text(self.default_response.read().await.clone()) + } + }; + + let response = match response { + MockLlmResponse::Text(content) => Ok(ChatCompletionResponse { + content: Some(content), + tool_calls: Some(Vec::::new()), + usage: None, + }), + MockLlmResponse::ToolCall { content, tool_calls } => Ok(ChatCompletionResponse { + content, + tool_calls: Some(tool_calls), + usage: None, + }), + MockLlmResponse::Error(message) => Err(AgentError::ExecutionFailed(message)), + }?; + + *self.last_response.write().await = Some(response.clone()); + Ok(response) + } +} + +struct SessionAwareExecutor { + executor: AgentExecutor, +} + +impl SessionAwareExecutor { + fn new(executor: AgentExecutor) -> Self { + Self { executor } + } + + async fn register_tool( + &self, + tool: Arc, + ) -> AgentResult<()> { + self.executor.register_tool(tool).await + } + + async fn update_prompt_context(&self, updater: F) + where + F: FnOnce(&mut mofa_foundation::agent::context::prompt::PromptContext), + { + self.executor.update_prompt_context(updater).await; + } +} + +#[async_trait] +impl MoFAAgent for SessionAwareExecutor { + fn id(&self) -> &str { + self.executor.id() + } + + fn name(&self) -> &str { + self.executor.name() + } + + fn capabilities(&self) -> &AgentCapabilities { + self.executor.capabilities() + } + + fn state(&self) -> mofa_kernel::agent::AgentState { + self.executor.state() + } + + async fn initialize(&mut self, ctx: &AgentContext) -> AgentResult<()> { + self.executor.initialize(ctx).await + } + + async fn execute( + &mut self, + input: AgentInput, + ctx: &AgentContext, + ) -> AgentResult { + let message = input.as_text().unwrap_or(""); + let session_key = ctx.session_id.as_deref().unwrap_or("default"); + let response = self.executor.process_message(session_key, message).await?; + Ok(AgentOutput::text(response)) + } + + async fn shutdown(&mut self) -> AgentResult<()> { + self.executor.shutdown().await + } +} + +struct TempWorkspace { + root: PathBuf, +} + +impl TempWorkspace { + fn new(prefix: &str) -> Result { + let root = std::env::temp_dir().join(format!("{}-{}", prefix, Uuid::now_v7())); + std::fs::create_dir_all(&root)?; + Ok(Self { root }) + } + + fn path(&self) -> &Path { + &self.root + } + + fn write_file(&self, relative_path: &Path, content: &str) -> Result { + let path = self.root.join(relative_path); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } + std::fs::write(&path, content)?; + Ok(path) + } + + fn snapshot(&self) -> WorkspaceSnapshot { + let mut files = Vec::new(); + collect_workspace_files(&self.root, &self.root, &mut files); + files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path)); + WorkspaceSnapshot { files } + } +} + +impl Drop for TempWorkspace { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.root); + } +} + +fn collect_workspace_files(root: &Path, current: &Path, files: &mut Vec) { + let entries = match std::fs::read_dir(current) { + Ok(entries) => entries, + Err(_) => return, + }; + + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + collect_workspace_files(root, &path, files); + continue; + } + + let metadata = match entry.metadata() { + Ok(metadata) => metadata, + Err(_) => continue, + }; + + let size_bytes = metadata.len(); + let modified_ms = metadata + .modified() + .ok() + .and_then(|time| time.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|duration| duration.as_millis() as u64); + + let bytes = match std::fs::read(&path) { + Ok(bytes) => bytes, + Err(_) => Vec::new(), + }; + let checksum = hash_bytes(&bytes); + let relative_path = path + .strip_prefix(root) + .unwrap_or(&path) + .to_string_lossy() + .replace('\\', "/"); + + files.push(WorkspaceFileSnapshot { + relative_path, + size_bytes, + modified_ms, + checksum, + }); + } +} + +fn hash_bytes(bytes: &[u8]) -> u64 { + let mut hasher = DefaultHasher::new(); + bytes.hash(&mut hasher); + hasher.finish() +} + +/// Test harness for running real agent execution paths. +pub struct AgentTestRunner { + workspace: TempWorkspace, + session_id: String, + execution_id: String, + llm: RunnerLlmHandle, + runner: AgentRunner, + mock_tools: Vec, +} + +enum RunnerLlmHandle { + Mock(Arc), + Recording(Arc), + Replay(Arc), +} + +impl AgentTestRunner { + pub async fn new() -> Result { + Self::with_config(AgentExecutorConfig::default()).await + } + + pub async fn with_config(config: AgentExecutorConfig) -> Result { + let workspace = TempWorkspace::new("mofa-agent-test")?; + let llm = Arc::new(MockAgentLLMProvider::new("mock-llm")); + let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?; + Self::with_workspace_and_handle(workspace, RunnerLlmHandle::Mock(llm), executor).await + } + + // Build a runner backed by a live provider that records every interaction. + pub async fn with_recording_provider( + inner: Arc, + case_name: &str, + tape_path: impl Into, + config: AgentExecutorConfig, + ) -> Result { + let workspace = TempWorkspace::new("mofa-agent-test")?; + let llm = Arc::new(RecordingLLMProvider::new( + "recording-llm", + inner, + case_name, + tape_path, + )); + let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?; + Self::with_workspace_and_handle(workspace, RunnerLlmHandle::Recording(llm), executor).await + } + + // Build a runner that serves responses from a pre-recorded tape. + pub async fn with_replay_tape( + tape: Tape, + config: AgentExecutorConfig, + ) -> Result { + let workspace = TempWorkspace::new("mofa-agent-test")?; + let llm = Arc::new(ReplayLLMProvider::new("replay-llm", tape)); + let executor = AgentExecutor::with_config(llm.clone(), workspace.path(), config).await?; + Self::with_workspace_and_handle(workspace, RunnerLlmHandle::Replay(llm), executor).await + } + + // Final assembly step that wires up the workspace, LLM handle, and executor. + async fn with_workspace_and_handle( + workspace: TempWorkspace, + llm: RunnerLlmHandle, + executor: AgentExecutor, + ) -> Result { + let agent = SessionAwareExecutor::new(executor); + + let execution_id = Uuid::now_v7().to_string(); + let session_id = Uuid::now_v7().to_string(); + let context = AgentContext::with_session(&execution_id, &session_id); + + let runner = AgentRunner::with_context(agent, context).await?; + + Ok(Self { + workspace, + session_id, + execution_id, + llm, + runner, + mock_tools: Vec::new(), + }) + } + + pub fn workspace(&self) -> &Path { + self.workspace.path() + } + + pub fn session_id(&self) -> &str { + &self.session_id + } + + pub fn execution_id(&self) -> &str { + &self.execution_id + } + + pub fn mock_llm(&self) -> Arc { + match &self.llm { + RunnerLlmHandle::Mock(provider) => Arc::clone(provider), + RunnerLlmHandle::Recording(_) | RunnerLlmHandle::Replay(_) => { + panic!("mock_llm is only available for mock-backed runners") + } + } + } + + pub fn write_bootstrap_file( + &self, + filename: &str, + content: &str, + ) -> Result { + self.workspace.write_file(Path::new(filename), content) + } + + pub fn write_workspace_file( + &self, + relative_path: impl AsRef, + content: &str, + ) -> Result { + self.workspace.write_file(relative_path.as_ref(), content) + } + + pub async fn register_simple_tool(&self, tool: T) -> Result<(), AgentRunnerError> + where + T: mofa_foundation::agent::components::tool::SimpleTool + Send + Sync + 'static, + { + let tool_ref = as_tool(tool); + self.runner + .agent() + .register_tool(tool_ref) + .await + .map_err(AgentRunnerError::from) + } + + pub async fn register_mock_tool(&mut self, tool: MockTool) -> Result<(), AgentRunnerError> { + self.register_simple_tool(tool.clone()).await?; + self.mock_tools.push(tool); + Ok(()) + } + + pub async fn configure_prompt( + &self, + identity: Option, + bootstrap_files: Option>, + ) { + self.runner + .agent() + .update_prompt_context(|ctx| { + if let Some(identity) = identity { + ctx.set_identity(identity); + } + if let Some(files) = bootstrap_files { + ctx.set_bootstrap_files(files); + } + }) + .await; + } + + pub async fn run_text(&mut self, input: &str) -> Result { + self.run_input(AgentInput::text(input)).await + } + + pub async fn run_text_with_session( + &mut self, + session_id: &str, + input: &str, + ) -> Result { + let original_session = self.runner.context().session_id.clone(); + self.runner + .set_session_id(Some(session_id.to_string())); + let result = self.run_text(input).await; + self.runner.set_session_id(original_session); + result + } + + pub async fn run_texts( + &mut self, + inputs: &[&str], + ) -> Result, AgentRunnerError> { + let mut results = Vec::with_capacity(inputs.len()); + for input in inputs { + results.push(self.run_text(input).await?); + } + Ok(results) + } + + pub async fn run_input( + &mut self, + input: AgentInput, + ) -> Result { + let started_at = Utc::now(); + let runner_state_before = self.runner.state().await; + let runner_stats_before = self.runner.stats().await; + let agent_state_before = self.runner.agent_state(); + let workspace_snapshot_before = self.workspace.snapshot(); + let timer = Instant::now(); + let result = self.runner.execute(input).await; + let duration = timer.elapsed(); + let runner_state_after = self.runner.state().await; + let runner_stats_after = self.runner.stats().await; + let agent_state_after = self.runner.agent_state(); + let session_snapshot = self.load_session_snapshot().await; + let workspace_snapshot_after = self.workspace.snapshot(); + let tool_calls = self.collect_tool_calls().await; + let llm_last_request = self.last_request().await; + let llm_last_response = self.last_response().await; + + let (output, error) = match result { + Ok(output) => (Some(output), None), + Err(err) => (None, Some(err)), + }; + + let metadata = AgentRunMetadata { + agent_id: self.runner.agent().id().to_string(), + agent_name: self.runner.agent().name().to_string(), + execution_id: self.runner.context().execution_id.clone(), + session_id: self.runner.context().session_id.clone(), + workspace_root: self.workspace.path().to_path_buf(), + runner_state_before, + runner_state_after, + runner_stats_before, + runner_stats_after, + agent_state_before, + agent_state_after, + started_at, + session_snapshot, + tool_calls, + llm_last_request, + llm_last_response, + workspace_snapshot_before, + workspace_snapshot_after, + }; + + Ok(AgentRunResult { + output, + error, + duration, + metadata, + }) + } + + pub async fn shutdown(self) -> Result<(), AgentRunnerError> { + self.runner.shutdown().await?; + Ok(()) + } + + async fn load_session_snapshot(&self) -> Option { + let session_id = self.runner.context().session_id.as_deref()?; + let storage = JsonlSessionStorage::new(self.workspace.path()).await.ok()?; + storage.load(session_id).await.ok()? + } + + async fn collect_tool_calls(&self) -> Vec { + let mut records = Vec::new(); + for tool in &self.mock_tools { + let calls = tool.history().await; + let results = tool.results().await; + for (idx, call) in calls.into_iter().enumerate() { + let result = results.get(idx).cloned(); + let (output, success, duration_ms, timed_out) = match result { + Some(result) => { + let duration_ms = result + .metadata + .get("duration_ms") + .and_then(|value| value.parse::().ok()); + let timed_out = result + .error + .as_ref() + .map(|err| err.contains("timed out")) + .unwrap_or(false); + ( + Some(result.output.clone()), + result.success, + duration_ms, + timed_out, + ) + } + None => (None, false, None, false), + }; + records.push(ToolCallRecord { + tool_name: tool.name().to_string(), + input: call.arguments, + output, + success, + duration_ms, + timed_out, + }); + } + } + records + } + + async fn last_request(&self) -> Option { + match &self.llm { + RunnerLlmHandle::Mock(provider) => provider.last_request().await, + RunnerLlmHandle::Recording(provider) => provider.last_request().await, + RunnerLlmHandle::Replay(provider) => provider.last_request().await, + } + } + + async fn last_response(&self) -> Option { + match &self.llm { + RunnerLlmHandle::Mock(provider) => provider.last_response().await, + RunnerLlmHandle::Recording(provider) => provider.last_response().await, + RunnerLlmHandle::Replay(provider) => provider.last_response().await, + } + } +} diff --git a/tests/src/artifact.rs b/tests/src/artifact.rs new file mode 100644 index 000000000..54416259c --- /dev/null +++ b/tests/src/artifact.rs @@ -0,0 +1,364 @@ +//! Canonical run artifacts for DSL-backed agent test execution. +//! +//! These types provide the stable, serializable output model for DSL runs, +//! built from the existing runner result. + +use crate::agent_runner::{AgentRunResult, ToolCallRecord, WorkspaceFileSnapshot, WorkspaceSnapshot}; +use crate::dsl::{AssertionOutcome, TestCaseDsl}; +use mofa_foundation::agent::session::Session; +use serde::{Deserialize, Serialize}; +use serde_json::json; + +// Top-level artifact emitted for a single DSL-backed case execution. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentRunArtifact { + pub case_name: String, + pub status: String, + pub output_text: Option, + pub runner_error: Option, + pub duration_ms: u64, + pub started_at_ms: u64, + pub execution_id: String, + pub session_id: Option, + pub workspace_root: String, + pub agent: AgentArtifact, + pub assertions: Vec, + pub tool_calls: Vec, + pub llm_request: Option, + pub llm_response: Option, + pub session_snapshot: Option, + pub workspace_before: WorkspaceSnapshotArtifact, + pub workspace_after: WorkspaceSnapshotArtifact, +} + +// Compact identity data for the agent used by the run. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentArtifact { + pub id: String, + pub name: String, +} + +// Tool execution records are flattened into the artifact for downstream checks. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ToolCallArtifact { + pub tool_name: String, + pub input: serde_json::Value, + pub output: Option, + pub success: bool, + pub duration_ms: Option, + pub timed_out: bool, +} + +// LLM request/response types keep only the fields needed for stable inspection. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmRequestArtifact { + pub model: Option, + pub temperature: Option, + pub max_tokens: Option, + pub messages: Vec, + pub tool_names: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmResponseArtifact { + pub content: Option, + pub tool_calls: Vec, + pub usage: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmMessageArtifact { + pub role: String, + pub content: Option, + pub tool_call_id: Option, + pub tool_calls: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmToolCallArtifact { + pub id: String, + pub name: String, + pub arguments: serde_json::Value, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TokenUsageArtifact { + pub prompt_tokens: u32, + pub completion_tokens: u32, + pub total_tokens: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SessionArtifact { + pub messages: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SessionMessageArtifact { + pub role: String, + pub content: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkspaceSnapshotArtifact { + pub files: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkspaceFileArtifact { + pub relative_path: String, + pub size_bytes: u64, + pub modified_ms: Option, + pub checksum: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentRunArtifactDiff { + pub matches: bool, + pub differences: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentRunArtifactComparison { + // Stable comparison payload for CI and tooling. + pub case_name: String, + pub matches: bool, + pub differences: Vec, + pub baseline_execution_id: String, + pub candidate_execution_id: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ArtifactDifference { + pub field: String, + pub expected: serde_json::Value, + pub actual: serde_json::Value, +} + +impl AgentRunArtifact { + // Build the canonical artifact from the current runner result plus DSL assertion outcomes. + pub fn from_run_result( + case: &TestCaseDsl, + result: &AgentRunResult, + assertions: Vec, + ) -> Self { + Self { + case_name: case.name.clone(), + status: if result.is_success() && assertions.iter().all(|item| item.passed) { + "passed".to_string() + } else { + "failed".to_string() + }, + output_text: result.output_text(), + runner_error: result.error.as_ref().map(ToString::to_string), + duration_ms: result.duration.as_millis() as u64, + started_at_ms: result.metadata.started_at.timestamp_millis() as u64, + execution_id: result.metadata.execution_id.clone(), + session_id: result.metadata.session_id.clone(), + workspace_root: result.metadata.workspace_root.display().to_string(), + agent: AgentArtifact { + id: result.metadata.agent_id.clone(), + name: result.metadata.agent_name.clone(), + }, + assertions, + tool_calls: result + .metadata + .tool_calls + .iter() + .map(tool_call_artifact) + .collect(), + llm_request: result + .metadata + .llm_last_request + .as_ref() + .map(|request| LlmRequestArtifact { + model: request.model.clone(), + temperature: request.temperature, + max_tokens: request.max_tokens, + messages: request + .messages + .iter() + .map(|message| LlmMessageArtifact { + role: message.role.clone(), + content: message.content.clone(), + tool_call_id: message.tool_call_id.clone(), + tool_calls: message + .tool_calls + .clone() + .unwrap_or_default() + .into_iter() + .map(llm_tool_call_artifact) + .collect(), + }) + .collect(), + tool_names: request + .tools + .clone() + .unwrap_or_default() + .into_iter() + .map(|tool| tool.name) + .collect(), + }), + llm_response: result + .metadata + .llm_last_response + .as_ref() + .map(|response| LlmResponseArtifact { + content: response.content.clone(), + tool_calls: response + .tool_calls + .clone() + .unwrap_or_default() + .into_iter() + .map(llm_tool_call_artifact) + .collect(), + usage: response.usage.as_ref().map(|usage| TokenUsageArtifact { + prompt_tokens: usage.prompt_tokens, + completion_tokens: usage.completion_tokens, + total_tokens: usage.total_tokens, + }), + }), + session_snapshot: result + .metadata + .session_snapshot + .as_ref() + .map(session_artifact), + workspace_before: workspace_snapshot_artifact(&result.metadata.workspace_snapshot_before), + workspace_after: workspace_snapshot_artifact(&result.metadata.workspace_snapshot_after), + } + } + + // Compare the MVP baseline fields while keeping deeper metadata out of scope for now. + pub fn compare_to(&self, baseline: &Self) -> AgentRunArtifactDiff { + let mut differences = Vec::new(); + + if self.status != baseline.status { + differences.push(ArtifactDifference { + field: "status".to_string(), + expected: json!(baseline.status), + actual: json!(self.status), + }); + } + + if self.output_text != baseline.output_text { + differences.push(ArtifactDifference { + field: "output_text".to_string(), + expected: json!(baseline.output_text), + actual: json!(self.output_text), + }); + } + + let baseline_assertions = baseline + .assertions + .iter() + .map(assertion_signature) + .collect::>(); + let actual_assertions = self + .assertions + .iter() + .map(assertion_signature) + .collect::>(); + if actual_assertions != baseline_assertions { + differences.push(ArtifactDifference { + field: "assertions".to_string(), + expected: json!(baseline_assertions), + actual: json!(actual_assertions), + }); + } + + let baseline_tools = baseline + .tool_calls + .iter() + .map(|call| call.tool_name.clone()) + .collect::>(); + let actual_tools = self + .tool_calls + .iter() + .map(|call| call.tool_name.clone()) + .collect::>(); + if actual_tools != baseline_tools { + differences.push(ArtifactDifference { + field: "tool_calls".to_string(), + expected: json!(baseline_tools), + actual: json!(actual_tools), + }); + } + + AgentRunArtifactDiff { + matches: differences.is_empty(), + differences, + } + } +} + +impl AgentRunArtifactComparison { + pub fn from_artifacts( + candidate: &AgentRunArtifact, + baseline: &AgentRunArtifact, + diff: AgentRunArtifactDiff, + ) -> Self { + Self { + case_name: candidate.case_name.clone(), + matches: diff.matches, + differences: diff.differences, + baseline_execution_id: baseline.execution_id.clone(), + candidate_execution_id: candidate.execution_id.clone(), + } + } +} + +fn assertion_signature(outcome: &AssertionOutcome) -> serde_json::Value { + json!({ + "kind": outcome.kind, + "passed": outcome.passed, + }) +} + +fn tool_call_artifact(record: &ToolCallRecord) -> ToolCallArtifact { + ToolCallArtifact { + tool_name: record.tool_name.clone(), + input: record.input.clone(), + output: record.output.clone(), + success: record.success, + duration_ms: record.duration_ms, + timed_out: record.timed_out, + } +} + +fn llm_tool_call_artifact(tool_call: mofa_kernel::agent::types::ToolCall) -> LlmToolCallArtifact { + LlmToolCallArtifact { + id: tool_call.id, + name: tool_call.name, + arguments: tool_call.arguments, + } +} + +// Session snapshots are reduced to ordered role/content pairs for stable comparisons. +fn session_artifact(session: &Session) -> SessionArtifact { + SessionArtifact { + messages: session + .messages + .iter() + .map(|message| SessionMessageArtifact { + role: message.role.clone(), + content: message.content.clone(), + }) + .collect(), + } +} + +// Workspace snapshots preserve a compact file-level view before and after execution. +fn workspace_snapshot_artifact(snapshot: &WorkspaceSnapshot) -> WorkspaceSnapshotArtifact { + WorkspaceSnapshotArtifact { + files: snapshot.files.iter().map(workspace_file_artifact).collect(), + } +} + +fn workspace_file_artifact(file: &WorkspaceFileSnapshot) -> WorkspaceFileArtifact { + WorkspaceFileArtifact { + relative_path: file.relative_path.clone(), + size_bytes: file.size_bytes, + modified_ms: file.modified_ms, + checksum: file.checksum, + } +} diff --git a/tests/src/assertions.rs b/tests/src/assertions.rs index 1a0b58594..946efe134 100644 --- a/tests/src/assertions.rs +++ b/tests/src/assertions.rs @@ -84,3 +84,173 @@ macro_rules! assert_bus_message_sent { ); }}; } + +/// Assert a session's messages match the expected (role, content) pairs. +pub fn assert_session_messages( + session: &mofa_foundation::agent::session::Session, + expected: &[(&str, &str)], +) { + assert_eq!( + session.messages.len(), + expected.len(), + "Expected {} session messages, got {}", + expected.len(), + session.messages.len() + ); + + for (idx, (role, content)) in expected.iter().enumerate() { + let msg = &session.messages[idx]; + assert_eq!( + msg.role, *role, + "Expected role '{}' at index {}, got '{}'", + role, idx, msg.role + ); + assert_eq!( + msg.content, *content, + "Expected content '{}' at index {}, got '{}'", + content, idx, msg.content + ); + } +} + +/// Assert the most recent tool result matches the expected JSON output. +/// +/// # Example +/// ```ignore +/// assert_tool_last_result!(tool, json!("done")); +/// ``` +#[macro_export] +macro_rules! assert_tool_last_result { + ($tool:expr, $expected:expr) => {{ + let result = $tool + .last_result() + .await + .expect("Expected tool to have a result, but it was never executed"); + let expected = $expected; + assert_eq!( + result.output, expected, + "Expected latest tool result {:?}, got {:?}", + expected, result.output + ); + }}; +} + +/// Assert the agent run produced the expected output text. +/// +/// # Example +/// ```ignore +/// assert_agent_output_text!(result, "hello"); +/// ``` +#[macro_export] +macro_rules! assert_agent_output_text { + ($result:expr, $expected:expr) => {{ + let expected = $expected; + let actual = $result.output_text(); + assert_eq!( + actual.as_deref(), + Some(expected), + "Expected agent output {:?}, got {:?}", + expected, + actual + ); + }}; +} + +/// Assert the agent run failed with an error containing the given substring. +/// +/// # Example +/// ```ignore +/// assert_run_failed_with!(result, "timeout"); +/// ``` +#[macro_export] +macro_rules! assert_run_failed_with { + ($result:expr, $pattern:expr) => {{ + let pattern = $pattern; + let error = $result + .error + .as_ref() + .expect("Expected run to fail, but it succeeded"); + let message = error.to_string(); + assert!( + message.contains(pattern), + "Expected error containing {:?}, got {:?}", + pattern, + message + ); + }}; +} + +/// Assert the workspace snapshot contains a file with the given relative path. +/// +/// # Example +/// ```ignore +/// assert_workspace_contains_file!(snapshot, "sessions/demo.jsonl"); +/// ``` +#[macro_export] +macro_rules! assert_workspace_contains_file { + ($snapshot:expr, $relative_path:expr) => {{ + let relative_path = $relative_path; + let found = $snapshot + .files + .iter() + .any(|file| file.relative_path == relative_path); + assert!( + found, + "Expected workspace snapshot to contain {:?}, found paths: {:?}", + relative_path, + $snapshot + .files + .iter() + .map(|file| file.relative_path.as_str()) + .collect::>() + ); + }}; +} + +/// Assert the run metadata captured a tool call with the given tool name. +/// +/// # Example +/// ```ignore +/// assert_run_recorded_tool_call!(result, "echo_tool"); +/// ``` +#[macro_export] +macro_rules! assert_run_recorded_tool_call { + ($result:expr, $tool_name:expr) => {{ + let tool_name = $tool_name; + let found = $result + .metadata + .tool_calls + .iter() + .any(|record| record.tool_name == tool_name); + assert!( + found, + "Expected run metadata to contain tool call {:?}, found tool calls: {:?}", + tool_name, + $result + .metadata + .tool_calls + .iter() + .map(|record| record.tool_name.as_str()) + .collect::>() + ); + }}; +} + +/// Assert the runner total execution count after a run matches the expected value. +/// +/// # Example +/// ```ignore +/// assert_runner_total_executions!(result, 1); +/// ``` +#[macro_export] +macro_rules! assert_runner_total_executions { + ($result:expr, $expected:expr) => {{ + let expected = $expected; + let actual = $result.metadata.runner_stats_after.total_executions; + assert_eq!( + actual, expected, + "Expected runner total executions {}, got {}", + expected, actual + ); + }}; +} diff --git a/tests/src/dsl.rs b/tests/src/dsl.rs new file mode 100644 index 000000000..9ef27e830 --- /dev/null +++ b/tests/src/dsl.rs @@ -0,0 +1,468 @@ +//! Minimal TOML DSL support for the testing MVP. +//! +//! This module keeps the schema intentionally small so contributors can define +//! simple agent tests without introducing a full DSL framework yet. + +use crate::agent_runner::{AgentRunResult, AgentRunnerError, AgentTestRunner}; +use crate::live_llm::{OpenAiCompatProvider, OpenAiCompatProviderConfig}; +use crate::replay::{ReplayError, Tape}; +use crate::tools::MockTool; +use mofa_foundation::agent::context::prompt::AgentIdentity; +use mofa_foundation::agent::executor::AgentExecutorConfig; +use mofa_kernel::agent::components::tool::ToolResult; +use mofa_kernel::agent::types::LLMProvider; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum DslError { + #[error("failed to read DSL file: {0}")] + Io(#[from] std::io::Error), + + #[error("failed to parse TOML DSL: {0}")] + Toml(#[from] toml::de::Error), + + #[error("runner error: {0}")] + Runner(#[from] AgentRunnerError), + + #[error("replay error: {0}")] + Replay(#[from] ReplayError), + + #[error("test case must define either `prompt` or `input`")] + MissingPrompt, + + #[error("expected output to contain `{expected}`, got `{actual}`")] + ExpectedContains { expected: String, actual: String }, + + #[error("expected tool `{tool}` to be called, found tool calls: {actual:?}")] + ExpectedToolCall { tool: String, actual: Vec }, + + #[error("run produced no text output")] + MissingOutput, + + #[error("provider-backed llm cannot be combined with inline responses, steps, or replay tape")] + ConflictingLlmConfig, + + #[error("record_tape requires a provider-backed llm")] + MissingRecordProvider, + + #[error("provider-backed llm requires record_tape")] + MissingRecordTape, + + #[error("unsupported llm provider kind: {0}")] + UnsupportedProvider(String), + + #[error("missing environment variable `{0}` for llm provider")] + MissingProviderEnv(String), +} + +#[derive(Debug, Clone, Deserialize)] +pub struct TestCaseDsl { + pub name: String, + pub prompt: Option, + pub input: Option, + pub expected_text: Option, + #[serde(default)] + pub bootstrap_files: Vec, + pub agent: Option, + #[serde(default)] + pub tools: Vec, + pub llm: Option, + #[serde(rename = "assert")] + pub assertions: Option, + #[serde(skip)] + pub source_path: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BootstrapFileDsl { + pub path: String, + pub content: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct AgentDsl { + pub name: Option, + pub description: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ToolDsl { + pub name: String, + pub description: String, + pub schema: Value, + pub result: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct LlmDsl { + #[serde(default)] + pub responses: Vec, + #[serde(default)] + pub steps: Vec, + pub tape: Option, + pub record_tape: Option, + pub provider: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct LlmProviderDsl { + pub kind: LlmProviderKind, + pub base_url: Option, + pub model: Option, + pub api_key_env: Option, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LlmProviderKind { + OpenAiCompatible, + Ollama, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct LlmStepDsl { + #[serde(rename = "type")] + pub kind: LlmStepKind, + pub content: Option, + pub tool: Option, + pub arguments: Option, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LlmStepKind { + Text, + ToolCall, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct AssertDsl { + pub contains: Option, + pub tool_called: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AssertionOutcome { + pub kind: String, + pub expected: Value, + pub actual: Value, + pub passed: bool, +} + +impl TestCaseDsl { + pub fn from_toml_str(input: &str) -> Result { + let mut case: Self = toml::from_str(input)?; + case.source_path = None; + Ok(case) + } + + pub fn from_toml_file(path: impl AsRef) -> Result { + let path = path.as_ref(); + let input = std::fs::read_to_string(path)?; + let mut case: Self = toml::from_str(&input)?; + case.source_path = Some(path.to_path_buf()); + Ok(case) + } + + fn execution_input(&self) -> Result<&str, DslError> { + self.prompt + .as_deref() + .or(self.input.as_deref()) + .ok_or(DslError::MissingPrompt) + } +} + +pub async fn run_test_case(case: &TestCaseDsl) -> Result { + let result = execute_test_case(case).await?; + let assertions = collect_assertion_outcomes(case, &result); + if let Some(error) = assertion_error_from_outcomes(&assertions) { + return Err(error); + } + Ok(result) +} + +pub async fn execute_test_case(case: &TestCaseDsl) -> Result { + let mut runner = build_runner(case).await?; + configure_runner_from_test_case(case, &mut runner).await?; + let result = runner.run_text(case.execution_input()?).await?; + runner.shutdown().await?; + Ok(result) +} + +pub async fn configure_runner_from_test_case( + case: &TestCaseDsl, + runner: &mut AgentTestRunner, +) -> Result<(), DslError> { + if !case.bootstrap_files.is_empty() { + let mut bootstrap_paths = Vec::with_capacity(case.bootstrap_files.len()); + for file in &case.bootstrap_files { + runner.write_bootstrap_file(&file.path, &file.content)?; + bootstrap_paths.push(file.path.clone()); + } + runner + .configure_prompt(agent_identity(case.agent.as_ref()), Some(bootstrap_paths)) + .await; + } else if case.agent.is_some() { + runner + .configure_prompt(agent_identity(case.agent.as_ref()), None) + .await; + } + + for tool in &case.tools { + let mock_tool = MockTool::new(&tool.name, &tool.description, tool.schema.clone()); + if let Some(result) = &tool.result { + mock_tool + .set_result(ToolResult::success(result.clone())) + .await; + } + runner.register_mock_tool(mock_tool).await?; + } + + // Queue deterministic LLM responses before execution so the DSL stays a thin + // adapter over the existing runner harness. + if let Some(llm) = &case.llm { + if llm.provider.is_some() { + if !llm.steps.is_empty() || !llm.responses.is_empty() || llm.tape.is_some() { + return Err(DslError::ConflictingLlmConfig); + } + } else if llm.record_tape.is_some() { + return Err(DslError::MissingRecordProvider); + } else if !llm.steps.is_empty() { + for step in &llm.steps { + match step.kind { + LlmStepKind::Text => { + runner + .mock_llm() + .add_response(step.content.clone().unwrap_or_default()) + .await; + } + LlmStepKind::ToolCall => { + runner + .mock_llm() + .add_tool_call_response( + step.tool.as_deref().unwrap_or_default(), + step.arguments.clone().unwrap_or(Value::Null), + step.content.clone(), + ) + .await; + } + } + } + } else if let Some(tape_path) = &llm.tape { + let resolved = resolve_case_path(case, tape_path); + let tape = Tape::from_file(resolved)?; + let _responses = tape.responses()?; + } else { + for response in &llm.responses { + runner.mock_llm().add_response(response).await; + } + } + } + Ok(()) +} + +// Build the appropriate runner (recording, replay, or mock) for this case. +async fn build_runner(case: &TestCaseDsl) -> Result { + let config = AgentExecutorConfig::default(); + let llm = case.llm.as_ref(); + match llm.and_then(|item| item.provider.as_ref()) { + Some(provider) => { + let provider = build_live_provider(provider)?; + if let Some(record_tape) = llm.and_then(|item| item.record_tape.as_deref()) { + let tape_path = resolve_case_path(case, record_tape); + AgentTestRunner::with_recording_provider(provider, &case.name, tape_path, config) + .await + .map_err(DslError::from) + } else { + Err(DslError::MissingRecordTape) + } + } + None => { + if let Some(llm) = llm { + if let Some(tape_path) = &llm.tape { + let tape = Tape::from_file(resolve_case_path(case, tape_path))?; + return AgentTestRunner::with_replay_tape(tape, config) + .await + .map_err(DslError::from); + } + } + AgentTestRunner::with_config(config) + .await + .map_err(DslError::from) + } + } +} + +// Instantiate an LLM provider implementation from the DSL settings. +fn build_live_provider(provider: &LlmProviderDsl) -> Result, DslError> { + match provider.kind { + LlmProviderKind::OpenAiCompatible => { + let api_key = read_provider_api_key(provider)?; + Ok(Arc::new(OpenAiCompatProvider::new(OpenAiCompatProviderConfig { + base_url: provider + .base_url + .clone() + .ok_or_else(|| DslError::UnsupportedProvider("open_ai_compatible missing base_url".to_string()))?, + model: provider.model.clone().unwrap_or_else(|| "gpt-4o-mini".to_string()), + api_key, + }))) + } + LlmProviderKind::Ollama => Ok(Arc::new(OpenAiCompatProvider::new(OpenAiCompatProviderConfig { + base_url: provider + .base_url + .clone() + .or_else(|| std::env::var("OLLAMA_BASE_URL").ok()) + .unwrap_or_else(|| "http://127.0.0.1:11434/v1".to_string()), + model: provider + .model + .clone() + .or_else(|| std::env::var("OLLAMA_MODEL").ok()) + .unwrap_or_else(|| "llama3".to_string()), + api_key: None, + }))), + } +} + +// Read or validate the configured API key name for a provider. +fn read_provider_api_key(provider: &LlmProviderDsl) -> Result, DslError> { + let env_name = provider + .api_key_env + .clone() + .unwrap_or_else(|| "OPENAI_API_KEY".to_string()); + if env_name.is_empty() { + return Ok(None); + } + + match std::env::var(&env_name) { + Ok(value) if value.is_empty() => Ok(None), + Ok(value) => Ok(Some(value)), + Err(std::env::VarError::NotPresent) => { + if provider.base_url.as_deref().map(|url| url.contains("127.0.0.1") || url.contains("localhost")).unwrap_or(false) { + Ok(None) + } else { + Err(DslError::MissingProviderEnv(env_name)) + } + } + Err(err) => Err(DslError::MissingProviderEnv(format!("{env_name}: {err}"))), + } +} + +fn resolve_case_path(case: &TestCaseDsl, tape_path: &str) -> PathBuf { + let path = Path::new(tape_path); + if path.is_absolute() { + return path.to_path_buf(); + } + + if let Some(source_path) = &case.source_path { + if let Some(parent) = source_path.parent() { + return parent.join(path); + } + } + + path.to_path_buf() +} + +fn agent_identity(agent: Option<&AgentDsl>) -> Option { + let agent = agent?; + let name = agent.name.clone()?; + Some(AgentIdentity { + name, + description: agent.description.clone().unwrap_or_default(), + icon: None, + }) +} + +fn expected_contains(case: &TestCaseDsl) -> Option<&str> { + // Prefer the explicit assertion block when present, while keeping + // `expected_text` as a lightweight shorthand for the MVP schema. + case.assertions + .as_ref() + .and_then(|assertions| assertions.contains.as_deref()) + .or(case.expected_text.as_deref()) +} + +fn expected_tool_call(case: &TestCaseDsl) -> Option<&str> { + case.assertions + .as_ref() + .and_then(|assertions| assertions.tool_called.as_deref()) +} + +pub fn collect_assertion_outcomes(case: &TestCaseDsl, result: &AgentRunResult) -> Vec { + let mut outcomes = Vec::new(); + + if let Some(expected) = expected_contains(case) { + let actual = result.output_text(); + outcomes.push(AssertionOutcome { + kind: "contains".to_string(), + expected: Value::String(expected.to_string()), + actual: actual + .clone() + .map(Value::String) + .unwrap_or(Value::Null), + passed: actual + .as_ref() + .map(|value| value.contains(expected)) + .unwrap_or(false), + }); + } + + if let Some(expected_tool) = expected_tool_call(case) { + let actual = result + .metadata + .tool_calls + .iter() + .map(|record| Value::String(record.tool_name.clone())) + .collect::>(); + outcomes.push(AssertionOutcome { + kind: "tool_called".to_string(), + expected: Value::String(expected_tool.to_string()), + actual: Value::Array(actual.clone()), + passed: actual + .iter() + .any(|tool| tool.as_str() == Some(expected_tool)), + }); + } + + outcomes +} + +pub fn assertion_error_from_outcomes(outcomes: &[AssertionOutcome]) -> Option { + for outcome in outcomes { + if outcome.passed { + continue; + } + + match outcome.kind.as_str() { + "contains" => { + return if outcome.actual.is_null() { + Some(DslError::MissingOutput) + } else { + Some(DslError::ExpectedContains { + expected: outcome.expected.as_str().unwrap_or_default().to_string(), + actual: outcome.actual.as_str().unwrap_or_default().to_string(), + }) + }; + } + "tool_called" => { + let actual = outcome + .actual + .as_array() + .into_iter() + .flatten() + .filter_map(|value| value.as_str().map(ToString::to_string)) + .collect::>(); + return Some(DslError::ExpectedToolCall { + tool: outcome.expected.as_str().unwrap_or_default().to_string(), + actual, + }); + } + _ => continue, + } + } + + None +} diff --git a/tests/src/lib.rs b/tests/src/lib.rs index 3368dc49b..eba2196b8 100644 --- a/tests/src/lib.rs +++ b/tests/src/lib.rs @@ -4,18 +4,42 @@ //! control for testing MoFA agents. pub mod adversarial; +pub mod agent_runner; +pub mod artifact; pub mod assertions; pub mod backend; pub mod bus; pub mod clock; +pub mod dsl; +pub mod live_llm; +pub mod replay; pub mod report; pub mod tools; pub use backend::MockLLMBackend; pub use bus::MockAgentBus; pub use clock::{Clock, MockClock, SystemClock}; +pub use dsl::{ + assertion_error_from_outcomes, collect_assertion_outcomes, configure_runner_from_test_case, + execute_test_case, run_test_case, AgentDsl, AssertDsl, AssertionOutcome, BootstrapFileDsl, + DslError, LlmDsl, LlmProviderDsl, LlmProviderKind, LlmStepDsl, LlmStepKind, TestCaseDsl, + ToolDsl, +}; +pub use live_llm::{OpenAiCompatProvider, OpenAiCompatProviderConfig}; +pub use agent_runner::{ + AgentRunMetadata, AgentRunResult, AgentRunnerError, AgentTestRunner, MockAgentLLMProvider, + ToolCallRecord, WorkspaceFileSnapshot, WorkspaceSnapshot, +}; +pub use artifact::{ + AgentArtifact, AgentRunArtifact, AgentRunArtifactComparison, AgentRunArtifactDiff, + ArtifactDifference, + LlmMessageArtifact, LlmRequestArtifact, LlmResponseArtifact, LlmToolCallArtifact, + SessionArtifact, SessionMessageArtifact, TokenUsageArtifact, ToolCallArtifact, + WorkspaceFileArtifact, WorkspaceSnapshotArtifact, +}; pub use report::{ JsonFormatter, ReportFormatter, TestCaseResult, TestReport, TestReportBuilder, TestStatus, TextFormatter, }; +pub use replay::{ReplayError, Tape, TapeInteraction}; pub use tools::MockTool; diff --git a/tests/src/live_llm.rs b/tests/src/live_llm.rs new file mode 100644 index 000000000..d8791600d --- /dev/null +++ b/tests/src/live_llm.rs @@ -0,0 +1,268 @@ +//! Minimal OpenAI-compatible provider for record-mode DSL runs. + +use async_trait::async_trait; +use mofa_kernel::agent::error::{AgentError, AgentResult}; +use mofa_kernel::agent::types::{ + ChatCompletionRequest, ChatCompletionResponse, LLMProvider, ToolCall, ToolDefinition, + TokenUsage, +}; +use serde::{Deserialize, Serialize}; +use std::error::Error as StdError; + +#[derive(Debug, Clone)] +pub struct OpenAiCompatProviderConfig { + pub base_url: String, + pub model: String, + pub api_key: Option, +} + +#[derive(Debug)] +pub struct OpenAiCompatProvider { + client: reqwest::Client, + config: OpenAiCompatProviderConfig, +} + +impl OpenAiCompatProvider { + // Create a new provider backed by a fresh HTTP client. + pub fn new(config: OpenAiCompatProviderConfig) -> Self { + Self { + client: reqwest::Client::new(), + config, + } + } + + // Normalize the configured base URL into a completions endpoint. + fn completions_url(&self) -> String { + let trimmed = self.config.base_url.trim_end_matches('/'); + if trimmed.ends_with("/chat/completions") || trimmed.ends_with("/completions") { + trimmed.to_string() + } else { + format!("{trimmed}/chat/completions") + } + } +} + +#[async_trait] +impl LLMProvider for OpenAiCompatProvider { + // Provider identifier exposed to the kernel. + fn name(&self) -> &str { + "openai-compatible" + } + + async fn chat(&self, request: ChatCompletionRequest) -> AgentResult { + // Translate the kernel chat request, send, and convert the response. + let request_body = OpenAiCompatRequest { + model: request + .model + .clone() + .unwrap_or_else(|| self.config.model.clone()), + messages: request + .messages + .iter() + .map(OpenAiCompatMessage::from_kernel) + .collect(), + tools: request.tools.as_ref().map(|tools| { + tools + .iter() + .map(OpenAiCompatToolDefinition::from_kernel) + .collect() + }), + temperature: request.temperature, + max_tokens: request.max_tokens, + }; + + eprintln!("OpenAI-compatible request = {:?}", request_body); + let mut builder = self.client.post(self.completions_url()).json(&request_body); + if let Some(api_key) = &self.config.api_key { + builder = builder.bearer_auth(api_key); + } + + let response = match builder.send().await { + Ok(response) => response, + Err(err) => { + eprintln!("Provider send error: {}", err); + let mut current: Option<&dyn StdError> = err.source(); + while let Some(source) = current { + eprintln!("Provider send error source: {}", source); + current = source.source(); + } + return Err(AgentError::ExecutionFailed(err.to_string())); + } + }; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + eprintln!("Provider error status={} body={}", status, body); + return Err(AgentError::ExecutionFailed(format!( + "provider returned {}: {}", + status, body + ))); + } + + let body: OpenAiCompatResponse = response + .json() + .await + .map_err(|err| AgentError::SerializationError(err.to_string()))?; + eprintln!("OpenAI-compatible response = {:?}", body); + let message = body + .choices + .into_iter() + .next() + .ok_or_else(|| AgentError::ExecutionFailed("provider returned no choices".to_string()))? + .message; + + Ok(ChatCompletionResponse { + content: message.content, + tool_calls: Some( + message + .tool_calls + .unwrap_or_default() + .into_iter() + .map(OpenAiCompatToolCall::into_kernel) + .collect(), + ), + usage: body.usage.map(OpenAiCompatUsage::into_kernel), + }) + } +} + +#[derive(Debug, Serialize)] +struct OpenAiCompatRequest { + model: String, + messages: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + tools: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + temperature: Option, + #[serde(skip_serializing_if = "Option::is_none")] + max_tokens: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +struct OpenAiCompatMessage { + role: String, + #[serde(skip_serializing_if = "Option::is_none")] + content: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tool_call_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tool_calls: Option>, +} + +impl OpenAiCompatMessage { + // Translate a kernel chat message into the OpenAI-compatible wire format. + fn from_kernel(message: &mofa_kernel::agent::types::ChatMessage) -> Self { + Self { + role: message.role.clone(), + content: message.content.clone(), + tool_call_id: message.tool_call_id.clone(), + tool_calls: message.tool_calls.as_ref().map(|tool_calls| { + tool_calls + .iter() + .cloned() + .map(OpenAiCompatToolCall::from_kernel) + .collect() + }), + } + } +} + +#[derive(Debug, Serialize)] +struct OpenAiCompatToolDefinition { + #[serde(rename = "type")] + kind: &'static str, + function: OpenAiCompatFunctionDefinition, +} + +impl OpenAiCompatToolDefinition { + // Map kernel tool metadata into the OpenAI-compatible representation. + fn from_kernel(tool: &ToolDefinition) -> Self { + Self { + kind: "function", + function: OpenAiCompatFunctionDefinition { + name: tool.name.clone(), + description: tool.description.clone(), + parameters: tool.parameters.clone(), + }, + } + } +} + +#[derive(Debug, Serialize)] +struct OpenAiCompatFunctionDefinition { + name: String, + description: String, + parameters: serde_json::Value, +} + +#[derive(Debug, Serialize, Deserialize)] +struct OpenAiCompatResponse { + choices: Vec, + #[serde(default)] + usage: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +struct OpenAiCompatChoice { + message: OpenAiCompatMessage, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct OpenAiCompatToolCall { + id: String, + #[serde(rename = "type", default)] + kind: Option, + function: OpenAiCompatFunctionCall, +} + +impl OpenAiCompatToolCall { + // Serialize a kernel tool call into the cached tape format. + fn from_kernel(tool_call: ToolCall) -> Self { + Self { + id: tool_call.id, + kind: Some("function".to_string()), + function: OpenAiCompatFunctionCall { + name: tool_call.name, + arguments: tool_call.arguments.to_string(), + }, + } + } + + // Restore the cached tool call to the kernel model. + fn into_kernel(self) -> ToolCall { + let arguments = + serde_json::from_str(&self.function.arguments).unwrap_or(serde_json::Value::String( + self.function.arguments, + )); + ToolCall { + id: self.id, + name: self.function.name, + arguments, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct OpenAiCompatFunctionCall { + name: String, + arguments: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct OpenAiCompatUsage { + prompt_tokens: u32, + completion_tokens: u32, + total_tokens: u32, +} + +impl OpenAiCompatUsage { + // Convert recorded usage into the kernel token usage struct. + fn into_kernel(self) -> TokenUsage { + TokenUsage { + prompt_tokens: self.prompt_tokens, + completion_tokens: self.completion_tokens, + total_tokens: self.total_tokens, + } + } +} diff --git a/tests/src/replay.rs b/tests/src/replay.rs new file mode 100644 index 000000000..ae2589f11 --- /dev/null +++ b/tests/src/replay.rs @@ -0,0 +1,337 @@ +//! Replay tape support for deterministic DSL-backed runs. + +use async_trait::async_trait; +use mofa_kernel::agent::error::{AgentError, AgentResult}; +use mofa_kernel::agent::types::{ + ChatCompletionRequest, ChatCompletionResponse, LLMProvider, ToolCall, TokenUsage, +}; +use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use thiserror::Error; +use tokio::sync::RwLock; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TapeInteraction { + #[serde(default)] + pub request: Option, + pub response: String, + #[serde(default)] + pub tool_calls: Vec, + #[serde(default)] + pub usage: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Tape { + pub version: u32, + pub case_name: String, + pub interactions: Vec, +} + +#[derive(Debug, Error)] +pub enum ReplayError { + #[error("failed to read tape file: {0}")] + Io(#[from] std::io::Error), + + #[error("failed to parse tape JSON: {0}")] + Json(#[from] serde_json::Error), + + #[error("tape has no interactions")] + EmptyTape, + + #[error("replay exhausted after {0} interactions")] + ReplayExhausted(usize), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TapeRequest { + pub messages: Vec, + pub model: Option, + pub temperature: Option, + pub max_tokens: Option, + #[serde(default)] + pub tool_names: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TapeMessage { + pub role: String, + pub content: Option, + pub tool_call_id: Option, + #[serde(default)] + pub tool_calls: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TapeToolCall { + pub id: String, + pub name: String, + pub arguments: serde_json::Value, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TapeTokenUsage { + pub prompt_tokens: u32, + pub completion_tokens: u32, + pub total_tokens: u32, +} + +pub struct RecordingLLMProvider { + name: String, + inner: Arc, + case_name: String, + output_path: PathBuf, + tape: RwLock, + last_request: RwLock>, + last_response: RwLock>, +} + +pub struct ReplayLLMProvider { + name: String, + tape: Tape, + cursor: RwLock, + last_request: RwLock>, + last_response: RwLock>, +} + +impl Tape { + // Start a new tape for the given case. + pub fn new(case_name: impl Into) -> Self { + Self { + version: 1, + case_name: case_name.into(), + interactions: Vec::new(), + } + } + + // Load a tape from disk. + pub fn from_file(path: impl AsRef) -> Result { + let body = std::fs::read_to_string(path)?; + Ok(serde_json::from_str(&body)?) + } + + // Persist the tape back to disk. + pub fn to_file(&self, path: impl AsRef) -> Result<(), ReplayError> { + let body = serde_json::to_string_pretty(self)?; + std::fs::write(path, body)?; + Ok(()) + } + + // Extract the recorded responses for quick replay validation. + pub fn responses(&self) -> Result, ReplayError> { + if self.interactions.is_empty() { + return Err(ReplayError::EmptyTape); + } + Ok(self + .interactions + .iter() + .map(|interaction| interaction.response.clone()) + .collect()) + } + + // Append a new interaction to the tape. + pub fn push_interaction(&mut self, request: ChatCompletionRequest, response: &ChatCompletionResponse) { + self.interactions.push(TapeInteraction { + request: Some(TapeRequest::from_request(&request)), + response: response.content.clone().unwrap_or_default(), + tool_calls: response + .tool_calls + .clone() + .unwrap_or_default() + .into_iter() + .map(TapeToolCall::from_tool_call) + .collect(), + usage: response.usage.clone().map(TapeTokenUsage::from_usage), + }); + } +} + +impl TapeRequest { + // Snapshot a chat request into the tape schema. + fn from_request(request: &ChatCompletionRequest) -> Self { + Self { + messages: request + .messages + .iter() + .map(|message| TapeMessage { + role: message.role.clone(), + content: message.content.clone(), + tool_call_id: message.tool_call_id.clone(), + tool_calls: message + .tool_calls + .clone() + .unwrap_or_default() + .into_iter() + .map(TapeToolCall::from_tool_call) + .collect(), + }) + .collect(), + model: request.model.clone(), + temperature: request.temperature, + max_tokens: request.max_tokens, + tool_names: request + .tools + .clone() + .unwrap_or_default() + .into_iter() + .map(|tool| tool.name) + .collect(), + } + } +} + +impl TapeToolCall { + // Encode a real tool call for replay. + fn from_tool_call(tool_call: ToolCall) -> Self { + Self { + id: tool_call.id, + name: tool_call.name, + arguments: tool_call.arguments, + } + } + + // Decode the tape entry back into a kernel tool call. + fn into_tool_call(self) -> ToolCall { + ToolCall { + id: self.id, + name: self.name, + arguments: self.arguments, + } + } +} + +impl TapeTokenUsage { + // Capture token usage for later inspection. + fn from_usage(usage: TokenUsage) -> Self { + Self { + prompt_tokens: usage.prompt_tokens, + completion_tokens: usage.completion_tokens, + total_tokens: usage.total_tokens, + } + } + + // Rehydrate the token usage for replay responses. + fn into_usage(self) -> TokenUsage { + TokenUsage { + prompt_tokens: self.prompt_tokens, + completion_tokens: self.completion_tokens, + total_tokens: self.total_tokens, + } + } +} + +impl RecordingLLMProvider { + // Wrap a live provider so every response is recorded to tape. + pub fn new( + name: impl Into, + inner: Arc, + case_name: impl Into, + output_path: impl Into, + ) -> Self { + let case_name = case_name.into(); + Self { + name: name.into(), + inner, + tape: RwLock::new(Tape::new(case_name.clone())), + case_name, + output_path: output_path.into(), + last_request: RwLock::new(None), + last_response: RwLock::new(None), + } + } + + // Return the most recent request issued through this recording provider. + pub async fn last_request(&self) -> Option { + self.last_request.read().await.clone() + } + + // Return the latest response captured by the recording provider. + pub async fn last_response(&self) -> Option { + self.last_response.read().await.clone() + } +} + +#[async_trait] +impl LLMProvider for RecordingLLMProvider { + fn name(&self) -> &str { + &self.name + } + + async fn chat(&self, request: ChatCompletionRequest) -> AgentResult { + // Forward the call, cache the interaction, and persist the tape. + *self.last_request.write().await = Some(request.clone()); + let response = self.inner.chat(request.clone()).await?; + *self.last_response.write().await = Some(response.clone()); + + let mut tape = self.tape.write().await; + if tape.case_name.is_empty() { + tape.case_name = self.case_name.clone(); + } + tape.push_interaction(request, &response); + tape.to_file(&self.output_path) + .map_err(|err| AgentError::ExecutionFailed(err.to_string()))?; + + Ok(response) + } +} + +impl ReplayLLMProvider { + // Create a replay provider that steps through a pre-recorded tape. + pub fn new(name: impl Into, tape: Tape) -> Self { + Self { + name: name.into(), + tape, + cursor: RwLock::new(0), + last_request: RwLock::new(None), + last_response: RwLock::new(None), + } + } + + // Return the last request seen by the replay provider. + pub async fn last_request(&self) -> Option { + self.last_request.read().await.clone() + } + + // Return the last response handed out during replay. + pub async fn last_response(&self) -> Option { + self.last_response.read().await.clone() + } +} + +#[async_trait] +impl LLMProvider for ReplayLLMProvider { + fn name(&self) -> &str { + &self.name + } + + async fn chat(&self, request: ChatCompletionRequest) -> AgentResult { + // Serve the next recorded interaction instead of calling a live LLM. + *self.last_request.write().await = Some(request); + let mut cursor = self.cursor.write().await; + let interaction = self + .tape + .interactions + .get(*cursor) + .cloned() + .or_else(|| self.tape.interactions.last().cloned()) + .ok_or_else(|| AgentError::ExecutionFailed(ReplayError::EmptyTape.to_string()))?; + if *cursor < self.tape.interactions.len() { + *cursor += 1; + } + + let response = ChatCompletionResponse { + content: Some(interaction.response), + tool_calls: Some( + interaction + .tool_calls + .into_iter() + .map(TapeToolCall::into_tool_call) + .collect(), + ), + usage: interaction.usage.map(TapeTokenUsage::into_usage), + }; + *self.last_response.write().await = Some(response.clone()); + Ok(response) + } +} diff --git a/tests/src/tools.rs b/tests/src/tools.rs index 48356b551..fbc6bb5b6 100644 --- a/tests/src/tools.rs +++ b/tests/src/tools.rs @@ -20,6 +20,7 @@ pub struct MockTool { category: ToolCategory, pub stubbed_result: Arc>, pub call_history: Arc>>, + pub result_history: Arc>>, failure_queue: Arc>>, failure_patterns: Arc>>, result_sequence: Arc>>, @@ -37,6 +38,7 @@ impl MockTool { "Mock execution default", ))), call_history: Arc::new(RwLock::new(Vec::new())), + result_history: Arc::new(RwLock::new(Vec::new())), failure_queue: Arc::new(RwLock::new(VecDeque::new())), failure_patterns: Arc::new(RwLock::new(Vec::new())), result_sequence: Arc::new(RwLock::new(VecDeque::new())), @@ -58,6 +60,16 @@ impl MockTool { self.call_history.read().await.len() } + /// Retrieve a clone of the full result history. + pub async fn results(&self) -> Vec { + self.result_history.read().await.clone() + } + + /// Returns the most recent result, or `None` if never executed. + pub async fn last_result(&self) -> Option { + self.result_history.read().await.last().cloned() + } + /// Queue failures for the next N calls. pub async fn fail_next(&self, count: usize, error_msg: &str) { let mut queue = self.failure_queue.write().await; @@ -110,12 +122,18 @@ impl SimpleTool for MockTool { async fn execute(&self, input: ToolInput) -> ToolResult { self.call_history.write().await.push(input.clone()); + let start = std::time::Instant::now(); // 1. Drain failure queue { let mut queue = self.failure_queue.write().await; if let Some(err) = queue.pop_front() { - return ToolResult::failure(err); + let mut result = ToolResult::failure(err); + result + .metadata + .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string()); + self.result_history.write().await.push(result.clone()); + return result; } } @@ -124,7 +142,12 @@ impl SimpleTool for MockTool { let patterns = self.failure_patterns.read().await; for (pattern, err) in patterns.iter() { if input.arguments == *pattern { - return ToolResult::failure(err); + let mut result = ToolResult::failure(err); + result + .metadata + .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string()); + self.result_history.write().await.push(result.clone()); + return result; } } } @@ -133,11 +156,21 @@ impl SimpleTool for MockTool { { let mut seq = self.result_sequence.write().await; if let Some(result) = seq.pop_front() { + let mut result = result; + result + .metadata + .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string()); + self.result_history.write().await.push(result.clone()); return result; } } - self.stubbed_result.read().await.clone() + let mut result = self.stubbed_result.read().await.clone(); + result + .metadata + .insert("duration_ms".to_string(), start.elapsed().as_millis().to_string()); + self.result_history.write().await.push(result.clone()); + result } fn category(&self) -> ToolCategory { diff --git a/tests/tests/agent_runner_tests.rs b/tests/tests/agent_runner_tests.rs new file mode 100644 index 000000000..250b9e62d --- /dev/null +++ b/tests/tests/agent_runner_tests.rs @@ -0,0 +1,311 @@ +use mofa_testing::agent_runner::AgentTestRunner; +use mofa_testing::assertions::assert_session_messages; +use mofa_testing::tools::MockTool; +use serde_json::json; + +#[tokio::test] +async fn agent_runner_executes_and_captures_output() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner + .mock_llm() + .add_response("Mocked response") + .await; + + let result = runner + .run_text("hello") + .await + .expect("run should succeed"); + + assert!(result.is_success()); + assert_eq!(result.output_text().as_deref(), Some("Mocked response")); + assert_eq!( + result.metadata.session_id.as_deref(), + Some(runner.session_id()) + ); + assert_eq!(result.metadata.execution_id, runner.execution_id()); + assert_eq!(result.metadata.runner_stats_before.total_executions, 0); + assert_eq!(result.metadata.runner_stats_after.total_executions, 1); + assert!(result.metadata.session_snapshot.is_some()); + let snapshot = result.metadata.session_snapshot.as_ref().unwrap(); + assert_eq!(snapshot.len(), 2); + assert_session_messages(snapshot, &[("user", "hello"), ("assistant", "Mocked response")]); + + let expected_session_path = format!("sessions/{}.jsonl", runner.session_id()); + assert!( + !result + .metadata + .workspace_snapshot_before + .files + .iter() + .any(|file| file.relative_path == expected_session_path) + ); + assert!( + result + .metadata + .workspace_snapshot_after + .files + .iter() + .any(|file| file.relative_path == expected_session_path) + ); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn agent_runner_creates_isolated_workspaces() { + let mut runner_a = AgentTestRunner::new().await.expect("runner A initializes"); + let mut runner_b = AgentTestRunner::new().await.expect("runner B initializes"); + + assert_ne!(runner_a.workspace(), runner_b.workspace()); + + runner_a + .mock_llm() + .add_response("Response A") + .await; + runner_b + .mock_llm() + .add_response("Response B") + .await; + + let _ = runner_a + .run_text("hi") + .await + .expect("runner A executes"); + let _ = runner_b + .run_text("hi") + .await + .expect("runner B executes"); + + // Session files should exist in each separate workspace. + let session_a = runner_a + .workspace() + .join("sessions") + .join(format!("{}.jsonl", runner_a.session_id())); + let session_b = runner_b + .workspace() + .join("sessions") + .join(format!("{}.jsonl", runner_b.session_id())); + + assert!(session_a.exists()); + assert!(session_b.exists()); + + runner_a.shutdown().await.expect("shutdown A"); + runner_b.shutdown().await.expect("shutdown B"); +} + +#[tokio::test] +async fn agent_runner_executes_tool_calls() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + + let tool = MockTool::new( + "echo_tool", + "Echo the provided input", + json!({ + "type": "object", + "properties": { + "input": { "type": "string" } + }, + "required": ["input"] + }), + ); + + runner + .register_mock_tool(tool.clone()) + .await + .expect("tool registered"); + + // First response triggers a tool call; second response is the final answer. + runner + .mock_llm() + .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None) + .await; + runner + .mock_llm() + .add_response("Final response") + .await; + + let result = runner + .run_text("use tool") + .await + .expect("run should succeed"); + + // Tool call should be captured in both tool history and run metadata. + assert_eq!(result.output_text().as_deref(), Some("Final response")); + assert_eq!(tool.call_count().await, 1); + let last_call = tool.last_call().await.expect("tool call captured"); + assert_eq!(last_call.arguments, json!({ "input": "ping" })); + assert_eq!(result.metadata.tool_calls.len(), 1); + let record = &result.metadata.tool_calls[0]; + assert_eq!(record.tool_name, "echo_tool"); + assert_eq!(record.input, json!({ "input": "ping" })); + assert!(record.success); + assert_eq!( + record.output, + Some(json!("Mock execution default")) + ); + assert!(record.duration_ms.is_some()); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn agent_runner_loads_bootstrap_files() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner + .write_bootstrap_file("AGENTS.md", "Bootstrap content for agent test.") + .expect("bootstrap file written"); + + runner + .mock_llm() + .add_response("Bootstrapped response") + .await; + + let _ = runner + .run_text("check prompt") + .await + .expect("run should succeed"); + + let request = runner + .mock_llm() + .last_request() + .await + .expect("request captured"); + // Validate the system message includes the bootstrap content. + let system_message = request + .messages + .first() + .and_then(|msg| msg.content.as_deref()) + .expect("system message content"); + + assert!(system_message.contains("AGENTS.md")); + assert!(system_message.contains("Bootstrap content for agent test.")); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn agent_runner_supports_multi_turn_runs() { + // Multi-turn helper should keep the same session and extend history. + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner.mock_llm().add_response("First reply").await; + runner.mock_llm().add_response("Second reply").await; + + let results = runner + .run_texts(&["turn one", "turn two"]) + .await + .expect("multi-turn run succeeds"); + + assert_eq!(results.len(), 2); + assert_eq!(results[0].output_text().as_deref(), Some("First reply")); + assert_eq!(results[1].output_text().as_deref(), Some("Second reply")); + + // Session snapshot should contain two user/assistant pairs. + let snapshot = results + .last() + .and_then(|result| result.metadata.session_snapshot.as_ref()) + .expect("session snapshot captured"); + assert_eq!(snapshot.len(), 4); + assert_session_messages( + snapshot, + &[ + ("user", "turn one"), + ("assistant", "First reply"), + ("user", "turn two"), + ("assistant", "Second reply"), + ], + ); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn agent_runner_customizes_prompt_identity_and_bootstraps() { + // Custom identity + bootstrap list should appear in the system prompt. + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner + .write_bootstrap_file("CUSTOM.md", "Custom bootstrap content.") + .expect("bootstrap file written"); + runner + .configure_prompt( + Some(mofa_foundation::agent::context::prompt::AgentIdentity { + name: "TestAgent".to_string(), + description: "Custom identity".to_string(), + icon: None, + }), + Some(vec!["CUSTOM.md".to_string()]), + ) + .await; + + runner.mock_llm().add_response("Custom response").await; + let _ = runner + .run_text("custom prompt") + .await + .expect("run should succeed"); + + let request = runner + .mock_llm() + .last_request() + .await + .expect("request captured"); + // Validate custom identity and bootstrap content. + let system_message = request + .messages + .first() + .and_then(|msg| msg.content.as_deref()) + .expect("system message content"); + + assert!(system_message.contains("TestAgent")); + assert!(system_message.contains("Custom identity")); + assert!(system_message.contains("CUSTOM.md")); + assert!(system_message.contains("Custom bootstrap content.")); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn agent_runner_captures_llm_failure() { + // LLM failures should surface in AgentRunResult with failed stats. + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner + .mock_llm() + .add_error_response("mock failure") + .await; + + let result = runner + .run_text("trigger failure") + .await + .expect("run should return result"); + + assert!(!result.is_success()); + let error = result.error.expect("error captured"); + assert!(error.to_string().contains("mock failure")); + assert_eq!(result.metadata.runner_stats_after.failed_executions, 1); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn agent_runner_allows_custom_session_keys() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner + .mock_llm() + .add_response("Custom session response") + .await; + + let result = runner + .run_text_with_session("custom-session", "hello session") + .await + .expect("run should succeed"); + + assert_eq!( + result.metadata.session_id.as_deref(), + Some("custom-session") + ); + let session_path = runner + .workspace() + .join("sessions") + .join("custom-session.jsonl"); + assert!(session_path.exists()); + + runner.shutdown().await.expect("shutdown succeeds"); +} diff --git a/tests/tests/artifact_tests.rs b/tests/tests/artifact_tests.rs new file mode 100644 index 000000000..1981f122e --- /dev/null +++ b/tests/tests/artifact_tests.rs @@ -0,0 +1,136 @@ +//! Tests for canonical DSL run artifact generation. + +use mofa_testing::{ + AgentRunArtifact, TestCaseDsl, assertion_error_from_outcomes, collect_assertion_outcomes, + execute_test_case, +}; + +#[tokio::test] +async fn artifact_contains_core_run_data() { + let case = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/tool_agent.toml" + )) + .expect("tool DSL example should parse"); + + let result = execute_test_case(&case) + .await + .expect("DSL case should execute"); + let assertions = collect_assertion_outcomes(&case, &result); + let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions); + + assert_eq!(artifact.case_name, "tool_agent_run"); + assert_eq!(artifact.status, "passed"); + assert_eq!(artifact.output_text.as_deref(), Some("Tool execution complete")); + assert_eq!(artifact.tool_calls.len(), 1); + assert_eq!(artifact.tool_calls[0].tool_name, "echo_tool"); + assert!(!artifact.agent.name.is_empty()); + assert!(artifact.llm_request.is_some()); + assert!(artifact.workspace_after.files.iter().any(|file| { + file.relative_path.ends_with(".jsonl") + })); +} + +#[tokio::test] +async fn artifact_captures_failed_assertions() { + let case = TestCaseDsl::from_toml_str( + r#" +name = "failing_case" +prompt = "Say hello" + +[llm] +responses = ["wrong output"] + +[assert] +contains = "expected text" +"#, + ) + .expect("inline DSL should parse"); + + let result = execute_test_case(&case) + .await + .expect("DSL case should execute"); + let assertions = collect_assertion_outcomes(&case, &result); + let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone()); + + assert_eq!(artifact.status, "failed"); + assert_eq!(artifact.assertions.len(), 1); + assert!(!artifact.assertions[0].passed); + assert!(assertion_error_from_outcomes(&assertions).is_some()); +} + +#[tokio::test] +async fn artifact_compare_to_returns_match_for_identical_runs() { + let case = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/simple_agent.toml" + )) + .expect("DSL example should parse"); + + let result = execute_test_case(&case) + .await + .expect("DSL case should execute"); + let assertions = collect_assertion_outcomes(&case, &result); + let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions); + + let diff = artifact.compare_to(&artifact); + + assert!(diff.matches); + assert!(diff.differences.is_empty()); +} + +#[tokio::test] +async fn artifact_compare_to_detects_output_and_tool_changes() { + let baseline = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/tool_agent.toml" + )) + .expect("tool DSL should parse"); + let baseline_result = execute_test_case(&baseline) + .await + .expect("baseline DSL should execute"); + let baseline_assertions = collect_assertion_outcomes(&baseline, &baseline_result); + let baseline_artifact = + AgentRunArtifact::from_run_result(&baseline, &baseline_result, baseline_assertions); + + let candidate = TestCaseDsl::from_toml_str( + r#" +name = "tool_agent_run" +input = "Use the echo tool and summarize the result." + +[[tools]] +name = "lookup_tool" +description = "Lookup the provided input." +schema = { type = "object", properties = { input = { type = "string" } }, required = ["input"] } +result = "lookup result" + +[assert] +contains = "Different output" +tool_called = "lookup_tool" + +[llm] + +[[llm.steps]] +type = "tool_call" +tool = "lookup_tool" +arguments = { input = "ping" } + +[[llm.steps]] +type = "text" +content = "Different output" +"#, + ) + .expect("candidate DSL should parse"); + let candidate_result = execute_test_case(&candidate) + .await + .expect("candidate DSL should execute"); + let candidate_assertions = collect_assertion_outcomes(&candidate, &candidate_result); + let candidate_artifact = + AgentRunArtifact::from_run_result(&candidate, &candidate_result, candidate_assertions); + + let diff = candidate_artifact.compare_to(&baseline_artifact); + + assert!(!diff.matches); + assert!(diff.differences.iter().any(|item| item.field == "output_text")); + assert!(diff.differences.iter().any(|item| item.field == "tool_calls")); +} diff --git a/tests/tests/assertion_macro_tests.rs b/tests/tests/assertion_macro_tests.rs index 55c2af2f4..c27aee010 100644 --- a/tests/tests/assertion_macro_tests.rs +++ b/tests/tests/assertion_macro_tests.rs @@ -1,11 +1,11 @@ -//! Tests for assertion macros: assert_tool_called!, assert_tool_called_with!, -//! assert_infer_called!, assert_bus_message_sent!. +//! Tests for assertion macros and assertion helpers used by the testing crate. use mofa_foundation::agent::components::tool::SimpleTool; use mofa_foundation::orchestrator::{ModelOrchestrator, ModelProviderConfig, ModelType}; use mofa_kernel::agent::components::tool::ToolInput; use mofa_kernel::bus::CommunicationMode; use mofa_kernel::message::AgentMessage; +use mofa_testing::agent_runner::AgentTestRunner; use mofa_testing::backend::MockLLMBackend; use mofa_testing::bus::MockAgentBus; use mofa_testing::tools::MockTool; @@ -121,3 +121,89 @@ async fn assert_bus_message_sent_panics_when_no_message_from_sender() { mofa_testing::assert_bus_message_sent!(bus, "agent-2"); } + +// =================================================================== +// New assertion helpers +// =================================================================== + +#[tokio::test] +async fn assert_tool_last_result_passes_on_matching_output() { + let tool = MockTool::new("search", "Search tool", json!({"type": "object"})); + tool.execute(ToolInput::from_json(json!({"query": "rust"}))) + .await; + + mofa_testing::assert_tool_last_result!(tool, json!("Mock execution default")); +} + +#[tokio::test] +async fn assert_agent_output_text_passes_on_matching_output() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner.mock_llm().add_response("hello from runner").await; + + let result = runner.run_text("hello").await.expect("run succeeds"); + + mofa_testing::assert_agent_output_text!(result, "hello from runner"); + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn assert_run_failed_with_passes_on_matching_error() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner.mock_llm().add_error_response("mock failure").await; + + let result = runner.run_text("hello").await.expect("run completes"); + + mofa_testing::assert_run_failed_with!(result, "mock failure"); + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn assert_workspace_contains_file_passes_when_snapshot_has_file() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner.mock_llm().add_response("workspace ready").await; + + let result = runner.run_text("hello").await.expect("run succeeds"); + let expected = format!("sessions/{}.jsonl", runner.session_id()); + + mofa_testing::assert_workspace_contains_file!(result.metadata.workspace_snapshot_after, expected); + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn assert_run_recorded_tool_call_passes_when_tool_metadata_exists() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + let tool = MockTool::new( + "echo_tool", + "Echo tool", + json!({ + "type": "object", + "properties": { "input": { "type": "string" } }, + "required": ["input"] + }), + ); + runner + .register_mock_tool(tool) + .await + .expect("tool registered"); + runner + .mock_llm() + .add_tool_call_response("echo_tool", json!({ "input": "ping" }), None) + .await; + runner.mock_llm().add_response("done").await; + + let result = runner.run_text("use tool").await.expect("run succeeds"); + + mofa_testing::assert_run_recorded_tool_call!(result, "echo_tool"); + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn assert_runner_total_executions_passes_on_first_run() { + let mut runner = AgentTestRunner::new().await.expect("runner initializes"); + runner.mock_llm().add_response("counted").await; + + let result = runner.run_text("hello").await.expect("run succeeds"); + + mofa_testing::assert_runner_total_executions!(result, 1); + runner.shutdown().await.expect("shutdown succeeds"); +} diff --git a/tests/tests/dsl_tests.rs b/tests/tests/dsl_tests.rs new file mode 100644 index 000000000..273366211 --- /dev/null +++ b/tests/tests/dsl_tests.rs @@ -0,0 +1,108 @@ +//! Integration tests for the minimal TOML DSL adapter. + +use mofa_testing::{configure_runner_from_test_case, run_test_case, AgentTestRunner, TestCaseDsl}; + +#[tokio::test] +async fn toml_dsl_runs_through_agent_runner() { + // Load the example DSL from the crate so the test exercises parsing and + // adapter execution together. + let case = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/simple_agent.toml" + )) + .expect("DSL example should parse"); + + assert_eq!(case.name, "simple_agent_run"); + + let result = run_test_case(&case) + .await + .expect("DSL case should run successfully"); + + assert!(result.is_success()); + assert_eq!(result.output_text().as_deref(), Some("hello from DSL")); +} + +#[tokio::test] +async fn toml_dsl_supports_bootstrap_files() { + let case = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/bootstrap_agent.toml" + )) + .expect("bootstrap DSL example should parse"); + + let mut runner = AgentTestRunner::new() + .await + .expect("runner should initialize"); + + configure_runner_from_test_case(&case, &mut runner) + .await + .expect("DSL bootstrap config should apply"); + + let _ = runner + .run_text(case.prompt.as_deref().expect("prompt should be present")) + .await + .expect("bootstrap run should succeed"); + + let request = runner + .mock_llm() + .last_request() + .await + .expect("request should be captured"); + let system_message = request + .messages + .first() + .and_then(|msg| msg.content.as_deref()) + .expect("system message content"); + + assert!(system_message.contains("AGENTS.md")); + assert!(system_message.contains("Bootstrapped instructions for the DSL test.")); + + runner.shutdown().await.expect("shutdown succeeds"); +} + +#[tokio::test] +async fn toml_dsl_supports_tool_backed_runs() { + let case = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/tool_agent.toml" + )) + .expect("tool DSL example should parse"); + + let result = run_test_case(&case) + .await + .expect("tool-backed DSL case should run successfully"); + + assert!(result.is_success()); + assert_eq!(result.output_text().as_deref(), Some("Tool execution complete")); + assert_eq!(result.metadata.tool_calls.len(), 1); + assert_eq!(result.metadata.tool_calls[0].tool_name, "echo_tool"); + + let request = result + .metadata + .llm_last_request + .as_ref() + .expect("request should be captured"); + let system_message = request + .messages + .first() + .and_then(|msg| msg.content.as_deref()) + .expect("system message content"); + assert!(system_message.contains("ToolAgent")); + assert!(system_message.contains("Agent used to validate tool-aware DSL execution.")); +} + +#[tokio::test] +async fn toml_dsl_supports_tape_backed_runs() { + let case = TestCaseDsl::from_toml_file(concat!( + env!("CARGO_MANIFEST_DIR"), + "/examples/simple_agent_tape.toml" + )) + .expect("tape-backed DSL example should parse"); + + let result = run_test_case(&case) + .await + .expect("tape-backed DSL case should run successfully"); + + assert!(result.is_success()); + assert_eq!(result.output_text().as_deref(), Some("hello from tape")); +}