Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
00d3d55
tests: add real agent runner harness
AdityaShome Mar 23, 2026
aed5b97
Add agent runner test harness with workspace isolation and tool/boots…
AdityaShome Mar 23, 2026
4543c4d
Add agent runner metadata, tool capture, and prompt customization
AdityaShome Mar 23, 2026
dcdb83e
Add workspace snapshots, session assertions, and tool timing to agent…
AdityaShome Mar 23, 2026
4bf72fe
Add agent runner examples and session aware runner updates
AdityaShome Mar 23, 2026
5560ff7
Normalize workspace snapshot paths for cross platform agent runner t…
AdityaShome Mar 23, 2026
709f06e
Merge branch 'main' into agent-runner-testing
AdityaShome Mar 30, 2026
9efd792
feat(testing): add agent runner assertions
AdityaShome Mar 30, 2026
f3768d3
feat(testing): add minimal TOML DSL adapter
AdityaShome Mar 30, 2026
b4d3d08
feat(cli): add test dsl command for TOML agent test cases
AdityaShome Mar 31, 2026
3dcceb5
feat(cli): add report output for test dsl command
AdityaShome Mar 31, 2026
c7099b9
feat(testing): add canonical run artifact for test dsl execution
AdityaShome Mar 31, 2026
1e3b47c
feat(testing): add baseline comparison for DSL run artifacts
AdityaShome Apr 1, 2026
23cc2e9
test(testing): add missing bootstrap DSL fixture
AdityaShome Apr 1, 2026
cd23885
feat(testing): add machine readable comparison output
AdityaShome Apr 2, 2026
9e25b9c
test(cli): cover comparison out in test dsl
AdityaShome Apr 2, 2026
aa1d186
feat(testing): add fail on diff CI gate for test dsl
AdityaShome Apr 2, 2026
86ad7b1
feat(testing): add OpenAI compatible recording + replay path
AdityaShome Apr 4, 2026
934d6a4
feat(testing): add tape fixture for replay example
AdityaShome Apr 5, 2026
c788369
test(cli): escape Windows tape paths in DSL integration test
AdityaShome Apr 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ lazy_static = "1.4"
# Actor framework for ReAct agents
ractor = "0"

# TOML deserialization (also used transitively by config)
toml = "0.8"

# Configuration file support (multi-format)
config = { version = "0.14", features = [
"toml",
Expand Down
2 changes: 2 additions & 0 deletions crates/mofa-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ mofa-kernel = { path = "../mofa-kernel", version = "0.1", features = [
] }
mofa-runtime = { path = "../mofa-runtime", version = "0.1" }
mofa-foundation = { path = "../mofa-foundation", version = "0.1" }
mofa-testing = { path = "../../tests", version = "0.1" }
config.workspace = true
tokio = { workspace = true }
thiserror = { workspace = true }
Expand Down Expand Up @@ -90,6 +91,7 @@ tokio-stream = "0.1"
assert_cmd = "2"
predicates = "3"
tempfile = "3"
axum = { workspace = true }

[features]
default = []
Expand Down
111 changes: 111 additions & 0 deletions crates/mofa-cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,40 @@ pub enum Commands {
dora: bool,
},

/// Run a testing DSL case file
TestDsl {
/// TOML DSL file to execute
file: PathBuf,

/// Optional canonical artifact file path
#[arg(long)]
artifact_out: Option<PathBuf>,

/// Optional report file path
#[arg(long)]
report_out: Option<PathBuf>,

/// Compare the current artifact against a saved baseline artifact
#[arg(long)]
baseline_in: Option<PathBuf>,

/// Write the current artifact to a baseline file
#[arg(long)]
baseline_out: Option<PathBuf>,

/// Write machine-readable comparison output (requires --baseline-in)
#[arg(long)]
comparison_out: Option<PathBuf>,

/// Exit non-zero when baseline comparison mismatches
#[arg(long)]
fail_on_diff: bool,

/// Report file format
#[arg(long, value_enum, default_value_t = TestDslReportFormat::Json)]
report_format: TestDslReportFormat,
},

/// Run a dora dataflow
#[cfg(feature = "dora")]
Dataflow {
Expand Down Expand Up @@ -219,6 +253,12 @@ pub enum DatabaseType {
Sqlite,
}

#[derive(Clone, Copy, Debug, ValueEnum, PartialEq, Eq)]
pub enum TestDslReportFormat {
Json,
Text,
}

impl std::fmt::Display for DatabaseType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Expand Down Expand Up @@ -725,6 +765,77 @@ mod tests {
assert!(parsed.is_ok(), "doctor ci strict json should parse");
}

#[test]
fn test_test_dsl_parses() {
let parsed = Cli::try_parse_from(["mofa", "test-dsl", "tests/examples/simple_agent.toml"]);
assert!(parsed.is_ok(), "test-dsl command should parse");
}

#[test]
fn test_test_dsl_report_flags_parse() {
let parsed = Cli::try_parse_from([
"mofa",
"test-dsl",
"tests/examples/simple_agent.toml",
"--report-out",
"/tmp/report.json",
"--report-format",
"json",
]);
assert!(parsed.is_ok(), "test-dsl report flags should parse");
}

#[test]
fn test_test_dsl_artifact_flag_parses() {
let parsed = Cli::try_parse_from([
"mofa",
"test-dsl",
"tests/examples/simple_agent.toml",
"--artifact-out",
"/tmp/artifact.json",
]);
assert!(parsed.is_ok(), "test-dsl artifact flag should parse");
}

#[test]
fn test_test_dsl_baseline_flags_parse() {
let parsed = Cli::try_parse_from([
"mofa",
"test-dsl",
"tests/examples/simple_agent.toml",
"--baseline-in",
"/tmp/baseline.json",
"--baseline-out",
"/tmp/new-baseline.json",
]);
assert!(parsed.is_ok(), "test-dsl baseline flags should parse");
}

#[test]
fn test_test_dsl_comparison_flag_parse() {
let parsed = Cli::try_parse_from([
"mofa",
"test-dsl",
"tests/examples/simple_agent.toml",
"--baseline-in",
"/tmp/baseline.json",
"--comparison-out",
"/tmp/comparison.json",
]);
assert!(parsed.is_ok(), "test-dsl comparison flag should parse");
}

#[test]
fn test_test_dsl_fail_on_diff_flag_parse() {
let parsed = Cli::try_parse_from([
"mofa",
"test-dsl",
"tests/examples/simple_agent.toml",
"--fail-on-diff",
]);
assert!(parsed.is_ok(), "test-dsl fail-on-diff flag should parse");
}

#[test]
fn test_rag_index_parses() {
let parsed = Cli::try_parse_from([
Expand Down
1 change: 1 addition & 0 deletions crates/mofa-cli/src/commands/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ pub mod new;
pub mod plugin;
pub mod rag;
pub mod run;
pub mod test_dsl;
pub mod session;
pub mod tool;
213 changes: 213 additions & 0 deletions crates/mofa-cli/src/commands/test_dsl.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
//! `mofa test-dsl` command implementation

use crate::CliError;
use crate::cli::TestDslReportFormat;
use crate::output::OutputFormat;
use mofa_testing::{
AgentRunArtifact, AgentRunArtifactComparison, DslError, JsonFormatter,
ReportFormatter,
TestCaseResult, TestReport, TestStatus, TextFormatter, TestCaseDsl,
assertion_error_from_outcomes, collect_assertion_outcomes, execute_test_case,
};
use serde::Serialize;
use serde_json::json;
use std::path::Path;

#[derive(Debug, Serialize)]
struct TestDslSummary {
name: String,
success: bool,
output_text: Option<String>,
duration_ms: u128,
tool_calls: Vec<String>,
workspace_root: String,
baseline_matches: Option<bool>,
}

/// Execute one TOML DSL test case through the testing runner.
pub async fn run(
path: &Path,
format: OutputFormat,
artifact_out: Option<&Path>,
report_out: Option<&Path>,
baseline_in: Option<&Path>,
baseline_out: Option<&Path>,
comparison_out: Option<&Path>,
fail_on_diff: bool,
report_format: TestDslReportFormat,
) -> Result<(), CliError> {
let case = TestCaseDsl::from_toml_file(path).map_err(map_dsl_error)?;
let result = execute_test_case(&case).await.map_err(map_dsl_error)?;
let assertions = collect_assertion_outcomes(&case, &result);
let artifact = AgentRunArtifact::from_run_result(&case, &result, assertions.clone());
let report = build_report(&artifact);
let baseline = if let Some(baseline_in) = baseline_in {
Some(read_artifact(baseline_in)?)
} else {
None
};
let baseline_diff = baseline.as_ref().map(|baseline| artifact.compare_to(baseline));

if let Some(artifact_out) = artifact_out {
write_artifact(artifact_out, &artifact)?;
}

if let Some(baseline_out) = baseline_out {
write_artifact(baseline_out, &artifact)?;
}

if let Some(comparison_out) = comparison_out {
let baseline = baseline.as_ref().ok_or_else(|| {
CliError::Other("comparison output requires --baseline-in".to_string())
})?;
let diff = baseline_diff.as_ref().ok_or_else(|| {
CliError::Other("comparison output requires --baseline-in".to_string())
})?;
let comparison = AgentRunArtifactComparison::from_artifacts(
&artifact,
baseline,
diff.clone(),
);
write_comparison(comparison_out, &comparison)?;
}

if let Some(report_out) = report_out {
write_report(report_out, report_format, &report)?;
}

let summary = TestDslSummary {
name: case.name,
success: result.is_success(),
output_text: result.output_text(),
duration_ms: result.duration.as_millis(),
tool_calls: result
.metadata
.tool_calls
.iter()
.map(|record| record.tool_name.clone())
.collect(),
workspace_root: result.metadata.workspace_root.display().to_string(),
baseline_matches: baseline_diff.as_ref().map(|diff| diff.matches),
};

match format {
OutputFormat::Json => {
let output = json!({
"success": true,
"case": summary,
"baseline": baseline_diff,
});
println!("{}", serde_json::to_string_pretty(&output)?);
}
_ => {
println!("case: {}", summary.name);
println!("status: {}", if summary.success { "passed" } else { "failed" });
if let Some(output_text) = &summary.output_text {
println!("output: {}", output_text);
}
if !summary.tool_calls.is_empty() {
println!("tool_calls: {}", summary.tool_calls.join(", "));
}
println!("duration_ms: {}", summary.duration_ms);
if let Some(diff) = &baseline_diff {
println!("baseline: {}", if diff.matches { "matched" } else { "mismatch" });
for difference in &diff.differences {
println!("difference: {}", difference.field);
}
}
}
}

if fail_on_diff {
if let Some(diff) = &baseline_diff {
if !diff.matches {
return Err(CliError::Other("baseline comparison mismatch".to_string()));
}
}
}

if let Some(error) = assertion_error_from_outcomes(&assertions) {
return Err(map_dsl_error(error));
}

Ok(())
}

fn build_report(artifact: &AgentRunArtifact) -> TestReport {
let status = if artifact.status == "passed" {
TestStatus::Passed
} else {
TestStatus::Failed
};
let error = artifact
.runner_error
.clone()
.or_else(|| {
artifact
.assertions
.iter()
.find(|item| !item.passed)
.map(|item| format!("assertion failed: {}", item.kind))
});
let metadata = vec![
(
"execution_id".to_string(),
artifact.execution_id.clone(),
),
(
"workspace_root".to_string(),
artifact.workspace_root.clone(),
),
(
"tool_calls".to_string(),
artifact.tool_calls.len().to_string(),
),
];

TestReport {
suite_name: "dsl".to_string(),
results: vec![TestCaseResult {
name: artifact.case_name.clone(),
status,
duration: std::time::Duration::from_millis(artifact.duration_ms),
error,
metadata,
}],
total_duration: std::time::Duration::from_millis(artifact.duration_ms),
timestamp: artifact.started_at_ms,
}
}

fn write_artifact(path: &Path, artifact: &AgentRunArtifact) -> Result<(), CliError> {
let body = serde_json::to_string_pretty(artifact)?;
std::fs::write(path, body)?;
Ok(())
}

fn write_comparison(
path: &Path,
comparison: &AgentRunArtifactComparison,
) -> Result<(), CliError> {
// Emit machine readable baseline comparison output.
let body = serde_json::to_string_pretty(comparison)?;
std::fs::write(path, body)?;
Ok(())
}

fn read_artifact(path: &Path) -> Result<AgentRunArtifact, CliError> {
let body = std::fs::read_to_string(path)?;
Ok(serde_json::from_str(&body)?)
}

fn write_report(path: &Path, format: TestDslReportFormat, report: &TestReport) -> Result<(), CliError> {
let body = match format {
TestDslReportFormat::Json => JsonFormatter.format(report),
TestDslReportFormat::Text => TextFormatter.format(report),
};
std::fs::write(path, body)?;
Ok(())
}

fn map_dsl_error(error: DslError) -> CliError {
CliError::Other(format!("DSL test failed: {error}"))
}
Loading
Loading