diff --git a/libs/arcade-cli/arcade_cli/display.py b/libs/arcade-cli/arcade_cli/display.py index fae74b5e2..a091ce9ce 100644 --- a/libs/arcade-cli/arcade_cli/display.py +++ b/libs/arcade-cli/arcade_cli/display.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Any +from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional from arcade_core.schema import ToolDefinition from rich.console import Console @@ -323,13 +324,23 @@ def display_tool_messages(tool_messages: list[dict]) -> None: ) -def display_eval_results(results: list[list[dict[str, Any]]], show_details: bool = False) -> None: +def _display_results_to_console( + output_console: Console, + results: list[list[dict[str, Any]]], + show_details: bool = False, + failed_only: bool = False, + original_counts: Optional[tuple[int, int, int, int]] = None, +) -> None: """ - Display evaluation results in a format inspired by pytest's output. + Display evaluation results to a specific console instance. Args: + output_console: The Console instance to write output to. results: List of dictionaries containing evaluation results for each model. show_details: Whether to show detailed results for each case. + failed_only: Whether only failed cases are being displayed. + original_counts: Optional tuple of (total_cases, total_passed, total_failed, total_warned) + from before filtering. Used when failed_only is True. """ total_passed = 0 total_failed = 0 @@ -343,9 +354,9 @@ def display_eval_results(results: list[list[dict[str, Any]]], show_details: bool cases = model_results.get("cases", []) total_cases += len(cases) - console.print(f"[bold]Model:[/bold] [bold magenta]{model}[/bold magenta]") + output_console.print(f"[bold]Model:[/bold] [bold magenta]{model}[/bold magenta]") if show_details: - console.print(f"[bold magenta]{rubric}[/bold magenta]") + output_console.print(f"[bold magenta]{rubric}[/bold magenta]") for case in cases: evaluation = case["evaluation"] @@ -365,24 +376,76 @@ def display_eval_results(results: list[list[dict[str, Any]]], show_details: bool # Display one-line summary for each case with score as a percentage score_percentage = evaluation.score * 100 - console.print(f"{status} {case['name']} -- Score: {score_percentage:.2f}%") + output_console.print(f"{status} {case['name']} -- Score: {score_percentage:.2f}%") if show_details: # Show detailed information for each case - console.print(f"[bold]User Input:[/bold] {case['input']}\n") - console.print("[bold]Details:[/bold]") - console.print(_format_evaluation(evaluation)) - console.print("-" * 80) - - # Summary - summary = ( - f"[bold]Summary -- [/bold]Total: {total_cases} -- [green]Passed: {total_passed}[/green]" - ) - if total_warned > 0: - summary += f" -- [yellow]Warnings: {total_warned}[/yellow]" - if total_failed > 0: - summary += f" -- [red]Failed: {total_failed}[/red]" - console.print(summary + "\n") + output_console.print(f"[bold]User Input:[/bold] {case['input']}\n") + output_console.print("[bold]Details:[/bold]") + output_console.print(_format_evaluation(evaluation)) + output_console.print("-" * 80) + + # Summary - use original counts if filtering, otherwise use current counts + if failed_only and original_counts: + # Unpack original counts + orig_total, orig_passed, orig_failed, orig_warned = original_counts + + # Show disclaimer before summary + output_console.print( + f"[bold yellow]Note: Showing only {total_cases} failed evaluation(s) (--failed-only)[/bold yellow]" + ) + + # Build summary with original counts + summary = ( + f"[bold]Summary -- [/bold]Total: {orig_total} -- [green]Passed: {orig_passed}[/green]" + ) + if orig_warned > 0: + summary += f" -- [yellow]Warnings: {orig_warned}[/yellow]" + if orig_failed > 0: + summary += f" -- [red]Failed: {orig_failed}[/red]" + else: + # Normal summary with current counts + summary = ( + f"[bold]Summary -- [/bold]Total: {total_cases} -- [green]Passed: {total_passed}[/green]" + ) + if total_warned > 0: + summary += f" -- [yellow]Warnings: {total_warned}[/yellow]" + if total_failed > 0: + summary += f" -- [red]Failed: {total_failed}[/red]" + + output_console.print(summary + "\n") + + +def display_eval_results( + results: list[list[dict[str, Any]]], + show_details: bool = False, + output_file: Optional[str] = None, + failed_only: bool = False, + original_counts: Optional[tuple[int, int, int, int]] = None, +) -> None: + """ + Display evaluation results in a format inspired by pytest's output. + + Args: + results: List of dictionaries containing evaluation results for each model. + show_details: Whether to show detailed results for each case. + output_file: Optional file path to write results to (plain text format). + failed_only: Whether only failed cases are being displayed (adds disclaimer). + original_counts: Optional tuple of (total_cases, total_passed, total_failed, total_warned) + from before filtering. Used when failed_only is True. + """ + # Always display to terminal + _display_results_to_console(console, results, show_details, failed_only, original_counts) + + # Also write to file if requested + if output_file: + output_path = Path(output_file) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + file_console = Console(file=f, width=120, legacy_windows=False) + _display_results_to_console( + file_console, results, show_details, failed_only, original_counts + ) def _format_evaluation(evaluation: "EvaluationResult") -> str: diff --git a/libs/arcade-cli/arcade_cli/main.py b/libs/arcade-cli/arcade_cli/main.py index 3a1e773db..4f22fc86f 100644 --- a/libs/arcade-cli/arcade_cli/main.py +++ b/libs/arcade-cli/arcade_cli/main.py @@ -32,6 +32,7 @@ Provider, compute_base_url, compute_login_url, + filter_failed_evaluations, get_eval_files, handle_cli_error, load_eval_suites, @@ -373,6 +374,18 @@ def evals( "-k", help="The model provider API key. If not provided, will look for the appropriate environment variable based on the provider (e.g., OPENAI_API_KEY for openai provider), first in the current environment, then in the current working directory's .env file.", ), + failed_only: bool = typer.Option( + False, + "--failed-only", + "-f", + help="Show only failed evaluations", + ), + output: Optional[str] = typer.Option( + None, + "--output", + "-o", + help="Write results to a file (plain text format)", + ), debug: bool = typer.Option(False, "--debug", help="Show debug information"), ) -> None: """ @@ -458,7 +471,19 @@ async def run_evaluations() -> None: # TODO error handling on each eval all_evaluations.extend(results) - display_eval_results(all_evaluations, show_details=show_details) + + # Filter to show only failed evaluations if requested + original_counts = None + if failed_only: + all_evaluations, original_counts = filter_failed_evaluations(all_evaluations) + + display_eval_results( + all_evaluations, + show_details=show_details, + output_file=output, + failed_only=failed_only, + original_counts=original_counts, + ) try: asyncio.run(run_evaluations()) diff --git a/libs/arcade-cli/arcade_cli/utils.py b/libs/arcade-cli/arcade_cli/utils.py index ed0b7d0aa..d5c4f7677 100644 --- a/libs/arcade-cli/arcade_cli/utils.py +++ b/libs/arcade-cli/arcade_cli/utils.py @@ -958,6 +958,65 @@ def resolve_provider_api_key(provider: Provider, provider_api_key: str | None = return None +def filter_failed_evaluations( + all_evaluations: list[list[dict[str, Any]]], +) -> tuple[list[list[dict[str, Any]]], tuple[int, int, int, int]]: + """ + Filter evaluation results to show only failed cases and calculate original counts. + + Args: + all_evaluations: List of evaluation results with structure: + [[{model: str, rubric: str, cases: [{name, input, evaluation}]}]] + + Returns: + Tuple of (filtered_evaluations, original_counts) where original_counts is + (total_cases, total_passed, total_failed, total_warned) + """ + original_total_cases = 0 + original_total_passed = 0 + original_total_failed = 0 + original_total_warned = 0 + + # Calculate original counts before filtering + for eval_suite in all_evaluations: + for model_results in eval_suite: + for case in model_results.get("cases", []): + evaluation = case["evaluation"] + original_total_cases += 1 + if evaluation.passed: + original_total_passed += 1 + elif evaluation.warning: + original_total_warned += 1 + else: + original_total_failed += 1 + + # Filter to show only failed evaluations + filtered_evaluations = [] + for eval_suite in all_evaluations: + filtered_suite = [] + for model_results in eval_suite: + filtered_cases = [ + case + for case in model_results.get("cases", []) + if not case["evaluation"].passed and not case["evaluation"].warning + ] + if filtered_cases: # Only include model results with failed cases + filtered_model_results = model_results.copy() + filtered_model_results["cases"] = filtered_cases + filtered_suite.append(filtered_model_results) + if filtered_suite: + filtered_evaluations.append(filtered_suite) + + original_counts = ( + original_total_cases, + original_total_passed, + original_total_failed, + original_total_warned, + ) + + return filtered_evaluations, original_counts + + def require_dependency( package_name: str, command_name: str, diff --git a/libs/tests/cli/test_display.py b/libs/tests/cli/test_display.py new file mode 100644 index 000000000..6eac646c9 --- /dev/null +++ b/libs/tests/cli/test_display.py @@ -0,0 +1,558 @@ +import tempfile +from pathlib import Path +from unittest.mock import Mock + +from arcade_cli.display import display_eval_results +from arcade_evals.eval import EvaluationResult + + +def create_mock_evaluation_result(passed: bool, warning: bool, score: float) -> Mock: + """Create a mock EvaluationResult with the specified properties.""" + evaluation = Mock(spec=EvaluationResult) + evaluation.passed = passed + evaluation.warning = warning + evaluation.score = score + evaluation.failure_reason = None + evaluation.results = [] + return evaluation + + +def test_display_eval_results_normal() -> None: + """Test normal display without filtering.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case 1", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Test Case 2", + "input": "Test input 2", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.5 + ), + }, + ], + } + ] + ] + + # Should not raise any exceptions + display_eval_results(results, show_details=False) + + +def test_display_eval_results_with_failed_only() -> None: + """Test display with failed_only flag and original counts.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ] + ] + + # Original counts: 3 total, 1 passed, 1 failed, 1 warned + original_counts = (3, 1, 1, 1) + + # Should not raise any exceptions + display_eval_results( + results, + show_details=False, + failed_only=True, + original_counts=original_counts, + ) + + +def test_display_eval_results_with_output_file() -> None: + """Test display with output file.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.9 + ), + }, + ], + } + ] + ] + + with tempfile.TemporaryDirectory() as tmpdir: + output_file = Path(tmpdir) / "test_output.txt" + + display_eval_results( + results, + show_details=False, + output_file=str(output_file), + ) + + # Verify file was created + assert output_file.exists() + + # Verify file contains some expected content + content = output_file.read_text() + assert "Model:" in content or "gpt-4o" in content + + +def test_display_eval_results_with_output_file_and_failed_only() -> None: + """Test display with both output file and failed_only flag.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.2 + ), + }, + ], + } + ] + ] + + original_counts = (5, 3, 1, 1) + + with tempfile.TemporaryDirectory() as tmpdir: + output_file = Path(tmpdir) / "test_output.txt" + + display_eval_results( + results, + show_details=False, + output_file=str(output_file), + failed_only=True, + original_counts=original_counts, + ) + + # Verify file was created + assert output_file.exists() + + # Verify file contains disclaimer and summary + content = output_file.read_text() + assert "failed-only" in content.lower() or "failed evaluation" in content.lower() + assert "Total: 5" in content # Should show original total + + +def test_display_eval_results_creates_parent_directories() -> None: + """Test that output file creates parent directories if they don't exist.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.9 + ), + }, + ], + } + ] + ] + + with tempfile.TemporaryDirectory() as tmpdir: + output_file = Path(tmpdir) / "nested" / "path" / "test_output.txt" + + # Parent directories don't exist yet + assert not output_file.parent.exists() + + display_eval_results( + results, + show_details=False, + output_file=str(output_file), + ) + + # Parent directories should be created + assert output_file.parent.exists() + assert output_file.exists() + + +def test_display_eval_results_with_warnings() -> None: + """Test display with cases that have warnings.""" + results: list = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Warning Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=True, score=0.85 + ), + }, + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ] + ] + + # Should not raise any exceptions + display_eval_results(results, show_details=False) + + +def test_display_eval_results_empty_results() -> None: + """Test display with empty results.""" + results: list = [] + + # Should not raise any exceptions + display_eval_results(results, show_details=False) + + +def test_display_eval_results_with_details() -> None: + """Test display with show_details=True.""" + evaluation = create_mock_evaluation_result(passed=True, warning=False, score=0.95) + evaluation.results = [ + { + "field": "test_field", + "match": True, + "score": 1.0, + "weight": 1.0, + "expected": "expected_value", + "actual": "actual_value", + "is_criticized": True, + } + ] + + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case", + "input": "Test input", + "evaluation": evaluation, + }, + ], + } + ] + ] + + # Should not raise any exceptions + display_eval_results(results, show_details=True) + + +def test_display_eval_results_with_failed_only_no_warnings() -> None: + """Test display with failed_only but original counts have no warnings.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ] + ] + + # Original counts: 10 total, 8 passed, 2 failed, 0 warned + original_counts = (10, 8, 2, 0) + + display_eval_results( + results, + show_details=False, + failed_only=True, + original_counts=original_counts, + ) + + +def test_display_eval_results_with_failed_only_no_failed() -> None: + """Test display with failed_only but original counts have no failed.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ] + ] + + # Original counts: 5 total, 5 passed, 0 failed, 0 warned (edge case) + original_counts = (5, 5, 0, 0) + + display_eval_results( + results, + show_details=False, + failed_only=True, + original_counts=original_counts, + ) + + +def test_display_eval_results_multiple_suites() -> None: + """Test display with multiple eval suites.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric 1", + "cases": [ + { + "name": "Test Case 1", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + ], + } + ], + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric 2", + "cases": [ + { + "name": "Test Case 2", + "input": "Test input 2", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.5 + ), + }, + ], + } + ], + ] + + display_eval_results(results, show_details=False) + + +def test_display_eval_results_multiple_models() -> None: + """Test display with multiple models in same suite.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case 1", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + ], + }, + { + "model": "gpt-3.5-turbo", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case 2", + "input": "Test input 2", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.5 + ), + }, + ], + }, + ] + ] + + display_eval_results(results, show_details=False) + + +def test_display_eval_results_summary_with_warnings() -> None: + """Test summary display when warnings are present.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Passed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Warning Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=True, score=0.85 + ), + }, + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ] + ] + + display_eval_results(results, show_details=False) + + +def test_display_eval_results_summary_only_passed() -> None: + """Test summary when all cases passed.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Passed Case 1", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Passed Case 2", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.98 + ), + }, + ], + } + ] + ] + + display_eval_results(results, show_details=False) + + +def test_display_eval_results_failed_only_with_warnings_in_summary() -> None: + """Test failed_only display when original counts include warnings.""" + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ] + ] + + # Original counts: 10 total, 7 passed, 2 failed, 1 warned + original_counts = (10, 7, 2, 1) + + with tempfile.TemporaryDirectory() as tmpdir: + output_file = Path(tmpdir) / "test_output.txt" + + display_eval_results( + results, + show_details=False, + output_file=str(output_file), + failed_only=True, + original_counts=original_counts, + ) + + content = output_file.read_text() + # Should show warnings in summary + assert "Warnings: 1" in content or "Warnings" in content + + +def test_display_eval_results_with_details_and_output() -> None: + """Test display with details and output file.""" + evaluation = create_mock_evaluation_result(passed=True, warning=False, score=0.95) + evaluation.results = [ + { + "field": "test_field", + "match": True, + "score": 1.0, + "weight": 1.0, + "expected": "expected_value", + "actual": "actual_value", + "is_criticized": True, + } + ] + + results = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Test Case", + "input": "Test input", + "evaluation": evaluation, + }, + ], + } + ] + ] + + with tempfile.TemporaryDirectory() as tmpdir: + output_file = Path(tmpdir) / "test_output.txt" + + display_eval_results( + results, + show_details=True, + output_file=str(output_file), + ) + + assert output_file.exists() + content = output_file.read_text() + assert "User Input:" in content + assert "Details:" in content diff --git a/libs/tests/cli/test_main_evals.py b/libs/tests/cli/test_main_evals.py new file mode 100644 index 000000000..51d45cfce --- /dev/null +++ b/libs/tests/cli/test_main_evals.py @@ -0,0 +1,255 @@ +from unittest.mock import Mock + +from arcade_cli.utils import filter_failed_evaluations +from arcade_evals.eval import EvaluationResult + + +def create_mock_evaluation_result(passed: bool, warning: bool, score: float) -> Mock: + """Create a mock EvaluationResult with the specified properties.""" + evaluation = Mock(spec=EvaluationResult) + evaluation.passed = passed + evaluation.warning = warning + evaluation.score = score + evaluation.failure_reason = None + evaluation.results = [] + return evaluation + + +def test_filter_failed_evaluations_mixed_results() -> None: + """Test filtering logic with mixed passed, failed, and warned cases.""" + all_evaluations = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Passed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Warning Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=True, score=0.85 + ), + }, + { + "name": "Failed Case 1", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + { + "name": "Failed Case 2", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.2 + ), + }, + ], + } + ] + ] + + filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations) + + # Verify original counts + assert original_counts == (4, 1, 2, 1) + + # Verify filtered results only contain failed cases + assert len(filtered_evaluations) == 1 + assert len(filtered_evaluations[0]) == 1 + assert len(filtered_evaluations[0][0]["cases"]) == 2 + assert filtered_evaluations[0][0]["cases"][0]["name"] == "Failed Case 1" + assert filtered_evaluations[0][0]["cases"][1]["name"] == "Failed Case 2" + + +def test_filter_failed_evaluations_all_passed() -> None: + """Test filtering when all cases passed (should return empty).""" + all_evaluations = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Passed Case 1", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Passed Case 2", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.98 + ), + }, + ], + } + ] + ] + + filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations) + + # Verify original counts + assert original_counts == (2, 2, 0, 0) + + # Verify filtered results are empty (no failed cases) + assert len(filtered_evaluations) == 0 + + +def test_filter_failed_evaluations_multiple_suites() -> None: + """Test filtering with multiple eval suites.""" + all_evaluations = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric 1", + "cases": [ + { + "name": "Passed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + } + ], + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric 2", + "cases": [ + { + "name": "Failed Case 2", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.2 + ), + }, + ], + } + ], + ] + + filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations) + + # Verify original counts + assert original_counts == (3, 1, 2, 0) + + # Verify filtered results + assert len(filtered_evaluations) == 2 + assert len(filtered_evaluations[0][0]["cases"]) == 1 + assert len(filtered_evaluations[1][0]["cases"]) == 1 + + +def test_filter_failed_evaluations_multiple_models() -> None: + """Test filtering with multiple models in same suite.""" + all_evaluations = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + }, + { + "model": "gpt-3.5-turbo", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Passed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + { + "name": "Failed Case 2", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.2 + ), + }, + ], + }, + ] + ] + + filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations) + + # Verify original counts + assert original_counts == (3, 1, 2, 0) + + # Verify filtered results - should have both models with failed cases + assert len(filtered_evaluations) == 1 + assert len(filtered_evaluations[0]) == 2 # Both models have failed cases + assert len(filtered_evaluations[0][0]["cases"]) == 1 # First model has 1 failed + assert len(filtered_evaluations[0][1]["cases"]) == 1 # Second model has 1 failed + + +def test_filter_failed_evaluations_model_with_no_failed() -> None: + """Test filtering when one model has no failed cases.""" + all_evaluations = [ + [ + { + "model": "gpt-4o", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Passed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=True, warning=False, score=0.95 + ), + }, + ], + }, + { + "model": "gpt-3.5-turbo", + "rubric": "Test Rubric", + "cases": [ + { + "name": "Failed Case", + "input": "Test input", + "evaluation": create_mock_evaluation_result( + passed=False, warning=False, score=0.3 + ), + }, + ], + }, + ] + ] + + filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations) + + # Verify original counts + assert original_counts == (2, 1, 1, 0) + + # Verify filtered results - only second model should be included + assert len(filtered_evaluations) == 1 + assert len(filtered_evaluations[0]) == 1 # Only one model with failed cases + assert filtered_evaluations[0][0]["model"] == "gpt-3.5-turbo" + assert len(filtered_evaluations[0][0]["cases"]) == 1 diff --git a/pyproject.toml b/pyproject.toml index cfa1c4abd..2eea8410d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,10 @@ [project] name = "arcade-mcp" -version = "1.5.8" +version = "1.6.0" description = "Arcade.dev - Tool Calling platform for Agents" readme = "README.md" -license = {file = "LICENSE"} -authors = [ - {name = "Arcade", email = "dev@arcade.dev"}, -] +license = { file = "LICENSE" } +authors = [{ name = "Arcade", email = "dev@arcade.dev" }] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", @@ -87,10 +85,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = [ - "libs/arcade-cli/arcade_cli", - "libs/arcade-evals/arcade_evals", -] +packages = ["libs/arcade-cli/arcade_cli", "libs/arcade-evals/arcade_evals"] [tool.uv.workspace] members = [ @@ -111,7 +106,7 @@ warn_unused_ignores = true show_error_codes = true ignore_missing_imports = true exclude = [ - '.*{{.*}}.*' # Ignore files that have names that use Jinja template syntax + '.*{{.*}}.*', # Ignore files that have names that use Jinja template syntax ] [tool.pytest.ini_options] @@ -131,11 +126,7 @@ addopts = [ [tool.coverage.run] source = ["libs"] -omit = [ - "*/tests/*", - "*/test_*", - "*/__pycache__/*", -] +omit = ["*/tests/*", "*/test_*", "*/__pycache__/*"] parallel = true patch = ["subprocess"]