diff --git a/libs/arcade-cli/arcade_cli/display.py b/libs/arcade-cli/arcade_cli/display.py
index fae74b5e2..a091ce9ce 100644
--- a/libs/arcade-cli/arcade_cli/display.py
+++ b/libs/arcade-cli/arcade_cli/display.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING, Any
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional
 
 from arcade_core.schema import ToolDefinition
 from rich.console import Console
@@ -323,13 +324,23 @@ def display_tool_messages(tool_messages: list[dict]) -> None:
             )
 
 
-def display_eval_results(results: list[list[dict[str, Any]]], show_details: bool = False) -> None:
+def _display_results_to_console(
+    output_console: Console,
+    results: list[list[dict[str, Any]]],
+    show_details: bool = False,
+    failed_only: bool = False,
+    original_counts: Optional[tuple[int, int, int, int]] = None,
+) -> None:
     """
-    Display evaluation results in a format inspired by pytest's output.
+    Display evaluation results to a specific console instance.
 
     Args:
+        output_console: The Console instance to write output to.
         results: List of dictionaries containing evaluation results for each model.
         show_details: Whether to show detailed results for each case.
+        failed_only: Whether only failed cases are being displayed.
+        original_counts: Optional tuple of (total_cases, total_passed, total_failed, total_warned)
+                        from before filtering. Used when failed_only is True.
     """
     total_passed = 0
     total_failed = 0
@@ -343,9 +354,9 @@ def display_eval_results(results: list[list[dict[str, Any]]], show_details: bool
             cases = model_results.get("cases", [])
             total_cases += len(cases)
 
-            console.print(f"[bold]Model:[/bold] [bold magenta]{model}[/bold magenta]")
+            output_console.print(f"[bold]Model:[/bold] [bold magenta]{model}[/bold magenta]")
             if show_details:
-                console.print(f"[bold magenta]{rubric}[/bold magenta]")
+                output_console.print(f"[bold magenta]{rubric}[/bold magenta]")
 
             for case in cases:
                 evaluation = case["evaluation"]
@@ -365,24 +376,76 @@ def display_eval_results(results: list[list[dict[str, Any]]], show_details: bool
 
                 # Display one-line summary for each case with score as a percentage
                 score_percentage = evaluation.score * 100
-                console.print(f"{status} {case['name']} -- Score: {score_percentage:.2f}%")
+                output_console.print(f"{status} {case['name']} -- Score: {score_percentage:.2f}%")
 
                 if show_details:
                     # Show detailed information for each case
-                    console.print(f"[bold]User Input:[/bold] {case['input']}\n")
-                    console.print("[bold]Details:[/bold]")
-                    console.print(_format_evaluation(evaluation))
-                    console.print("-" * 80)
-
-    # Summary
-    summary = (
-        f"[bold]Summary -- [/bold]Total: {total_cases} -- [green]Passed: {total_passed}[/green]"
-    )
-    if total_warned > 0:
-        summary += f" -- [yellow]Warnings: {total_warned}[/yellow]"
-    if total_failed > 0:
-        summary += f" -- [red]Failed: {total_failed}[/red]"
-    console.print(summary + "\n")
+                    output_console.print(f"[bold]User Input:[/bold] {case['input']}\n")
+                    output_console.print("[bold]Details:[/bold]")
+                    output_console.print(_format_evaluation(evaluation))
+                    output_console.print("-" * 80)
+
+    # Summary - use original counts if filtering, otherwise use current counts
+    if failed_only and original_counts:
+        # Unpack original counts
+        orig_total, orig_passed, orig_failed, orig_warned = original_counts
+
+        # Show disclaimer before summary
+        output_console.print(
+            f"[bold yellow]Note: Showing only {total_cases} failed evaluation(s) (--failed-only)[/bold yellow]"
+        )
+
+        # Build summary with original counts
+        summary = (
+            f"[bold]Summary -- [/bold]Total: {orig_total} -- [green]Passed: {orig_passed}[/green]"
+        )
+        if orig_warned > 0:
+            summary += f" -- [yellow]Warnings: {orig_warned}[/yellow]"
+        if orig_failed > 0:
+            summary += f" -- [red]Failed: {orig_failed}[/red]"
+    else:
+        # Normal summary with current counts
+        summary = (
+            f"[bold]Summary -- [/bold]Total: {total_cases} -- [green]Passed: {total_passed}[/green]"
+        )
+        if total_warned > 0:
+            summary += f" -- [yellow]Warnings: {total_warned}[/yellow]"
+        if total_failed > 0:
+            summary += f" -- [red]Failed: {total_failed}[/red]"
+
+    output_console.print(summary + "\n")
+
+
+def display_eval_results(
+    results: list[list[dict[str, Any]]],
+    show_details: bool = False,
+    output_file: Optional[str] = None,
+    failed_only: bool = False,
+    original_counts: Optional[tuple[int, int, int, int]] = None,
+) -> None:
+    """
+    Display evaluation results in a format inspired by pytest's output.
+
+    Args:
+        results: List of dictionaries containing evaluation results for each model.
+        show_details: Whether to show detailed results for each case.
+        output_file: Optional file path to write results to (plain text format).
+        failed_only: Whether only failed cases are being displayed (adds disclaimer).
+        original_counts: Optional tuple of (total_cases, total_passed, total_failed, total_warned)
+                        from before filtering. Used when failed_only is True.
+    """
+    # Always display to terminal
+    _display_results_to_console(console, results, show_details, failed_only, original_counts)
+
+    # Also write to file if requested
+    if output_file:
+        output_path = Path(output_file)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, "w") as f:
+            file_console = Console(file=f, width=120, legacy_windows=False)
+            _display_results_to_console(
+                file_console, results, show_details, failed_only, original_counts
+            )
 
 
 def _format_evaluation(evaluation: "EvaluationResult") -> str:
diff --git a/libs/arcade-cli/arcade_cli/main.py b/libs/arcade-cli/arcade_cli/main.py
index 3a1e773db..4f22fc86f 100644
--- a/libs/arcade-cli/arcade_cli/main.py
+++ b/libs/arcade-cli/arcade_cli/main.py
@@ -32,6 +32,7 @@
     Provider,
     compute_base_url,
     compute_login_url,
+    filter_failed_evaluations,
     get_eval_files,
     handle_cli_error,
     load_eval_suites,
@@ -373,6 +374,18 @@ def evals(
         "-k",
         help="The model provider API key. If not provided, will look for the appropriate environment variable based on the provider (e.g., OPENAI_API_KEY for openai provider), first in the current environment, then in the current working directory's .env file.",
     ),
+    failed_only: bool = typer.Option(
+        False,
+        "--failed-only",
+        "-f",
+        help="Show only failed evaluations",
+    ),
+    output: Optional[str] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Write results to a file (plain text format)",
+    ),
     debug: bool = typer.Option(False, "--debug", help="Show debug information"),
 ) -> None:
     """
@@ -458,7 +471,19 @@ async def run_evaluations() -> None:
 
         # TODO error handling on each eval
         all_evaluations.extend(results)
-        display_eval_results(all_evaluations, show_details=show_details)
+
+        # Filter to show only failed evaluations if requested
+        original_counts = None
+        if failed_only:
+            all_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
+
+        display_eval_results(
+            all_evaluations,
+            show_details=show_details,
+            output_file=output,
+            failed_only=failed_only,
+            original_counts=original_counts,
+        )
 
     try:
         asyncio.run(run_evaluations())
diff --git a/libs/arcade-cli/arcade_cli/utils.py b/libs/arcade-cli/arcade_cli/utils.py
index ed0b7d0aa..d5c4f7677 100644
--- a/libs/arcade-cli/arcade_cli/utils.py
+++ b/libs/arcade-cli/arcade_cli/utils.py
@@ -958,6 +958,65 @@ def resolve_provider_api_key(provider: Provider, provider_api_key: str | None =
     return None
 
 
+def filter_failed_evaluations(
+    all_evaluations: list[list[dict[str, Any]]],
+) -> tuple[list[list[dict[str, Any]]], tuple[int, int, int, int]]:
+    """
+    Filter evaluation results to show only failed cases and calculate original counts.
+
+    Args:
+        all_evaluations: List of evaluation results with structure:
+            [[{model: str, rubric: str, cases: [{name, input, evaluation}]}]]
+
+    Returns:
+        Tuple of (filtered_evaluations, original_counts) where original_counts is
+        (total_cases, total_passed, total_failed, total_warned)
+    """
+    original_total_cases = 0
+    original_total_passed = 0
+    original_total_failed = 0
+    original_total_warned = 0
+
+    # Calculate original counts before filtering
+    for eval_suite in all_evaluations:
+        for model_results in eval_suite:
+            for case in model_results.get("cases", []):
+                evaluation = case["evaluation"]
+                original_total_cases += 1
+                if evaluation.passed:
+                    original_total_passed += 1
+                elif evaluation.warning:
+                    original_total_warned += 1
+                else:
+                    original_total_failed += 1
+
+    # Filter to show only failed evaluations
+    filtered_evaluations = []
+    for eval_suite in all_evaluations:
+        filtered_suite = []
+        for model_results in eval_suite:
+            filtered_cases = [
+                case
+                for case in model_results.get("cases", [])
+                if not case["evaluation"].passed and not case["evaluation"].warning
+            ]
+            if filtered_cases:  # Only include model results with failed cases
+                filtered_model_results = model_results.copy()
+                filtered_model_results["cases"] = filtered_cases
+                filtered_suite.append(filtered_model_results)
+        if filtered_suite:
+            filtered_evaluations.append(filtered_suite)
+
+    original_counts = (
+        original_total_cases,
+        original_total_passed,
+        original_total_failed,
+        original_total_warned,
+    )
+
+    return filtered_evaluations, original_counts
+
+
 def require_dependency(
     package_name: str,
     command_name: str,
diff --git a/libs/tests/cli/test_display.py b/libs/tests/cli/test_display.py
new file mode 100644
index 000000000..6eac646c9
--- /dev/null
+++ b/libs/tests/cli/test_display.py
@@ -0,0 +1,558 @@
+import tempfile
+from pathlib import Path
+from unittest.mock import Mock
+
+from arcade_cli.display import display_eval_results
+from arcade_evals.eval import EvaluationResult
+
+
+def create_mock_evaluation_result(passed: bool, warning: bool, score: float) -> Mock:
+    """Create a mock EvaluationResult with the specified properties."""
+    evaluation = Mock(spec=EvaluationResult)
+    evaluation.passed = passed
+    evaluation.warning = warning
+    evaluation.score = score
+    evaluation.failure_reason = None
+    evaluation.results = []
+    return evaluation
+
+
+def test_display_eval_results_normal() -> None:
+    """Test normal display without filtering."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case 1",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Test Case 2",
+                        "input": "Test input 2",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.5
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Should not raise any exceptions
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_with_failed_only() -> None:
+    """Test display with failed_only flag and original counts."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Original counts: 3 total, 1 passed, 1 failed, 1 warned
+    original_counts = (3, 1, 1, 1)
+
+    # Should not raise any exceptions
+    display_eval_results(
+        results,
+        show_details=False,
+        failed_only=True,
+        original_counts=original_counts,
+    )
+
+
+def test_display_eval_results_with_output_file() -> None:
+    """Test display with output file."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.9
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_file = Path(tmpdir) / "test_output.txt"
+
+        display_eval_results(
+            results,
+            show_details=False,
+            output_file=str(output_file),
+        )
+
+        # Verify file was created
+        assert output_file.exists()
+
+        # Verify file contains some expected content
+        content = output_file.read_text()
+        assert "Model:" in content or "gpt-4o" in content
+
+
+def test_display_eval_results_with_output_file_and_failed_only() -> None:
+    """Test display with both output file and failed_only flag."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.2
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    original_counts = (5, 3, 1, 1)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_file = Path(tmpdir) / "test_output.txt"
+
+        display_eval_results(
+            results,
+            show_details=False,
+            output_file=str(output_file),
+            failed_only=True,
+            original_counts=original_counts,
+        )
+
+        # Verify file was created
+        assert output_file.exists()
+
+        # Verify file contains disclaimer and summary
+        content = output_file.read_text()
+        assert "failed-only" in content.lower() or "failed evaluation" in content.lower()
+        assert "Total: 5" in content  # Should show original total
+
+
+def test_display_eval_results_creates_parent_directories() -> None:
+    """Test that output file creates parent directories if they don't exist."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.9
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_file = Path(tmpdir) / "nested" / "path" / "test_output.txt"
+
+        # Parent directories don't exist yet
+        assert not output_file.parent.exists()
+
+        display_eval_results(
+            results,
+            show_details=False,
+            output_file=str(output_file),
+        )
+
+        # Parent directories should be created
+        assert output_file.parent.exists()
+        assert output_file.exists()
+
+
+def test_display_eval_results_with_warnings() -> None:
+    """Test display with cases that have warnings."""
+    results: list = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Warning Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=True, score=0.85
+                        ),
+                    },
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Should not raise any exceptions
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_empty_results() -> None:
+    """Test display with empty results."""
+    results: list = []
+
+    # Should not raise any exceptions
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_with_details() -> None:
+    """Test display with show_details=True."""
+    evaluation = create_mock_evaluation_result(passed=True, warning=False, score=0.95)
+    evaluation.results = [
+        {
+            "field": "test_field",
+            "match": True,
+            "score": 1.0,
+            "weight": 1.0,
+            "expected": "expected_value",
+            "actual": "actual_value",
+            "is_criticized": True,
+        }
+    ]
+
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case",
+                        "input": "Test input",
+                        "evaluation": evaluation,
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Should not raise any exceptions
+    display_eval_results(results, show_details=True)
+
+
+def test_display_eval_results_with_failed_only_no_warnings() -> None:
+    """Test display with failed_only but original counts have no warnings."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Original counts: 10 total, 8 passed, 2 failed, 0 warned
+    original_counts = (10, 8, 2, 0)
+
+    display_eval_results(
+        results,
+        show_details=False,
+        failed_only=True,
+        original_counts=original_counts,
+    )
+
+
+def test_display_eval_results_with_failed_only_no_failed() -> None:
+    """Test display with failed_only but original counts have no failed."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Original counts: 5 total, 5 passed, 0 failed, 0 warned (edge case)
+    original_counts = (5, 5, 0, 0)
+
+    display_eval_results(
+        results,
+        show_details=False,
+        failed_only=True,
+        original_counts=original_counts,
+    )
+
+
+def test_display_eval_results_multiple_suites() -> None:
+    """Test display with multiple eval suites."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric 1",
+                "cases": [
+                    {
+                        "name": "Test Case 1",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                ],
+            }
+        ],
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric 2",
+                "cases": [
+                    {
+                        "name": "Test Case 2",
+                        "input": "Test input 2",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.5
+                        ),
+                    },
+                ],
+            }
+        ],
+    ]
+
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_multiple_models() -> None:
+    """Test display with multiple models in same suite."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case 1",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                ],
+            },
+            {
+                "model": "gpt-3.5-turbo",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case 2",
+                        "input": "Test input 2",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.5
+                        ),
+                    },
+                ],
+            },
+        ]
+    ]
+
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_summary_with_warnings() -> None:
+    """Test summary display when warnings are present."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Passed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Warning Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=True, score=0.85
+                        ),
+                    },
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_summary_only_passed() -> None:
+    """Test summary when all cases passed."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Passed Case 1",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Passed Case 2",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.98
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    display_eval_results(results, show_details=False)
+
+
+def test_display_eval_results_failed_only_with_warnings_in_summary() -> None:
+    """Test failed_only display when original counts include warnings."""
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    # Original counts: 10 total, 7 passed, 2 failed, 1 warned
+    original_counts = (10, 7, 2, 1)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_file = Path(tmpdir) / "test_output.txt"
+
+        display_eval_results(
+            results,
+            show_details=False,
+            output_file=str(output_file),
+            failed_only=True,
+            original_counts=original_counts,
+        )
+
+        content = output_file.read_text()
+        # Should show warnings in summary
+        assert "Warnings: 1" in content or "Warnings" in content
+
+
+def test_display_eval_results_with_details_and_output() -> None:
+    """Test display with details and output file."""
+    evaluation = create_mock_evaluation_result(passed=True, warning=False, score=0.95)
+    evaluation.results = [
+        {
+            "field": "test_field",
+            "match": True,
+            "score": 1.0,
+            "weight": 1.0,
+            "expected": "expected_value",
+            "actual": "actual_value",
+            "is_criticized": True,
+        }
+    ]
+
+    results = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Test Case",
+                        "input": "Test input",
+                        "evaluation": evaluation,
+                    },
+                ],
+            }
+        ]
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_file = Path(tmpdir) / "test_output.txt"
+
+        display_eval_results(
+            results,
+            show_details=True,
+            output_file=str(output_file),
+        )
+
+        assert output_file.exists()
+        content = output_file.read_text()
+        assert "User Input:" in content
+        assert "Details:" in content
diff --git a/libs/tests/cli/test_main_evals.py b/libs/tests/cli/test_main_evals.py
new file mode 100644
index 000000000..51d45cfce
--- /dev/null
+++ b/libs/tests/cli/test_main_evals.py
@@ -0,0 +1,255 @@
+from unittest.mock import Mock
+
+from arcade_cli.utils import filter_failed_evaluations
+from arcade_evals.eval import EvaluationResult
+
+
+def create_mock_evaluation_result(passed: bool, warning: bool, score: float) -> Mock:
+    """Create a mock EvaluationResult with the specified properties."""
+    evaluation = Mock(spec=EvaluationResult)
+    evaluation.passed = passed
+    evaluation.warning = warning
+    evaluation.score = score
+    evaluation.failure_reason = None
+    evaluation.results = []
+    return evaluation
+
+
+def test_filter_failed_evaluations_mixed_results() -> None:
+    """Test filtering logic with mixed passed, failed, and warned cases."""
+    all_evaluations = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Passed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Warning Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=True, score=0.85
+                        ),
+                    },
+                    {
+                        "name": "Failed Case 1",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                    {
+                        "name": "Failed Case 2",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.2
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
+
+    # Verify original counts
+    assert original_counts == (4, 1, 2, 1)
+
+    # Verify filtered results only contain failed cases
+    assert len(filtered_evaluations) == 1
+    assert len(filtered_evaluations[0]) == 1
+    assert len(filtered_evaluations[0][0]["cases"]) == 2
+    assert filtered_evaluations[0][0]["cases"][0]["name"] == "Failed Case 1"
+    assert filtered_evaluations[0][0]["cases"][1]["name"] == "Failed Case 2"
+
+
+def test_filter_failed_evaluations_all_passed() -> None:
+    """Test filtering when all cases passed (should return empty)."""
+    all_evaluations = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Passed Case 1",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Passed Case 2",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.98
+                        ),
+                    },
+                ],
+            }
+        ]
+    ]
+
+    filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
+
+    # Verify original counts
+    assert original_counts == (2, 2, 0, 0)
+
+    # Verify filtered results are empty (no failed cases)
+    assert len(filtered_evaluations) == 0
+
+
+def test_filter_failed_evaluations_multiple_suites() -> None:
+    """Test filtering with multiple eval suites."""
+    all_evaluations = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric 1",
+                "cases": [
+                    {
+                        "name": "Passed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            }
+        ],
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric 2",
+                "cases": [
+                    {
+                        "name": "Failed Case 2",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.2
+                        ),
+                    },
+                ],
+            }
+        ],
+    ]
+
+    filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
+
+    # Verify original counts
+    assert original_counts == (3, 1, 2, 0)
+
+    # Verify filtered results
+    assert len(filtered_evaluations) == 2
+    assert len(filtered_evaluations[0][0]["cases"]) == 1
+    assert len(filtered_evaluations[1][0]["cases"]) == 1
+
+
+def test_filter_failed_evaluations_multiple_models() -> None:
+    """Test filtering with multiple models in same suite."""
+    all_evaluations = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            },
+            {
+                "model": "gpt-3.5-turbo",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Passed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                    {
+                        "name": "Failed Case 2",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.2
+                        ),
+                    },
+                ],
+            },
+        ]
+    ]
+
+    filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
+
+    # Verify original counts
+    assert original_counts == (3, 1, 2, 0)
+
+    # Verify filtered results - should have both models with failed cases
+    assert len(filtered_evaluations) == 1
+    assert len(filtered_evaluations[0]) == 2  # Both models have failed cases
+    assert len(filtered_evaluations[0][0]["cases"]) == 1  # First model has 1 failed
+    assert len(filtered_evaluations[0][1]["cases"]) == 1  # Second model has 1 failed
+
+
+def test_filter_failed_evaluations_model_with_no_failed() -> None:
+    """Test filtering when one model has no failed cases."""
+    all_evaluations = [
+        [
+            {
+                "model": "gpt-4o",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Passed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=True, warning=False, score=0.95
+                        ),
+                    },
+                ],
+            },
+            {
+                "model": "gpt-3.5-turbo",
+                "rubric": "Test Rubric",
+                "cases": [
+                    {
+                        "name": "Failed Case",
+                        "input": "Test input",
+                        "evaluation": create_mock_evaluation_result(
+                            passed=False, warning=False, score=0.3
+                        ),
+                    },
+                ],
+            },
+        ]
+    ]
+
+    filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
+
+    # Verify original counts
+    assert original_counts == (2, 1, 1, 0)
+
+    # Verify filtered results - only second model should be included
+    assert len(filtered_evaluations) == 1
+    assert len(filtered_evaluations[0]) == 1  # Only one model with failed cases
+    assert filtered_evaluations[0][0]["model"] == "gpt-3.5-turbo"
+    assert len(filtered_evaluations[0][0]["cases"]) == 1
diff --git a/pyproject.toml b/pyproject.toml
index cfa1c4abd..2eea8410d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,10 @@
 [project]
 name = "arcade-mcp"
-version = "1.5.8"
+version = "1.6.0"
 description = "Arcade.dev - Tool Calling platform for Agents"
 readme = "README.md"
-license = {file = "LICENSE"}
-authors = [
-    {name = "Arcade", email = "dev@arcade.dev"},
-]
+license = { file = "LICENSE" }
+authors = [{ name = "Arcade", email = "dev@arcade.dev" }]
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
@@ -87,10 +85,7 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = [
-    "libs/arcade-cli/arcade_cli",
-    "libs/arcade-evals/arcade_evals",
-]
+packages = ["libs/arcade-cli/arcade_cli", "libs/arcade-evals/arcade_evals"]
 
 [tool.uv.workspace]
 members = [
@@ -111,7 +106,7 @@ warn_unused_ignores = true
 show_error_codes = true
 ignore_missing_imports = true
 exclude = [
-    '.*{{.*}}.*' # Ignore files that have names that use Jinja template syntax
+    '.*{{.*}}.*', # Ignore files that have names that use Jinja template syntax
 ]
 
 [tool.pytest.ini_options]
@@ -131,11 +126,7 @@ addopts = [
 
 [tool.coverage.run]
 source = ["libs"]
-omit = [
-    "*/tests/*",
-    "*/test_*",
-    "*/__pycache__/*",
-]
+omit = ["*/tests/*", "*/test_*", "*/__pycache__/*"]
 parallel = true
 patch = ["subprocess"]