ArcadeAI · jottakka · Jan 7, 2026 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/examples/composite_mcp_evals_example.py b/examples/composite_mcp_evals_example.py
@@ -0,0 +1,236 @@
+"""
+Example: Evaluating Tools from Multiple MCP Servers
+
+This example demonstrates how to use CompositeMCPRegistry to evaluate tools
+from multiple MCP servers in a single evaluation suite.
+"""
+
+from arcade_evals import (
+    BinaryCritic,
+    CompositeMCPRegistry,
+    EvalSuite,
+    ExpectedToolCall,
+)
+
+# Step 1: Define tool descriptors from multiple MCP servers
+# In practice, these would come from different MCP server tools/list responses
+
+calculator_tools = [
+    {
+        "name": "add",
+        "description": "Add two numbers together",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "a": {"type": "number", "description": "First number"},
+                "b": {"type": "number", "description": "Second number", "default": 0},
+            },
+            "required": ["a"],
+        },
+    },
+    {
+        "name": "multiply",
+        "description": "Multiply two numbers together",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "a": {"type": "number"},
+                "b": {"type": "number"},
+            },
+            "required": ["a", "b"],
+        },
+    },
+]
+
+string_tools = [
+    {
+        "name": "uppercase",
+        "description": "Convert string to uppercase",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "text": {"type": "string", "description": "Text to convert"},
+            },
+            "required": ["text"],
+        },
+    },
+    {
+        "name": "reverse",
+        "description": "Reverse a string",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "text": {"type": "string", "description": "Text to reverse"},
+            },
+            "required": ["text"],
+        },
+    },
+]
+
+datetime_tools = [
+    {
+        "name": "format_date",
+        "description": "Format a date string",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "date": {"type": "string"},
+                "format": {"type": "string", "default": "%Y-%m-%d"},
+            },
+            "required": ["date"],
+        },
+    },
+]
+
+# Step 2: Create a composite registry with tools from multiple servers
+# Method 1: Pass tool lists directly
+composite = CompositeMCPRegistry(
+    tool_lists={
+        "calculator": calculator_tools,
+        "strings": string_tools,
+        "datetime": datetime_tools,
+    }
+)
+
+print("🎯 Composite MCP Registry Created!")
+print(f"Servers: {', '.join(composite.get_server_names())}")
+print()
+
+# Step 3: Show how tools are namespaced
+print("📋 All Tools (with namespacing):")
+tools = composite.list_tools_for_model(tool_format="openai")
+for tool in tools:
+    name = tool["function"]["name"]
+    desc = tool["function"]["description"]
+    print(f"  - {name}: {desc}")
+print()
+
+# Step 4: Create an evaluation suite using the composite registry
+suite = EvalSuite(
+    name="Multi-Server Evaluation Suite",
+    system_message="You are a helpful assistant with access to calculator, string, and datetime tools.",
+    catalog=composite,
+)
+
+# Step 5: Add test cases using tools from different servers
+
+# Test 1: Calculator server - using fully namespaced name
+suite.add_case(
+    name="Addition with namespace",
+    user_message="What is 15 plus 7?",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="calculator.add",  # Fully namespaced
+            args={"a": 15, "b": 7},
+        )
+    ],
+    critics=[
+        BinaryCritic(critic_field="a", weight=0.5),
+        BinaryCritic(critic_field="b", weight=0.5),
+    ],
+)
+
+# Test 2: String server - using short unique name
+suite.add_case(
+    name="String uppercase",
+    user_message="Convert 'hello world' to uppercase",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="uppercase",  # Short name (unique across all servers)
+            args={"text": "hello world"},
+        )
+    ],
+    critics=[
+        BinaryCritic(critic_field="text", weight=1.0),
+    ],
+)
+
+# Test 3: Multiple tool calls from different servers
+suite.add_case(
+    name="Mixed server operations",
+    user_message="Calculate 10 times 5, then reverse the result",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="calculator.multiply",
+            args={"a": 10, "b": 5},
+        ),
+        ExpectedToolCall(
+            tool_name="strings.reverse",
+            args={"text": "50"},
+        ),
+    ],
+    critics=[
+        BinaryCritic(critic_field="a", weight=0.25),
+        BinaryCritic(critic_field="b", weight=0.25),
+        BinaryCritic(critic_field="text", weight=0.5),
+    ],
+)
+
+# Test 4: Using defaults from schema
+suite.add_case(
+    name="Date formatting with default",
+    user_message="Format the date 2025-11-18",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="datetime.format_date",
+            args={"date": "2025-11-18"},  # 'format' will use default
+        )
+    ],
+    critics=[
+        BinaryCritic(critic_field="date", weight=1.0),
+    ],
+)
+
+# Step 6: Display configured cases
+print("✅ Evaluation Suite Configured!")
+print(f"Suite: {suite.name}")
+print(f"Total cases: {len(suite.cases)}\n")
+
+print("Configured test cases:")
+for i, case in enumerate(suite.cases, 1):
+    print(f"\n{i}. {case.name}")
+    print(f"   Expected {len(case.expected_tool_calls)} tool call(s):")
+    for tc in case.expected_tool_calls:
+        print(f"   - {tc.name}({tc.args})")
+
+# Step 7: Demonstrate name collision handling
+print("\n\n🔍 Name Collision Example:")
+print("=" * 60)
+
+# Create two servers with the same tool name
+tools_a = [
+    {
+        "name": "process",
+        "description": "Process A",
+        "inputSchema": {"type": "object", "properties": {}},
+    }
+]
+tools_b = [
+    {
+        "name": "process",
+        "description": "Process B",
+        "inputSchema": {"type": "object", "properties": {}},
+    }
+]
+
+collision_composite = CompositeMCPRegistry(tool_lists={"server_a": tools_a, "server_b": tools_b})
+
+# Short name is ambiguous
+try:
+    collision_composite.resolve_tool_name("process")
+except ValueError as e:
+    print(f"❌ Short name fails: {e}")
+
+# But namespaced names work fine
+print(f"✅ Namespaced works: {collision_composite.resolve_tool_name('server_a.process')}")
+print(f"✅ Namespaced works: {collision_composite.resolve_tool_name('server_b.process')}")
+
+print("\n\n💡 Key Features:")
+print("  • Combine tools from multiple MCP servers")
+print("  • Automatic namespacing prevents collisions (server.tool)")
+print("  • Short names work when unique across all servers")
+print("  • Each server's tools maintain their own schemas and defaults")
+print("  • All existing Python tool evaluations still work unchanged")
+
+print("\n💡 To run actual evaluations, use:")
+print("   results = suite.run(provider_api_key='your-api-key', model='gpt-4')")
diff --git a/examples/mcp_evals_example.py b/examples/mcp_evals_example.py
@@ -0,0 +1,129 @@
+"""
+Example: Evaluating MCP Server Tools with arcade-evals
+
+This example demonstrates how to evaluate tools from an MCP server
+without requiring Python callables.
+"""
+
+from arcade_evals import BinaryCritic, EvalSuite, ExpectedToolCall, MCPToolRegistry
+
+# Step 1: Define MCP tool descriptors
+# These would typically come from an MCP server's tools/list response
+mcp_tools = [
+    {
+        "name": "calculator_add",
+        "description": "Add two numbers together",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "a": {
+                    "type": "number",
+                    "description": "First number",
+                },
+                "b": {
+                    "type": "number",
+                    "description": "Second number",
+                    "default": 0,  # Optional: MCP tools can specify defaults
+                },
+            },
+            "required": ["a"],
+        },
+    },
+    {
+        "name": "calculator_multiply",
+        "description": "Multiply two numbers together",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "a": {"type": "number", "description": "First number"},
+                "b": {"type": "number", "description": "Second number"},
+            },
+            "required": ["a", "b"],
+        },
+    },
+]
+
+# Step 2: Create an MCP tool registry
+registry = MCPToolRegistry(mcp_tools)
+
+# Step 3: Create an evaluation suite using the MCP registry
+suite = EvalSuite(
+    name="Calculator MCP Evaluation",
+    system_message="You are a helpful calculator assistant. Use the available tools to perform calculations.",
+    catalog=registry,  # Use MCP registry instead of ToolCatalog
+)
+
+# Step 4: Add test cases using tool names (not Python functions!)
+suite.add_case(
+    name="Simple addition",
+    user_message="What is 5 plus 3?",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="calculator_add",  # String name, not a callable
+            args={"a": 5, "b": 3},
+        )
+    ],
+    critics=[
+        BinaryCritic(critic_field="a", weight=0.5),
+        BinaryCritic(critic_field="b", weight=0.5),
+    ],
+)
+
+suite.add_case(
+    name="Addition with implicit default",
+    user_message="Add 10",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="calculator_add",
+            args={"a": 10},  # 'b' will use default value of 0
+        )
+    ],
+    critics=[
+        BinaryCritic(critic_field="a", weight=1.0),
+    ],
+)
+
+suite.add_case(
+    name="Multiplication",
+    user_message="What is 7 times 6?",
+    expected_tool_calls=[
+        ExpectedToolCall(
+            tool_name="calculator_multiply",
+            args={"a": 7, "b": 6},
+        )
+    ],
+    critics=[
+        BinaryCritic(critic_field="a", weight=0.5),
+        BinaryCritic(critic_field="b", weight=0.5),
+    ],
+)
+
+# Step 5: Demo the configuration
+if __name__ == "__main__":
+    print("Running MCP tool evaluations...")
+    print(f"Suite: {suite.name}")
+    print(f"Cases: {len(suite.cases)}")
+    print()
+
+    print("✅ MCP evaluation suite configured successfully!")
+    print("\nConfigured cases:")
+    for i, case in enumerate(suite.cases, 1):
+        print(f"{i}. {case.name}")
+        print(f"   Expected: {len(case.expected_tool_calls)} tool call(s)")
+        for tc in case.expected_tool_calls:
+            print(f"   - {tc.name}({tc.args})")
+
+    print("\n💡 To run actual evaluations, use:")
+    print("   results = suite.run(provider_api_key='your-api-key', model='gpt-4')")
+
+    # Demo: Show how MCP tools are converted to OpenAI format
+    print("\n📋 MCP tools converted to OpenAI format:")
+    tools = registry.list_tools_for_model(tool_format="openai")
+    for tool in tools:
+        print(f"\n- {tool['function']['name']}")
+        print(f"  Description: {tool['function']['description']}")
+        function_params = tool["function"].get("parameters")
+        if function_params and isinstance(function_params, dict):
+            params = function_params.get("properties", {})
+            if params:
+                print(f"  Parameters: {', '.join(params.keys())}")