diff --git a/CLAUDE.md b/CLAUDE.md index f659238d..74b8a84d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,8 +14,9 @@ - Development mode: `npm run dev` (use `npm run dev:windows` on Windows) - Format code: `npm run prettier-fix` - Client lint: `cd client && npm run lint` -- Run tests: `npm test` (464 passing, 100% pass rate) +- Run tests: `npm test` (696 total, 669 passing, 27 timeout failures) - Run assessment tests: `npm test -- assessment` (208 assessment module tests) +- Note: 27 test failures are SecurityAssessor timeout issues (480s limit), not logic errors ## Code Style Guidelines @@ -195,6 +196,7 @@ For detailed documentation on specific features, see: - **Functionality Testing**: [README.md](README.md#2-optimized-progressive-complexity-testing) - Multi-scenario validation, progressive complexity - **Security Assessment**: [README.md](README.md#4-context-aware-security-assessment-with-zero-false-positives) - Domain-specific patterns, zero false positives - **Error Handling**: [README.md](README.md#assessment-categories) - MCP protocol compliance, validation quality +- **MCP Spec Reference**: [docs/mcp_spec_11-2025.md](docs/mcp_spec_11-2025.md) - Protocol revision 2025-11-25 (latest) - **MCP Spec Compliance**: See PROJECT_STATUS.md timeline for latest enhancements - **Recent Changes**: [PROJECT_STATUS.md](PROJECT_STATUS.md#development-timeline---october-2025) @@ -217,7 +219,7 @@ For detailed documentation on specific features, see: **Testing:** -- `client/src/services/__tests__/` - 464 total tests (100% passing) +- `client/src/services/__tests__/` - 696 total tests (669 passing, 27 timeout failures) - `client/src/services/assessment/__tests__/` - 208 assessment module tests ## Development Workflow diff --git a/README.md b/README.md index 0a035692..42e91465 100644 --- a/README.md +++ b/README.md @@ -353,7 +353,7 @@ Our enhanced MCP Inspector includes a comprehensive assessment system that valid - Input validation and sanitization checks - Authentication/authorization testing - Sensitive data exposure detection - - Dual-mode testing: Reviewer mode (3 critical patterns) + Developer mode (all 17 patterns) + - Dual-mode testing: Reviewer mode (3 critical patterns) + Developer mode (all 13 patterns) 5. **Usability** - Tool naming consistency analysis diff --git a/cli/package.json b/cli/package.json index 1cac4aa0..fa095d8a 100644 --- a/cli/package.json +++ b/cli/package.json @@ -1,6 +1,6 @@ { "name": "@bryan-thompson/inspector-assessment-cli", - "version": "1.7.1", + "version": "1.8.0", "description": "CLI for the Enhanced MCP Inspector with assessment capabilities", "license": "MIT", "author": "Bryan Thompson ", diff --git a/cli/src/assess-full.ts b/cli/src/assess-full.ts index 824b5795..4c9882a5 100644 --- a/cli/src/assess-full.ts +++ b/cli/src/assess-full.ts @@ -48,6 +48,7 @@ interface AssessmentOptions { sourceCodePath?: string; claudeEnabled?: boolean; fullAssessment?: boolean; + auditMode?: boolean; verbose?: boolean; jsonOnly?: boolean; helpRequested?: boolean; @@ -286,7 +287,22 @@ function buildConfig(options: AssessmentOptions): AssessmentConfiguration { testTimeout: 30000, }; - if (options.fullAssessment !== false) { + if (options.auditMode) { + // Audit mode: only HIGH-value modules for automated MCP auditing + config.assessmentCategories = { + functionality: true, + security: true, + documentation: false, + errorHandling: true, + usability: false, + mcpSpecCompliance: true, + aupCompliance: false, + toolAnnotations: true, + prohibitedLibraries: false, + manifestValidation: false, + portability: false, + }; + } else if (options.fullAssessment !== false) { config.assessmentCategories = { functionality: true, security: true, @@ -376,6 +392,7 @@ async function runFullAssessment( callTool: createCallToolWrapper(client), config, sourceCodePath: options.sourceCodePath, + transportType: serverConfig.transport || "stdio", ...sourceFiles, }; @@ -400,13 +417,53 @@ function saveResults( serverName: string, results: MCPDirectoryAssessment, outputPath?: string, + transportType?: string, ): string { const defaultPath = `/tmp/inspector-full-assessment-${serverName}.json`; const finalPath = outputPath || defaultPath; + // Build audit summary for automated consumption + const securityResult = results.security as { + auditAnalysis?: { + highConfidenceVulnerabilities: string[]; + needsReview: string[]; + falsePositiveLikelihood: Record; + }; + vulnerabilities?: string[]; + }; + const functionalityResult = results.functionality as { + workingTools?: number; + totalTools?: number; + }; + const mcpResult = results.mcpSpecCompliance as { + metrics?: { overallScore?: number }; + }; + const errorResult = results.errorHandling as { + metrics?: { mcpComplianceScore?: number }; + }; + + const auditSummary = { + highConfidenceVulnerabilities: + securityResult?.auditAnalysis?.highConfidenceVulnerabilities || [], + needsReview: securityResult?.auditAnalysis?.needsReview || [], + falsePositiveLikelihood: + securityResult?.auditAnalysis?.falsePositiveLikelihood || {}, + functionalTools: functionalityResult?.workingTools || 0, + totalTools: functionalityResult?.totalTools || 0, + mcpComplianceScore: errorResult?.metrics?.mcpComplianceScore || 0, + transportType: transportType || "unknown", + recommendedAction: + results.overallStatus === "PASS" + ? "APPROVE" + : results.overallStatus === "FAIL" + ? "REJECT" + : "REVIEW", + }; + const output = { timestamp: new Date().toISOString(), assessmentType: "full", + auditSummary, ...results, }; @@ -563,6 +620,9 @@ function parseArgs(): AssessmentOptions { case "--full": options.fullAssessment = true; break; + case "--audit-mode": + options.auditMode = true; + break; case "--verbose": case "-v": options.verbose = true; @@ -617,6 +677,9 @@ Options: --source Source code path for deep analysis (AUP, portability, etc.) --claude-enabled Enable Claude Code integration for intelligent analysis --full Enable all assessment modules (default) + --audit-mode Run only high-value modules for automated MCP auditing + (Functionality, Security, ErrorHandling, MCPSpecCompliance, ToolAnnotations) + Reduces false positives and includes audit summary in output --json Output only JSON (no console summary) --verbose, -v Enable verbose logging --help, -h Show this help message @@ -658,10 +721,16 @@ async function main() { displaySummary(results); } + // Determine transport type for audit summary + const serverConfig = loadServerConfig( + options.serverName, + options.serverConfigPath, + ); const outputPath = saveResults( options.serverName, results, options.outputPath, + serverConfig.transport || "stdio", ); if (options.jsonOnly) { diff --git a/cli/src/assess-security.ts b/cli/src/assess-security.ts index aad8f0b3..3c7eaf78 100644 --- a/cli/src/assess-security.ts +++ b/cli/src/assess-security.ts @@ -237,7 +237,7 @@ async function runSecurityAssessment( const config: AssessmentConfiguration = { ...DEFAULT_ASSESSMENT_CONFIG, - securityPatternsToTest: 17, + securityPatternsToTest: 13, reviewerMode: false, testTimeout: 30000, }; @@ -247,9 +247,10 @@ async function runSecurityAssessment( tools, callTool: createCallToolWrapper(client), config, + transportType: serverConfig.transport || "stdio", }; - console.log(`🛡️ Running security assessment with 17 attack patterns...`); + console.log(`🛡️ Running security assessment with 13 attack patterns...`); const assessor = new SecurityAssessor(config); const results = await assessor.assess(context); @@ -386,7 +387,7 @@ function printHelp() { console.log(` Usage: mcp-assess-security [options] [server-name] -Run security assessment against an MCP server with 17 attack patterns. +Run security assessment against an MCP server with 13 attack patterns. Options: --server, -s Server name (required, or pass as first positional arg) @@ -396,15 +397,14 @@ Options: --verbose, -v Enable verbose logging --help, -h Show this help message -Attack Patterns Tested (17 total): - • Direct prompt injection - • Indirect prompt injection - • Instruction override - • Role-playing attacks - • Encoding bypass - • Multi-turn manipulation - • Context poisoning - • And more... +Attack Patterns Tested (13 total): + • Command Injection • SQL Injection + • Calculator Injection • Path Traversal + • Type Safety • Boundary Testing + • Required Fields • MCP Error Format + • Timeout Handling • Indirect Prompt Injection + • Unicode Bypass • Nested Injection + • Package Squatting Examples: mcp-assess-security my-server diff --git a/client/package.json b/client/package.json index 14a0df32..7b4d3175 100644 --- a/client/package.json +++ b/client/package.json @@ -1,6 +1,6 @@ { "name": "@bryan-thompson/inspector-assessment-client", - "version": "1.7.1", + "version": "1.8.0", "description": "Client-side application for the Enhanced MCP Inspector with assessment capabilities", "license": "MIT", "author": "Bryan Thompson ", diff --git a/client/src/lib/assessmentTypes.ts b/client/src/lib/assessmentTypes.ts index 6630a070..78436553 100644 --- a/client/src/lib/assessmentTypes.ts +++ b/client/src/lib/assessmentTypes.ts @@ -66,6 +66,9 @@ export interface SecurityTestResult { connectionError?: boolean; // True if test failed due to connection/server failure errorType?: "connection" | "server" | "protocol"; // Classify error type testReliability?: "completed" | "failed" | "retried"; // Test execution status + // Audit-mode fields for automated consumption + vulnerableHighConfidence?: boolean; // Only true when confidence === "high" AND vulnerable + toolCategory?: string; // Classified tool category (e.g., "search_retrieval", "calculator") } export interface CodeExample { @@ -243,6 +246,16 @@ export interface SecurityAssessment { overallRiskLevel: SecurityRiskLevel; status: AssessmentStatus; explanation: string; + // Audit-mode: pre-computed false positive analysis + auditAnalysis?: { + highConfidenceVulnerabilities: string[]; + needsReview: string[]; + falsePositiveLikelihood: Record; + responseUniformity: Record< + string, + { uniqueResponses: number; totalTests: number } + >; + }; } export interface DocumentationAssessment { diff --git a/client/src/services/assessment/AssessmentOrchestrator.ts b/client/src/services/assessment/AssessmentOrchestrator.ts index a26787d5..e09a1366 100644 --- a/client/src/services/assessment/AssessmentOrchestrator.ts +++ b/client/src/services/assessment/AssessmentOrchestrator.ts @@ -66,6 +66,10 @@ export interface AssessmentContext { // MCPB manifest validation (optional) manifestJson?: ManifestJsonSchema; manifestRaw?: string; // Raw manifest.json content for parsing validation + + // Transport type for context-aware security testing + // Used to skip irrelevant tests (e.g., path traversal on remote servers) + transportType?: "stdio" | "http" | "sse"; } export class AssessmentOrchestrator { diff --git a/client/src/services/assessment/modules/SecurityAssessor.ts b/client/src/services/assessment/modules/SecurityAssessor.ts index 44980253..4682e930 100644 --- a/client/src/services/assessment/modules/SecurityAssessor.ts +++ b/client/src/services/assessment/modules/SecurityAssessor.ts @@ -101,12 +101,16 @@ export class SecurityAssessor extends BaseAssessor { overallRiskLevel, ); + // Compute audit analysis (pre-computed FP analysis for automated consumption) + const auditAnalysis = this.computeAuditAnalysis(validTests); + return { promptInjectionTests: allTests, vulnerabilities, overallRiskLevel, status, explanation, + auditAnalysis, }; } @@ -171,6 +175,12 @@ export class SecurityAssessor extends BaseAssessor { ); for (const tool of toolsToTest) { + // Classify tool once for filtering decisions + const toolClassification = new ToolClassifier().classify( + tool.name, + tool.description, + ); + // Tools with no input parameters can't be exploited via payload injection // Add passing results so they appear in the UI if (!this.hasInputParameters(tool)) { @@ -203,6 +213,49 @@ export class SecurityAssessor extends BaseAssessor { // Test with each attack type (all patterns in advanced mode) for (const attackPattern of attackPatterns) { + // Skip calculator injection on non-calculator tools (major FP reduction) + if ( + attackPattern.attackName === "Calculator Injection" && + !toolClassification.categories.includes(ToolCategory.CALCULATOR) + ) { + const payloads = getPayloadsForAttack(attackPattern.attackName); + for (const payload of payloads) { + results.push({ + testName: attackPattern.attackName, + description: payload.description, + payload: payload.payload, + riskLevel: payload.riskLevel, + toolName: tool.name, + vulnerable: false, + evidence: + "Skipped: Calculator injection tests only run on math/calc/eval tools", + }); + } + continue; + } + + // Skip path traversal on HTTP/SSE transport (no filesystem to traverse) + if ( + attackPattern.attackName === "Path Traversal" && + context.transportType && + context.transportType !== "stdio" + ) { + const payloads = getPayloadsForAttack(attackPattern.attackName); + for (const payload of payloads) { + results.push({ + testName: attackPattern.attackName, + description: payload.description, + payload: payload.payload, + riskLevel: payload.riskLevel, + toolName: tool.name, + vulnerable: false, + evidence: + "Skipped: Path traversal tests not applicable for remote (HTTP/SSE) servers", + }); + } + continue; + } + // Get ALL payloads for this attack pattern const payloads = getPayloadsForAttack(attackPattern.attackName); @@ -278,6 +331,12 @@ export class SecurityAssessor extends BaseAssessor { ); for (const tool of toolsToTest) { + // Classify tool once for filtering decisions + const toolClassification = new ToolClassifier().classify( + tool.name, + tool.description, + ); + // Tools with no input parameters can't be exploited via payload injection // Add passing results so they appear in the UI if (!this.hasInputParameters(tool)) { @@ -312,6 +371,23 @@ export class SecurityAssessor extends BaseAssessor { // Test with each critical pattern for (const attackPattern of basicPatterns) { + // Skip calculator injection on non-calculator tools (major FP reduction) + if ( + attackPattern.attackName === "Calculator Injection" && + !toolClassification.categories.includes(ToolCategory.CALCULATOR) + ) { + continue; + } + + // Skip path traversal on HTTP/SSE transport (no filesystem to traverse) + if ( + attackPattern.attackName === "Path Traversal" && + context.transportType && + context.transportType !== "stdio" + ) { + continue; + } + // Get only the FIRST (most generic) payload for basic testing const allPayloads = getPayloadsForAttack(attackPattern.attackName); const payload = allPayloads[0]; // Just use first payload @@ -439,6 +515,12 @@ export class SecurityAssessor extends BaseAssessor { payload, ); + // Classify tool for audit-mode output + const classification = new ToolClassifier().classify( + tool.name, + tool.description, + ); + return { testName: attackName, description: payload.description, @@ -449,6 +531,12 @@ export class SecurityAssessor extends BaseAssessor { evidence, response: this.extractResponseContent(response), ...confidenceResult, + // Audit-mode fields + vulnerableHighConfidence: + isVulnerable && + (confidenceResult.confidence === "high" || + !confidenceResult.confidence), + toolCategory: classification.categories[0] || "generic", }; } catch (error) { // Check if error is a connection/server failure @@ -1017,6 +1105,86 @@ export class SecurityAssessor extends BaseAssessor { return vulnerabilities; } + /** + * Compute audit analysis for automated consumption + * Pre-computes false positive likelihood and response uniformity per tool + */ + private computeAuditAnalysis(validTests: SecurityTestResult[]): { + highConfidenceVulnerabilities: string[]; + needsReview: string[]; + falsePositiveLikelihood: Record; + responseUniformity: Record< + string, + { uniqueResponses: number; totalTests: number } + >; + } { + const highConfVulns: string[] = []; + const needsReview: string[] = []; + const fpLikelihood: Record = {}; + const uniformity: Record< + string, + { uniqueResponses: number; totalTests: number } + > = {}; + + // Group vulnerable tests by tool + const vulnByTool: Record = {}; + for (const test of validTests) { + if (test.vulnerable) { + const name = test.toolName || "unknown"; + if (!vulnByTool[name]) vulnByTool[name] = []; + vulnByTool[name].push(test); + } + } + + for (const [toolName, tests] of Object.entries(vulnByTool)) { + // High confidence vulnerabilities + const highConf = tests.filter( + (t) => !t.confidence || t.confidence === "high", + ); + if (highConf.length > 0) { + highConfVulns.push( + `${toolName}: ${highConf.map((t) => t.testName).join(", ")}`, + ); + } + + // Needs review (medium/low confidence) + const reviewNeeded = tests.filter( + (t) => t.confidence === "medium" || t.confidence === "low", + ); + if (reviewNeeded.length > 0) { + needsReview.push( + `${toolName}: ${reviewNeeded.map((t) => t.testName).join(", ")}`, + ); + } + + // Response uniformity analysis + const responses = tests.map((t) => + (t.response || "").trim().substring(0, 200), + ); + const uniqueResponses = new Set(responses).size; + uniformity[toolName] = { + uniqueResponses, + totalTests: tests.length, + }; + + // FP likelihood based on response uniformity + if (uniqueResponses === 1 && tests.length >= 2) { + fpLikelihood[toolName] = "HIGH"; + } else if (uniqueResponses < tests.length / 2 && tests.length >= 4) { + fpLikelihood[toolName] = "MEDIUM"; + } else { + fpLikelihood[toolName] = "LOW"; + } + } + + return { + highConfidenceVulnerabilities: highConfVulns, + needsReview, + falsePositiveLikelihood: fpLikelihood, + responseUniformity: uniformity, + }; + } + /** * Determine overall risk level */ diff --git a/client/src/test/utils/securityPatternFactory.ts b/client/src/test/utils/securityPatternFactory.ts index 6996bfb9..86d270d0 100644 --- a/client/src/test/utils/securityPatternFactory.ts +++ b/client/src/test/utils/securityPatternFactory.ts @@ -1,4 +1,4 @@ -// Security pattern test factories for all 17 patterns +// Security pattern test factories for all 13 patterns export interface SecurityTestCase { pattern: string; diff --git a/docs/ENHANCED_TESTING_IMPLEMENTATION.md b/docs/ENHANCED_TESTING_IMPLEMENTATION.md deleted file mode 100644 index 6ec7cb09..00000000 --- a/docs/ENHANCED_TESTING_IMPLEMENTATION.md +++ /dev/null @@ -1,178 +0,0 @@ -# Enhanced Functionality Testing Implementation - -> **📌 Note (2025-10-06)**: As of this date, comprehensive multi-scenario testing is now the **default and only** testing mode. The `enableEnhancedTesting` configuration option has been removed. This document describes the current testing methodology. - -## Overview - -This document describes the comprehensive testing methodology used by MCP Inspector for functionality validation, which uses multi-scenario testing and response validation to ensure true functionality beyond simple connectivity checks. - -## Problem Statement - -The original testing had critical issues: - -1. **Superficial Test Data**: Always used "test_value" for strings, empty arrays/objects -2. **False Positives**: Any successful response marked as "working" regardless of content -3. **Single Scenario**: Each tool tested only once with minimal parameters -4. **No Validation**: No verification that tools actually perform their intended functions - -## Solution Architecture - -### New Components Created - -#### 1. TestDataGenerator.ts - -**Purpose**: Generates realistic, context-aware test data based on parameter schemas - -**Key Features**: - -- Context-aware data generation (URLs, emails, paths, queries, etc.) -- Multiple test scenarios per tool (happy path, edge cases, boundaries, error cases) -- Realistic data pools for different types -- Support for special characters, unicode, and extreme values - -**Example**: - -```typescript -// Instead of "test_value" for all strings: -- URL fields → "https://api.github.com/repos/microsoft/vscode" -- Email fields → "user@example.com" -- Query fields → "SELECT * FROM users WHERE active = true" -- ID fields → "user_123456" or UUID format -``` - -#### 2. ResponseValidator.ts - -**Purpose**: Validates tool responses for actual functionality, not just connectivity - -**Key Features**: - -- Structure validation (response has expected format) -- Content validation (response contains meaningful data) -- Semantic validation (response relates to input) -- Tool-specific logic validation (search tools return results, etc.) -- Classification system: - - `fully_working`: All validations pass - - `partially_working`: Some validations pass - - `connectivity_only`: Responds but doesn't function - - `broken`: Fails to respond properly - -**Validation Checks**: - -1. Response structure matches expectations -2. Content is meaningful (not just echoing input) -3. Semantic correctness (output relates to input) -4. Tool-specific patterns (database ops, file ops, API calls) - -#### 3. TestScenarioEngine.ts - -**Purpose**: Orchestrates comprehensive testing with multiple scenarios per tool - -**Key Features**: - -- Generates 5-20 scenarios per tool based on complexity -- Covers different test categories: - - Happy path (typical usage) - - Edge cases (empty values, special characters) - - Boundary values (min/max values) - - Error cases (invalid inputs) -- Statistical confidence scoring -- Detailed recommendations for improvements - -**Test Coverage**: - -```typescript -// Each tool gets multiple scenarios: -Tool: search_database -├── Happy Path - Typical query -├── Edge Case - Empty search term -├── Edge Case - Special characters -├── Boundary - Maximum query length -└── Error Case - Invalid SQL syntax -``` - -### Integration with Existing Code - -#### assessmentService.ts Updates - -- Enhanced `generateTestValue()` method with TestDataGenerator integration -- New `assessFunctionalityEnhanced()` method for multi-scenario testing -- Backward compatibility maintained with configuration flag -- Comprehensive result reporting with confidence scores - -#### Type Definitions (assessmentTypes.ts) - -- Added `EnhancedToolTestResult` interface with detailed metrics -- Configuration option `enableEnhancedTesting` to toggle new functionality -- Extended validation coverage metrics - -## Usage - -### Enable Enhanced Testing - -```typescript -const config: AssessmentConfiguration = { - // ... other config - enableEnhancedTesting: true, // Enable multi-scenario testing - scenariosPerTool: 10, // Max scenarios per tool (optional) -}; -``` - -### Result Structure - -```typescript -{ - toolName: "search_items", - status: "partially_working", // More nuanced than just "working"/"broken" - confidence: 72, // Statistical confidence score - scenariosExecuted: 8, - scenariosPassed: 5, - validationSummary: { - happyPathSuccess: true, // Basic functionality works - edgeCasesHandled: 2/3, // Some edge cases fail - errorHandlingWorks: false, // Doesn't validate inputs properly - }, - recommendations: [ - "Improve error handling - tool doesn't properly validate inputs", - "Handle edge cases better - 1 edge case(s) failed" - ] -} -``` - -## Impact - -### Before (Superficial Testing) - -- **Test Input**: `{ query: "test_value" }` -- **Any Response**: Marked as "working" -- **Coverage**: 1 test per tool -- **Result**: 100% tools "working" (false positive) - -### After (Comprehensive Testing) - -- **Test Inputs**: Multiple realistic scenarios - - `{ query: "SELECT * FROM users WHERE active = true" }` - - `{ query: "" }` (edge case) - - `{ query: "'; DROP TABLE users; --" }` (security test) -- **Response Validation**: Checks actual functionality -- **Coverage**: 5-20 tests per tool -- **Result**: Realistic assessment with confidence scores - -## Benefits - -1. **Accuracy**: Real functionality validation, not just connectivity -2. **Confidence**: Statistical confidence scores instead of binary pass/fail -3. **Actionable Feedback**: Specific recommendations for improvements -4. **MCP Compliance**: True validation for directory submission requirements -5. **Developer Value**: Helps developers understand what actually needs fixing - -## Future Enhancements - -1. **Parallel Testing**: Run scenarios in parallel for faster execution -2. **Custom Scenarios**: Allow users to define custom test scenarios -3. **Historical Comparison**: Track improvement over time -4. **Performance Metrics**: Add response time and resource usage tracking -5. **Integration Testing**: Test tool interactions and dependencies - -## Conclusion - -This enhancement transforms MCP Inspector from a simple connectivity checker to a comprehensive functionality validator. It provides developers with accurate, actionable insights about their MCP server implementations, ensuring tools actually work as intended before directory submission. diff --git a/docs/ENHANCEMENT_IMPLEMENTATION_PLAN.md b/docs/ENHANCEMENT_IMPLEMENTATION_PLAN.md deleted file mode 100644 index c5fb5563..00000000 --- a/docs/ENHANCEMENT_IMPLEMENTATION_PLAN.md +++ /dev/null @@ -1,264 +0,0 @@ -# MCP Inspector Assessment Enhancement Implementation Plan - -## Upgrading to June 2025 MCP Protocol Standards - -### Overview - -This document captures the complete implementation plan for enhancing the MCP Inspector assessment methodology from 5 core requirements to 10 comprehensive assessment categories with 17 security attack patterns. - -## Current State (Before Enhancement) - -- **5 Core Categories**: Functionality, Security, Documentation, Error Handling, Usability -- **8 Security Patterns**: Direct Command Injection, Role Override, Data Exfiltration, Context Escape, Instruction Confusion, Unicode Bypass, Nested Injection, System Command -- **Architecture**: Monolithic MCPAssessmentService class -- **Testing**: Sequential only, no parallel execution - -## Target State (After Enhancement) - -- **10 Assessment Categories**: - - Original 5 + MCP Spec Compliance, Supply Chain Security, Dynamic Security, Privacy Compliance, Human-in-the-Loop -- **17 Security Patterns**: - - Original 8 + Tool Shadowing, Metadata Exfiltration, Package Squatting, Indirect Prompt Injection, Configuration Drift, Sandbox Escape, Tool Poisoning, Rug Pull, Confused Deputy -- **Architecture**: Modular AssessmentOrchestrator with pluggable assessors -- **Testing**: Parallel execution support, performance optimization - -## Implementation Waves - -### Wave 1: Core Infrastructure (COMPLETED ✅) - -**Objective**: Refactor architecture for extensibility - -#### Completed Tasks: - -1. **Updated Type Definitions** (`/client/src/lib/assessmentTypes.ts`) - - Added new assessment category interfaces - - Expanded PROMPT_INJECTION_TESTS from 8 to 17 patterns - - Added configuration for extended assessments - - Added parallel testing configuration - -2. **Created Modular Architecture** - - `AssessmentOrchestrator.ts`: Main coordinator class - - `BaseAssessor.ts`: Abstract base class for all assessors - - Individual assessor modules in `/client/src/services/assessment/modules/` - -3. **Implemented Core Assessors** - - `FunctionalityAssessor.ts`: Tool functionality testing - - `SecurityAssessor.ts`: 17 security pattern testing - - `DocumentationAssessor.ts`: Documentation quality evaluation - - `ErrorHandlingAssessor.ts`: Error handling validation - - `UsabilityAssessor.ts`: Usability metrics assessment - -### Wave 2: Extended Assessment Modules (COMPLETED ✅) - -**Objective**: Implement new assessment categories per June 2025 spec - -#### Completed: - -1. **MCPSpecComplianceAssessor** ✅ - - Transport compliance (Streamable HTTP vs SSE) - - OAuth resource server validation - - Annotation support testing - - Streaming protocol detection - -2. **SupplyChainAssessor** ✅ - - Dependency vulnerability scanning - - Package integrity verification - - SBOM generation - - License compliance checking - - Typosquatting detection with Levenshtein distance - -3. **DynamicSecurityAssessor** ✅ - - Runtime behavior monitoring - - Input fuzzing tests (10 test categories) - - Sandbox escape detection - - Memory leak detection - - Anomaly scoring (0-100 scale) - -4. **PrivacyComplianceAssessor** ✅ - - PII detection and classification (13 patterns) - - GDPR compliance checking (5 requirements) - - CCPA compliance checking (4 requirements) - - Data retention policy validation - - Encryption validation (at rest & in transit) - -5. **HumanInLoopAssessor** ✅ - - Review mechanism detection (pre/post/continuous) - - Override capability testing (cancel/modify/revert/pause) - - Transparency feature validation (explainability/audit/decisions/confidence) - - Audit trail verification (comprehensive/immutable/searchable) - - Emergency control testing (kill switch/safe mode/fallback/manual override) - -### Wave 3: UI/UX Enhancement (COMPLETED ✅) - -**Objective**: Update UI to display new assessment categories - -#### Completed Tasks: - -1. **Updated AssessmentTab Component** ✅ - - Added configuration checkboxes for enabling extended assessment - - Integrated all 10 assessment categories - - Added conditional rendering based on configuration - -2. **Created Extended Assessment Category Components** ✅ - - `ExtendedAssessmentCategories.tsx`: Modular display components for 5 new categories - - MCPSpecComplianceDisplay: Protocol compliance visualization - - SupplyChainDisplay: Dependency and vulnerability visualization - - DynamicSecurityDisplay: Runtime security metrics display - - PrivacyComplianceDisplay: Privacy and regulatory compliance UI - - HumanInLoopDisplay: Human oversight features display - -3. **Implemented Category Filtering System** ✅ - - `AssessmentCategoryFilter.tsx`: Interactive filter component - - Toggle individual categories on/off - - Select all/deselect all functionality - - Separate core and extended category groups - - Real-time filter application - -4. **Enhanced UI Components** ✅ - - Added Progress component for visual metrics - - Added Badge component for status indicators - - Expandable sections with JSON view toggle - - Rich visualization of assessment results - -5. **Export Functionality Enhanced** ✅ - - Updated text report generation to include extended categories - - JSON export includes all 10 categories - - Category-aware export based on configuration - -### Wave 4: Testing & Validation (TODO) - -**Objective**: Comprehensive test coverage - -#### Tasks: - -1. **Update Test Suite** - - ```typescript - // Test files to update: - - assessmentService.test.ts: Add tests for orchestrator - - Add individual test files for each assessor - - Test all 17 security patterns - - Test parallel execution - ``` - -2. **Integration Testing** - - End-to-end assessment workflow - - Performance benchmarking - - Error recovery testing - -3. **Validation Testing** - - Test against known vulnerable servers - - Test against compliant servers - - Edge case testing - -### Wave 5: Documentation & Migration (TODO) - -**Objective**: Complete documentation and migration support - -#### Tasks: - -1. **Update ASSESSMENT_METHODOLOGY.md** - - Document all 17 security patterns - - Explain new assessment categories - - Add examples and best practices - -2. **Create Migration Guide** - - Breaking changes documentation - - Configuration migration steps - - API compatibility notes - -3. **Create User Guide** - - How to enable extended assessments - - Interpreting new metrics - - Customization options - -## Technical Implementation Details - -### New Security Patterns (9 Additional) - -1. **Tool Shadowing**: Create fake tool with same name to intercept calls -2. **Metadata Exfiltration**: Extract system metadata through tool parameters -3. **Package Squatting**: Reference typosquatted package names -4. **Indirect Prompt Injection**: Inject through external data sources -5. **Configuration Drift**: Modify tool configuration during runtime -6. **Sandbox Escape**: Attempt to break out of execution sandbox -7. **Tool Poisoning**: Corrupt tool behavior for future invocations -8. **Rug Pull**: Change behavior after gaining trust -9. **Confused Deputy**: Trick tool into acting on behalf of attacker - -### Configuration Schema - -```typescript -interface AssessmentConfiguration { - // Core settings - autoTest: boolean; - testTimeout: number; - skipBrokenTools: boolean; - verboseLogging: boolean; - generateReport: boolean; - saveEvidence: boolean; - - // Extended settings - enableExtendedAssessment?: boolean; - parallelTesting?: boolean; - maxParallelTests?: number; - mcpProtocolVersion?: string; - assessmentCategories?: { - functionality: boolean; - security: boolean; - documentation: boolean; - errorHandling: boolean; - usability: boolean; - // New categories - mcpSpecCompliance?: boolean; - supplyChain?: boolean; - dynamicSecurity?: boolean; - privacy?: boolean; - humanInLoop?: boolean; - }; -} -``` - -### Integration Points - -1. **Backward Compatibility**: Original 5 categories work without changes -2. **Progressive Enhancement**: New categories are opt-in via configuration -3. **API Compatibility**: Existing integrations continue to work -4. **UI Graceful Degradation**: UI handles missing new data gracefully - -## Success Metrics - -- ✅ All 17 security patterns implemented and tested -- ✅ Modular architecture allows easy addition of new assessors -- ⏳ 10 assessment categories fully functional -- ⏳ Parallel testing reduces assessment time by 40% -- ⏳ Zero breaking changes for existing users -- ⏳ Comprehensive documentation and examples - -## Risk Mitigation - -1. **Performance Impact**: Mitigated by parallel execution and caching -2. **Breaking Changes**: Avoided through backward compatibility design -3. **Complexity**: Managed through modular architecture -4. **Testing Coverage**: Addressed through comprehensive test suite - -## Next Steps - -1. Complete remaining Wave 2 assessors (Supply Chain, Dynamic Security, Privacy, Human-in-Loop) -2. Implement Wave 3 UI enhancements -3. Create comprehensive test coverage (Wave 4) -4. Update all documentation (Wave 5) -5. Performance optimization and caching -6. Beta testing with real MCP servers - -## Notes - -- All new features are opt-in to maintain backward compatibility -- Parallel testing is disabled by default to ensure stability -- Extended assessments can be enabled per-category for granular control -- The architecture supports future addition of more assessment categories - ---- - -_Last Updated: 2025-09-10_ -_Status: Wave 1 Complete ✅, Wave 2 Complete ✅, Wave 3 Complete ✅, Wave 4 Ready to Start_ diff --git a/docs/ERROR_HANDLING_TEST_METHODOLOGY.md b/docs/ERROR_HANDLING_TEST_METHODOLOGY.md deleted file mode 100644 index a290838b..00000000 --- a/docs/ERROR_HANDLING_TEST_METHODOLOGY.md +++ /dev/null @@ -1,187 +0,0 @@ -# Error Handling Test Methodology Verification - -## Overview - -This document verifies that the Error Handling assessment in MCP Inspector aligns with the MCP protocol requirements and JSON-RPC 2.0 specifications. - -## MCP Protocol Requirements - -### 1. JSON-RPC 2.0 Compliance - -**Requirement**: All messages between MCP clients and servers MUST follow the JSON-RPC 2.0 specification. - -**Our Testing**: - -- ✅ We check for proper error responses (`isErrorResponse()` method) -- ✅ We extract error codes and messages (`extractErrorInfo()` method) -- ✅ We validate error response structure - -### 2. Standard Error Codes - -**Requirement**: MCP uses standard JSON-RPC 2.0 error codes (-32768 to -32000): - -- Parse Error (-32700) -- Invalid Request (-32600) -- Method Not Found (-32601) -- Invalid Params (-32602) -- Internal Error (-32603) - -**Our Testing**: - -- ✅ Invalid Params: Tested via `testWrongTypes()` and `testMissingParameters()` -- ⚠️ Parse Error: Not directly tested (handled at transport layer) -- ⚠️ Method Not Found: Not tested in current implementation -- ⚠️ Invalid Request: Not directly tested -- ✅ Internal Error: Caught in error handlers - -### 3. Input Validation Requirements - -**Requirement**: Strict validation against protocol specification including structure, field consistency, and type safety. - -**Our Testing**: - -- ✅ **Missing Required Parameters**: `testMissingParameters()` sends empty params -- ✅ **Wrong Type Validation**: `testWrongTypes()` sends incorrect types (number for string, etc.) -- ✅ **Invalid Values**: `testInvalidValues()` tests enum violations, format violations -- ✅ **Excessive Input**: `testExcessiveInput()` tests 100KB string inputs - -### 4. Error Response Structure - -**Requirement**: Error responses must include: - -```json -{ - "jsonrpc": "2.0", - "id": "...", - "error": { - "code": number, - "message": string, - "data": optional - } -} -``` - -**Our Testing**: - -- ✅ Checks for error code presence (`hasProperErrorCodes`) -- ✅ Validates error messages (`hasDescriptiveMessages`) -- ✅ Captures full error structure in test details - -### 5. Tool Execution Errors - -**Requirement**: Tool execution failures may return successful response with `isError` flag. - -**Our Testing**: - -- ✅ Handles both patterns: - - Standard JSON-RPC errors (caught in catch blocks) - - Tool-specific errors with `isError` flag (checked in `isErrorResponse()`) - -## Test Scenarios Coverage - -### Current Test Coverage (4 scenarios per tool): - -1. **Missing Required Parameters** ✅ - - Tests: Empty parameter object - - Validates: Required field validation - - Expected: Error with "required" in message - -2. **Wrong Parameter Types** ✅ - - Tests: Incorrect types for each field type - - Validates: Type checking - - Expected: Error with "type" or "invalid" in message - -3. **Invalid Parameter Values** ✅ - - Tests: Out-of-range values, invalid formats - - Validates: Value constraints - - Expected: Error response - -4. **Excessive Input Size** ✅ - - Tests: 100KB string inputs - - Validates: Input size limits - - Expected: Error or graceful handling - -### Recommended Additional Test Scenarios: - -1. **Extra Parameters** (Not currently tested) - - Test: Send unexpected additional fields - - Validates: Strict schema enforcement - - Expected: Rejection of unknown parameters - -2. **Null/Undefined Values** (Partially tested) - - Test: Send null for non-nullable fields - - Validates: Null handling - - Expected: Appropriate error response - -3. **Nested Object Validation** (Not tested) - - Test: Invalid nested structures - - Validates: Deep validation - - Expected: Error identifying nested issues - -4. **Method Not Found** (Not tested) - - Test: Call non-existent tool - - Validates: Method existence checking - - Expected: -32601 error code - -## Scoring Methodology - -### Current Scoring: - -- Tests up to 5 tools -- 4 test scenarios per tool -- Maximum 20 tests total -- Score = (passed tests / total tests) \* 100 - -### Quality Assessment: - -- **Excellent**: ≥90% pass rate -- **Good**: ≥70% pass rate -- **Fair**: ≥50% pass rate -- **Poor**: <50% pass rate - -## Compliance Assessment - -### Strengths: - -1. ✅ Comprehensive input validation testing -2. ✅ Proper error detection for both JSON-RPC and tool-specific patterns -3. ✅ Detailed test reporting with actual vs expected -4. ✅ Actionable recommendations based on failures - -### Areas for Enhancement: - -1. ⚠️ Add testing for method not found scenarios -2. ⚠️ Test for extra parameter rejection -3. ⚠️ Validate specific JSON-RPC error codes (-32602 for invalid params) -4. ⚠️ Test nested object validation - -## Conclusion - -The current Error Handling assessment methodology is **substantially compliant** with MCP protocol requirements. It effectively tests the core validation scenarios required by the specification: - -- ✅ Parameter type validation -- ✅ Required field validation -- ✅ Value constraint validation -- ✅ Input size handling -- ✅ Error response structure validation - -The testing methodology provides valuable insights into server error handling capabilities and generates actionable recommendations for improvement. While there are opportunities to enhance coverage (particularly around method existence and extra parameter handling), the current implementation effectively assesses the critical error handling requirements of the MCP protocol. - -## Recommendations for Users - -When reviewing Error Handling assessment results: - -1. **Look for patterns**: If all "wrong type" tests fail, the server likely lacks type validation -2. **Check error quality**: Servers should provide clear error codes and descriptive messages -3. **Verify critical paths**: Missing required parameter validation is a critical security issue -4. **Consider the context**: Some servers may intentionally accept flexible inputs - -## Next Steps - -To further enhance the Error Handling assessment: - -1. Add "Method Not Found" test scenario -2. Add "Extra Parameters" test scenario -3. Enhance error code validation to check for specific JSON-RPC codes -4. Add nested object validation tests -5. Consider adding performance impact tests (response time under error conditions) diff --git a/docs/FUNCTIONALITY_TESTING_ANALYSIS.md b/docs/FUNCTIONALITY_TESTING_ANALYSIS.md deleted file mode 100644 index e7123705..00000000 --- a/docs/FUNCTIONALITY_TESTING_ANALYSIS.md +++ /dev/null @@ -1,350 +0,0 @@ -# Functionality Testing Analysis - MCP Inspector - -## Executive Summary - -This document analyzes the current functionality testing approach in MCP Inspector, evaluates its comprehensiveness, and identifies areas for improvement. Based on examination of the test results and implementation, our functionality testing is currently **basic and insufficient** for comprehensive MCP server validation. - -## Current Testing Approach - -### 1. Test Parameter Generation - -The current implementation uses overly simplistic test parameters: - -```typescript -// Current approach in generateTestValue(): -- Strings: Always "test_value" (or "https://example.com" for URLs) -- Numbers: Always minimum value or 1 -- Booleans: Always true -- Arrays: Always empty [] -- Objects: Always empty {} -``` - -**Problems:** - -- **No Realistic Data**: Using "test_value" for all strings doesn't test real-world scenarios -- **Empty Collections**: Empty arrays/objects don't test iteration or processing logic -- **Minimal Values**: Always using minimum numbers doesn't test range handling -- **No Variation**: Same values for every test means limited coverage - -### 2. Single Test Per Tool - -Currently, each tool is tested only once with minimal parameters: - -```typescript -// From testTool(): -const testParams = this.generateTestParameters(tool); -const result = await callTool(tool.name, testParams); -``` - -**Problems:** - -- **No Edge Cases**: Doesn't test boundary conditions -- **No Valid Variations**: Doesn't test different valid input combinations -- **No Performance Testing**: Doesn't test with realistic data volumes -- **No State Testing**: Doesn't test sequential operations or state dependencies - -### 3. Response Validation - -The current validation is binary - either "working" or "broken": - -```typescript -// Current classification: -if (result.isError) { - // Tool responds with error = "working" (incorrect!) - return { status: "working", response: result }; -} else { - // Tool responds without error = "working" - return { status: "working", response: result }; -} -``` - -**Problems:** - -- **Error Responses Marked as Working**: Tools that return errors with valid test data are incorrectly marked as "working" -- **No Response Content Validation**: Doesn't verify if response contains expected data -- **No Schema Validation**: Doesn't check if response matches expected format -- **No Semantic Validation**: Doesn't verify if response makes logical sense - -## Test Results Analysis - -Looking at the provided test results for the Notion MCP server: - -### What's Being Tested - -All 14 tools are being called with minimal, often invalid parameters: - -- `search`: Using "test_value" for query (gets validation error) -- `fetch`: Using "test_value" for ID (gets validation error) -- `create-pages`: Using empty arrays and null parent (gets validation error) -- `update-page`: Using null data (gets validation error) - -### False Positives - -**100% coverage reported, but this is misleading because:** - -1. Tools returning errors are counted as "working" -2. No verification that tools actually perform their intended function -3. No testing with valid, realistic data -4. No testing of successful operations - -### Real Coverage - -- **Connectivity**: ✅ Yes, we verify tools respond -- **Error Handling**: ✅ Yes, we see they validate input -- **Core Functionality**: ❌ No, we don't test if tools actually work -- **Data Processing**: ❌ No, we don't test with real data -- **Business Logic**: ❌ No, we don't verify correct behavior - -## Critical Gaps Identified - -### 1. Lack of Valid Input Testing - -**Current**: Only testing with invalid/minimal inputs -**Needed**: Test with realistic, valid data that should succeed - -### 2. No Success Path Validation - -**Current**: Not verifying tools can complete their primary function -**Needed**: Verify tools can successfully execute their intended operations - -### 3. Missing Response Validation - -**Current**: Only checking if response exists -**Needed**: Validate response content, structure, and correctness - -### 4. No Comprehensive Scenarios - -**Current**: Single test per tool -**Needed**: Multiple scenarios per tool covering different use cases - -### 5. No Contextual Testing - -**Current**: Tools tested in isolation -**Needed**: Test tools that depend on each other (e.g., create then fetch) - -## Recommendations for Improvement - -### 1. Enhanced Test Data Generation - -```typescript -// Proposed improvements: -private generateRealisticTestValue(schema: SchemaProperty, fieldName: string): unknown { - switch (schema.type) { - case "string": - // Use realistic values based on field name and format - if (fieldName.includes("id")) return this.generateUUID(); - if (fieldName.includes("name")) return this.generateName(); - if (fieldName.includes("description")) return this.generateDescription(); - if (fieldName.includes("content")) return this.generateContent(); - if (schema.format === "uri") return this.generateValidURL(); - if (schema.format === "email") return this.generateValidEmail(); - if (schema.pattern) return this.generateFromPattern(schema.pattern); - return this.generateRealisticString(schema); - - case "array": - // Generate non-empty arrays with valid items - const itemCount = Math.min(3, schema.minItems || 1); - return Array(itemCount).fill(null).map(() => - this.generateRealisticTestValue(schema.items, `${fieldName}_item`) - ); - - case "object": - // Generate complete objects with all required properties - return this.generateCompleteObject(schema); - } -} -``` - -### 2. Multiple Test Scenarios Per Tool - -```typescript -// Proposed test scenarios: -interface TestScenario { - name: string; - description: string; - generateParams: () => Record; - validateResponse: (response: any) => ValidationResult; - priority: "critical" | "important" | "nice-to-have"; -} - -// For each tool, define multiple scenarios: -const scenarios: TestScenario[] = [ - { - name: "minimal_valid", - description: "Test with minimum required valid parameters", - generateParams: () => this.generateMinimalValidParams(tool), - validateResponse: (res) => this.validateBasicSuccess(res), - priority: "critical", - }, - { - name: "typical_use", - description: "Test with typical real-world parameters", - generateParams: () => this.generateTypicalParams(tool), - validateResponse: (res) => this.validateExpectedOutput(res), - priority: "critical", - }, - { - name: "maximum_valid", - description: "Test with maximum valid parameters", - generateParams: () => this.generateMaximalParams(tool), - validateResponse: (res) => this.validateCompleteResponse(res), - priority: "important", - }, - { - name: "edge_cases", - description: "Test boundary conditions and edge cases", - generateParams: () => this.generateEdgeCaseParams(tool), - validateResponse: (res) => this.validateEdgeCaseBehavior(res), - priority: "important", - }, -]; -``` - -### 3. Response Validation Framework - -```typescript -// Proposed response validation: -interface ResponseValidation { - checkStructure: boolean; // Response matches expected schema - checkContent: boolean; // Response contains expected data - checkSemantics: boolean; // Response makes logical sense - checkPerformance: boolean; // Response time is acceptable - checkSideEffects: boolean; // Tool performed expected actions -} - -private validateToolResponse( - tool: Tool, - scenario: TestScenario, - response: any -): ValidationResult { - const checks = { - structure: this.validateResponseSchema(response, tool.outputSchema), - content: this.validateResponseContent(response, scenario), - semantics: this.validateResponseLogic(response, scenario), - performance: this.validateResponseTime(response.executionTime), - sideEffects: this.validateExpectedChanges(tool, scenario, response) - }; - - return { - passed: Object.values(checks).every(c => c.passed), - checks, - details: this.generateValidationReport(checks) - }; -} -``` - -### 4. Contextual Testing Strategy - -```typescript -// Test tools in realistic sequences: -interface ContextualTest { - name: string; - tools: string[]; - workflow: WorkflowStep[]; - validateOutcome: () => ValidationResult; -} - -const contextualTests: ContextualTest[] = [ - { - name: "create_and_fetch", - tools: ["create", "fetch"], - workflow: [ - { tool: "create", params: {...}, saveResult: "created_id" }, - { tool: "fetch", params: { id: "${created_id}" } } - ], - validateOutcome: () => this.validateCreateFetchConsistency() - }, - { - name: "search_and_update", - tools: ["search", "update"], - workflow: [ - { tool: "search", params: {...}, saveResult: "found_items" }, - { tool: "update", params: { id: "${found_items[0].id}", ... } } - ], - validateOutcome: () => this.validateSearchUpdateFlow() - } -]; -``` - -### 5. Test Classification Matrix - -```typescript -// Properly classify test results: -enum ToolStatus { - FULLY_FUNCTIONAL = "fully_functional", // All scenarios pass - PARTIALLY_FUNCTIONAL = "partially_functional", // Some scenarios pass - BASIC_FUNCTIONAL = "basic_functional", // Only responds, not verified - NON_FUNCTIONAL = "non_functional", // Doesn't respond properly - UNTESTED = "untested", // Couldn't test -} - -interface FunctionalityScore { - status: ToolStatus; - scenariosPassed: number; - scenariosTotal: number; - criticalPassed: boolean; - detailedResults: ScenarioResult[]; - confidence: number; // 0-100% confidence in assessment -} -``` - -## Implementation Priority - -### Phase 1: Critical Improvements (Immediate) - -1. **Fix response classification**: Stop marking error responses as "working" -2. **Add valid input generation**: Create realistic test data -3. **Implement basic response validation**: Check for expected content - -### Phase 2: Enhanced Testing (Short-term) - -1. **Multiple scenarios per tool**: Test different use cases -2. **Response schema validation**: Verify output structure -3. **Performance metrics**: Track response times - -### Phase 3: Comprehensive Testing (Medium-term) - -1. **Contextual workflows**: Test tool interactions -2. **State management testing**: Verify stateful operations -3. **Load testing**: Test with realistic data volumes - -### Phase 4: Advanced Testing (Long-term) - -1. **Fuzzing**: Automatically generate edge cases -2. **Property-based testing**: Verify invariants -3. **Regression testing**: Track changes over time - -## Metrics for Success - -### Current Metrics (Misleading) - -- Coverage: 100% (but superficial) -- Pass Rate: 100% (but incorrectly classified) - -### Proposed Metrics (Meaningful) - -- **True Functional Coverage**: % of tools that complete primary function -- **Scenario Coverage**: % of critical scenarios tested -- **Response Validity**: % of responses that match expected output -- **Confidence Score**: Statistical confidence in results -- **Test Depth**: Average scenarios tested per tool - -## Conclusion - -The current functionality testing in MCP Inspector provides **minimal value** beyond basic connectivity checking. While it reports 100% coverage, this is misleading as it: - -1. **Doesn't test actual functionality** - only tests that tools respond -2. **Uses unrealistic test data** - "test_value" everywhere -3. **Misclassifies results** - errors marked as "working" -4. **Lacks depth** - single test per tool -5. **Missing validation** - no verification of response content - -To provide meaningful functionality assessment for MCP directory submissions, we need to implement: - -- Realistic test data generation -- Multiple test scenarios per tool -- Proper response validation -- Contextual testing workflows -- Accurate result classification - -This will transform functionality testing from a basic "does it respond?" check to a comprehensive "does it work correctly?" validation. diff --git a/docs/FUNCTIONALITY_TESTING_ENHANCEMENTS.md b/docs/FUNCTIONALITY_TESTING_ENHANCEMENTS.md deleted file mode 100644 index 25303838..00000000 --- a/docs/FUNCTIONALITY_TESTING_ENHANCEMENTS.md +++ /dev/null @@ -1,305 +0,0 @@ -# Functionality Testing Enhancement Recommendations - -## Current State Analysis - -The MCP Inspector's Functionality tests currently test whether tools are callable, but there are several areas where we can enhance these tests to provide more comprehensive validation of tool functionality. - -### Current Implementation - -1. **Simple Testing Mode**: Generates basic test parameters and attempts to call tools -2. **Enhanced Testing Mode**: Uses TestScenarioEngine with multiple scenarios per tool -3. **Test Categories**: Happy path, edge cases, boundary values, and error cases -4. **Timeout Handling**: 5-second default timeout for tool calls - -## Recommended Enhancements - -### 1. Enhanced Tool Callability Verification - -#### 1.1 Connection State Validation - -```typescript -interface ConnectionStateTest { - preConnectionCheck: boolean; // Verify transport is connected - postCallVerification: boolean; // Verify connection remains stable - reconnectionTest: boolean; // Test reconnection after failure -} -``` - -**Implementation**: Add connection state checks before and after tool calls to ensure the transport layer is functioning properly. - -#### 1.2 Progressive Complexity Testing - -Instead of jumping straight to complex test cases, implement a progressive approach: - -```typescript -enum TestComplexity { - MINIMAL = "minimal", // Absolute minimum required params - SIMPLE = "simple", // Single required param with basic value - TYPICAL = "typical", // Common use case with realistic data - COMPLEX = "complex", // All params with nested structures - STRESS = "stress", // Maximum complexity/size allowed -} -``` - -**Benefits**: - -- Identifies at what complexity level a tool starts failing -- Provides granular feedback about tool capabilities -- Helps debug parameter handling issues - -### 2. Enhanced Response Validation - -#### 2.1 Response Structure Verification - -```typescript -interface ResponseValidation { - hasContent: boolean; // Response contains actual content - contentType: string; // Type of content returned - isWellFormed: boolean; // Response follows MCP protocol - schemaCompliance: boolean; // Matches expected output schema - performanceMetrics: { - responseTime: number; - contentSize: number; - streamingSupported: boolean; - }; -} -``` - -#### 2.2 Semantic Response Validation - -- Verify responses make sense given the input -- Check for placeholder/mock responses -- Validate that tool actually performed its stated function - -### 3. Improved Parameter Generation - -#### 3.1 Schema-Aware Parameter Generation - -```typescript -class SmartParameterGenerator { - // Generate parameters based on semantic understanding - generateFromDescription(tool: Tool): Record { - // Parse tool description for hints - // Match parameter names to common patterns - // Use context-aware defaults - } - - // Generate parameters from examples if provided - generateFromExamples(tool: Tool): Record { - // Extract from tool documentation - // Use inline examples from schema - } - - // Interactive parameter discovery - async discoverParameters(tool: Tool): Promise> { - // Try minimal params first - // Incrementally add optional params - // Learn from error messages - } -} -``` - -#### 3.2 Domain-Specific Test Data - -Enhance test data pools with domain-specific values: - -```typescript -const DOMAIN_SPECIFIC_DATA = { - filesystem: { - paths: ["/tmp/test.txt", "./README.md", "../config.json"], - operations: ["read", "write", "append", "delete"], - permissions: ["755", "644", "600"], - }, - database: { - queries: ["SELECT * FROM users", "INSERT INTO logs"], - connections: ["postgresql://localhost/db", "mongodb://localhost:27017"], - operations: ["find", "insert", "update", "delete"], - }, - api: { - endpoints: ["/api/v1/users", "/health", "/status"], - methods: ["GET", "POST", "PUT", "DELETE"], - headers: { - "Content-Type": "application/json", - Authorization: "Bearer token", - }, - }, -}; -``` - -### 4. Error Recovery and Retry Logic - -#### 4.1 Intelligent Retry Strategy - -```typescript -interface RetryStrategy { - maxAttempts: number; - backoffMultiplier: number; - retryableErrors: string[]; - - shouldRetry(error: Error, attempt: number): boolean; - adjustParameters( - params: Record, - error: Error, - ): Record; -} -``` - -#### 4.2 Error Classification - -```typescript -enum ErrorClassification { - TRANSPORT_ERROR = "transport", // Connection/network issues - VALIDATION_ERROR = "validation", // Parameter validation failed - PERMISSION_ERROR = "permission", // Authorization/access denied - RESOURCE_ERROR = "resource", // Resource not found/available - TIMEOUT_ERROR = "timeout", // Operation timed out - UNKNOWN_ERROR = "unknown", // Unclassified error -} -``` - -### 5. Performance and Load Testing - -#### 5.1 Concurrent Call Testing - -```typescript -interface ConcurrencyTest { - simultaneousCalls: number; - successRate: number; - averageResponseTime: number; - maxResponseTime: number; - errorRate: number; -} -``` - -#### 5.2 Sustained Load Testing - -- Test tools with repeated calls over time -- Monitor for memory leaks or degradation -- Verify rate limiting behavior - -### 6. Stateful Testing - -#### 6.1 State Verification - -```typescript -interface StatefulTest { - // Test that tools maintain state correctly - setupState(): Promise; - verifyStateChange(): Promise; - cleanupState(): Promise; -} -``` - -#### 6.2 Idempotency Testing - -- Verify that repeated identical calls produce consistent results -- Test for unintended side effects -- Validate transaction semantics - -### 7. Enhanced Reporting - -#### 7.1 Detailed Failure Analysis - -```typescript -interface FailureAnalysis { - failurePoint: "connection" | "validation" | "execution" | "response"; - rootCause: string; - suggestedFix: string; - workarounds: string[]; - relatedTools: string[]; // Other tools with similar issues -} -``` - -#### 7.2 Confidence Scoring Improvements - -```typescript -interface EnhancedConfidenceScore { - overall: number; // 0-100 - breakdown: { - connectivity: number; // Can connect and call - reliability: number; // Consistent responses - correctness: number; // Produces expected results - performance: number; // Meets performance targets - errorHandling: number; // Handles errors gracefully - }; - factors: string[]; // What influenced the score -} -``` - -## Implementation Priority - -### Phase 1: Core Enhancements (High Priority) - -1. Progressive complexity testing -2. Enhanced response validation -3. Smart parameter generation -4. Error classification - -### Phase 2: Advanced Features (Medium Priority) - -1. Concurrent call testing -2. Stateful testing -3. Domain-specific test data -4. Retry strategies - -### Phase 3: Optimization (Low Priority) - -1. Performance profiling -2. Load testing -3. Interactive parameter discovery -4. ML-based parameter generation - -## Example Enhanced Test Flow - -```typescript -async function enhancedFunctionalityTest( - tool: Tool, -): Promise { - const results = { - connectivity: await testConnectivity(tool), - minimal: await testMinimalCall(tool), - typical: await testTypicalUsage(tool), - edge: await testEdgeCases(tool), - concurrent: await testConcurrency(tool), - stateful: await testStatefulness(tool), - performance: await measurePerformance(tool), - }; - - return { - ...aggregateResults(results), - confidence: calculateConfidence(results), - recommendations: generateRecommendations(results), - }; -} -``` - -## Benefits of These Enhancements - -1. **Better Reliability Assessment**: Know exactly when and why tools fail -2. **Improved Debugging**: Pinpoint issues to specific complexity levels or parameter combinations -3. **Performance Insights**: Understand tool performance characteristics -4. **State Management**: Verify tools handle state correctly -5. **Production Readiness**: Assess if tools can handle real-world usage patterns -6. **Actionable Feedback**: Provide specific recommendations for fixing issues - -## Next Steps - -1. Prioritize which enhancements to implement first -2. Create detailed implementation specs for each enhancement -3. Update test scenarios and validation logic -4. Enhance reporting to show additional metrics -5. Add configuration options for test depth/complexity -6. Document new testing capabilities for users - -## Conclusion - -These enhancements will transform the Functionality tests from simple "can I call this tool?" checks to comprehensive assessments that validate: - -- **Callability**: Can the tool be invoked? -- **Reliability**: Does it work consistently? -- **Correctness**: Does it produce expected results? -- **Performance**: Does it meet performance requirements? -- **Robustness**: Does it handle edge cases and errors well? -- **Scalability**: Can it handle concurrent/sustained load? - -This comprehensive approach will provide MCP server developers with actionable insights to improve their implementations and give users confidence in the tools they're using. diff --git a/docs/MIGRATION_SINGLE_MODE.md b/docs/MIGRATION_SINGLE_MODE.md deleted file mode 100644 index 763c8029..00000000 --- a/docs/MIGRATION_SINGLE_MODE.md +++ /dev/null @@ -1,157 +0,0 @@ -# Migration to Single Comprehensive Testing Mode - -**Date**: 2025-10-06 -**Type**: Breaking Change -**Impact**: Medium (configuration change, no functionality loss) - ---- - -## Summary - -The MCP Inspector has been simplified to use **only comprehensive multi-scenario testing** for all functionality assessments. The dual-mode system (standard vs comprehensive) has been removed. - -## What Changed - -### Removed Features - -1. **Configuration Option**: `enableEnhancedTesting` has been removed from `AssessmentConfiguration` -2. **UI Control**: The "Run comprehensive tests (slower but more thorough)" checkbox has been removed from the Assessment tab -3. **Code**: The `assessFunctionalitySimple()` method has been removed from the assessment service - -### New Behavior - -- **All testing is now comprehensive** - Every functionality assessment uses multi-scenario validation -- **No configuration required** - Comprehensive testing is always enabled -- **Simplified UI** - One less configuration option to manage - -## Why This Change? - -Based on extensive analysis documented in `COMPREHENSIVE_TESTING_ANALYSIS.md`: - -1. **Quality**: Comprehensive testing provides 80% reduction in false positives through business logic error detection -2. **Accuracy**: Multi-scenario validation with confidence scoring provides far more reliable results -3. **MCP Directory Requirements**: Anthropic's MCP directory submission requires thorough validation - simple "ping tests" are insufficient -4. **User Feedback**: The simple testing mode was primarily kept for "backward compatibility" but didn't provide meaningful value - -## Migration Guide - -### If You Used Default Settings - -✅ **No action required** - The default was already simple testing, which has been replaced with comprehensive testing. Your assessments will now be more thorough and accurate. - -### If You Enabled Comprehensive Testing - -✅ **No action required** - Your preferred mode is now the default and only mode. - -### If You Have Custom Configuration Code - -#### Before - -```typescript -const config: AssessmentConfiguration = { - // ... other settings - enableEnhancedTesting: true, // ❌ This option no longer exists -}; -``` - -#### After - -```typescript -const config: AssessmentConfiguration = { - // ... other settings - // No need to specify testing mode - comprehensive is always used -}; -``` - -### If You Have Saved Configuration Files - -If you have saved configuration JSON files that include `enableEnhancedTesting`, they will continue to work - the option will simply be ignored. No errors will be thrown. - -## Performance Implications - -### Testing Time - -- **Previous (simple mode)**: ~5 seconds per tool -- **New (comprehensive mode)**: ~45-70 seconds per tool - -For a typical 10-tool MCP server: - -- **Previous**: ~50 seconds total -- **New**: 4-8 minutes total - -### Why This Is Acceptable - -1. **Quality over Speed**: Comprehensive testing catches issues that simple testing misses -2. **One-Time Cost**: Assessment is typically run once during development/validation, not continuously -3. **Parallelization**: Future optimizations can reduce time without sacrificing coverage -4. **MCP Directory**: Required thoroughness for Anthropic's directory submission - -## Features You Still Have - -All comprehensive testing features remain available: - -- ✅ Multi-scenario validation (Happy Path, Edge Cases, Boundary Testing, Error Cases) -- ✅ Progressive complexity testing (Minimal → Simple) -- ✅ Business logic error detection -- ✅ Confidence scoring (0-100%) -- ✅ Detailed test reports with recommendations -- ✅ Realistic test data generation -- ✅ Response validation with semantic analysis - -## Troubleshooting - -### "My assessments are taking much longer now" - -This is expected behavior. Comprehensive testing is more thorough and takes longer. If you need faster results: - -1. Use the **tool selector dropdown** in Assessment Configuration to select specific tools for error handling tests (uncheck slow/problematic tools) -2. Run assessments less frequently during development -3. Use the CLI for quick tool testing during development, full assessment for validation - -**Note**: As of 2025-10-10, you can visually select which tools to test via a multi-select dropdown with checkboxes instead of using the old numeric `maxToolsToTestForErrors` config. - -### "I preferred the quick smoke tests" - -The previous simple testing mode had significant limitations: - -- High false positive rate (marking broken tools as working) -- No validation of actual functionality -- Insufficient for MCP directory submission -- No confidence scoring or detailed analysis - -For quick smoke tests during development, consider: - -- Using the MCP Inspector's **Tools tab** for individual tool testing -- Running the CLI inspector for rapid iteration -- Running full assessments before commits/releases - -### "Can I get the old behavior back?" - -The simple testing mode has been permanently removed. However, if you need lightweight testing: - -1. Use the **tool selector dropdown** (Assessment Configuration → "Select tools for error handling tests:") to choose which tools to test - - Select specific tools by name with checkboxes - - Search/filter for large tool lists - - Use "Deselect All" to skip error handling entirely (fastest option) -2. The core functionality testing is comprehensive but optimized (removed redundant scenarios) -3. Future updates may include additional optimization while maintaining quality - -**Note**: The old numeric `maxToolsToTestForErrors` config is deprecated but still works for backward compatibility. - -## Related Documentation - -- **Technical Details**: `ENHANCED_TESTING_IMPLEMENTATION.md` -- **Analysis That Led to This Change**: `COMPREHENSIVE_TESTING_ANALYSIS.md` -- **Optimization History**: `COMPREHENSIVE_TESTING_OPTIMIZATION_PLAN.md` - -## Support - -If you have questions or concerns about this change: - -1. Open an issue at https://github.com/triepod-ai/inspector-assessment/issues -2. Review the comprehensive testing documentation in the `/docs` folder -3. Check the README for updated configuration examples - ---- - -**Bottom Line**: All testing is now comprehensive, accurate, and aligned with MCP directory requirements. The simplification reduces confusion and ensures consistent, high-quality assessments across all use cases. diff --git a/docs/REVIEWER_QUICK_START.md b/docs/REVIEWER_QUICK_START.md index 7fd555ed..6eb92e47 100644 --- a/docs/REVIEWER_QUICK_START.md +++ b/docs/REVIEWER_QUICK_START.md @@ -36,7 +36,7 @@ 1. **Switch to Developer Mode** - Click mode selector and choose "Developer Mode" - - Enables comprehensive testing (all 18 security patterns instead of 3) + - Enables comprehensive testing (all 13 security patterns instead of 3) - Takes 5-10 minutes vs 1-2 minutes for reviewer mode 2. **Re-run Assessment** @@ -100,13 +100,13 @@ **Testing modes**: - **Reviewer Mode**: 3 critical patterns (48 tests for typical 16-tool server) -- **Developer Mode**: 18 comprehensive patterns (900+ tests for typical 16-tool server) +- **Developer Mode**: 13 comprehensive patterns (900+ tests for typical 16-tool server) **Example - PASS**: ``` ✅ Security: 0 vulnerabilities found -- Patterns tested: 3 (Reviewer) or 18 (Developer) +- Patterns tested: 3 (Reviewer) or 13 (Developer) - Risk level: LOW - All tools properly reject malicious inputs ``` @@ -496,7 +496,7 @@ If you encounter problems or have suggestions: | Feature | Reviewer Mode | Developer Mode | | ------------------------ | --------------------------- | -------------------------------- | | **Speed** | 1-2 minutes | 5-10 minutes | -| **Security Patterns** | 3 critical patterns | 18 comprehensive patterns | +| **Security Patterns** | 3 critical patterns | 13 comprehensive patterns | | **Test Scenarios** | 1 per tool | Multiple scenarios per tool | | **Error Handling Tools** | First 3 tools | All tools | | **MCP Spec Compliance** | Not included | Included (informational) | diff --git a/docs/mcp_spec_06-2025.md b/docs/mcp_spec_06-2025.md deleted file mode 100644 index 304e51bb..00000000 --- a/docs/mcp_spec_06-2025.md +++ /dev/null @@ -1,274 +0,0 @@ -# Complete Model Context Protocol (MCP) Specification - -## Executive Summary and Latest Version - -The Model Context Protocol (MCP) is an **open protocol enabling seamless integration between LLM applications and external data sources/tools**, functioning as a standardized "USB-C port for AI applications." The **latest version 2025-06-18** introduces structured tool output, enhanced security with OAuth 2.1 Resource Server classification, elicitation support for user input requests, and removes JSON-RPC batching support. MCP eliminates the N×M integration problem by providing a single protocol that all AI applications can use to connect with any MCP-compliant server. - -## Core Architecture and Protocol Foundation - -MCP implements a **client-server architecture using JSON-RPC 2.0** messaging over two conceptual layers. The **transport layer** handles communication channels through stdio (recommended), Streamable HTTP, or custom implementations, while the **data layer** manages JSON-RPC message exchange, lifecycle management, and core primitives. Hosts (LLM applications) initiate connections to servers (services) through managed clients, establishing a clear separation of concerns where AI applications focus on intelligence while delegating data retrieval and tool execution to specialized servers. - -### Protocol Specification and Message Formats - -All MCP messages **must follow JSON-RPC 2.0 specification** with UTF-8 encoding. The protocol defines three message types: **requests** (bidirectional with unique IDs), **responses** (matching request IDs with either result or error), and **notifications** (one-way messages without IDs). Messages are structured as follows: - -```json -// Request Example -{ - "jsonrpc": "2.0", - "method": "tools/list", - "id": "unique-request-id" -} - -// Response Example -{ - "jsonrpc": "2.0", - "result": {...}, - "id": "unique-request-id" -} -``` - -The protocol mandates **no ID reuse within sessions**, **no batching support** (removed in 2025-06-18), and requires the `MCP-Protocol-Version` header for HTTP transport. Standard error codes range from parse errors (-32700) to internal errors (-32603), with custom application-specific codes permitted. - -## Transport Mechanisms - -### stdio Transport (Primary Recommendation) - -The **stdio transport** operates by launching the MCP server as a subprocess, with the server reading JSON-RPC messages from stdin and writing responses to stdout. Messages are **newline-delimited** without embedded newlines, while stderr remains available for logging. This transport provides process-level security through the execution context and represents the simplest, most secure implementation path. - -### Streamable HTTP Transport - -Replacing the legacy HTTP+SSE transport, **Streamable HTTP** provides a single endpoint supporting POST for client requests and optional GET for server-to-client streaming. Security requirements include **Origin header validation** to prevent DNS rebinding, localhost binding for local servers, and mandatory authentication implementation. Session management uses the `Mcp-Session-Id` header with backwards compatibility through fallback to SSE endpoints. - -## Server and Client Implementation Requirements - -### Initialization and Capability Negotiation - -Servers and clients **must implement** the initialization handshake exchanging protocol versions and capabilities: - -```typescript -interface InitializeRequest { - protocolVersion: string; // e.g., "2025-06-18" - capabilities: ClientCapabilities; - clientInfo: Implementation; -} - -interface ServerCapabilities { - tools?: { listChanged?: boolean }; - resources?: { subscribe?: boolean; listChanged?: boolean }; - prompts?: { listChanged?: boolean }; - logging?: {}; - experimental?: { [key: string]: any }; -} -``` - -The **capabilities object** declares supported features enabling dynamic feature discovery. Servers may include optional **instructions** for LLM guidance during initialization. - -## Resources, Tools, and Prompts - -### Resource System - -Resources provide **structured access to external data** through URIs with content negotiation: - -```typescript -interface Resource { - uri: string; // Unique resource identifier - name: string; // Programmatic identifier - title?: string; // Human-readable name - mimeType?: string; // Content type - size?: number; // Size in bytes -} -``` - -Resources support **listing** with pagination, **reading** with text or binary content, **subscriptions** for change monitoring, and **URI templates** (RFC 6570) for dynamic resource generation. - -### Tool Definitions and Structured Output - -Tools enable **server-side function execution** with JSON Schema validation: - -```typescript -interface Tool { - name: string; // URI-like identifier - inputSchema: object; // JSON Schema for inputs - outputSchema?: object; // NEW: Structured output schema - annotations?: ToolAnnotations; // Behavioral hints -} -``` - -The **2025-06-18 version introduces structured output** through `outputSchema`, enabling type-safe tool results alongside traditional unstructured content. Tool annotations provide behavioral hints (readOnlyHint, destructiveHint, idempotentHint) but **must not be trusted** for security decisions. - -### Prompt Templates - -Prompts offer **reusable conversation templates** with variable substitution: - -```typescript -interface Prompt { - name: string; // Unique identifier - arguments?: PromptArgument[]; // Template variables -} - -interface PromptMessage { - role: "user" | "assistant" | "system"; - content: TextContent | ImageContent | ResourceContent; -} -``` - -## Advanced Features - -### Sampling (Server → Client LLM Access) - -Servers can **request LLM generation** through the client using the sampling API: - -```typescript -interface CreateMessageRequest { - messages: SamplingMessage[]; - modelPreferences?: ModelPreferences; // Model selection hints - temperature?: number; - maxTokens: number; - includeContext?: "none" | "thisServer" | "allServers"; -} -``` - -Model preferences enable **intelligent model selection** based on cost, speed, and capability priorities with specific model hints. - -### Elicitation (NEW in 2025-06-18) - -The **elicitation feature** enables servers to request additional user input during interactions through structured data collection with JSON schemas, supporting dynamic workflows requiring human intervention. - -### Completion Support - -Argument completion provides **context-aware suggestions** for prompt and resource arguments: - -```typescript -interface CompleteRequest { - ref: { type: "ref/prompt" | "ref/resource"; name?: string }; - argument: { name: string; value: string }; - context?: { arguments?: { [key: string]: string } }; -} -``` - -## Security Architecture - -### OAuth 2.1 Resource Server Model - -MCP servers are **classified as OAuth 2.1 Resource Servers** requiring: - -- **PKCE implementation** for authorization code protection -- **Resource Indicators (RFC 8707)** preventing unauthorized token acquisition -- **Token audience validation** ensuring tokens are issued specifically for the server -- **No token passthrough** to upstream services -- **HTTPS mandatory** for all authorization endpoints - -### Protected Resource Metadata (RFC 9728) - -Servers expose authorization server discovery through: - -```typescript -interface ProtectedResourceMetadata { - authorization_servers: string[]; // Authorization server URLs -} -``` - -### Transport Security Requirements - -**HTTP transports must validate Origin headers**, bind to localhost for local servers, implement proper authentication, and use Bearer tokens in Authorization headers (never in query strings). The protocol mandates **short-lived access tokens** with refresh token rotation for public clients. - -## API Endpoints and Operations - -### Core Protocol Endpoints - -**Lifecycle Management:** - -- `initialize` - Connection setup and capability exchange -- `ping` - Health check mechanism -- `notifications/initialized` - Client ready signal - -**Resource Operations:** - -- `resources/list` - Enumerate available resources -- `resources/read` - Retrieve resource content -- `resources/subscribe` - Monitor resource changes -- `resources/templates/list` - Get URI templates - -**Tool Operations:** - -- `tools/list` - Discover available tools -- `tools/call` - Execute tool with arguments - -**Prompt Operations:** - -- `prompts/list` - Get available prompts -- `prompts/get` - Retrieve prompt with substitution - -**Client Features:** - -- `sampling/createMessage` - Request LLM generation -- `elicitation/create` - Request user input -- `roots/list` - Request client root directories - -## Implementation Patterns and Code Examples - -### Basic Server Implementation (TypeScript) - -```typescript -import { Server } from "@modelcontextprotocol/sdk/server/index.js"; -import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; - -const server = new Server({ - name: "example-server", - version: "1.0.0", -}); - -server.registerTool( - { - name: "get_weather", - description: "Get weather for a city", - inputSchema: { - type: "object", - properties: { city: { type: "string" } }, - required: ["city"], - }, - }, - async (args) => ({ - content: [{ type: "text", text: `Weather for ${args.city}` }], - }), -); - -const transport = new StdioServerTransport(); -await server.connect(transport); -``` - -### Client Implementation Pattern - -```typescript -const client = new Client({ name: "example-client", version: "1.0.0" }); -const transport = new StdioClientTransport({ - command: "node", - args: ["server.js"], -}); - -await client.connect(transport); -const tools = await client.request({ method: "tools/list" }); -``` - -## SDK Ecosystem and Language Support - -**Official SDKs** are available for **TypeScript** (@modelcontextprotocol/sdk), **Python** (mcp with FastMCP framework), **C#** (Microsoft collaboration), **Go** (Google collaboration), **Ruby** (Shopify collaboration), and **Kotlin** (JetBrains collaboration). Each SDK provides full specification support with type safety, multiple transport implementations, and comprehensive error handling patterns. - -## Registry and Industry Adoption - -The **MCP Registry** (registry.modelcontextprotocol.io) launched in preview September 2025, providing official server discovery and distribution with OpenAPI specifications and sub-registry support. Major adopters include **Anthropic** (Claude Desktop), **OpenAI** (ChatGPT desktop, March 2025), **Microsoft** (Copilot integration), and development tools including Zed, Replit, Codeium, and Sourcegraph. - -## Latest Changes and Migration Guide - -### Breaking Changes in 2025-06-18 - -The latest version **removes JSON-RPC batching support** (PR #416), **requires Protocol Version headers** for HTTP transport (PR #548), and changes lifecycle operations from SHOULD to MUST requirements. Migration requires updating batched requests to individual calls, adding version headers to HTTP implementations, and ensuring lifecycle operation compliance. - -### New Capabilities - -**Structured tool output** enables type-safe results through outputSchema definitions. **OAuth Resource Server classification** strengthens security with protected resource metadata. **Resource Indicators requirement** prevents malicious token acquisition. **Elicitation support** enables dynamic user input requests. **Enhanced security documentation** provides comprehensive implementation guidance. - -## Conclusion and Technical Assessment - -The Model Context Protocol represents a **mature, production-ready specification** for AI-tool integration with robust security, flexible transport options, and comprehensive type safety. The protocol's **modular design** allows selective implementation while maintaining interoperability. With **industry-wide adoption** from major AI providers and development platforms, MCP establishes itself as the de facto standard for LLM-external system communication. The **active development cycle** with regular updates and growing SDK ecosystem ensures continued evolution aligned with emerging AI application requirements. diff --git a/docs/mcp_spec_11-2025.md b/docs/mcp_spec_11-2025.md new file mode 100644 index 00000000..c22cd65c --- /dev/null +++ b/docs/mcp_spec_11-2025.md @@ -0,0 +1,215 @@ +# Model Context Protocol (MCP) Specification Reference + +**Protocol Revision**: 2025-11-25 +**Source**: [modelcontextprotocol.io/specification/2025-11-25](https://modelcontextprotocol.io/specification/2025-11-25) +**Schema**: [schema.ts](https://github.com/modelcontextprotocol/specification/blob/main/schema/2025-11-25/schema.ts) + +## What Changed from 2025-06-18 + +The November 2025 release (MCP's 1-year anniversary) adds: + +- **Tasks**: New abstraction for tracking server work, queryable status, results available after creation +- **Enhanced Tool Calling**: Tool calling in sampling requests, server-side agent loops, parallel tool calls +- **Extensions**: Components outside core spec for scenario-specific additions +- **Icons**: Standardized visual identifiers for tools, resources, prompts, implementations +- **`title` field on tools**: Human-readable display name separate from `name` +- **`_meta` property**: Reserved metadata with reverse-DNS key naming +- **MCP-Protocol-Version header**: Required on all HTTP requests after initialization +- **Security Best Practices**: Expanded guidance (session hijacking, token passthrough, scope minimization) + +## Core Architecture + +MCP uses **client-host-server architecture** with JSON-RPC 2.0 messaging. + +- **Hosts**: LLM applications that initiate connections, manage clients, enforce security +- **Clients**: Connectors within host, 1:1 relationship with servers, maintain session isolation +- **Servers**: Provide resources, tools, prompts; operate independently + +### Key Design Principles + +1. Servers should be easy to build (hosts handle orchestration) +2. Servers should be composable (focused functionality, combinable) +3. Servers cannot see the full conversation or other servers +4. Features added progressively via capability negotiation + +## Transports + +### stdio + +- Client launches server as subprocess +- Messages on stdin/stdout, newline-delimited, no embedded newlines +- stderr for logging (MAY write UTF-8 strings) +- Server MUST NOT write non-MCP content to stdout + +### Streamable HTTP (replaces HTTP+SSE from 2024-11-05) + +- Single MCP endpoint supporting POST and GET +- POST for client-to-server messages; GET for server-initiated SSE streams +- Session management via `MCP-Session-Id` header (cryptographically secure) +- `MCP-Protocol-Version` header MUST be included on all requests after init + +**Security Requirements**: + +- MUST validate `Origin` header (DNS rebinding prevention) +- If Origin present and invalid, MUST respond 403 Forbidden +- Local servers SHOULD bind only to localhost (127.0.0.1) +- SHOULD implement proper authentication + +**Resumability**: Servers MAY attach SSE event IDs; clients resume via GET with `Last-Event-ID` + +## Capability Negotiation + +During initialization, clients and servers declare supported features: + +**Server capabilities**: resources, tools, prompts (each with optional `listChanged`) +**Client capabilities**: sampling, roots, elicitation + +Both parties MUST respect declared capabilities throughout the session. + +## Server Features + +### Tools + +Tools are **model-controlled** functions for interacting with external systems. + +**Tool Definition**: + +- `name`: Unique identifier (1-128 chars, case-sensitive, alphanumeric + `_-. `) +- `title`: Optional human-readable display name (new in 2025-11-25) +- `description`: Human-readable description +- `inputSchema`: JSON Schema (2020-12 default) for parameters +- `outputSchema`: Optional JSON Schema for structured output validation +- `annotations`: Optional behavioral hints (see below) +- `icons`: Optional array of icon objects + +**Tool Annotations** (behavioral hints, NOT security guarantees): + +```json +{ + "annotations": { + "title": "Human-readable title", + "readOnlyHint": true, + "destructiveHint": false, + "idempotentHint": true, + "openWorldHint": true + } +} +``` + +- `readOnlyHint`: Tool does not modify its environment (default: false) +- `destructiveHint`: Tool may perform destructive updates (default: true) +- `idempotentHint`: Repeated calls with same args have no additional effect (default: false) +- `openWorldHint`: Tool interacts with external entities (default: true) + +**CRITICAL**: Clients MUST consider annotations untrusted unless from a trusted server. + +**Tool Results**: Can be unstructured (`content` array) or structured (`structuredContent` JSON). +Content types: text, image, audio, resource_link, embedded resource. +If `outputSchema` provided, servers MUST return conforming `structuredContent`. + +**Error Handling**: + +1. Protocol errors: JSON-RPC errors (-32602 for unknown tool, etc.) +2. Tool execution errors: `isError: true` in result (actionable, model can self-correct) + +### Resources + +- Context and data for users/models +- Application-controlled (attached by client) +- URI-based with subscriptions for updates + +### Prompts + +- Pre-defined templates/instructions +- User-controlled (invoked by user choice) + +## Client Features + +- **Sampling**: Server-initiated LLM interactions (requires explicit user approval) +- **Roots**: Server-initiated queries about filesystem/URI boundaries +- **Elicitation**: Server requests additional user input with structured JSON schemas + +## Authorization (HTTP transports) + +Based on **OAuth 2.1** with: + +- **PKCE**: MUST use S256 code challenge method +- **Resource Indicators (RFC 8707)**: MUST include `resource` parameter in auth/token requests +- **Protected Resource Metadata (RFC 9728)**: Servers MUST implement for auth server discovery +- **Client ID Metadata Documents**: Preferred registration mechanism (HTTPS URLs as client IDs) +- **Token audience validation**: Servers MUST validate tokens were issued specifically for them +- **Token passthrough forbidden**: Servers MUST NOT forward received tokens to upstream APIs + +**Discovery flow**: Client gets 401 -> extracts resource_metadata from WWW-Authenticate -> fetches Protected Resource Metadata -> discovers authorization server -> proceeds with OAuth flow + +## Error Codes + +Standard JSON-RPC error codes: + +- `-32700`: Parse error +- `-32600`: Invalid request +- `-32601`: Method not found +- `-32602`: Invalid params +- `-32603`: Internal error +- `-32000` to `-32099`: Server errors (MCP-specific) +- `-32001`: MCP transport error + +## Security Considerations + +### Key Principles + +1. **User Consent**: Users must explicitly consent to all data access and operations +2. **Data Privacy**: Hosts must not transmit data without user consent +3. **Tool Safety**: Tools represent arbitrary code execution; annotations are untrusted +4. **LLM Sampling Controls**: Users must approve sampling requests, control prompts + +### Implementation Requirements + +**Servers MUST**: + +- Validate all tool inputs +- Implement proper access controls +- Rate limit tool invocations +- Sanitize tool outputs + +**Clients SHOULD**: + +- Prompt for user confirmation on sensitive operations +- Show tool inputs before calling server (prevent data exfiltration) +- Validate tool results before passing to LLM +- Implement timeouts for tool calls +- Log tool usage for audit purposes + +### Session Security + +- Session IDs MUST be cryptographically secure +- Clients MUST handle session IDs securely +- Sessions can be terminated by server at any time (404 response) +- Clients SHOULD send DELETE to terminate sessions they no longer need + +## Relevance to Inspector Assessment + +### What the Inspector Should Test + +| Spec Requirement | Inspector Module | Notes | +| ------------------------- | ------------------------- | ---------------------------------------------- | +| Tool input validation | SecurityAssessor | Injection payloads test server-side validation | +| Error response format | ErrorHandlingAssessor | JSON-RPC error codes, isError field | +| Tool annotations accuracy | ToolAnnotationAssessor | readOnlyHint vs actual behavior | +| JSON-RPC compliance | MCPSpecComplianceAssessor | Message format, capability negotiation | +| Tool functionality | FunctionalityAssessor | Tools work with valid inputs | +| Output schema conformance | MCPSpecComplianceAssessor | structuredContent matches outputSchema | + +### What the Spec Says About Security Testing + +- Annotations are **untrusted hints** - cannot rely on them for security decisions +- Tool safety is the **host's responsibility** (not the server's) +- Protocol-level security is about transport (Origin validation, token handling) +- Application-level security is about input validation and access controls +- The spec does NOT define vulnerability categories - that's implementation-specific + +### Transport-Aware Testing Implications + +- **stdio servers**: Full security testing (command injection, path traversal relevant) +- **HTTP servers**: Transport security matters (Origin, auth), but path traversal less relevant +- **Both**: Input validation, error handling, annotation accuracy always relevant diff --git a/package-lock.json b/package-lock.json index 6e379d8f..63c59f60 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@bryan-thompson/inspector-assessment", - "version": "1.7.1", + "version": "1.8.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@bryan-thompson/inspector-assessment", - "version": "1.7.1", + "version": "1.8.0", "license": "MIT", "workspaces": [ "client", @@ -51,7 +51,7 @@ }, "cli": { "name": "@bryan-thompson/inspector-assessment-cli", - "version": "1.7.1", + "version": "1.8.0", "license": "MIT", "dependencies": { "@bryan-thompson/inspector-assessment-client": "^1.5.0", @@ -76,7 +76,7 @@ }, "client": { "name": "@bryan-thompson/inspector-assessment-client", - "version": "1.7.1", + "version": "1.8.0", "license": "MIT", "dependencies": { "@modelcontextprotocol/sdk": "^1.23.0", @@ -10756,7 +10756,7 @@ }, "server": { "name": "@bryan-thompson/inspector-assessment-server", - "version": "1.7.1", + "version": "1.8.0", "license": "MIT", "dependencies": { "@modelcontextprotocol/sdk": "^1.23.0", diff --git a/package.json b/package.json index c964aec0..539d88f6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@bryan-thompson/inspector-assessment", - "version": "1.7.1", + "version": "1.8.0", "description": "Enhanced MCP Inspector with comprehensive assessment capabilities for server validation", "license": "MIT", "author": "Bryan Thompson ", diff --git a/scripts/run-security-assessment.ts b/scripts/run-security-assessment.ts index 7f1e9b12..1dcbe519 100755 --- a/scripts/run-security-assessment.ts +++ b/scripts/run-security-assessment.ts @@ -230,7 +230,7 @@ async function runSecurityAssessment( // Create assessment context const config: AssessmentConfiguration = { ...DEFAULT_ASSESSMENT_CONFIG, - securityPatternsToTest: 17, // All 17 attack patterns + securityPatternsToTest: 13, // All 13 attack patterns reviewerMode: false, testTimeout: 30000, }; @@ -243,7 +243,7 @@ async function runSecurityAssessment( }; // Run security assessment - console.log(`🛡️ Running security assessment with 17 attack patterns...`); + console.log(`🛡️ Running security assessment with 13 attack patterns...`); const assessor = new SecurityAssessor(config); const results = await assessor.assess(context); diff --git a/server/package.json b/server/package.json index f6dfc30e..4d0b41dc 100644 --- a/server/package.json +++ b/server/package.json @@ -1,6 +1,6 @@ { "name": "@bryan-thompson/inspector-assessment-server", - "version": "1.7.1", + "version": "1.8.0", "description": "Server-side application for the Enhanced MCP Inspector with assessment capabilities", "license": "MIT", "author": "Bryan Thompson ",