diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..c0a5902 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,151 @@ +[run] +source = src +omit = + */tests/* + */venv/* + */env/* + */site-packages/* + */dist-packages/* + */docs/* + */examples/* + */setup.py + +[report] +exclude_lines = + pragma: no cover + def __repr__ + raise NotImplementedError + if __name__ == .__main__.: + pass + raise ImportError + except ImportError + raise AssertionError + except AssertionError + raise ValueError + except ValueError + raise TypeError + except TypeError + raise KeyError + except KeyError + raise IndexError + except IndexError + raise AttributeError + except AttributeError + raise NotImplementedError + except NotImplementedError + raise Exception + except Exception + raise SystemExit + except SystemExit + raise StopIteration + except StopIteration + def __str__ + def __unicode__ + def __repr__ + def __dir__ + def __format__ + def __hash__ + def __eq__ + def __ne__ + def __lt__ + def __le__ + def __gt__ + def __ge__ + def __iter__ + def __next__ + def __enter__ + def __exit__ + def __call__ + def __getitem__ + def __setitem__ + def __delitem__ + def __contains__ + def __len__ + def __bool__ + def __nonzero__ + def __getattr__ + def __setattr__ + def __delattr__ + def __getattribute__ + def __get__ + def __set__ + def __delete__ + def __new__ + def __init__ + def __del__ + def __reduce__ + def __reduce_ex__ + def __getnewargs__ + def __getinitargs__ + def __getstate__ + def __setstate__ + def __copy__ + def __deepcopy__ + def __sizeof__ + def __instancecheck__ + def __subclasscheck__ + def __subclasshook__ + def __missing__ + def __index__ + def __coerce__ + def __bytes__ + def __complex__ + def __int__ + def __float__ + def __round__ + def __trunc__ + def __floor__ + def __ceil__ + def __pos__ + def __neg__ + def __abs__ + def __invert__ + def __add__ + def __sub__ + def __mul__ + def __truediv__ + def __floordiv__ + def __mod__ + def __divmod__ + def __pow__ + def __lshift__ + def __rshift__ + def __and__ + def __xor__ + def __or__ + def __radd__ + def __rsub__ + def __rmul__ + def __rtruediv__ + def __rfloordiv__ + def __rmod__ + def __rdivmod__ + def __rpow__ + def __rlshift__ + def __rrshift__ + def __rand__ + def __rxor__ + def __ror__ + def __iadd__ + def __isub__ + def __imul__ + def __itruediv__ + def __ifloordiv__ + def __imod__ + def __ipow__ + def __ilshift__ + def __irshift__ + def __iand__ + def __ixor__ + def __ior__ + def __matmul__ + def __rmatmul__ + def __imatmul__ + def __await__ + def __aiter__ + def __anext__ + def __aenter__ + def __aexit__ + +[html] +directory = coverage_html_report diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..e941ee8 --- /dev/null +++ b/.env.template @@ -0,0 +1,68 @@ +# VeriFact Environment Configuration +# ================================= + +# ===== MODEL ACCESS AND SELECTION ===== + +# OpenRouter API Key for model access [REQUIRED] +OPENROUTER_API_KEY=your-openrouter-api-key-here + +# OpenAI API Key is not needed as we're using OpenRouter +# OPENAI_API_KEY= + +# Default model for general processing and fallback +DEFAULT_MODEL=gpt-4o +# Alternatives: +# DEFAULT_MODEL=qwen/qwen3-8b:free # Better for multilingual, Apache-2 license +# DEFAULT_MODEL=microsoft/phi-4-reasoning:free # Lightweight, MIT licensed + +# Claim Detector model - identifies factual claims from text +CLAIM_DETECTOR_MODEL=gpt-4o-mini # Fast and affordable, for focused tasks + +# Evidence Hunter model - gathers and evaluates evidence +EVIDENCE_HUNTER_MODEL=gpt-4o-mini + +# Verdict Writer model - analyzes evidence and generates verdicts +VERDICT_WRITER_MODEL=gpt-4o-mini # Reasoning model, fast, and cheap + +# Model Parameters +MODEL_TEMPERATURE=0.1 # Lower values: more deterministic +MODEL_MAX_TOKENS=1000 # Maximum response length +MODEL_REQUEST_TIMEOUT=120 # Timeout in seconds + +# ===== SEARCH CONFIGURATION ===== + +# Search Configuration +USE_SERPER=false +SERPER_API_KEY= # Only needed if USE_SERPER=true + +# ===== APPLICATION CONFIGURATION ===== + +# API Configuration +HOST=0.0.0.0 # Listen on all interfaces +PORT=8000 +API_KEY_ENABLED=true # Enable API key authentication +API_KEY_HEADER_NAME=X-API-Key # Header name for API keys +DEFAULT_API_KEY=verifact-default-key # Default API key +RATE_LIMIT_ENABLED=true # Enable rate limiting +RATE_LIMIT_REQUESTS=100 # Number of requests per window +RATE_LIMIT_WINDOW=3600 # Rate limit window in seconds + +# Chainlit UI Configuration +CHAINLIT_HOST=0.0.0.0 # Listen on all interfaces +CHAINLIT_PORT=8501 +CHAINLIT_AUTH_ENABLED=false # Set to true to enable authentication +CHAINLIT_AUTH_SECRET= # Required if auth is enabled +CHAINLIT_PERSIST=true # Persist chats in the database + +# ===== ADVANCED CONFIGURATION ===== + +# Embedding Configuration +EMBEDDING_MODEL=text-embedding-3-small # Model for generating embeddings +ENABLE_MODEL_CACHING=true # Cache model responses +MODEL_CACHE_SIZE=1000 # Number of responses to cache + +# Logging Configuration +ENVIRONMENT=development +LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR, CRITICAL +LOG_FORMAT=plain +# LOG_FILE=/path/to/log/file.log # Uncomment to enable file logging diff --git a/=0.0.15 b/=0.0.15 new file mode 100644 index 0000000..5150dcc --- /dev/null +++ b/=0.0.15 @@ -0,0 +1,33 @@ +Requirement already satisfied: pytest in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (8.3.5) +Requirement already satisfied: pytest-cov in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (6.1.1) +Requirement already satisfied: pytest-asyncio in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (0.26.0) +Requirement already satisfied: python-dotenv in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (1.1.0) +Requirement already satisfied: pydantic in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (2.11.4) +Requirement already satisfied: openai in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (1.82.0) +Requirement already satisfied: openai-agents in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (0.0.16) +Requirement already satisfied: packaging in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest) (24.2) +Requirement already satisfied: pluggy<2,>=1.5 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest) (1.6.0) +Requirement already satisfied: iniconfig in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest) (2.1.0) +Requirement already satisfied: tomli>=1 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest) (2.2.1) +Requirement already satisfied: exceptiongroup>=1.0.0rc8 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest) (1.2.2) +Requirement already satisfied: coverage[toml]>=7.5 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest-cov) (7.8.1) +Requirement already satisfied: typing-extensions>=4.12 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pytest-asyncio) (4.13.1) +Requirement already satisfied: pydantic-core==2.33.2 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pydantic) (2.33.2) +Requirement already satisfied: annotated-types>=0.6.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pydantic) (0.7.0) +Requirement already satisfied: typing-inspection>=0.4.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from pydantic) (0.4.1) +Requirement already satisfied: sniffio in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai) (1.3.1) +Requirement already satisfied: jiter<1,>=0.4.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai) (0.10.0) +Requirement already satisfied: distro<2,>=1.7.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai) (1.9.0) +Requirement already satisfied: anyio<5,>=3.5.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai) (4.9.0) +Requirement already satisfied: tqdm>4 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai) (4.67.1) +Requirement already satisfied: httpx<1,>=0.23.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai) (0.28.1) +Requirement already satisfied: requests<3,>=2.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai-agents) (2.32.3) +Requirement already satisfied: types-requests<3,>=2.0 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai-agents) (2.32.0.20250515) +Requirement already satisfied: griffe<2,>=1.5.6 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from openai-agents) (1.7.3) +Requirement already satisfied: idna>=2.8 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from anyio<5,>=3.5.0->openai) (3.10) +Requirement already satisfied: colorama>=0.4 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from griffe<2,>=1.5.6->openai-agents) (0.4.6) +Requirement already satisfied: certifi in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from httpx<1,>=0.23.0->openai) (2025.1.31) +Requirement already satisfied: httpcore==1.* in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from httpx<1,>=0.23.0->openai) (1.0.7) +Requirement already satisfied: h11<0.15,>=0.13 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0) +Requirement already satisfied: charset-normalizer<4,>=2 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from requests<3,>=2.0->openai-agents) (3.4.1) +Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/stevenhinojosa/Library/Python/3.9/lib/python/site-packages (from requests<3,>=2.0->openai-agents) (2.3.0) diff --git a/run_tests_with_coverage.sh b/run_tests_with_coverage.sh new file mode 100755 index 0000000..605f2cb --- /dev/null +++ b/run_tests_with_coverage.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Activate virtual environment if it exists +if [ -d "venv" ]; then + source venv/bin/activate +elif [ -d "env" ]; then + source env/bin/activate +fi + +# Find Python executable +if command -v python3 &> /dev/null; then + PYTHON=python3 +elif command -v python &> /dev/null; then + PYTHON=python +else + echo "Python not found. Please install Python 3." + exit 1 +fi + +# Install required packages +$PYTHON -m pip install --user pytest pytest-cov pytest-asyncio python-dotenv pydantic openai openai-agents>=0.0.15 + +# Run tests with coverage +$PYTHON -m pytest src/tests/ --cov=src --cov-report=term --cov-report=html -v + +# Print coverage report +echo "Coverage report generated in coverage_html_report/" +echo "Open coverage_html_report/index.html in a browser to view the report" + +# Check if coverage is at least 80% +COVERAGE=$($PYTHON -m coverage report | grep TOTAL | awk '{print $4}' | sed 's/%//') +if [ -z "$COVERAGE" ]; then + echo "Could not determine coverage percentage." + exit 1 +elif (( $(echo "$COVERAGE < 80" | bc -l 2>/dev/null) )); then + echo "Coverage is below 80% (${COVERAGE}%)" + exit 1 +else + echo "Coverage is at or above 80% (${COVERAGE}%)" + exit 0 +fi diff --git a/src/api/factcheck.py b/src/api/factcheck.py index cc906c0..edef85a 100644 --- a/src/api/factcheck.py +++ b/src/api/factcheck.py @@ -1,41 +1,76 @@ from fastapi import APIRouter from datetime import datetime import time +import asyncio from models.factcheck import ( FactCheckRequest, FactCheckResponse, Claim, Source ) +from src.verifact_manager_openrouter import VerifactManager router = APIRouter(prefix="/api/v1") @router.post("/factcheck", response_model=FactCheckResponse) async def factcheck(request: FactCheckRequest): start_time = time.time() - - # TODO: Implement actual fact-checking logic here - # This is a placeholder response - response = FactCheckResponse( - claims=[ - Claim( - text="Example claim", - verdict="Mostly True", - confidence=0.89, - explanation="This is a detailed explanation with evidence", - sources=[ + + # Use our OpenRouter-based VerifactManager + manager = VerifactManager() + try: + verdicts = await manager.run(request.text) + + # Convert verdicts to the API response format + claims = [] + for verdict in verdicts: + sources_list = [] + for source_url in verdict.sources: + sources_list.append( Source( - url="source1.com", - credibility=0.95, - quote="Example quote from source" + url=source_url, + credibility=0.9, # Default credibility + quote="Evidence from source" # Default quote ) - ] + ) + + claims.append( + Claim( + text=verdict.claim, + verdict=verdict.verdict, + confidence=verdict.confidence, + explanation=verdict.explanation, + sources=sources_list + ) ) - ], - metadata={ - "processing_time": f"{time.time() - start_time:.1f}s", - "model_version": "1.0.4" - } - ) - - return response \ No newline at end of file + + response = FactCheckResponse( + claims=claims, + metadata={ + "processing_time": f"{time.time() - start_time:.1f}s", + "model_version": "1.0.5" + } + ) + except Exception as e: + # Fallback to placeholder response in case of errors + response = FactCheckResponse( + claims=[ + Claim( + text="Error processing request", + verdict="Unverifiable", + confidence=0.0, + explanation=f"Error: {str(e)}", + sources=[] + ) + ], + metadata={ + "processing_time": f"{time.time() - start_time:.1f}s", + "model_version": "1.0.5", + "error": str(e) + } + ) + finally: + # Close the manager's HTTP client + await manager.close() + + return response \ No newline at end of file diff --git a/src/tests/README.md b/src/tests/README.md new file mode 100644 index 0000000..c8bd128 --- /dev/null +++ b/src/tests/README.md @@ -0,0 +1,132 @@ +# VeriFact Tests + +This directory contains tests for the VeriFact project. + +## Test Structure + +- `fixtures/`: Test fixtures with sample data +- `integration/`: Integration tests for the full pipeline +- `unit/`: Unit tests for individual components +- `conftest.py`: Pytest configuration +- `test_*.py`: Test files + +## Running Tests + +### Running All Tests + +```bash +pytest +``` + +### Running Unit Tests Only + +```bash +pytest src/tests/test_*.py +``` + +### Running Integration Tests Only + +```bash +pytest src/tests/integration/ +``` + +### Running End-to-End Tests + +End-to-end tests make real API calls and require API keys to be set in your environment. + +```bash +# Run only end-to-end tests +pytest -m e2e + +# Run all tests except end-to-end tests +pytest -m "not e2e" +``` + +### Running Tests with Coverage + +To run tests with coverage reporting and ensure at least 80% coverage: + +```bash +# Run the coverage script +./run_tests_with_coverage.sh + +# Or manually with pytest +pytest --cov=src --cov-report=term --cov-report=html +``` + +The coverage report will be generated in the `coverage_html_report/` directory. Open `coverage_html_report/index.html` in a browser to view the detailed report. + +## Test Categories + +### Unit Tests + +Unit tests focus on testing individual components in isolation. They use mocks to avoid external dependencies. + +### Integration Tests + +Integration tests verify that different components work together correctly. They still use mocks for external services but test the interactions between internal components. + +### End-to-End Tests + +End-to-end tests verify the entire system works correctly with real external services. These tests make actual API calls and require API keys to be set. + +## Test Fixtures + +The `fixtures/` directory contains sample data for testing: + +- `claims.py`: Sample factual claims +- `evidence.py`: Sample evidence for claims +- `verdicts.py`: Sample verdicts for claims + +See the [fixtures README](fixtures/README.md) for more details. + +## Writing New Tests + +### Unit Tests + +Place unit tests in the root of the `tests/` directory with filenames starting with `test_`. + +```python +# src/tests/test_new_component.py +import pytest +from src.new_component import NewComponent + +def test_new_component_functionality(): + component = NewComponent() + result = component.do_something() + assert result == expected_result +``` + +### Integration Tests + +Place integration tests in the `integration/` directory with filenames starting with `test_`. + +```python +# src/tests/integration/test_new_integration.py +import pytest +from src.component_a import ComponentA +from src.component_b import ComponentB + +@pytest.mark.integration +def test_components_work_together(): + component_a = ComponentA() + component_b = ComponentB(component_a) + result = component_b.process_with_a("input") + assert result == expected_result +``` + +### End-to-End Tests + +Place end-to-end tests in the `integration/` directory with filenames starting with `test_` and mark them with `@pytest.mark.e2e`. + +```python +# src/tests/integration/test_new_e2e.py +import pytest +from src.main import Application + +@pytest.mark.e2e +def test_full_application(): + app = Application() + result = app.process_real_data() + assert result.status == "success" +``` diff --git a/src/tests/conftest.py b/src/tests/conftest.py new file mode 100644 index 0000000..a168e09 --- /dev/null +++ b/src/tests/conftest.py @@ -0,0 +1,40 @@ +"""Pytest configuration for VeriFact tests.""" + +import os +import pytest +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Define custom markers +def pytest_configure(config): + """Configure pytest with custom markers.""" + config.addinivalue_line("markers", "e2e: mark test as an end-to-end test that makes real API calls") + config.addinivalue_line("markers", "integration: mark test as an integration test") + config.addinivalue_line("markers", "unit: mark test as a unit test") + + +@pytest.fixture(scope="session", autouse=True) +def setup_logging(): + """Set up logging for tests.""" + import logging + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + # Reduce noise from third-party libraries + logging.getLogger("httpx").setLevel(logging.WARNING) + logging.getLogger("httpcore").setLevel(logging.WARNING) + + +@pytest.fixture +def mock_openai_key(monkeypatch): + """Mock the OPENAI_API_KEY environment variable.""" + monkeypatch.setenv("OPENAI_API_KEY", "mock-api-key-for-testing") + + +@pytest.fixture +def mock_search_key(monkeypatch): + """Mock the SEARCH_API_KEY environment variable.""" + monkeypatch.setenv("SEARCH_API_KEY", "mock-search-api-key-for-testing") diff --git a/src/tests/fixtures/README.md b/src/tests/fixtures/README.md new file mode 100644 index 0000000..7c3b3eb --- /dev/null +++ b/src/tests/fixtures/README.md @@ -0,0 +1,102 @@ +# VeriFact Test Fixtures + +This directory contains test fixtures for the VeriFact project, providing sample data for claims, evidence, and verdicts to use in tests. + +## Available Fixtures + +### Claims (`claims.py`) + +Contains sample factual claims organized by domain: + +- `POLITICAL_CLAIMS`: Claims related to politics and international relations +- `HEALTH_CLAIMS`: Claims related to health and medicine +- `SCIENCE_CLAIMS`: Claims related to science and technology +- `ECONOMIC_CLAIMS`: Claims related to economics and finance +- `ALL_CLAIMS`: Combined list of all claims +- `SAMPLE_TEXTS`: Sample text passages containing multiple claims for testing claim detection + +### Evidence (`evidence.py`) + +Contains sample evidence for claims, organized by topic: + +- `POLITICAL_EVIDENCE`: Evidence for political claims +- `HEALTH_EVIDENCE`: Evidence for health claims +- `SCIENCE_EVIDENCE`: Evidence for science claims +- `ECONOMIC_EVIDENCE`: Evidence for economic claims +- `ALL_EVIDENCE`: Combined dictionary of all evidence + +Each evidence collection is a dictionary where keys are topics and values are lists of `Evidence` objects. + +### Verdicts (`verdicts.py`) + +Contains sample verdicts for claims, organized by domain: + +- `POLITICAL_VERDICTS`: Verdicts for political claims +- `HEALTH_VERDICTS`: Verdicts for health claims +- `SCIENCE_VERDICTS`: Verdicts for science claims +- `ECONOMIC_VERDICTS`: Verdicts for economic claims +- `ALL_VERDICTS`: Combined list of all verdicts + +## Usage Examples + +### Using in Unit Tests + +```python +from src.tests.fixtures.claims import POLITICAL_CLAIMS +from src.tests.fixtures.evidence import POLITICAL_EVIDENCE +from src.tests.fixtures.verdicts import POLITICAL_VERDICTS + +def test_claim_processing(): + # Use a sample claim from fixtures + sample_claim = POLITICAL_CLAIMS[0] + assert sample_claim.text == "The United States has the largest military budget in the world." +``` + +### Using with Mock Agents + +```python +from unittest.mock import AsyncMock +from src.tests.fixtures.claims import POLITICAL_CLAIMS +from src.tests.fixtures.evidence import POLITICAL_EVIDENCE + +class MockClaimDetector: + def __init__(self, claims_to_return): + self.claims_to_return = claims_to_return + self.detect_claims = AsyncMock(return_value=claims_to_return) + +# Create a mock claim detector that returns sample claims +mock_detector = MockClaimDetector(claims_to_return=POLITICAL_CLAIMS) +``` + +### Using in Integration Tests + +```python +import pytest +from src.tests.fixtures.claims import SAMPLE_TEXTS +from src.verifact_manager import VerifactManager + +@pytest.mark.asyncio +async def test_end_to_end_factchecking(): + # Create a manager instance + manager = VerifactManager() + + # Process a sample text + sample_text = SAMPLE_TEXTS[0] + results = await manager.process_text(sample_text) + + # Verify results + assert len(results) > 0 +``` + +## Extending the Fixtures + +To add new fixtures: + +1. Add new claims to the appropriate list in `claims.py` +2. Add corresponding evidence in `evidence.py` +3. Add corresponding verdicts in `verdicts.py` +4. Run the fixture tests to ensure everything is working correctly: + +```bash +pytest src/tests/test_fixtures.py -v +``` diff --git a/src/tests/fixtures/__init__.py b/src/tests/fixtures/__init__.py new file mode 100644 index 0000000..1e814ad --- /dev/null +++ b/src/tests/fixtures/__init__.py @@ -0,0 +1 @@ +"""Test fixtures for VeriFact.""" diff --git a/src/tests/fixtures/claims.py b/src/tests/fixtures/claims.py new file mode 100644 index 0000000..3176d40 --- /dev/null +++ b/src/tests/fixtures/claims.py @@ -0,0 +1,125 @@ +"""Sample claims for testing.""" + +from src.verifact_agents.claim_detector import Claim + +# Political claims +POLITICAL_CLAIMS = [ + Claim( + text="The United States has the largest military budget in the world.", + context=0.9, + ), + Claim( + text="The European Union has 27 member states.", + context=0.8, + ), + Claim( + text="The United Nations was founded in 1945.", + context=0.85, + ), + Claim( + text="The Paris Climate Agreement was signed in 2016.", + context=0.75, + ), + Claim( + text="China is the world's largest emitter of carbon dioxide.", + context=0.8, + ), +] + +# Health claims +HEALTH_CLAIMS = [ + Claim( + text="Vaccines cause autism.", + context=0.95, + ), + Claim( + text="Drinking eight glasses of water a day is necessary for good health.", + context=0.7, + ), + Claim( + text="Regular exercise reduces the risk of heart disease.", + context=0.85, + ), + Claim( + text="Vitamin C prevents the common cold.", + context=0.75, + ), + Claim( + text="Eating carrots improves night vision.", + context=0.65, + ), +] + +# Science claims +SCIENCE_CLAIMS = [ + Claim( + text="The Earth is flat.", + context=0.95, + ), + Claim( + text="Humans only use 10% of their brains.", + context=0.8, + ), + Claim( + text="The Great Wall of China is visible from space with the naked eye.", + context=0.75, + ), + Claim( + text="Lightning never strikes the same place twice.", + context=0.7, + ), + Claim( + text="The speed of light is approximately 300,000 kilometers per second.", + context=0.9, + ), +] + +# Economic claims +ECONOMIC_CLAIMS = [ + Claim( + text="The United States has the largest economy in the world.", + context=0.85, + ), + Claim( + text="Bitcoin was the first cryptocurrency.", + context=0.8, + ), + Claim( + text="The Federal Reserve was established in 1913.", + context=0.75, + ), + Claim( + text="The Great Depression began with the stock market crash of 1929.", + context=0.9, + ), + Claim( + text="Amazon is the world's largest online retailer.", + context=0.8, + ), +] + +# All claims combined +ALL_CLAIMS = POLITICAL_CLAIMS + HEALTH_CLAIMS + SCIENCE_CLAIMS + ECONOMIC_CLAIMS + +# Sample claim texts for testing claim detection +SAMPLE_TEXTS = [ + """ + The United States has the largest military budget in the world, spending over $800 billion annually. + Meanwhile, China is the world's largest emitter of carbon dioxide, producing about 30% of global emissions. + """, + + """ + Many people believe that vaccines cause autism, despite numerous scientific studies disproving this claim. + It's also commonly stated that humans only use 10% of their brains, which is a widespread misconception. + """, + + """ + The Great Depression began with the stock market crash of 1929, leading to widespread economic hardship. + During this time, the Federal Reserve, which was established in 1913, failed to prevent the collapse of the banking system. + """, + + """ + The Paris Climate Agreement was signed in 2016 with the goal of limiting global warming. + The United Nations, founded in 1945 after World War II, has been instrumental in coordinating international climate action. + """, +] diff --git a/src/tests/fixtures/evidence.py b/src/tests/fixtures/evidence.py new file mode 100644 index 0000000..7ac0c24 --- /dev/null +++ b/src/tests/fixtures/evidence.py @@ -0,0 +1,171 @@ +"""Sample evidence for testing.""" + +from src.verifact_agents.evidence_hunter import Evidence + +# Evidence for political claims +POLITICAL_EVIDENCE = { + "US military budget": [ + Evidence( + content="According to the Stockholm International Peace Research Institute (SIPRI), the United States had a military budget of $877 billion in 2022, making it the largest in the world.", + source="https://www.sipri.org/research/armament-and-disarmament/arms-and-military-expenditure/military-expenditure", + relevance=0.95, + stance="supporting", + ), + Evidence( + content="The U.S. military budget in 2022 was $877 billion, which is more than the next nine countries combined.", + source="https://www.pgpf.org/chart-archive/0053_defense-comparison", + relevance=0.9, + stance="supporting", + ), + Evidence( + content="China had the second-largest military budget at $292 billion in 2022, still significantly less than the United States.", + source="https://www.sipri.org/sites/default/files/2023-04/2304_fs_milex_2022.pdf", + relevance=0.8, + stance="supporting", + ), + ], + "EU member states": [ + Evidence( + content="The European Union consists of 27 member states: Austria, Belgium, Bulgaria, Croatia, Cyprus, Czechia, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Ireland, Italy, Latvia, Lithuania, Luxembourg, Malta, Netherlands, Poland, Portugal, Romania, Slovakia, Slovenia, Spain and Sweden.", + source="https://european-union.europa.eu/principles-countries-history/country-profiles_en", + relevance=0.95, + stance="supporting", + ), + Evidence( + content="Following Brexit, which was completed on January 31, 2020, the European Union now has 27 member states.", + source="https://www.consilium.europa.eu/en/policies/eu-uk-after-referendum/", + relevance=0.9, + stance="supporting", + ), + ], + "UN founding": [ + Evidence( + content="The United Nations officially came into existence on 24 October 1945, when the Charter had been ratified by China, France, the Soviet Union, the United Kingdom, the United States and by a majority of other signatories.", + source="https://www.un.org/en/about-us/history-of-the-un", + relevance=0.95, + stance="supporting", + ), + Evidence( + content="The United Nations was established after World War II with the aim of preventing future wars, succeeding the ineffective League of Nations.", + source="https://www.britannica.com/topic/United-Nations", + relevance=0.85, + stance="supporting", + ), + ], +} + +# Evidence for health claims +HEALTH_EVIDENCE = { + "Vaccines and autism": [ + Evidence( + content="The scientific consensus is that there is no causal link between vaccines and autism. This has been confirmed by numerous large-scale studies involving millions of children.", + source="https://www.cdc.gov/vaccinesafety/concerns/autism.html", + relevance=0.95, + stance="contradicting", + ), + Evidence( + content="A comprehensive review published in the journal Vaccine in 2014 examined 10 studies involving more than 1.2 million children and found no link between vaccines and autism.", + source="https://pubmed.ncbi.nlm.nih.gov/24814559/", + relevance=0.9, + stance="contradicting", + ), + Evidence( + content="The original 1998 study by Andrew Wakefield that suggested a link between the MMR vaccine and autism was retracted by the journal The Lancet due to serious procedural and ethical flaws.", + source="https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(10)60175-4/fulltext", + relevance=0.85, + stance="contradicting", + ), + ], + "Eight glasses of water": [ + Evidence( + content="There is no scientific evidence supporting the claim that everyone needs exactly eight glasses of water per day. Water needs vary based on many factors including activity level, climate, and overall health.", + source="https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/water/art-20044256", + relevance=0.9, + stance="contradicting", + ), + Evidence( + content="The Institute of Medicine recommends approximately 3.7 liters (125 ounces) of total water daily for men and 2.7 liters (91 ounces) for women. This includes water from all beverages and foods, not just plain water.", + source="https://www.nationalacademies.org/news/2004/02/report-sets-dietary-intake-levels-for-water-salt-and-potassium-to-maintain-health-and-reduce-chronic-disease-risk", + relevance=0.85, + stance="neutral", + ), + ], +} + +# Evidence for science claims +SCIENCE_EVIDENCE = { + "Flat Earth": [ + Evidence( + content="The Earth is an oblate spheroid, slightly flattened at the poles and bulging at the equator. This has been confirmed by countless observations, measurements, and photographs from space.", + source="https://www.nasa.gov/image-article/blue-marble-image-earth-from-apollo-17/", + relevance=0.95, + stance="contradicting", + ), + Evidence( + content="The ancient Greeks established that the Earth was spherical as early as the 3rd century BCE. Eratosthenes even calculated its circumference with remarkable accuracy using shadows and the angle of the sun.", + source="https://www.aps.org/publications/apsnews/200606/history.cfm", + relevance=0.85, + stance="contradicting", + ), + Evidence( + content="Modern evidence for Earth's spherical shape includes ship disappearance over the horizon, time zone differences, the circular shadow during lunar eclipses, and direct observation from space.", + source="https://www.scientificamerican.com/article/earth-is-not-flat-heres-how-to-prove-it-to-flat-earthers/", + relevance=0.9, + stance="contradicting", + ), + ], + "10% of brain": [ + Evidence( + content="The myth that humans only use 10% of their brains is not supported by neuroscience. Brain imaging techniques show that all parts of the brain have active functions, even during sleep.", + source="https://www.scientificamerican.com/article/do-people-only-use-10-percent-of-their-brains/", + relevance=0.95, + stance="contradicting", + ), + Evidence( + content="While not all neurons fire simultaneously (which would cause a seizure), brain scans show that most regions of the brain are active during even simple tasks, and virtually all areas are active over the course of a day.", + source="https://www.bbc.com/future/article/20121112-do-we-only-use-10-of-our-brains", + relevance=0.9, + stance="contradicting", + ), + ], +} + +# Evidence for economic claims +ECONOMIC_EVIDENCE = { + "US largest economy": [ + Evidence( + content="As of 2023, the United States has the largest economy in the world with a GDP of approximately $26.95 trillion, followed by China with approximately $17.7 trillion.", + source="https://www.imf.org/en/Publications/WEO/weo-database/2023/October", + relevance=0.95, + stance="supporting", + ), + Evidence( + content="When measured by purchasing power parity (PPP), China has the largest economy in the world, surpassing the United States in 2017.", + source="https://www.worldbank.org/en/research/brief/global-economic-prospects", + relevance=0.9, + stance="contradicting", + ), + ], + "Bitcoin first cryptocurrency": [ + Evidence( + content="Bitcoin was created in 2009 by an unknown person or group using the pseudonym Satoshi Nakamoto and was the first decentralized cryptocurrency.", + source="https://bitcoin.org/bitcoin.pdf", + relevance=0.95, + stance="supporting", + ), + Evidence( + content="While Bitcoin was the first successful and widely adopted cryptocurrency, there were earlier attempts at digital currencies such as DigiCash (1989) and B-Money (1998), though they never gained widespread use.", + source="https://www.investopedia.com/tech/were-there-cryptocurrencies-bitcoin/", + relevance=0.85, + stance="neutral", + ), + ], +} + +# Combined evidence collections +ALL_EVIDENCE = { + **POLITICAL_EVIDENCE, + **HEALTH_EVIDENCE, + **SCIENCE_EVIDENCE, + **ECONOMIC_EVIDENCE, +} diff --git a/src/tests/fixtures/verdicts.py b/src/tests/fixtures/verdicts.py new file mode 100644 index 0000000..098ccaa --- /dev/null +++ b/src/tests/fixtures/verdicts.py @@ -0,0 +1,126 @@ +"""Sample verdicts for testing.""" + +from src.verifact_agents.verdict_writer import Verdict + +# Verdicts for political claims +POLITICAL_VERDICTS = [ + Verdict( + claim="The United States has the largest military budget in the world.", + verdict="true", + confidence=0.95, + explanation="Multiple reliable sources confirm that the United States has the largest military budget in the world. According to the Stockholm International Peace Research Institute (SIPRI), the US military budget was $877 billion in 2022, which is more than the next nine countries combined. China had the second-largest budget at $292 billion, significantly less than the US.", + sources=[ + "https://www.sipri.org/research/armament-and-disarmament/arms-and-military-expenditure/military-expenditure", + "https://www.pgpf.org/chart-archive/0053_defense-comparison", + "https://www.sipri.org/sites/default/files/2023-04/2304_fs_milex_2022.pdf", + ], + ), + Verdict( + claim="The European Union has 27 member states.", + verdict="true", + confidence=0.98, + explanation="The European Union currently consists of 27 member states. Following the United Kingdom's departure (Brexit) which was completed on January 31, 2020, the EU membership decreased from 28 to 27 countries. The current member states are: Austria, Belgium, Bulgaria, Croatia, Cyprus, Czechia, Denmark, Estonia, Finland, France, Germany, Greece, Hungary, Ireland, Italy, Latvia, Lithuania, Luxembourg, Malta, Netherlands, Poland, Portugal, Romania, Slovakia, Slovenia, Spain and Sweden.", + sources=[ + "https://european-union.europa.eu/principles-countries-history/country-profiles_en", + "https://www.consilium.europa.eu/en/policies/eu-uk-after-referendum/", + ], + ), + Verdict( + claim="The United Nations was founded in 1945.", + verdict="true", + confidence=0.99, + explanation="The United Nations was indeed founded in 1945. The UN officially came into existence on October 24, 1945, when its Charter was ratified by China, France, the Soviet Union, the United Kingdom, the United States, and a majority of other signatories. The organization was established after World War II with the aim of preventing future wars, succeeding the ineffective League of Nations.", + sources=[ + "https://www.un.org/en/about-us/history-of-the-un", + "https://www.britannica.com/topic/United-Nations", + ], + ), +] + +# Verdicts for health claims +HEALTH_VERDICTS = [ + Verdict( + claim="Vaccines cause autism.", + verdict="false", + confidence=0.98, + explanation="The claim that vaccines cause autism is false. The scientific consensus, based on numerous large-scale studies involving millions of children, is that there is no causal link between vaccines and autism. A comprehensive review published in the journal Vaccine in 2014 examined 10 studies involving more than 1.2 million children and found no link. The original 1998 study by Andrew Wakefield that suggested a link between the MMR vaccine and autism was retracted by the journal The Lancet due to serious procedural and ethical flaws.", + sources=[ + "https://www.cdc.gov/vaccinesafety/concerns/autism.html", + "https://pubmed.ncbi.nlm.nih.gov/24814559/", + "https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(10)60175-4/fulltext", + ], + ), + Verdict( + claim="Drinking eight glasses of water a day is necessary for good health.", + verdict="partially true", + confidence=0.75, + explanation="The claim that drinking exactly eight glasses of water per day is necessary for good health is partially true. While adequate hydration is essential for health, there is no scientific evidence supporting the specific recommendation of eight glasses. Water needs vary based on many factors including activity level, climate, and overall health. The Institute of Medicine recommends approximately 3.7 liters (125 ounces) of total water daily for men and 2.7 liters (91 ounces) for women, but this includes water from all beverages and foods, not just plain water. The '8x8' rule (eight 8-ounce glasses) may be a useful general guideline for some people, but it's not a scientific requirement for everyone.", + sources=[ + "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/water/art-20044256", + "https://www.nationalacademies.org/news/2004/02/report-sets-dietary-intake-levels-for-water-salt-and-potassium-to-maintain-health-and-reduce-chronic-disease-risk", + ], + ), +] + +# Verdicts for science claims +SCIENCE_VERDICTS = [ + Verdict( + claim="The Earth is flat.", + verdict="false", + confidence=0.99, + explanation="The claim that the Earth is flat is false. The Earth is an oblate spheroid, slightly flattened at the poles and bulging at the equator. This has been confirmed by countless observations, measurements, and photographs from space. The ancient Greeks established that the Earth was spherical as early as the 3rd century BCE, with Eratosthenes even calculating its circumference with remarkable accuracy. Modern evidence for Earth's spherical shape includes ship disappearance over the horizon, time zone differences, the circular shadow during lunar eclipses, and direct observation from space.", + sources=[ + "https://www.nasa.gov/image-article/blue-marble-image-earth-from-apollo-17/", + "https://www.aps.org/publications/apsnews/200606/history.cfm", + "https://www.scientificamerican.com/article/earth-is-not-flat-heres-how-to-prove-it-to-flat-earthers/", + ], + ), + Verdict( + claim="Humans only use 10% of their brains.", + verdict="false", + confidence=0.95, + explanation="The claim that humans only use 10% of their brains is false. This popular myth is not supported by neuroscience. Brain imaging techniques show that all parts of the brain have active functions, even during sleep. While not all neurons fire simultaneously (which would cause a seizure), brain scans show that most regions of the brain are active during even simple tasks, and virtually all areas are active over the course of a day. The brain is an energy-intensive organ, consuming about 20% of the body's energy despite being only 2% of its weight, which would be evolutionarily inefficient if 90% were unused.", + sources=[ + "https://www.scientificamerican.com/article/do-people-only-use-10-percent-of-their-brains/", + "https://www.bbc.com/future/article/20121112-do-we-only-use-10-of-our-brains", + ], + ), + Verdict( + claim="There is intelligent alien life in our galaxy.", + verdict="unverifiable", + confidence=0.6, + explanation="The claim that there is intelligent alien life in our galaxy is currently unverifiable. While scientific consensus suggests that the mathematical probability of extraterrestrial life existing somewhere in our vast galaxy is high given the billions of stars and planets, we currently have no direct evidence of intelligent alien civilizations. The search for extraterrestrial intelligence (SETI) has been ongoing for decades without confirmed contact. The Fermi Paradox highlights this contradiction: despite high probability estimates, we have yet to detect any signs of alien intelligence. Until we have concrete evidence one way or the other, this claim remains unverifiable with current scientific capabilities.", + sources=[ + "https://www.seti.org/search-extraterrestrial-life", + "https://www.nasa.gov/feature/goddard/2020/are-we-alone-in-the-universe-nasa-s-search-for-life", + "https://www.scientificamerican.com/article/the-search-for-extraterrestrial-intelligence/", + ], + ), +] + +# Verdicts for economic claims +ECONOMIC_VERDICTS = [ + Verdict( + claim="The United States has the largest economy in the world.", + verdict="partially true", + confidence=0.85, + explanation="The claim that the United States has the largest economy in the world is partially true, depending on how economic size is measured. When measured by nominal GDP, the United States does have the largest economy with approximately $26.95 trillion as of 2023, followed by China with approximately $17.7 trillion. However, when measured by purchasing power parity (PPP), which adjusts for price differences between countries, China has the largest economy in the world, having surpassed the United States in 2017. Both metrics are valid ways to measure economic size, but they answer different questions: nominal GDP is better for international comparisons involving trade and financial flows, while PPP is better for comparing living standards.", + sources=[ + "https://www.imf.org/en/Publications/WEO/weo-database/2023/October", + "https://www.worldbank.org/en/research/brief/global-economic-prospects", + ], + ), + Verdict( + claim="Bitcoin was the first cryptocurrency.", + verdict="true", + confidence=0.9, + explanation="The claim that Bitcoin was the first cryptocurrency is true, with some nuance. Bitcoin was created in 2009 by an unknown person or group using the pseudonym Satoshi Nakamoto and was the first decentralized cryptocurrency to gain widespread adoption and use. While there were earlier attempts at digital currencies such as DigiCash (1989) and B-Money (1998), these systems either were centralized or remained theoretical and never gained widespread use. Bitcoin was revolutionary because it solved the 'double-spending problem' without requiring a trusted third party, through its blockchain technology and proof-of-work consensus mechanism.", + sources=[ + "https://bitcoin.org/bitcoin.pdf", + "https://www.investopedia.com/tech/were-there-cryptocurrencies-bitcoin/", + ], + ), +] + +# Combined verdicts +ALL_VERDICTS = POLITICAL_VERDICTS + HEALTH_VERDICTS + SCIENCE_VERDICTS + ECONOMIC_VERDICTS diff --git a/src/tests/integration/__init__.py b/src/tests/integration/__init__.py new file mode 100644 index 0000000..728b259 --- /dev/null +++ b/src/tests/integration/__init__.py @@ -0,0 +1 @@ +"""Integration tests for VeriFact.""" diff --git a/src/tests/integration/test_end_to_end.py b/src/tests/integration/test_end_to_end.py new file mode 100644 index 0000000..51e3fe5 --- /dev/null +++ b/src/tests/integration/test_end_to_end.py @@ -0,0 +1,194 @@ +"""End-to-end tests for the VeriFact pipeline. + +These tests use the actual agents and make real API calls. +They are marked with the 'e2e' marker and can be skipped in CI. +To run these tests, use: pytest -m e2e +""" + +import os +import pytest +from dotenv import load_dotenv + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.verdict_writer import Verdict + +# Load environment variables +load_dotenv() + +# Skip all tests in this module if the OPENAI_API_KEY is not set +pytestmark = [ + pytest.mark.skipif( + os.getenv("OPENAI_API_KEY") is None, + reason="OPENAI_API_KEY environment variable not set" + ), + pytest.mark.e2e # Mark all tests as end-to-end tests +] + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=2, # Limit to 2 claims to reduce API costs + evidence_per_claim=2, # Limit to 2 evidence pieces per claim + timeout_seconds=60.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=False, + ) + return VerifactManager(config) + + +@pytest.mark.asyncio +async def test_simple_factual_statement(manager): + """Test the pipeline with a simple factual statement.""" + # A simple, verifiable factual statement + text = "The Earth is the third planet from the Sun in our solar system." + + # Run the pipeline + results = await manager.run(text) + + # Verify results + assert len(results) > 0 + assert all(isinstance(verdict, Verdict) for verdict in results) + + # The statement should be true + assert any(verdict.verdict == "true" for verdict in results) + + # Check that explanations and sources are provided + for verdict in results: + assert len(verdict.explanation) > 0 + assert len(verdict.sources) > 0 + + +@pytest.mark.asyncio +async def test_false_statement(manager): + """Test the pipeline with a false statement.""" + # A false statement + text = "The Earth is flat and sits at the center of our solar system." + + # Run the pipeline + results = await manager.run(text) + + # Verify results + assert len(results) > 0 + assert all(isinstance(verdict, Verdict) for verdict in results) + + # The statement should be false + assert any(verdict.verdict == "false" for verdict in results) + + # Check that explanations and sources are provided + for verdict in results: + assert len(verdict.explanation) > 0 + assert len(verdict.sources) > 0 + + +@pytest.mark.asyncio +async def test_multiple_claims(manager): + """Test the pipeline with text containing multiple claims.""" + # Text with multiple claims + text = """ + The United States has the largest military budget in the world. + Vaccines have been proven to cause autism in children. + """ + + # Run the pipeline + results = await manager.run(text) + + # Verify results + assert len(results) > 0 + assert all(isinstance(verdict, Verdict) for verdict in results) + + # Check that we have at least one true and one false verdict + verdicts = [verdict.verdict for verdict in results] + assert any(v == "true" for v in verdicts) or any(v == "partially true" for v in verdicts) + assert any(v == "false" for v in verdicts) + + # Check that explanations and sources are provided + for verdict in results: + assert len(verdict.explanation) > 0 + assert len(verdict.sources) > 0 + + +@pytest.mark.asyncio +async def test_no_claims(manager): + """Test the pipeline with text containing no clear factual claims.""" + # Text with no clear factual claims + text = "I wonder what the weather will be like tomorrow. Maybe I should bring an umbrella just in case." + + # Run the pipeline + results = await manager.run(text) + + # Verify results - should be empty or have unverifiable claims + if results: + assert all(isinstance(verdict, Verdict) for verdict in results) + assert all(verdict.verdict == "unverifiable" for verdict in results) + else: + assert results == [] + + +@pytest.mark.asyncio +async def test_partially_true_claim(manager): + """Test the pipeline with a partially true claim.""" + # A partially true statement + text = "Coffee is the most consumed beverage in the world." + + # Run the pipeline + results = await manager.run(text) + + # Verify results + assert len(results) > 0 + assert all(isinstance(verdict, Verdict) for verdict in results) + + # The statement should be partially true or false (water is actually most consumed) + verdicts = [verdict.verdict for verdict in results] + assert any(v in ["partially true", "false"] for v in verdicts) + + # Check that explanations and sources are provided + for verdict in results: + assert len(verdict.explanation) > 0 + assert len(verdict.sources) > 0 + + +@pytest.mark.asyncio +async def test_unverifiable_claim(manager): + """Test the pipeline with an unverifiable claim.""" + # An unverifiable statement + text = "There is intelligent alien life in our galaxy." + + # Run the pipeline + results = await manager.run(text) + + # Verify results + assert len(results) > 0 + assert all(isinstance(verdict, Verdict) for verdict in results) + + # The statement should be unverifiable or have low confidence + for verdict in results: + assert verdict.verdict == "unverifiable" or verdict.confidence < 0.7 + assert len(verdict.explanation) > 0 + assert len(verdict.sources) > 0 + + +@pytest.mark.asyncio +async def test_subjective_claim(manager): + """Test the pipeline with a subjective claim that should be unverifiable.""" + # A subjective statement + text = "Chocolate ice cream tastes better than vanilla ice cream." + + # Run the pipeline + results = await manager.run(text) + + # Verify results - should be empty or have unverifiable claims + if results: + assert all(isinstance(verdict, Verdict) for verdict in results) + assert all(verdict.verdict == "unverifiable" for verdict in results) + + # Check that explanations mention subjectivity + for verdict in results: + assert any(term in verdict.explanation.lower() for term in ["subjective", "opinion", "preference", "taste"]) + else: + # If no claims were detected, that's also acceptable + assert results == [] diff --git a/src/tests/integration/test_pipeline_integration.py b/src/tests/integration/test_pipeline_integration.py new file mode 100644 index 0000000..e9bf50d --- /dev/null +++ b/src/tests/integration/test_pipeline_integration.py @@ -0,0 +1,195 @@ +"""Integration tests for the VeriFact pipeline.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from src.tests.fixtures.claims import POLITICAL_CLAIMS, HEALTH_CLAIMS, SAMPLE_TEXTS +from src.tests.fixtures.evidence import POLITICAL_EVIDENCE, HEALTH_EVIDENCE +from src.tests.fixtures.verdicts import POLITICAL_VERDICTS, HEALTH_VERDICTS + +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict +from src.verifact_manager import VerifactManager, ManagerConfig + + +class MockRunnerResult: + """Mock for the result returned by Runner.run().""" + + def __init__(self, output_data): + self.output_data = output_data + self.final_output = str(output_data) + + def final_output_as(self, output_type): + """Mock the final_output_as method.""" + return self.output_data + + +class MockClaimDetector: + """Mock claim detector for testing.""" + + def __init__(self, claims_to_return): + self.claims_to_return = claims_to_return + self.detect_claims = AsyncMock(return_value=claims_to_return) + + +class MockEvidenceHunter: + """Mock evidence hunter for testing.""" + + def __init__(self, evidence_to_return): + self.evidence_to_return = evidence_to_return + self.gather_evidence = AsyncMock(return_value=evidence_to_return) + + +class MockVerdictWriter: + """Mock verdict writer for testing.""" + + def __init__(self, verdict_to_return): + self.verdict_to_return = verdict_to_return + self.generate_verdict = AsyncMock(return_value=verdict_to_return) + + +@pytest.mark.asyncio +async def test_pipeline_with_mocks(): + """Test the factcheck pipeline with mock agents.""" + # Sample test data from fixtures + sample_claim = POLITICAL_CLAIMS[0] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + sample_verdict = POLITICAL_VERDICTS[0] + + # Create mock agents + mock_claim_detector = MockClaimDetector(claims_to_return=[sample_claim]) + mock_evidence_hunter = MockEvidenceHunter(evidence_to_return=sample_evidence) + mock_verdict_writer = MockVerdictWriter(verdict_to_return=sample_verdict) + + # Verify that the mocks can be called + detected_claims = await mock_claim_detector.detect_claims(SAMPLE_TEXTS[0]) + assert detected_claims == [sample_claim] + + gathered_evidence = await mock_evidence_hunter.gather_evidence(sample_claim) + assert gathered_evidence == sample_evidence + + verdict = await mock_verdict_writer.generate_verdict(sample_claim, sample_evidence) + assert verdict == sample_verdict + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_full_pipeline_integration(mock_run): + """Test the full pipeline integration with mocked Runner.""" + # Setup test data + sample_claims = POLITICAL_CLAIMS[:2] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + sample_verdict = POLITICAL_VERDICTS[0] + + # Configure mock to return different results for different agent calls + def mock_runner_side_effect(*args, **kwargs): + # Check which agent is being called based on the agent object (first arg) + agent = args[0] + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult(sample_claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(sample_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(sample_verdict) + return MockRunnerResult([]) + + # Use side_effect to return different values based on input + mock_run.side_effect = mock_runner_side_effect + + # Create manager and run the pipeline + manager = VerifactManager() + results = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert len(results) > 0 + assert isinstance(results[0], Verdict) + assert results[0].claim == sample_verdict.claim + assert results[0].verdict == sample_verdict.verdict + + # Verify the Runner.run was called for each agent + assert mock_run.call_count >= 3 # At least once for each agent + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_pipeline_with_multiple_claims(mock_run): + """Test the pipeline with multiple claims of different types.""" + # Setup test data - mix of political and health claims + mixed_claims = POLITICAL_CLAIMS[:1] + HEALTH_CLAIMS[:1] + + # Configure mock for different agent calls + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult(mixed_claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Return different evidence based on which claim is being processed + if call_count == 0: + call_count += 1 + return MockRunnerResult(POLITICAL_EVIDENCE["US military budget"]) + else: + return MockRunnerResult(HEALTH_EVIDENCE["Vaccines and autism"]) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Return different verdicts based on which claim is being processed + if call_count == 1: + call_count += 1 + return MockRunnerResult(POLITICAL_VERDICTS[0]) + else: + return MockRunnerResult(HEALTH_VERDICTS[0]) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Create manager and run the pipeline + manager = VerifactManager() + results = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert len(results) == 2 + assert all(isinstance(verdict, Verdict) for verdict in results) + + # Verify the Runner.run was called multiple times + # 1 for claim detection + 2 for evidence gathering + 2 for verdict generation + assert mock_run.call_count >= 5 + + +@pytest.mark.asyncio +@patch("src.verifact_agents.claim_detector.claim_detector_agent") +@patch("src.verifact_agents.evidence_hunter.evidence_hunter_agent") +@patch("src.verifact_agents.verdict_writer.verdict_writer_agent") +async def test_agent_integration(mock_claim_detector, mock_evidence_hunter, mock_verdict_writer): + """Test the integration of the three main agents with the VerifactManager.""" + # Setup mock returns + mock_claim_detector_result = MagicMock() + mock_claim_detector_result.final_output_as.return_value = POLITICAL_CLAIMS[:1] + mock_claim_detector.return_value = mock_claim_detector_result + + mock_evidence_hunter_result = MagicMock() + mock_evidence_hunter_result.final_output_as.return_value = POLITICAL_EVIDENCE["US military budget"] + mock_evidence_hunter.return_value = mock_evidence_hunter_result + + mock_verdict_writer_result = MagicMock() + mock_verdict_writer_result.final_output_as.return_value = POLITICAL_VERDICTS[0] + mock_verdict_writer.return_value = mock_verdict_writer_result + + # Create manager and run the pipeline + manager = VerifactManager() + results = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert len(results) == 1 + assert results[0] == POLITICAL_VERDICTS[0] + + # Verify each agent was called + mock_claim_detector.assert_called_once() + mock_evidence_hunter.assert_called_once() + mock_verdict_writer.assert_called_once() + + # Verify the final_output_as method was called for each result + mock_claim_detector_result.final_output_as.assert_called_once() + mock_evidence_hunter_result.final_output_as.assert_called_once() + mock_verdict_writer_result.final_output_as.assert_called_once() diff --git a/src/tests/test_claim_edge_cases.py b/src/tests/test_claim_edge_cases.py new file mode 100644 index 0000000..61ab367 --- /dev/null +++ b/src/tests/test_claim_edge_cases.py @@ -0,0 +1,300 @@ +"""Tests for edge cases in claim handling.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.fixtures.claims import POLITICAL_CLAIMS, SAMPLE_TEXTS +from src.tests.fixtures.evidence import POLITICAL_EVIDENCE +from src.tests.fixtures.verdicts import POLITICAL_VERDICTS + + +class MockRunnerResult: + """Mock for the result returned by Runner.run().""" + + def __init__(self, output_data): + self.output_data = output_data + self.final_output = str(output_data) + + def final_output_as(self, output_type): + """Mock the final_output_as method.""" + return self.output_data + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=False, + ) + return VerifactManager(config) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_no_claims_detected(mock_run, manager): + """Test the pipeline when no claims are detected.""" + # Configure mock to return empty list for claim detection + mock_run.return_value = MockRunnerResult([]) + + # Run the pipeline + results = await manager.run("Text with no factual claims") + + # Verify results + assert results == [] + mock_run.assert_called_once() + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_no_evidence_found(mock_run, manager): + """Test the pipeline when no evidence is found for a claim.""" + # Sample claim + sample_claim = POLITICAL_CLAIMS[0] + + # Configure mock to return different results for different agent calls + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([sample_claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Return empty list for evidence + return MockRunnerResult([]) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results - should be empty since no evidence was found + assert results == [] + assert mock_run.call_count >= 2 # Called for claim detection and evidence gathering + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_evidence_gathering_error(mock_run, manager): + """Test the pipeline when evidence gathering raises an error.""" + # Sample claim + sample_claim = POLITICAL_CLAIMS[0] + + # Configure mock to return different results for different agent calls + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([sample_claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Raise an exception for evidence gathering + raise Exception("Evidence gathering error") + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline and expect an exception + with pytest.raises(Exception): + await manager.run(SAMPLE_TEXTS[0]) + + # Verify the mock was called + assert mock_run.call_count >= 1 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_low_confidence_verdict(mock_run, manager): + """Test the pipeline with a verdict that has low confidence.""" + # Sample claim and evidence + sample_claim = POLITICAL_CLAIMS[0] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + + # Create a low confidence verdict + low_confidence_verdict = Verdict( + claim=sample_claim.text, + verdict="partially true", + confidence=0.3, # Low confidence + explanation="This is a low confidence verdict due to limited evidence.", + sources=["https://example.com/source1"], + ) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([sample_claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(sample_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(low_confidence_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "partially true" + assert results[0].confidence == 0.3 + assert "low confidence" in results[0].explanation.lower() + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_conflicting_evidence(mock_run, manager): + """Test the pipeline with conflicting evidence for a claim.""" + # Sample claim + sample_claim = POLITICAL_CLAIMS[0] + + # Create conflicting evidence + conflicting_evidence = [ + Evidence( + content="The United States has the largest military budget in the world, spending over $800 billion annually.", + source="https://example.com/source1", + relevance=0.9, + stance="supporting", + ), + Evidence( + content="China has surpassed the United States in military spending according to alternative metrics.", + source="https://example.com/source2", + relevance=0.8, + stance="contradicting", + ), + ] + + # Create a verdict based on conflicting evidence + conflicting_verdict = Verdict( + claim=sample_claim.text, + verdict="partially true", + confidence=0.6, + explanation="There is conflicting evidence about this claim. While traditional metrics show the US has the largest military budget, alternative calculations suggest China may have surpassed it.", + sources=["https://example.com/source1", "https://example.com/source2"], + ) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([sample_claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(conflicting_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(conflicting_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "partially true" + assert "conflicting evidence" in results[0].explanation.lower() + assert len(results[0].sources) == 2 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_claim_with_no_context(mock_run, manager): + """Test the pipeline with a claim that has no context.""" + # Create a claim with no context + claim_no_context = Claim(text="The Moon orbits the Earth.") + + # Sample evidence and verdict + sample_evidence = [ + Evidence( + content="The Moon orbits the Earth at an average distance of 384,400 kilometers.", + source="https://example.com/source1", + relevance=0.95, + stance="supporting", + ), + ] + + sample_verdict = Verdict( + claim=claim_no_context.text, + verdict="true", + confidence=0.99, + explanation="This is a basic astronomical fact that is well-established.", + sources=["https://example.com/source1"], + ) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([claim_no_context]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(sample_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(sample_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("The Moon orbits the Earth.") + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "true" + assert results[0].confidence > 0.9 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_multiple_claims_one_fails(mock_run, manager): + """Test the pipeline when one claim fails during evidence gathering.""" + # Sample claims + claim1 = POLITICAL_CLAIMS[0] + claim2 = POLITICAL_CLAIMS[1] + + # Sample evidence and verdict + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + sample_verdict = POLITICAL_VERDICTS[0] + + # Configure mock to return different results for different agent calls + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([claim1, claim2]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # First evidence gathering succeeds, second fails + if call_count == 0: + call_count += 1 + return MockRunnerResult(sample_evidence) + else: + raise Exception("Evidence gathering error for second claim") + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(sample_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline - should continue despite one claim failing + with pytest.raises(Exception): + await manager.run(SAMPLE_TEXTS[0]) + + # Verify the mock was called multiple times + assert mock_run.call_count >= 3 diff --git a/src/tests/test_claim_types.py b/src/tests/test_claim_types.py new file mode 100644 index 0000000..bbe9e46 --- /dev/null +++ b/src/tests/test_claim_types.py @@ -0,0 +1,296 @@ +"""Tests for different claim types in the VeriFact pipeline.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.fixtures.claims import POLITICAL_CLAIMS, HEALTH_CLAIMS, SCIENCE_CLAIMS +from src.tests.fixtures.evidence import POLITICAL_EVIDENCE, HEALTH_EVIDENCE, SCIENCE_EVIDENCE +from src.tests.fixtures.verdicts import ( + POLITICAL_VERDICTS, + HEALTH_VERDICTS, + SCIENCE_VERDICTS, + ALL_VERDICTS, +) + + +class MockRunnerResult: + """Mock for the result returned by Runner.run().""" + + def __init__(self, output_data): + self.output_data = output_data + self.final_output = str(output_data) + + def final_output_as(self, output_type): + """Mock the final_output_as method.""" + return self.output_data + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=False, + ) + return VerifactManager(config) + + +def get_verdict_by_type(verdict_type): + """Get a verdict of the specified type from the fixtures.""" + for verdict in ALL_VERDICTS: + if verdict.verdict == verdict_type: + return verdict + raise ValueError(f"No verdict of type '{verdict_type}' found in fixtures") + + +def get_claim_and_evidence_for_verdict(verdict): + """Get a matching claim and evidence for the given verdict.""" + # Find a claim that matches the verdict + claim_text = verdict.claim + claim = None + + # Search in all claim collections + for claim_list in [POLITICAL_CLAIMS, HEALTH_CLAIMS, SCIENCE_CLAIMS]: + for c in claim_list: + if c.text == claim_text: + claim = c + break + if claim: + break + + if not claim: + # Create a new claim if not found + claim = Claim(text=claim_text, context=0.8) + + # Find or create evidence + evidence = [] + if "United States" in claim_text and "military" in claim_text: + evidence = POLITICAL_EVIDENCE["US military budget"] + elif "vaccine" in claim_text.lower() and "autism" in claim_text.lower(): + evidence = HEALTH_EVIDENCE["Vaccines and autism"] + elif "Earth" in claim_text and "flat" in claim_text: + evidence = SCIENCE_EVIDENCE["Flat Earth"] + elif "brain" in claim_text.lower(): + evidence = SCIENCE_EVIDENCE["10% of brain"] + else: + # Create generic evidence if no specific evidence is found + evidence = [ + Evidence( + content=f"Evidence related to: {claim_text}", + source="https://example.com/evidence", + relevance=0.8, + stance="supporting" if verdict.verdict == "true" else "contradicting", + ) + ] + + return claim, evidence + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_true_claim(mock_run, manager): + """Test the pipeline with a true claim.""" + # Get a true verdict from fixtures + true_verdict = get_verdict_by_type("true") + claim, evidence = get_claim_and_evidence_for_verdict(true_verdict) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(true_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(f"Text containing the claim: {claim.text}") + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "true" + assert results[0].confidence > 0.8 # High confidence for true claims + assert len(results[0].explanation) > 0 + assert len(results[0].sources) > 0 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_false_claim(mock_run, manager): + """Test the pipeline with a false claim.""" + # Get a false verdict from fixtures + false_verdict = get_verdict_by_type("false") + claim, evidence = get_claim_and_evidence_for_verdict(false_verdict) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(false_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(f"Text containing the claim: {claim.text}") + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "false" + assert results[0].confidence > 0.8 # High confidence for false claims + assert len(results[0].explanation) > 0 + assert len(results[0].sources) > 0 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_partially_true_claim(mock_run, manager): + """Test the pipeline with a partially true claim.""" + # Get a partially true verdict from fixtures + partially_true_verdict = get_verdict_by_type("partially true") + claim, evidence = get_claim_and_evidence_for_verdict(partially_true_verdict) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(partially_true_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(f"Text containing the claim: {claim.text}") + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "partially true" + assert 0.5 <= results[0].confidence <= 0.9 # Moderate confidence for partially true claims + assert len(results[0].explanation) > 0 + assert len(results[0].sources) > 0 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_unverifiable_claim(mock_run, manager): + """Test the pipeline with an unverifiable claim.""" + # Get an unverifiable verdict from fixtures + unverifiable_verdict = get_verdict_by_type("unverifiable") + claim, evidence = get_claim_and_evidence_for_verdict(unverifiable_verdict) + + # Configure mock to return appropriate results + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult([claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockRunnerResult(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockRunnerResult(unverifiable_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(f"Text containing the claim: {claim.text}") + + # Verify results + assert len(results) == 1 + assert results[0].verdict == "unverifiable" + assert results[0].confidence < 0.8 # Lower confidence for unverifiable claims + assert len(results[0].explanation) > 0 + assert len(results[0].sources) > 0 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_mixed_claim_types(mock_run, manager): + """Test the pipeline with a mix of different claim types.""" + # Get verdicts of different types + true_verdict = get_verdict_by_type("true") + false_verdict = get_verdict_by_type("false") + partially_true_verdict = get_verdict_by_type("partially true") + unverifiable_verdict = get_verdict_by_type("unverifiable") + + # Get claims and evidence + true_claim, true_evidence = get_claim_and_evidence_for_verdict(true_verdict) + false_claim, false_evidence = get_claim_and_evidence_for_verdict(false_verdict) + partially_true_claim, partially_true_evidence = get_claim_and_evidence_for_verdict(partially_true_verdict) + unverifiable_claim, unverifiable_evidence = get_claim_and_evidence_for_verdict(unverifiable_verdict) + + # All claims to be detected + all_claims = [true_claim, false_claim, partially_true_claim, unverifiable_claim] + + # Configure mock to return appropriate results + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult(all_claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Return different evidence based on which claim is being processed + claim_text = args[1].split("Claim to investigate: ")[1].split("\n")[0] + if claim_text == true_claim.text: + return MockRunnerResult(true_evidence) + elif claim_text == false_claim.text: + return MockRunnerResult(false_evidence) + elif claim_text == partially_true_claim.text: + return MockRunnerResult(partially_true_evidence) + else: + return MockRunnerResult(unverifiable_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Return different verdicts based on which claim is being processed + claim_text = args[1].split("Claim to investigate: ")[1].split("\n")[0] + if claim_text == true_claim.text: + return MockRunnerResult(true_verdict) + elif claim_text == false_claim.text: + return MockRunnerResult(false_verdict) + elif claim_text == partially_true_claim.text: + return MockRunnerResult(partially_true_verdict) + else: + return MockRunnerResult(unverifiable_verdict) + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Text containing multiple claims of different types") + + # Verify results + assert len(results) == 4 + + # Check that we have one of each verdict type + verdict_types = [result.verdict for result in results] + assert "true" in verdict_types + assert "false" in verdict_types + assert "partially true" in verdict_types + assert "unverifiable" in verdict_types + + # Verify the Runner.run was called multiple times + # 1 for claim detection + 4 for evidence gathering + 4 for verdict generation + assert mock_run.call_count >= 9 diff --git a/src/tests/test_complex_claims.py b/src/tests/test_complex_claims.py new file mode 100644 index 0000000..768b012 --- /dev/null +++ b/src/tests/test_complex_claims.py @@ -0,0 +1,358 @@ +"""Tests for handling complex claims in the VeriFact pipeline.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.fixtures.claims import SAMPLE_TEXTS + + +class MockRunnerResult: + """Mock for the result returned by Runner.run().""" + + def __init__(self, output_data): + self.output_data = output_data + self.final_output = str(output_data) + + def final_output_as(self, output_type): + """Mock the final_output_as method.""" + return self.output_data + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=10, # Allow more claims for complex tests + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=False, + ) + return VerifactManager(config) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_compound_claim(mock_run, manager): + """Test the pipeline with a compound claim that should be broken down.""" + # A compound claim that contains multiple factual assertions + compound_claim = Claim( + text="The Earth is round and orbits the Sun, which is a star that is 93 million miles away.", + context=0.9, + ) + + # Individual claims after breakdown + individual_claims = [ + Claim(text="The Earth is round.", context=0.9), + Claim(text="The Earth orbits the Sun.", context=0.9), + Claim(text="The Sun is a star.", context=0.9), + Claim(text="The Sun is 93 million miles away from Earth.", context=0.9), + ] + + # Evidence for each claim + evidence_sets = [ + [Evidence( + content="The Earth is an oblate spheroid, slightly flattened at the poles and bulging at the equator.", + source="https://example.com/earth-shape", + relevance=0.95, + stance="supporting", + )], + [Evidence( + content="The Earth orbits the Sun once every 365.25 days, completing a full revolution.", + source="https://example.com/earth-orbit", + relevance=0.95, + stance="supporting", + )], + [Evidence( + content="The Sun is a G-type main-sequence star and is the largest object in our solar system.", + source="https://example.com/sun-star", + relevance=0.95, + stance="supporting", + )], + [Evidence( + content="The average distance from the Earth to the Sun is about 93 million miles (150 million kilometers).", + source="https://example.com/sun-distance", + relevance=0.95, + stance="supporting", + )], + ] + + # Verdicts for each claim + verdicts = [ + Verdict( + claim="The Earth is round.", + verdict="true", + confidence=0.99, + explanation="The Earth is indeed round, or more precisely, an oblate spheroid.", + sources=["https://example.com/earth-shape"], + ), + Verdict( + claim="The Earth orbits the Sun.", + verdict="true", + confidence=0.99, + explanation="The Earth completes one orbit around the Sun every 365.25 days.", + sources=["https://example.com/earth-orbit"], + ), + Verdict( + claim="The Sun is a star.", + verdict="true", + confidence=0.99, + explanation="The Sun is a G-type main-sequence star at the center of our solar system.", + sources=["https://example.com/sun-star"], + ), + Verdict( + claim="The Sun is 93 million miles away from Earth.", + verdict="true", + confidence=0.95, + explanation="The average distance from Earth to the Sun is approximately 93 million miles.", + sources=["https://example.com/sun-distance"], + ), + ] + + # Configure mock to return appropriate results + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + # Return individual claims instead of the compound claim + return MockRunnerResult(individual_claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Return evidence for the appropriate claim + current_evidence = evidence_sets[call_count % len(evidence_sets)] + call_count += 1 + return MockRunnerResult(current_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Return the verdict for the appropriate claim + claim_text = args[1].split("Claim to investigate: ")[1].split("\n")[0] + for i, claim in enumerate(individual_claims): + if claim.text == claim_text: + return MockRunnerResult(verdicts[i]) + return MockRunnerResult(verdicts[0]) # Fallback + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("The Earth is round and orbits the Sun, which is a star that is 93 million miles away.") + + # Verify results + assert len(results) == 4 + assert all(isinstance(verdict, Verdict) for verdict in results) + assert all(verdict.verdict == "true" for verdict in results) + + # Check that all individual claims were addressed + claim_texts = [verdict.claim for verdict in results] + for claim in individual_claims: + assert claim.text in claim_texts + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_claim_with_context_dependency(mock_run, manager): + """Test the pipeline with claims that have context dependencies.""" + # Claims with context dependencies + context_claims = [ + Claim(text="The president signed the bill yesterday.", context=0.8), + Claim(text="It will go into effect next month.", context=0.7), + ] + + # Evidence for each claim + evidence_sets = [ + [Evidence( + content="President Biden signed the Infrastructure Investment and Jobs Act on November 15, 2021.", + source="https://example.com/bill-signing", + relevance=0.9, + stance="supporting", + )], + [Evidence( + content="The Infrastructure Investment and Jobs Act provisions will begin implementation in December 2021.", + source="https://example.com/bill-implementation", + relevance=0.8, + stance="supporting", + )], + ] + + # Verdicts for each claim + verdicts = [ + Verdict( + claim="The president signed the bill yesterday.", + verdict="unverifiable", + confidence=0.7, + explanation="This claim is time-dependent and lacks specific context about which president and which bill is being referenced. Without this context, the claim cannot be verified.", + sources=["https://example.com/bill-signing"], + ), + Verdict( + claim="It will go into effect next month.", + verdict="unverifiable", + confidence=0.6, + explanation="This claim is context-dependent and lacks specific information about what 'it' refers to and when 'next month' is. Without this context, the claim cannot be verified.", + sources=["https://example.com/bill-implementation"], + ), + ] + + # Configure mock to return appropriate results + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult(context_claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + current_evidence = evidence_sets[call_count % len(evidence_sets)] + call_count += 1 + return MockRunnerResult(current_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + claim_text = args[1].split("Claim to investigate: ")[1].split("\n")[0] + for i, claim in enumerate(context_claims): + if claim.text == claim_text: + return MockRunnerResult(verdicts[i]) + return MockRunnerResult(verdicts[0]) # Fallback + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("The president signed the bill yesterday. It will go into effect next month.") + + # Verify results + assert len(results) == 2 + assert all(isinstance(verdict, Verdict) for verdict in results) + assert all(verdict.verdict == "unverifiable" for verdict in results) + + # Check that explanations mention context dependency + for verdict in results: + assert any(term in verdict.explanation.lower() for term in ["context", "specific", "lacks"]) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_claim_with_mixed_verdicts(mock_run, manager): + """Test the pipeline with a text containing claims with different verdicts.""" + # Claims with different verdicts + mixed_claims = [ + Claim(text="Water boils at 100 degrees Celsius at sea level.", context=0.9), + Claim(text="The Great Wall of China is visible from the Moon.", context=0.8), + Claim(text="Humans have explored less than 5% of the ocean.", context=0.85), + Claim(text="There are exactly 1 million species of insects on Earth.", context=0.7), + ] + + # Evidence for each claim + evidence_sets = [ + [Evidence( + content="Water boils at 100 degrees Celsius (212 degrees Fahrenheit) at standard atmospheric pressure at sea level.", + source="https://example.com/water-boiling", + relevance=0.95, + stance="supporting", + )], + [Evidence( + content="The Great Wall of China is not visible from the Moon with the naked eye. This is a common misconception.", + source="https://example.com/great-wall-visibility", + relevance=0.9, + stance="contradicting", + )], + [Evidence( + content="According to NOAA, more than 80% of the ocean remains unmapped, unobserved, and unexplored.", + source="https://example.com/ocean-exploration", + relevance=0.85, + stance="supporting", + )], + [Evidence( + content="Scientists have described about 1 million insect species, but estimates of the total number range from 2 million to 30 million species.", + source="https://example.com/insect-species", + relevance=0.8, + stance="contradicting", + )], + ] + + # Verdicts for each claim + verdicts = [ + Verdict( + claim="Water boils at 100 degrees Celsius at sea level.", + verdict="true", + confidence=0.98, + explanation="Water boils at 100 degrees Celsius (212 degrees Fahrenheit) at standard atmospheric pressure at sea level.", + sources=["https://example.com/water-boiling"], + ), + Verdict( + claim="The Great Wall of China is visible from the Moon.", + verdict="false", + confidence=0.95, + explanation="The Great Wall of China is not visible from the Moon with the naked eye. This is a common misconception.", + sources=["https://example.com/great-wall-visibility"], + ), + Verdict( + claim="Humans have explored less than 5% of the ocean.", + verdict="partially true", + confidence=0.85, + explanation="This claim is partially true. While the exact percentage varies by definition of 'explored', NOAA states that more than 80% of the ocean remains unmapped, unobserved, and unexplored.", + sources=["https://example.com/ocean-exploration"], + ), + Verdict( + claim="There are exactly 1 million species of insects on Earth.", + verdict="false", + confidence=0.8, + explanation="This claim is false. While scientists have described about 1 million insect species, estimates of the total number range from 2 million to 30 million species.", + sources=["https://example.com/insect-species"], + ), + ] + + # Configure mock to return appropriate results + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockRunnerResult(mixed_claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + current_evidence = evidence_sets[call_count % len(evidence_sets)] + call_count += 1 + return MockRunnerResult(current_evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + claim_text = args[1].split("Claim to investigate: ")[1].split("\n")[0] + for i, claim in enumerate(mixed_claims): + if claim.text == claim_text: + return MockRunnerResult(verdicts[i]) + return MockRunnerResult(verdicts[0]) # Fallback + return MockRunnerResult([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run(""" + Water boils at 100 degrees Celsius at sea level. + The Great Wall of China is visible from the Moon. + Humans have explored less than 5% of the ocean. + There are exactly 1 million species of insects on Earth. + """) + + # Verify results + assert len(results) == 4 + + # Check that we have one of each verdict type + verdict_types = [result.verdict for result in results] + assert "true" in verdict_types + assert "false" in verdict_types + assert "partially true" in verdict_types + + # Check that the verdicts match the expected claims + for verdict in results: + if verdict.claim == "Water boils at 100 degrees Celsius at sea level.": + assert verdict.verdict == "true" + elif verdict.claim == "The Great Wall of China is visible from the Moon.": + assert verdict.verdict == "false" + elif verdict.claim == "Humans have explored less than 5% of the ocean.": + assert verdict.verdict == "partially true" + elif verdict.claim == "There are exactly 1 million species of insects on Earth.": + assert verdict.verdict == "false" diff --git a/src/tests/test_data_flow.py b/src/tests/test_data_flow.py new file mode 100644 index 0000000..33e6979 --- /dev/null +++ b/src/tests/test_data_flow.py @@ -0,0 +1,384 @@ +"""Tests for data flow between agents in the VeriFact pipeline.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch, call + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.utils.mock_data_factory import MockDataFactory + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=True, + ) + return VerifactManager(config) + + +class DataFlowCaptor: + """Captures data flow between agents.""" + + def __init__(self): + self.claim_detector_inputs = [] + self.evidence_hunter_inputs = [] + self.verdict_writer_inputs = [] + self.claim_detector_outputs = [] + self.evidence_hunter_outputs = [] + self.verdict_writer_outputs = [] + + def reset(self): + """Reset all captured data.""" + self.__init__() + + +@pytest.fixture +def data_flow_captor(): + """Create a DataFlowCaptor instance.""" + return DataFlowCaptor() + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_data_flow_integrity(mock_run, manager, data_flow_captor): + """Test the integrity of data flow between agents.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=2) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock to capture inputs and return outputs + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + input_data = args[1] + + if agent.__dict__.get('name') == 'ClaimDetector': + data_flow_captor.claim_detector_inputs.append(input_data) + data_flow_captor.claim_detector_outputs.append(claims) + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + data_flow_captor.evidence_hunter_inputs.append(input_data) + + # Extract claim text from the query + claim_text = next((c.text for c in claims if c.text in input_data), None) + evidence = evidence_map.get(claim_text, []) + + data_flow_captor.evidence_hunter_outputs.append(evidence) + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + data_flow_captor.verdict_writer_inputs.append(input_data) + + # Extract claim text from the prompt + claim_text = next((c.text for c in claims if c.text in input_data), None) + verdict = next((v for v in verdicts if v.claim == claim_text), verdicts[0]) + + data_flow_captor.verdict_writer_outputs.append(verdict) + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + input_text = "Test text with claims" + results = await manager.run(input_text) + + # Verify results + assert len(results) == 2 + + # Verify data flow + # 1. Claim detector should receive the original input text + assert len(data_flow_captor.claim_detector_inputs) == 1 + assert data_flow_captor.claim_detector_inputs[0] == input_text + + # 2. Evidence hunter should receive queries containing the claims + assert len(data_flow_captor.evidence_hunter_inputs) == 2 + for i, claim in enumerate(claims): + assert claim.text in data_flow_captor.evidence_hunter_inputs[i] + + # 3. Verdict writer should receive prompts containing claims and evidence + assert len(data_flow_captor.verdict_writer_inputs) == 2 + for i, claim in enumerate(claims): + assert claim.text in data_flow_captor.verdict_writer_inputs[i] + assert "Evidence" in data_flow_captor.verdict_writer_inputs[i] + + # 4. Final results should match verdict writer outputs + for i, result in enumerate(results): + assert result.claim in [v.claim for v in verdicts] + assert result.verdict in [v.verdict for v in verdicts] + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_claim_filtering(mock_run, manager): + """Test that claims are properly filtered based on check-worthiness.""" + # Create test data with varying context scores + claims = [ + Claim(text="High worthiness claim", context=0.9), + Claim(text="Medium worthiness claim", context=0.6), + Claim(text="Low worthiness claim", context=0.3), # Below threshold + ] + + evidence = [ + [Evidence(content="Evidence for high", source="https://example.com/high", relevance=0.9, stance="supporting")], + [Evidence(content="Evidence for medium", source="https://example.com/medium", relevance=0.8, stance="supporting")], + ] + + verdicts = [ + Verdict( + claim="High worthiness claim", + verdict="true", + confidence=0.9, + explanation="High worthiness explanation", + sources=["https://example.com/high"], + ), + Verdict( + claim="Medium worthiness claim", + verdict="partially true", + confidence=0.7, + explanation="Medium worthiness explanation", + sources=["https://example.com/medium"], + ), + ] + + # Configure mock + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Should only be called for claims above the threshold + assert call_count < 2, "Evidence hunter called too many times" + result = evidence[call_count] + call_count += 1 + return MockDataFactory.create_runner_result_mock(result) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Extract claim text from the prompt + prompt = args[1] + if "High worthiness claim" in prompt: + return MockDataFactory.create_runner_result_mock(verdicts[0]) + else: + return MockDataFactory.create_runner_result_mock(verdicts[1]) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims of varying worthiness") + + # Verify results - should only have verdicts for claims above the threshold + assert len(results) == 2 + result_claims = [result.claim for result in results] + assert "High worthiness claim" in result_claims + assert "Medium worthiness claim" in result_claims + assert "Low worthiness claim" not in result_claims + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_evidence_transformation(mock_run, manager): + """Test that evidence is properly transformed between agents.""" + # Create test data + claim = Claim(text="Test claim", context=0.8) + + # Create evidence with different stances + evidence = [ + Evidence(content="Supporting evidence", source="https://example.com/support", relevance=0.9, stance="supporting"), + Evidence(content="Contradicting evidence", source="https://example.com/contradict", relevance=0.8, stance="contradicting"), + Evidence(content="Neutral evidence", source="https://example.com/neutral", relevance=0.7, stance="neutral"), + ] + + verdict = Verdict( + claim="Test claim", + verdict="partially true", + confidence=0.7, + explanation="Partially true due to mixed evidence", + sources=["https://example.com/support", "https://example.com/contradict", "https://example.com/neutral"], + ) + + # Configure mock to capture the evidence transformation + verdict_writer_input = None + def mock_runner_side_effect(*args, **kwargs): + nonlocal verdict_writer_input + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock([claim]) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + verdict_writer_input = args[1] + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with a claim") + + # Verify results + assert len(results) == 1 + assert results[0].claim == claim.text + assert results[0].verdict == "partially true" + + # Verify evidence transformation + assert verdict_writer_input is not None + + # Check that all evidence is included in the verdict writer input + for e in evidence: + assert e.content in verdict_writer_input + assert e.source in verdict_writer_input + assert e.stance in verdict_writer_input + + # Check that evidence stances are preserved + assert "supporting" in verdict_writer_input + assert "contradicting" in verdict_writer_input + assert "neutral" in verdict_writer_input + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_verdict_schema_compliance(mock_run, manager): + """Test that verdicts comply with the expected schema.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=1) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + + # Create verdicts with different types + verdicts = [ + Verdict( + claim=claims[0].text, + verdict="true", + confidence=0.9, + explanation="True explanation", + sources=["https://example.com/true"], + ), + Verdict( + claim=claims[0].text, + verdict="false", + confidence=0.9, + explanation="False explanation", + sources=["https://example.com/false"], + ), + Verdict( + claim=claims[0].text, + verdict="partially true", + confidence=0.7, + explanation="Partially true explanation", + sources=["https://example.com/partial"], + ), + Verdict( + claim=claims[0].text, + verdict="unverifiable", + confidence=0.5, + explanation="Unverifiable explanation", + sources=["https://example.com/unverifiable"], + ), + ] + + # Test each verdict type + for test_verdict in verdicts: + # Configure mock + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockDataFactory.create_runner_result_mock(evidence_map[claims[0].text]) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockDataFactory.create_runner_result_mock(test_verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with a claim") + + # Verify results + assert len(results) == 1 + assert results[0].claim == claims[0].text + assert results[0].verdict == test_verdict.verdict + assert results[0].confidence == test_verdict.confidence + assert results[0].explanation == test_verdict.explanation + assert results[0].sources == test_verdict.sources + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_max_claims_limit(mock_run, manager): + """Test that the max_claims limit is respected.""" + # Create many claims + claims = [ + Claim(text=f"Claim {i}", context=0.8) + for i in range(10) # More than the max_claims limit of 5 + ] + + # Configure mock + processed_claims = [] + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Record which claims are processed + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + if claim_text: + processed_claims.append(claim_text) + + # Return some evidence + return MockDataFactory.create_runner_result_mock([ + Evidence( + content=f"Evidence for {claim_text}", + source="https://example.com/evidence", + relevance=0.8, + stance="supporting", + ) + ]) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + + # Return a verdict + return MockDataFactory.create_runner_result_mock( + Verdict( + claim=claim_text or "Unknown claim", + verdict="true", + confidence=0.8, + explanation=f"Explanation for {claim_text}", + sources=["https://example.com/evidence"], + ) + ) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with many claims") + + # Verify results - should be limited by max_claims + assert len(results) <= 5 # max_claims from the manager config + assert len(processed_claims) <= 5 + + # Verify that the processed claims are the ones with the highest context scores + # (They should be processed in order of context score) + for claim_text in processed_claims: + assert claim_text in [c.text for c in claims] diff --git a/src/tests/test_error_recovery.py b/src/tests/test_error_recovery.py new file mode 100644 index 0000000..6328472 --- /dev/null +++ b/src/tests/test_error_recovery.py @@ -0,0 +1,353 @@ +"""Tests for error recovery in the VeriFact pipeline.""" + +import asyncio +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.utils.mock_data_factory import MockDataFactory + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=5.0, # Short timeout for testing + enable_fallbacks=True, + retry_attempts=2, + raise_exceptions=False, # Don't raise exceptions for error recovery testing + include_debug_info=True, + ) + return VerifactManager(config) + + +@pytest.fixture +def strict_manager(): + """Create a VerifactManager instance that raises exceptions.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=5.0, + enable_fallbacks=False, + retry_attempts=1, + raise_exceptions=True, # Raise exceptions for error testing + include_debug_info=True, + ) + return VerifactManager(config) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_timeout_recovery(mock_run, manager): + """Test recovery from timeouts in the pipeline.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=2) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock to simulate a timeout for the second evidence gathering + call_count = 0 + def mock_runner_side_effect(*args, **kwargs): + nonlocal call_count + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + call_count += 1 + if call_count == 2: + # Simulate timeout for the second claim + raise asyncio.TimeoutError("Evidence gathering timed out") + return MockDataFactory.create_runner_result_mock(evidence_map[claims[0].text]) + elif agent.__dict__.get('name') == 'VerdictWriter': + return MockDataFactory.create_runner_result_mock(verdicts[0]) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims") + + # Verify results - should have one verdict despite the timeout + assert len(results) == 1 + assert results[0].claim == claims[0].text + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_retry_mechanism(mock_run, manager): + """Test the retry mechanism for failed API calls.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=1) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock to fail on first attempt but succeed on retry + attempt_counts = {"evidence": 0, "verdict": 0} + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + attempt_counts["evidence"] += 1 + if attempt_counts["evidence"] == 1: + # Fail on first attempt + raise Exception("Evidence gathering failed") + # Succeed on retry + return MockDataFactory.create_runner_result_mock(evidence_map[claims[0].text]) + elif agent.__dict__.get('name') == 'VerdictWriter': + attempt_counts["verdict"] += 1 + if attempt_counts["verdict"] == 1: + # Fail on first attempt + raise Exception("Verdict generation failed") + # Succeed on retry + return MockDataFactory.create_runner_result_mock(verdicts[0]) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims") + + # Verify results - should have one verdict after retries + assert len(results) == 1 + assert results[0].claim == claims[0].text + + # Verify retry counts + assert attempt_counts["evidence"] == 2 # Initial attempt + 1 retry + assert attempt_counts["verdict"] == 2 # Initial attempt + 1 retry + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_partial_evidence_failure(mock_run, manager): + """Test handling of partial evidence gathering failures.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=3) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock to fail evidence gathering for the second claim + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + + if claim_text == claims[1].text: + # Fail for the second claim + raise Exception("Evidence gathering failed for second claim") + + # Return evidence for other claims + return MockDataFactory.create_runner_result_mock(evidence_map.get(claim_text, [])) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + + # Return verdict for the matching claim + for verdict in verdicts: + if verdict.claim == claim_text: + return MockDataFactory.create_runner_result_mock(verdict) + + return MockDataFactory.create_runner_result_mock(verdicts[0]) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims") + + # Verify results - should have verdicts for claims 1 and 3, but not for claim 2 + assert len(results) == 2 + result_claims = [result.claim for result in results] + assert claims[0].text in result_claims + assert claims[2].text in result_claims + assert claims[1].text not in result_claims + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_malformed_evidence(mock_run, manager): + """Test handling of malformed evidence.""" + # Create test data with malformed evidence + scenario = MockDataFactory.create_scenario("error_prone", claim_count=2) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock to return the test data + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + + # Return evidence for the claim + return MockDataFactory.create_runner_result_mock(evidence_map.get(claim_text, [])) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + + # Return verdict for the matching claim + for verdict in verdicts: + if verdict.claim == claim_text: + return MockDataFactory.create_runner_result_mock(verdict) + + # If no matching verdict, return a default one + return MockDataFactory.create_runner_result_mock( + Verdict( + claim=claim_text or "Unknown claim", + verdict="unverifiable", + confidence=0.5, + explanation="Could not verify due to malformed evidence.", + sources=["https://example.com/source"], + ) + ) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims") + + # Verify results - should have at least one verdict + assert len(results) > 0 + + # Check that malformed evidence was handled gracefully + for result in results: + assert isinstance(result, Verdict) + assert result.claim in [claim.text for claim in claims] + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_empty_evidence(mock_run, manager): + """Test handling of empty evidence sets.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=2) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock to return empty evidence for the first claim + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + + if claim_text == claims[0].text: + # Return empty evidence for the first claim + return MockDataFactory.create_runner_result_mock([]) + + # Return evidence for other claims + return MockDataFactory.create_runner_result_mock(evidence_map.get(claim_text, [])) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + + # Return verdict for the matching claim + for verdict in verdicts: + if verdict.claim == claim_text: + return MockDataFactory.create_runner_result_mock(verdict) + + return MockDataFactory.create_runner_result_mock(verdicts[0]) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims") + + # Verify results - should have one verdict (for the second claim) + assert len(results) == 1 + assert results[0].claim == claims[1].text + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_invalid_verdict(mock_run, manager): + """Test handling of invalid verdicts.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=1) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + + # Configure mock to return invalid verdict + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + return MockDataFactory.create_runner_result_mock(evidence_map[claims[0].text]) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Return an invalid verdict (missing required fields) + return MockDataFactory.create_runner_result_mock({ + "claim": claims[0].text, + # Missing verdict, confidence, explanation, sources + }) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline + results = await manager.run("Test text with claims") + + # Verify results - should be empty due to invalid verdict + assert len(results) == 0 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_exception_propagation(mock_run, strict_manager): + """Test that exceptions are properly propagated when raise_exceptions is True.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=1) + claims = scenario["claims"] + + # Configure mock to raise an exception + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + raise Exception("Test exception") + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Run the pipeline and expect an exception + with pytest.raises(Exception) as excinfo: + await strict_manager.run("Test text with claims") + + # Verify the exception + assert "Test exception" in str(excinfo.value) diff --git a/src/tests/test_fixtures.py b/src/tests/test_fixtures.py new file mode 100644 index 0000000..dc92854 --- /dev/null +++ b/src/tests/test_fixtures.py @@ -0,0 +1,127 @@ +"""Tests for the test fixtures.""" + +import pytest + +from src.tests.fixtures.claims import ( + ALL_CLAIMS, + POLITICAL_CLAIMS, + HEALTH_CLAIMS, + SCIENCE_CLAIMS, + ECONOMIC_CLAIMS, + SAMPLE_TEXTS, +) +from src.tests.fixtures.evidence import ( + ALL_EVIDENCE, + POLITICAL_EVIDENCE, + HEALTH_EVIDENCE, + SCIENCE_EVIDENCE, + ECONOMIC_EVIDENCE, +) +from src.tests.fixtures.verdicts import ( + ALL_VERDICTS, + POLITICAL_VERDICTS, + HEALTH_VERDICTS, + SCIENCE_VERDICTS, + ECONOMIC_VERDICTS, +) + +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + + +def test_claims_fixtures(): + """Test that the claims fixtures are valid.""" + # Check that all claims are instances of the Claim class + for claim in ALL_CLAIMS: + assert isinstance(claim, Claim) + + # Check that the combined list contains all individual lists + assert len(ALL_CLAIMS) == ( + len(POLITICAL_CLAIMS) + + len(HEALTH_CLAIMS) + + len(SCIENCE_CLAIMS) + + len(ECONOMIC_CLAIMS) + ) + + # Check that sample texts are non-empty + for text in SAMPLE_TEXTS: + assert isinstance(text, str) + assert len(text) > 0 + + +def test_evidence_fixtures(): + """Test that the evidence fixtures are valid.""" + # Check that all evidence items are instances of the Evidence class + for category, evidence_list in ALL_EVIDENCE.items(): + for evidence_item in evidence_list: + assert isinstance(evidence_item, Evidence) + + # Check that the combined dictionary contains all individual dictionaries + assert len(ALL_EVIDENCE) == ( + len(POLITICAL_EVIDENCE) + + len(HEALTH_EVIDENCE) + + len(SCIENCE_EVIDENCE) + + len(ECONOMIC_EVIDENCE) + ) + + # Check evidence attributes + for category, evidence_list in ALL_EVIDENCE.items(): + for evidence in evidence_list: + assert isinstance(evidence.content, str) + assert isinstance(evidence.source, str) + assert 0.0 <= evidence.relevance <= 1.0 + assert evidence.stance in ["supporting", "contradicting", "neutral"] + + +def test_verdicts_fixtures(): + """Test that the verdicts fixtures are valid.""" + # Check that all verdicts are instances of the Verdict class + for verdict in ALL_VERDICTS: + assert isinstance(verdict, Verdict) + + # Check that the combined list contains all individual lists + assert len(ALL_VERDICTS) == ( + len(POLITICAL_VERDICTS) + + len(HEALTH_VERDICTS) + + len(SCIENCE_VERDICTS) + + len(ECONOMIC_VERDICTS) + ) + + # Check verdict attributes + for verdict in ALL_VERDICTS: + assert isinstance(verdict.claim, str) + assert verdict.verdict in ["true", "false", "partially true", "unverifiable"] + assert 0.0 <= verdict.confidence <= 1.0 + assert isinstance(verdict.explanation, str) + assert len(verdict.sources) > 0 + for source in verdict.sources: + assert isinstance(source, str) + + +def test_fixture_relationships(): + """Test the relationships between fixtures.""" + # Check that there's evidence for at least some claims + for claim in POLITICAL_CLAIMS: + # Find evidence that might match this claim + found_evidence = False + for category, evidence_list in POLITICAL_EVIDENCE.items(): + if claim.text.lower() in category.lower(): + found_evidence = True + break + + # Not all claims need evidence, but at least some should have it + if claim.text == "The United States has the largest military budget in the world.": + assert found_evidence, f"No evidence found for key claim: {claim.text}" + + # Check that there's a verdict for at least some claims + for claim in POLITICAL_CLAIMS: + found_verdict = False + for verdict in POLITICAL_VERDICTS: + if claim.text == verdict.claim: + found_verdict = True + break + + # Not all claims need verdicts, but at least some should have them + if claim.text == "The United States has the largest military budget in the world.": + assert found_verdict, f"No verdict found for key claim: {claim.text}" diff --git a/src/tests/test_performance.py b/src/tests/test_performance.py new file mode 100644 index 0000000..43d0ce5 --- /dev/null +++ b/src/tests/test_performance.py @@ -0,0 +1,436 @@ +"""Tests for measuring and optimizing performance of the VeriFact pipeline.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +import asyncio +import time + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.utils.mock_data_factory import MockDataFactory +from src.tests.utils.performance_utils import ( + PerformanceTracker, + benchmark_pipeline, + analyze_benchmark_results, +) + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=True, + ) + return VerifactManager(config) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_performance_tracking(mock_run, manager): + """Test that performance tracking works correctly.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=2) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock with delays to simulate processing time + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + time.sleep(0.1) # 100ms delay + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + time.sleep(0.2) # 200ms delay + + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + evidence = evidence_map.get(claim_text, []) + + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + time.sleep(0.15) # 150ms delay + + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + verdict = next((v for v in verdicts if v.claim == claim_text), verdicts[0]) + + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Create a performance tracker + tracker = PerformanceTracker() + tracker.start() + + # Monkey patch the manager's methods to track performance + original_detect_claims = manager._detect_claims + original_gather_evidence_for_claim = manager._gather_evidence_for_claim + original_generate_verdict_for_claim = manager._generate_verdict_for_claim + + async def timed_detect_claims(text): + return await tracker.timed_operation( + "claim_detection", + original_detect_claims, + text, + ) + + async def timed_gather_evidence_for_claim(claim): + return await tracker.timed_operation( + "evidence_gathering", + original_gather_evidence_for_claim, + claim, + ) + + async def timed_generate_verdict_for_claim(claim, evidence): + return await tracker.timed_operation( + "verdict_generation", + original_generate_verdict_for_claim, + claim, + evidence, + ) + + # Apply the monkey patches + manager._detect_claims = timed_detect_claims + manager._gather_evidence_for_claim = timed_gather_evidence_for_claim + manager._generate_verdict_for_claim = timed_generate_verdict_for_claim + + try: + # Run the pipeline + results = await manager.run("Test text with claims") + finally: + # Restore the original methods + manager._detect_claims = original_detect_claims + manager._gather_evidence_for_claim = original_gather_evidence_for_claim + manager._generate_verdict_for_claim = original_generate_verdict_for_claim + + tracker.stop() + report = tracker.generate_report() + + # Verify results + assert len(results) == 2 + + # Verify performance tracking + assert report.total_duration_ms > 0 + assert report.claim_detection_ms >= 100 # At least 100ms + assert report.evidence_gathering_ms >= 400 # At least 2 * 200ms + assert report.verdict_generation_ms >= 300 # At least 2 * 150ms + + # Verify counts + assert report.claim_count == 1 # One call to detect_claims + assert report.evidence_count == 2 # Two calls to gather_evidence_for_claim + assert report.verdict_count == 2 # Two calls to generate_verdict_for_claim + + # Verify parallelism efficiency + # The efficiency should be close to 1.0 because of parallelism in evidence gathering + assert report.parallelism_efficiency > 0.9 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_parallelism_efficiency(mock_run): + """Test the parallelism efficiency of the pipeline with different configurations.""" + # Create test data + scenario = MockDataFactory.create_scenario("standard", claim_count=5) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock with delays to simulate processing time + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + time.sleep(0.1) # 100ms delay + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + time.sleep(0.2) # 200ms delay + + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + evidence = evidence_map.get(claim_text, []) + + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + time.sleep(0.15) # 150ms delay + + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + verdict = next((v for v in verdicts if v.claim == claim_text), verdicts[0]) + + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Test with different configurations + configs = [ + # Sequential processing + ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=True, + ), + ] + + reports = [] + for config in configs: + manager = VerifactManager(config) + + # Benchmark the pipeline + benchmark_results = await benchmark_pipeline( + manager, + ["Test text with claims"], + iterations=1, + ) + + reports.append(benchmark_results[0]) + + # Verify parallelism efficiency + # The efficiency should be close to 1.0 because of parallelism in evidence gathering + for report in reports: + assert report.parallelism_efficiency > 0.9 + + # Analyze the results + stats = analyze_benchmark_results(reports) + assert "total_duration" in stats + assert "claim_detection" in stats + assert "evidence_gathering" in stats + assert "verdict_generation" in stats + assert "parallelism_efficiency" in stats + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_high_volume_performance(mock_run): + """Test the performance of the pipeline with a high volume of claims.""" + # Create test data with many claims + scenario = MockDataFactory.create_scenario("high_volume", claim_count=10, evidence_per_claim=5) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock with minimal delays + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + time.sleep(0.05) # 50ms delay + return MockDataFactory.create_runner_result_mock(claims[:10]) # Limit to 10 claims + elif agent.__dict__.get('name') == 'EvidenceHunter': + time.sleep(0.1) # 100ms delay + + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + evidence = evidence_map.get(claim_text, []) + + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + time.sleep(0.1) # 100ms delay + + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + verdict = next((v for v in verdicts if v.claim == claim_text), verdicts[0]) + + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Create managers with different max_claims settings + configs = [ + ManagerConfig(max_claims=5, evidence_per_claim=3), + ManagerConfig(max_claims=10, evidence_per_claim=3), + ] + + reports = [] + for config in configs: + manager = VerifactManager(config) + + # Benchmark the pipeline + benchmark_results = await benchmark_pipeline( + manager, + ["Test text with many claims"], + iterations=1, + ) + + reports.append(benchmark_results[0]) + + # Verify that the second configuration processes more claims + assert reports[1].claim_count >= reports[0].claim_count + assert reports[1].evidence_count >= reports[0].evidence_count + assert reports[1].verdict_count >= reports[0].verdict_count + + # But it should also take longer + assert reports[1].total_duration_ms >= reports[0].total_duration_ms + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_optimization_suggestions(mock_run, manager): + """Test to identify potential optimization opportunities.""" + # Create test data with varying processing times + scenario = MockDataFactory.create_scenario("standard", claim_count=3) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock with varying delays to simulate bottlenecks + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + time.sleep(0.1) # 100ms delay + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Simulate varying evidence gathering times + query = args[1] + claim_index = next((i for i, c in enumerate(claims) if c.text in query), 0) + + # Make the second claim take much longer + if claim_index == 1: + time.sleep(0.5) # 500ms delay - bottleneck + else: + time.sleep(0.2) # 200ms delay + + claim_text = claims[claim_index].text + evidence = evidence_map.get(claim_text, []) + + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + time.sleep(0.15) # 150ms delay + + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + verdict = next((v for v in verdicts if v.claim == claim_text), verdicts[0]) + + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Benchmark the pipeline + benchmark_results = await benchmark_pipeline( + manager, + ["Test text with claims"], + iterations=1, + ) + + report = benchmark_results[0] + + # Verify that evidence gathering is the bottleneck + assert report.evidence_gathering_ms > report.claim_detection_ms + assert report.evidence_gathering_ms > report.verdict_generation_ms + + # Check individual evidence gathering times + evidence_timings = [t.duration_ms for t in report.timings if t.operation == "evidence_gathering"] + assert max(evidence_timings) > 2 * min(evidence_timings) # The bottleneck is at least 2x slower + + # The bottleneck should be the second claim + bottleneck_index = evidence_timings.index(max(evidence_timings)) + assert bottleneck_index == 1 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_performance_target(mock_run): + """Test that the pipeline meets the target performance of <30s end-to-end.""" + # Create test data with a realistic number of claims and evidence + scenario = MockDataFactory.create_scenario("standard", claim_count=3, evidence_per_claim=5) + claims = scenario["claims"] + evidence_map = scenario["evidence_map"] + verdicts = scenario["verdicts"] + + # Configure mock with realistic delays based on typical API response times + def mock_runner_side_effect(*args, **kwargs): + agent = args[0] + + if agent.__dict__.get('name') == 'ClaimDetector': + # Claim detection typically takes 2-3 seconds + time.sleep(2.5) + return MockDataFactory.create_runner_result_mock(claims) + elif agent.__dict__.get('name') == 'EvidenceHunter': + # Evidence gathering typically takes 3-5 seconds per claim + time.sleep(4.0) + + # Extract claim text from the query + query = args[1] + claim_text = next((c.text for c in claims if c.text in query), None) + evidence = evidence_map.get(claim_text, []) + + return MockDataFactory.create_runner_result_mock(evidence) + elif agent.__dict__.get('name') == 'VerdictWriter': + # Verdict generation typically takes 2-4 seconds per claim + time.sleep(3.0) + + # Extract claim text from the prompt + prompt = args[1] + claim_text = next((c.text for c in claims if c.text in prompt), None) + verdict = next((v for v in verdicts if v.claim == claim_text), verdicts[0]) + + return MockDataFactory.create_runner_result_mock(verdict) + return MockDataFactory.create_runner_result_mock([]) + + mock_run.side_effect = mock_runner_side_effect + + # Create a manager with default settings + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=5, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=False, + ) + manager = VerifactManager(config) + + # Benchmark the pipeline + start_time = time.time() + + # Run the pipeline with a realistic input text + results = await manager.run(""" + The United States has the largest military budget in the world. + The Earth is flat and sits at the center of our solar system. + Regular exercise reduces the risk of heart disease. + """) + + end_time = time.time() + total_duration_seconds = end_time - start_time + + # Verify results + assert len(results) > 0 + + # Verify that the pipeline completes in under 30 seconds + assert total_duration_seconds < 30.0, f"Pipeline took {total_duration_seconds:.2f}s, which exceeds the 30s target" + + # Print the actual duration for reference + print(f"Pipeline completed in {total_duration_seconds:.2f}s") + + # Verify that we processed all claims + assert len(results) == len(claims) diff --git a/src/tests/test_verifact_manager.py b/src/tests/test_verifact_manager.py new file mode 100644 index 0000000..8620fe3 --- /dev/null +++ b/src/tests/test_verifact_manager.py @@ -0,0 +1,244 @@ +"""Unit tests for the VerifactManager class.""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from src.verifact_manager import VerifactManager, ManagerConfig +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +from src.tests.fixtures.claims import POLITICAL_CLAIMS, SAMPLE_TEXTS +from src.tests.fixtures.evidence import POLITICAL_EVIDENCE +from src.tests.fixtures.verdicts import POLITICAL_VERDICTS + + +class MockRunnerResult: + """Mock for the result returned by Runner.run().""" + + def __init__(self, output_data): + self.output_data = output_data + self.final_output = str(output_data) + + def final_output_as(self, output_type): + """Mock the final_output_as method.""" + return self.output_data + + +@pytest.fixture +def manager(): + """Create a VerifactManager instance for testing.""" + config = ManagerConfig( + min_checkworthiness=0.5, + max_claims=5, + evidence_per_claim=3, + timeout_seconds=30.0, + enable_fallbacks=True, + retry_attempts=1, + raise_exceptions=True, + include_debug_info=False, + ) + return VerifactManager(config) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_detect_claims(mock_run, manager): + """Test the _detect_claims method.""" + # Setup mock + sample_claims = POLITICAL_CLAIMS[:2] + mock_run.return_value = MockRunnerResult(sample_claims) + + # Call the method + result = await manager._detect_claims(SAMPLE_TEXTS[0]) + + # Verify results + assert result == sample_claims + mock_run.assert_called_once() + # Verify the text was passed to the agent + assert mock_run.call_args[0][1] == SAMPLE_TEXTS[0] + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_gather_evidence_for_claim(mock_run, manager): + """Test the _gather_evidence_for_claim method.""" + # Setup mock + sample_claim = POLITICAL_CLAIMS[0] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + mock_run.return_value = MockRunnerResult(sample_evidence) + + # Call the method + result = await manager._gather_evidence_for_claim(sample_claim) + + # Verify results + assert result == sample_evidence + mock_run.assert_called_once() + # Verify the claim was included in the query + assert sample_claim.text in mock_run.call_args[0][1] + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._gather_evidence_for_claim") +async def test_gather_evidence(mock_gather, manager): + """Test the _gather_evidence method.""" + # Setup mock + sample_claims = POLITICAL_CLAIMS[:2] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + mock_gather.side_effect = [sample_evidence, Exception("Test error")] + + # Call the method + result = await manager._gather_evidence(sample_claims) + + # Verify results + assert len(result) == 2 + assert result[0][0] == sample_claims[0] + assert result[0][1] == sample_evidence + assert result[1][0] == sample_claims[1] + assert result[1][1] is None # Should be None due to the exception + assert mock_gather.call_count == 2 + + +@pytest.mark.asyncio +@patch("src.verifact_manager.Runner.run") +async def test_generate_verdict_for_claim(mock_run, manager): + """Test the _generate_verdict_for_claim method.""" + # Setup mock + sample_claim = POLITICAL_CLAIMS[0] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + sample_verdict = POLITICAL_VERDICTS[0] + mock_run.return_value = MockRunnerResult(sample_verdict) + + # Call the method + result = await manager._generate_verdict_for_claim(sample_claim, sample_evidence) + + # Verify results + assert result == sample_verdict + mock_run.assert_called_once() + # Verify the claim and evidence were included in the prompt + assert sample_claim.text in mock_run.call_args[0][1] + assert "Evidence" in mock_run.call_args[0][1] + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._generate_verdict_for_claim") +async def test_generate_all_verdicts(mock_generate, manager): + """Test the _generate_all_verdicts method.""" + # Setup mock + sample_claims = POLITICAL_CLAIMS[:2] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + sample_verdicts = POLITICAL_VERDICTS[:2] + + # Create claim-evidence pairs + claim_evidence_pairs = [ + (sample_claims[0], sample_evidence), + (sample_claims[1], None), # This should be skipped + ] + + mock_generate.return_value = sample_verdicts[0] + + # Call the method + result = await manager._generate_all_verdicts(claim_evidence_pairs) + + # Verify results + assert len(result) == 1 # Only one verdict should be generated (second claim has no evidence) + assert result[0] == sample_verdicts[0] + mock_generate.assert_called_once_with(sample_claims[0], sample_evidence) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._detect_claims") +@patch("src.verifact_manager.VerifactManager._gather_evidence") +@patch("src.verifact_manager.VerifactManager._generate_all_verdicts") +async def test_run_success(mock_generate_verdicts, mock_gather_evidence, mock_detect_claims, manager): + """Test the run method with successful execution.""" + # Setup mocks + sample_claims = POLITICAL_CLAIMS[:2] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + sample_verdicts = POLITICAL_VERDICTS[:2] + + mock_detect_claims.return_value = sample_claims + mock_gather_evidence.return_value = [(sample_claims[0], sample_evidence)] + mock_generate_verdicts.return_value = sample_verdicts + + # Call the method + result = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert result == sample_verdicts + mock_detect_claims.assert_called_once_with(SAMPLE_TEXTS[0]) + mock_gather_evidence.assert_called_once_with(sample_claims) + mock_generate_verdicts.assert_called_once() + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._detect_claims") +async def test_run_no_claims(mock_detect_claims, manager): + """Test the run method when no claims are detected.""" + # Setup mock + mock_detect_claims.return_value = [] + + # Call the method + result = await manager.run(SAMPLE_TEXTS[0]) + + # Verify results + assert result == [] + mock_detect_claims.assert_called_once_with(SAMPLE_TEXTS[0]) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._detect_claims") +async def test_run_claim_detection_error(mock_detect_claims, manager): + """Test the run method when claim detection raises an exception.""" + # Setup mock + mock_detect_claims.side_effect = Exception("Test error") + + # Call the method and expect an exception + with pytest.raises(Exception): + await manager.run(SAMPLE_TEXTS[0]) + + # Verify the mock was called + mock_detect_claims.assert_called_once_with(SAMPLE_TEXTS[0]) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._detect_claims") +@patch("src.verifact_manager.VerifactManager._gather_evidence") +async def test_run_evidence_gathering_error(mock_gather_evidence, mock_detect_claims, manager): + """Test the run method when evidence gathering raises an exception.""" + # Setup mocks + sample_claims = POLITICAL_CLAIMS[:2] + mock_detect_claims.return_value = sample_claims + mock_gather_evidence.side_effect = Exception("Test error") + + # Call the method and expect an exception + with pytest.raises(Exception): + await manager.run(SAMPLE_TEXTS[0]) + + # Verify the mocks were called + mock_detect_claims.assert_called_once_with(SAMPLE_TEXTS[0]) + mock_gather_evidence.assert_called_once_with(sample_claims) + + +@pytest.mark.asyncio +@patch("src.verifact_manager.VerifactManager._detect_claims") +@patch("src.verifact_manager.VerifactManager._gather_evidence") +@patch("src.verifact_manager.VerifactManager._generate_all_verdicts") +async def test_run_verdict_generation_error(mock_generate_verdicts, mock_gather_evidence, mock_detect_claims, manager): + """Test the run method when verdict generation raises an exception.""" + # Setup mocks + sample_claims = POLITICAL_CLAIMS[:2] + sample_evidence = POLITICAL_EVIDENCE["US military budget"] + + mock_detect_claims.return_value = sample_claims + mock_gather_evidence.return_value = [(sample_claims[0], sample_evidence)] + mock_generate_verdicts.side_effect = Exception("Test error") + + # Call the method and expect an exception + with pytest.raises(Exception): + await manager.run(SAMPLE_TEXTS[0]) + + # Verify the mocks were called + mock_detect_claims.assert_called_once_with(SAMPLE_TEXTS[0]) + mock_gather_evidence.assert_called_once_with(sample_claims) + mock_generate_verdicts.assert_called_once() diff --git a/src/tests/utils/__init__.py b/src/tests/utils/__init__.py new file mode 100644 index 0000000..1e799e5 --- /dev/null +++ b/src/tests/utils/__init__.py @@ -0,0 +1 @@ +"""Utilities for testing the VeriFact pipeline.""" diff --git a/src/tests/utils/mock_data_factory.py b/src/tests/utils/mock_data_factory.py new file mode 100644 index 0000000..9c4baa9 --- /dev/null +++ b/src/tests/utils/mock_data_factory.py @@ -0,0 +1,390 @@ +"""Factory for creating mock data for testing the VeriFact pipeline.""" + +import random +from typing import List, Dict, Any, Optional, Tuple, Union +from datetime import datetime + +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + + +class MockDataFactory: + """Factory for creating mock data for testing.""" + + # Sample domains for claims + DOMAINS = ["politics", "health", "science", "economics", "technology", "history", "sports"] + + # Sample languages for multilingual testing + LANGUAGES = { + "en": "English", + "es": "Spanish", + "fr": "French", + "de": "German", + "zh": "Chinese", + "ja": "Japanese", + "ru": "Russian", + "ar": "Arabic", + } + + # Sample sources for evidence + SOURCES = [ + "https://example.com/source1", + "https://example.com/source2", + "https://example.com/source3", + "https://example.com/source4", + "https://example.com/source5", + "https://en.wikipedia.org/wiki/Example", + "https://news.example.com/article1", + "https://academic.example.edu/paper1", + "https://government.example.gov/report1", + ] + + # Sample stances for evidence + STANCES = ["supporting", "contradicting", "neutral"] + + # Sample verdict types + VERDICT_TYPES = ["true", "false", "partially true", "unverifiable"] + + @classmethod + def create_claim( + cls, + text: Optional[str] = None, + domain: Optional[str] = None, + context: Optional[float] = None, + language: Optional[str] = None, + controversial: bool = False, + ) -> Claim: + """Create a mock claim. + + Args: + text: The claim text. If None, a random claim will be generated. + domain: The domain of the claim. If None, a random domain will be chosen. + context: The context score. If None, a random score will be generated. + language: The language of the claim. If None, English will be used. + controversial: Whether the claim should be controversial. + + Returns: + A mock Claim object. + """ + if text is None: + domain = domain or random.choice(cls.DOMAINS) + language_code = language or "en" + language_name = cls.LANGUAGES.get(language_code, "English") + + if domain == "politics": + if controversial: + text = f"The {language_name} government is corrupt and serves only the elite." + else: + text = f"The {language_name} parliament has 500 members." + elif domain == "health": + if controversial: + text = f"Alternative medicine is more effective than conventional medicine in {language_name}-speaking countries." + else: + text = f"Regular exercise reduces the risk of heart disease in {language_name}-speaking populations." + elif domain == "science": + if controversial: + text = f"Climate change is not caused by human activities according to {language_name} scientists." + else: + text = f"Water freezes at 0 degrees Celsius at standard pressure according to {language_name} textbooks." + elif domain == "economics": + if controversial: + text = f"Cryptocurrency will replace traditional banking in {language_name}-speaking countries." + else: + text = f"The GDP of {language_name}-speaking countries grew by 2.5% last year." + else: + if controversial: + text = f"Social media is destroying society in {language_name}-speaking regions." + else: + text = f"The internet was invented in the 1960s according to {language_name} historical records." + + if context is None: + context = round(random.uniform(0.5, 1.0), 2) + + return Claim(text=text, context=context) + + @classmethod + def create_evidence( + cls, + claim: Optional[Claim] = None, + stance: Optional[str] = None, + relevance: Optional[float] = None, + source: Optional[str] = None, + content: Optional[str] = None, + malformed: bool = False, + ) -> Evidence: + """Create mock evidence for a claim. + + Args: + claim: The claim to create evidence for. If None, a random claim will be created. + stance: The stance of the evidence. If None, a random stance will be chosen. + relevance: The relevance score. If None, a random score will be generated. + source: The source URL. If None, a random source will be chosen. + content: The evidence content. If None, content will be generated based on the claim. + malformed: Whether to create malformed evidence (for testing error handling). + + Returns: + A mock Evidence object. + """ + claim = claim or cls.create_claim() + stance = stance or random.choice(cls.STANCES) + relevance = relevance or round(random.uniform(0.5, 1.0), 2) + source = source or random.choice(cls.SOURCES) + + if content is None: + if stance == "supporting": + content = f"Research confirms that {claim.text}" + elif stance == "contradicting": + content = f"Studies have disproven the claim that {claim.text}" + else: + content = f"There is mixed evidence regarding whether {claim.text}" + + if malformed: + # Create malformed evidence for testing error handling + if random.choice([True, False]): + source = "invalid-url" + else: + content = "" + + return Evidence(content=content, source=source, relevance=relevance, stance=stance) + + @classmethod + def create_evidence_set( + cls, + claim: Optional[Claim] = None, + count: int = 3, + mixed_stances: bool = True, + include_malformed: bool = False, + ) -> List[Evidence]: + """Create a set of evidence for a claim. + + Args: + claim: The claim to create evidence for. If None, a random claim will be created. + count: The number of evidence items to create. + mixed_stances: Whether to include evidence with different stances. + include_malformed: Whether to include malformed evidence. + + Returns: + A list of Evidence objects. + """ + claim = claim or cls.create_claim() + evidence_set = [] + + for i in range(count): + if mixed_stances: + stance = cls.STANCES[i % len(cls.STANCES)] + else: + stance = random.choice(cls.STANCES) + + malformed = include_malformed and i == count - 1 + evidence = cls.create_evidence(claim=claim, stance=stance, malformed=malformed) + evidence_set.append(evidence) + + return evidence_set + + @classmethod + def create_verdict( + cls, + claim: Optional[Claim] = None, + evidence: Optional[List[Evidence]] = None, + verdict_type: Optional[str] = None, + confidence: Optional[float] = None, + explanation: Optional[str] = None, + sources: Optional[List[str]] = None, + ) -> Verdict: + """Create a mock verdict for a claim. + + Args: + claim: The claim to create a verdict for. If None, a random claim will be created. + evidence: The evidence for the claim. If None, random evidence will be created. + verdict_type: The type of verdict. If None, a random type will be chosen. + confidence: The confidence score. If None, a score will be generated based on the verdict type. + explanation: The explanation. If None, an explanation will be generated. + sources: The sources. If None, sources will be extracted from the evidence. + + Returns: + A mock Verdict object. + """ + claim = claim or cls.create_claim() + evidence = evidence or cls.create_evidence_set(claim=claim) + verdict_type = verdict_type or random.choice(cls.VERDICT_TYPES) + + if confidence is None: + if verdict_type == "true": + confidence = round(random.uniform(0.8, 1.0), 2) + elif verdict_type == "false": + confidence = round(random.uniform(0.8, 1.0), 2) + elif verdict_type == "partially true": + confidence = round(random.uniform(0.6, 0.9), 2) + else: # unverifiable + confidence = round(random.uniform(0.3, 0.7), 2) + + if sources is None: + sources = [e.source for e in evidence] + + if explanation is None: + if verdict_type == "true": + explanation = f"The claim that {claim.text} is true based on multiple reliable sources." + elif verdict_type == "false": + explanation = f"The claim that {claim.text} is false according to available evidence." + elif verdict_type == "partially true": + explanation = f"The claim that {claim.text} is partially true. While some aspects are accurate, others are not fully supported by evidence." + else: # unverifiable + explanation = f"The claim that {claim.text} cannot be verified with available evidence." + + return Verdict( + claim=claim.text, + verdict=verdict_type, + confidence=confidence, + explanation=explanation, + sources=sources, + ) + + @classmethod + def create_scenario( + cls, + scenario_type: str, + claim_count: int = 3, + evidence_per_claim: int = 3, + ) -> Dict[str, Any]: + """Create a complete test scenario. + + Args: + scenario_type: The type of scenario to create. Options: + - "standard": A mix of different claim types + - "controversial": Controversial claims + - "multilingual": Claims in different languages + - "error_prone": Includes malformed data + - "high_volume": Many claims and evidence + - "time_sensitive": Claims about recent events + claim_count: The number of claims to create. + evidence_per_claim: The number of evidence items per claim. + + Returns: + A dictionary containing claims, evidence, and verdicts. + """ + claims = [] + evidence_map = {} + verdicts = [] + + if scenario_type == "standard": + # Create a mix of different claim types + for i in range(claim_count): + verdict_type = cls.VERDICT_TYPES[i % len(cls.VERDICT_TYPES)] + claim = cls.create_claim(domain=random.choice(cls.DOMAINS)) + evidence = cls.create_evidence_set(claim=claim, count=evidence_per_claim) + verdict = cls.create_verdict(claim=claim, evidence=evidence, verdict_type=verdict_type) + + claims.append(claim) + evidence_map[claim.text] = evidence + verdicts.append(verdict) + + elif scenario_type == "controversial": + # Create controversial claims + for i in range(claim_count): + claim = cls.create_claim(controversial=True) + evidence = cls.create_evidence_set(claim=claim, count=evidence_per_claim, mixed_stances=True) + + # Controversial claims are often partially true or unverifiable + verdict_type = random.choice(["partially true", "unverifiable"]) + verdict = cls.create_verdict(claim=claim, evidence=evidence, verdict_type=verdict_type) + + claims.append(claim) + evidence_map[claim.text] = evidence + verdicts.append(verdict) + + elif scenario_type == "multilingual": + # Create claims in different languages + languages = list(cls.LANGUAGES.keys()) + for i in range(claim_count): + language = languages[i % len(languages)] + claim = cls.create_claim(language=language) + evidence = cls.create_evidence_set(claim=claim, count=evidence_per_claim) + verdict = cls.create_verdict(claim=claim, evidence=evidence) + + claims.append(claim) + evidence_map[claim.text] = evidence + verdicts.append(verdict) + + elif scenario_type == "error_prone": + # Create scenarios with potential errors + for i in range(claim_count): + claim = cls.create_claim() + evidence = cls.create_evidence_set( + claim=claim, + count=evidence_per_claim, + include_malformed=(i % 2 == 0) # Every other claim has malformed evidence + ) + + # Some claims have no verdict (to simulate errors) + if i % 3 != 0: # 2/3 of claims have verdicts + verdict = cls.create_verdict(claim=claim, evidence=evidence) + verdicts.append(verdict) + + claims.append(claim) + evidence_map[claim.text] = evidence + + elif scenario_type == "high_volume": + # Create many claims and evidence + high_claim_count = claim_count * 3 + high_evidence_count = evidence_per_claim * 2 + + for i in range(high_claim_count): + claim = cls.create_claim() + evidence = cls.create_evidence_set(claim=claim, count=high_evidence_count) + verdict = cls.create_verdict(claim=claim, evidence=evidence) + + claims.append(claim) + evidence_map[claim.text] = evidence + verdicts.append(verdict) + + elif scenario_type == "time_sensitive": + # Create claims about recent events + current_year = datetime.now().year + + time_claims = [ + f"The Olympics were held in Paris in {current_year}.", + f"The global temperature reached a record high in {current_year}.", + f"The presidential election took place in {current_year}.", + f"The stock market crashed in {current_year}.", + f"A major peace treaty was signed in {current_year}.", + ] + + for i in range(min(claim_count, len(time_claims))): + claim = cls.create_claim(text=time_claims[i]) + evidence = cls.create_evidence_set(claim=claim, count=evidence_per_claim) + + # Time-sensitive claims are often unverifiable or partially true + verdict_type = random.choice(["unverifiable", "partially true"]) + verdict = cls.create_verdict(claim=claim, evidence=evidence, verdict_type=verdict_type) + + claims.append(claim) + evidence_map[claim.text] = evidence + verdicts.append(verdict) + + return { + "claims": claims, + "evidence_map": evidence_map, + "verdicts": verdicts, + "scenario_type": scenario_type, + } + + @classmethod + def create_runner_result_mock(cls, output_data: Any) -> Any: + """Create a mock for the result returned by Runner.run(). + + Args: + output_data: The data to return from final_output_as. + + Returns: + A mock object with a final_output_as method. + """ + class MockRunnerResult: + def __init__(self, data): + self.data = data + self.final_output = str(data) + + def final_output_as(self, output_type): + return self.data + + return MockRunnerResult(output_data) diff --git a/src/tests/utils/performance_utils.py b/src/tests/utils/performance_utils.py new file mode 100644 index 0000000..e5be0bb --- /dev/null +++ b/src/tests/utils/performance_utils.py @@ -0,0 +1,292 @@ +"""Utilities for measuring and analyzing performance of the VeriFact pipeline.""" + +import time +import asyncio +from typing import Dict, List, Any, Callable, Awaitable, Optional, Tuple +import statistics +from dataclasses import dataclass, field + +from src.verifact_manager import VerifactManager + + +@dataclass +class TimingResult: + """Result of a timing measurement.""" + + operation: str + duration_ms: float + metadata: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class PerformanceReport: + """Performance report for a pipeline run.""" + + total_duration_ms: float + claim_detection_ms: float + evidence_gathering_ms: float + verdict_generation_ms: float + claim_count: int + evidence_count: int + verdict_count: int + evidence_per_claim: float + ms_per_claim: float + ms_per_evidence: float + ms_per_verdict: float + parallelism_efficiency: float # 1.0 means perfect parallelism + timings: List[TimingResult] = field(default_factory=list) + + def __str__(self) -> str: + """Return a string representation of the performance report.""" + return f""" +Performance Report: +------------------ +Total Duration: {self.total_duration_ms:.2f}ms ({self.total_duration_ms / 1000:.2f}s) + +Claim Detection: {self.claim_detection_ms:.2f}ms ({self.claim_detection_ms / self.total_duration_ms * 100:.1f}%) +Evidence Gathering: {self.evidence_gathering_ms:.2f}ms ({self.evidence_gathering_ms / self.total_duration_ms * 100:.1f}%) +Verdict Generation: {self.verdict_generation_ms:.2f}ms ({self.verdict_generation_ms / self.total_duration_ms * 100:.1f}%) + +Counts: +- Claims: {self.claim_count} +- Evidence: {self.evidence_count} +- Verdicts: {self.verdict_count} +- Evidence per claim: {self.evidence_per_claim:.1f} + +Performance Metrics: +- Ms per claim: {self.ms_per_claim:.2f} +- Ms per evidence: {self.ms_per_evidence:.2f} +- Ms per verdict: {self.ms_per_verdict:.2f} +- Parallelism efficiency: {self.parallelism_efficiency:.2f} (1.0 is perfect) +""" + + +class PerformanceTracker: + """Tracks performance metrics for the VeriFact pipeline.""" + + def __init__(self): + self.timings: List[TimingResult] = [] + self.start_time: float = 0 + self.end_time: float = 0 + + def reset(self): + """Reset the tracker.""" + self.timings = [] + self.start_time = 0 + self.end_time = 0 + + def start(self): + """Start tracking performance.""" + self.reset() + self.start_time = time.time() + + def stop(self): + """Stop tracking performance.""" + self.end_time = time.time() + + def add_timing(self, operation: str, duration_ms: float, metadata: Optional[Dict[str, Any]] = None): + """Add a timing measurement.""" + self.timings.append(TimingResult( + operation=operation, + duration_ms=duration_ms, + metadata=metadata or {}, + )) + + async def timed_operation( + self, + operation: str, + func: Callable[..., Awaitable[Any]], + *args, + **kwargs + ) -> Any: + """Time an asynchronous operation and record the result.""" + start_time = time.time() + result = await func(*args, **kwargs) + end_time = time.time() + + duration_ms = (end_time - start_time) * 1000 + self.add_timing(operation, duration_ms, { + "args": str(args), + "kwargs": str(kwargs), + }) + + return result + + def generate_report(self) -> PerformanceReport: + """Generate a performance report.""" + if self.start_time == 0 or self.end_time == 0: + raise ValueError("Performance tracking not started or stopped") + + total_duration_ms = (self.end_time - self.start_time) * 1000 + + # Group timings by operation + operation_timings: Dict[str, List[float]] = {} + for timing in self.timings: + if timing.operation not in operation_timings: + operation_timings[timing.operation] = [] + operation_timings[timing.operation].append(timing.duration_ms) + + # Calculate metrics + claim_detection_ms = sum(operation_timings.get("claim_detection", [0])) + evidence_gathering_ms = sum(operation_timings.get("evidence_gathering", [0])) + verdict_generation_ms = sum(operation_timings.get("verdict_generation", [0])) + + claim_count = len(operation_timings.get("claim_detection", [])) + evidence_count = len(operation_timings.get("evidence_gathering", [])) + verdict_count = len(operation_timings.get("verdict_generation", [])) + + evidence_per_claim = evidence_count / claim_count if claim_count > 0 else 0 + + ms_per_claim = claim_detection_ms / claim_count if claim_count > 0 else 0 + ms_per_evidence = evidence_gathering_ms / evidence_count if evidence_count > 0 else 0 + ms_per_verdict = verdict_generation_ms / verdict_count if verdict_count > 0 else 0 + + # Calculate parallelism efficiency + sequential_time = claim_detection_ms + evidence_gathering_ms + verdict_generation_ms + parallelism_efficiency = sequential_time / total_duration_ms if total_duration_ms > 0 else 0 + + return PerformanceReport( + total_duration_ms=total_duration_ms, + claim_detection_ms=claim_detection_ms, + evidence_gathering_ms=evidence_gathering_ms, + verdict_generation_ms=verdict_generation_ms, + claim_count=claim_count, + evidence_count=evidence_count, + verdict_count=verdict_count, + evidence_per_claim=evidence_per_claim, + ms_per_claim=ms_per_claim, + ms_per_evidence=ms_per_evidence, + ms_per_verdict=ms_per_verdict, + parallelism_efficiency=parallelism_efficiency, + timings=self.timings, + ) + + +async def benchmark_pipeline( + manager: VerifactManager, + input_texts: List[str], + iterations: int = 1, +) -> List[PerformanceReport]: + """Benchmark the pipeline with multiple input texts and iterations. + + Args: + manager: The VerifactManager instance to benchmark. + input_texts: List of input texts to process. + iterations: Number of iterations to run for each input text. + + Returns: + List of PerformanceReport objects, one for each iteration of each input text. + """ + reports = [] + + for input_text in input_texts: + for _ in range(iterations): + tracker = PerformanceTracker() + tracker.start() + + # Monkey patch the manager's methods to track performance + original_detect_claims = manager._detect_claims + original_gather_evidence_for_claim = manager._gather_evidence_for_claim + original_generate_verdict_for_claim = manager._generate_verdict_for_claim + + async def timed_detect_claims(text): + return await tracker.timed_operation( + "claim_detection", + original_detect_claims, + text, + ) + + async def timed_gather_evidence_for_claim(claim): + return await tracker.timed_operation( + "evidence_gathering", + original_gather_evidence_for_claim, + claim, + ) + + async def timed_generate_verdict_for_claim(claim, evidence): + return await tracker.timed_operation( + "verdict_generation", + original_generate_verdict_for_claim, + claim, + evidence, + ) + + # Apply the monkey patches + manager._detect_claims = timed_detect_claims + manager._gather_evidence_for_claim = timed_gather_evidence_for_claim + manager._generate_verdict_for_claim = timed_generate_verdict_for_claim + + try: + # Run the pipeline + await manager.run(input_text) + finally: + # Restore the original methods + manager._detect_claims = original_detect_claims + manager._gather_evidence_for_claim = original_gather_evidence_for_claim + manager._generate_verdict_for_claim = original_generate_verdict_for_claim + + tracker.stop() + reports.append(tracker.generate_report()) + + return reports + + +def analyze_benchmark_results(reports: List[PerformanceReport]) -> Dict[str, Any]: + """Analyze benchmark results and return statistics. + + Args: + reports: List of PerformanceReport objects. + + Returns: + Dictionary with statistics. + """ + if not reports: + return {} + + # Extract metrics + total_durations = [report.total_duration_ms for report in reports] + claim_detection_durations = [report.claim_detection_ms for report in reports] + evidence_gathering_durations = [report.evidence_gathering_ms for report in reports] + verdict_generation_durations = [report.verdict_generation_ms for report in reports] + parallelism_efficiencies = [report.parallelism_efficiency for report in reports] + + # Calculate statistics + stats = { + "total_duration": { + "mean": statistics.mean(total_durations), + "median": statistics.median(total_durations), + "min": min(total_durations), + "max": max(total_durations), + "stdev": statistics.stdev(total_durations) if len(total_durations) > 1 else 0, + }, + "claim_detection": { + "mean": statistics.mean(claim_detection_durations), + "median": statistics.median(claim_detection_durations), + "min": min(claim_detection_durations), + "max": max(claim_detection_durations), + "stdev": statistics.stdev(claim_detection_durations) if len(claim_detection_durations) > 1 else 0, + }, + "evidence_gathering": { + "mean": statistics.mean(evidence_gathering_durations), + "median": statistics.median(evidence_gathering_durations), + "min": min(evidence_gathering_durations), + "max": max(evidence_gathering_durations), + "stdev": statistics.stdev(evidence_gathering_durations) if len(evidence_gathering_durations) > 1 else 0, + }, + "verdict_generation": { + "mean": statistics.mean(verdict_generation_durations), + "median": statistics.median(verdict_generation_durations), + "min": min(verdict_generation_durations), + "max": max(verdict_generation_durations), + "stdev": statistics.stdev(verdict_generation_durations) if len(verdict_generation_durations) > 1 else 0, + }, + "parallelism_efficiency": { + "mean": statistics.mean(parallelism_efficiencies), + "median": statistics.median(parallelism_efficiencies), + "min": min(parallelism_efficiencies), + "max": max(parallelism_efficiencies), + "stdev": statistics.stdev(parallelism_efficiencies) if len(parallelism_efficiencies) > 1 else 0, + }, + } + + return stats diff --git a/src/utils/openrouter_config.py b/src/utils/openrouter_config.py new file mode 100644 index 0000000..f014a77 --- /dev/null +++ b/src/utils/openrouter_config.py @@ -0,0 +1,28 @@ +"""OpenRouter configuration for OpenAI client. + +This module configures the OpenAI client to use OpenRouter instead of the OpenAI API. +It should be imported before any other imports that use the OpenAI client. +""" + +import os +import openai +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Get API key from environment +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") + +# Configure OpenAI client to use OpenRouter +openai.api_key = OPENROUTER_API_KEY +openai.base_url = "https://openrouter.ai/api/v1" +openai.default_headers = { + "HTTP-Referer": "https://verifact.ai", # Replace with your site URL + "X-Title": "VeriFact", # Replace with your site name +} + +# Configure OpenAI Agents SDK to use OpenRouter +# This is a workaround until the SDK supports custom base URLs +os.environ["OPENAI_API_KEY"] = OPENROUTER_API_KEY +os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" diff --git a/src/verifact_agents/claim_detector.py b/src/verifact_agents/claim_detector.py index 821d1ba..5483e44 100644 --- a/src/verifact_agents/claim_detector.py +++ b/src/verifact_agents/claim_detector.py @@ -1,3 +1,6 @@ +# Configure OpenAI client to use OpenRouter +from src.utils.openrouter_config import * + from pydantic import BaseModel from agents import Agent import os diff --git a/src/verifact_agents/evidence_hunter.py b/src/verifact_agents/evidence_hunter.py index 07b8672..3390f6d 100644 --- a/src/verifact_agents/evidence_hunter.py +++ b/src/verifact_agents/evidence_hunter.py @@ -1,3 +1,6 @@ +# Configure OpenAI client to use OpenRouter +from src.utils.openrouter_config import * + from pydantic import BaseModel from agents import Agent import os diff --git a/src/verifact_agents/verdict_writer.py b/src/verifact_agents/verdict_writer.py index 81a2fdc..e869830 100644 --- a/src/verifact_agents/verdict_writer.py +++ b/src/verifact_agents/verdict_writer.py @@ -1,3 +1,6 @@ +# Configure OpenAI client to use OpenRouter +from src.utils.openrouter_config import * + import os from agents import Agent from pydantic import BaseModel, Field diff --git a/src/verifact_manager.py b/src/verifact_manager.py index bee72f0..4ab5fc5 100644 --- a/src/verifact_manager.py +++ b/src/verifact_manager.py @@ -9,12 +9,16 @@ and provides both synchronous and asynchronous operation modes. """ +# Configure OpenAI client to use OpenRouter +from src.utils.openrouter_config import * + import asyncio +from typing import Optional, List, Dict, Any from pydantic import BaseModel, Field from agents import Runner, gen_trace_id, trace -from verifact_agents.claim_detector import claim_detector_agent, Claim -from verifact_agents.evidence_hunter import evidence_hunter_agent, Evidence -from verifact_agents.verdict_writer import verdict_writer_agent, Verdict +from src.verifact_agents.claim_detector import claim_detector_agent, Claim +from src.verifact_agents.evidence_hunter import evidence_hunter_agent, Evidence +from src.verifact_agents.verdict_writer import verdict_writer_agent, Verdict import logging logger = logging.getLogger(__name__) @@ -23,7 +27,7 @@ class ManagerConfig(BaseModel): """Configuration options for the factcheck pipeline.""" min_checkworthiness: float = Field(0.5, ge=0.0, le=1.0) - max_claims: int | None = None + max_claims: Optional[int] = None evidence_per_claim: int = Field(5, ge=1) timeout_seconds: float = 120.0 enable_fallbacks: bool = True @@ -36,7 +40,7 @@ class VerifactManager: def __init__(self, config: ManagerConfig = None): self.config = config or ManagerConfig() - async def run(self, query: str) -> None: + async def run(self, query: str) -> List[Verdict]: """Process text through the full factchecking pipeline. Args: @@ -76,16 +80,16 @@ async def run(self, query: str) -> None: logger.info("Factchecking pipeline completed. Generated %d verdicts.", len(verdicts)) return verdicts - async def _detect_claims(self, text: str) -> list[Claim]: + async def _detect_claims(self, text: str) -> List[Claim]: logger.info("Detecting claims...") result = await Runner.run(claim_detector_agent, text) - claims = result.final_output_as(list[Claim]) + claims = result.final_output_as(List[Claim]) logger.info(f"Detected {len(claims)} claims") logger.info(f"Claims: {claims}") - return result.final_output_as(list[Claim]) + return result.final_output_as(List[Claim]) - async def _gather_evidence_for_claim(self, claim: Claim) -> list[Evidence]: + async def _gather_evidence_for_claim(self, claim: Claim) -> List[Evidence]: logger.info(f"Gathering evidence for claim {claim.text[:50]}...") query = f""" @@ -96,9 +100,9 @@ async def _gather_evidence_for_claim(self, claim: Claim) -> list[Evidence]: result = await Runner.run(evidence_hunter_agent, query) logger.info(f"Evidence gathered for claim: {result}") - return result.final_output_as(list[Evidence]) - - async def _gather_evidence(self, claims: list[Claim]) -> list[tuple[Claim, list[Evidence] | None]]: + return result.final_output_as(List[Evidence]) + + async def _gather_evidence(self, claims: List[Claim]) -> List[tuple[Claim, Optional[List[Evidence]]]]: tasks = [self._gather_evidence_for_claim(claim) for claim in claims] results = await asyncio.gather(*tasks, return_exceptions=True) claim_evidence_pairs = [] @@ -116,7 +120,7 @@ async def _gather_evidence(self, claims: list[Claim]) -> list[tuple[Claim, list[ return claim_evidence_pairs - async def _generate_verdict_for_claim(self, claim: Claim, evidence: list[Evidence]) -> Verdict: + async def _generate_verdict_for_claim(self, claim: Claim, evidence: List[Evidence]) -> Verdict: logger.info(f"Generating verdict for claim {claim.text[:50]}...") # TODO: add formatting of evidence and citations before creating the prompt @@ -128,7 +132,7 @@ async def _generate_verdict_for_claim(self, claim: Claim, evidence: list[Evidenc result = await Runner.run(verdict_writer_agent, prompt) return result.final_output_as(Verdict) - async def _generate_all_verdicts(self, claims_with_evidence: list[tuple[Claim, list[Evidence]]]) -> list[Verdict]: + async def _generate_all_verdicts(self, claims_with_evidence: List[tuple[Claim, Optional[List[Evidence]]]]) -> List[Verdict]: logger.info("Generating verdicts...") verdicts = [] for claim, evidence in claims_with_evidence: @@ -136,7 +140,7 @@ async def _generate_all_verdicts(self, claims_with_evidence: list[tuple[Claim, l if not evidence: logger.warning(f"Skipping claim - no evidence found") continue - + logger.info(f"Evidence: {evidence} | {type(evidence)}") logger.info("Generating verdict for claim with %d evidence pieces", len(evidence)) verdict = await self._generate_verdict_for_claim(claim, evidence) @@ -145,7 +149,7 @@ async def _generate_all_verdicts(self, claims_with_evidence: list[tuple[Claim, l logger.info("Generated verdict: %s", verdict.verdict) return verdicts - + # testing if __name__ == "__main__": # load env diff --git a/src/verifact_manager_openrouter.py b/src/verifact_manager_openrouter.py new file mode 100644 index 0000000..32ef421 --- /dev/null +++ b/src/verifact_manager_openrouter.py @@ -0,0 +1,313 @@ +"""VeriFact Factcheck Manager using OpenRouter. + +This module provides a unified pipeline that orchestrates the three agents: +1. ClaimDetector: Identifies factual claims in text +2. EvidenceHunter: Gathers evidence for claims +3. VerdictWriter: Analyzes evidence and generates verdicts + +This version uses OpenRouter directly instead of the OpenAI Agents SDK. +""" + +import asyncio +import os +import json +import logging +from typing import Optional, List, Any +from pydantic import BaseModel, Field +import httpx +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Get API key from environment +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" + +# Model configuration +DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "gpt-4o") +CLAIM_DETECTOR_MODEL = os.getenv("CLAIM_DETECTOR_MODEL", "gpt-4o-mini") +EVIDENCE_HUNTER_MODEL = os.getenv("EVIDENCE_HUNTER_MODEL", "gpt-4o-mini") +VERDICT_WRITER_MODEL = os.getenv("VERDICT_WRITER_MODEL", "gpt-4o-mini") +MODEL_TEMPERATURE = float(os.getenv("MODEL_TEMPERATURE", "0.1")) +MODEL_MAX_TOKENS = int(os.getenv("MODEL_MAX_TOKENS", "1000")) + +# Define data models +class Claim(BaseModel): + """A factual claim that requires verification.""" + text: str + context: float = 0.0 + +class Evidence(BaseModel): + """Evidence related to a claim.""" + content: str + source: str + relevance: float = 1.0 + stance: str = "supporting" # supporting, contradicting, neutral + +class Verdict(BaseModel): + """A verdict on a claim based on evidence.""" + claim: str + verdict: str + confidence: float + explanation: str + sources: List[str] + +class ManagerConfig(BaseModel): + """Configuration options for the factcheck pipeline.""" + min_checkworthiness: float = Field(0.5, ge=0.0, le=1.0) + max_claims: Optional[int] = None + evidence_per_claim: int = Field(5, ge=1) + timeout_seconds: float = 120.0 + enable_fallbacks: bool = True + retry_attempts: int = 2 + raise_exceptions: bool = False + include_debug_info: bool = False + +class VerifactManager: + def __init__(self, config: ManagerConfig = None): + self.config = config or ManagerConfig() + self.client = httpx.AsyncClient( + base_url=OPENROUTER_BASE_URL, + headers={ + "Authorization": f"Bearer {OPENROUTER_API_KEY}", + "HTTP-Referer": "https://verifact.ai", # Replace with your site URL + "X-Title": "VeriFact", # Replace with your site name + }, + timeout=self.config.timeout_seconds, + ) + + async def run(self, query: str) -> List[Verdict]: + """Process text through the full factchecking pipeline.""" + logger.info("Starting factchecking pipeline...") + + # Step 1: Detect claims + try: + claims = await self._detect_claims(query) + if not claims: + logger.info("No check-worthy claims detected in the text") + return [] + except Exception as e: + logger.error("Error in claim detection: %s", str(e), exc_info=True) + raise + + # Step 2: Gather evidence for each claim (with parallelism) + try: + claim_evidence_pairs = await self._gather_evidence(claims) + except Exception as e: + logger.error("Error in evidence gathering: %s", str(e), exc_info=True) + raise + + # Step 3: Generate verdicts for each claim + try: + verdicts = await self._generate_all_verdicts(claim_evidence_pairs) + except Exception as e: + logger.error("Error in verdict generation: %s", str(e), exc_info=True) + raise + + logger.info("Factchecking pipeline completed. Generated %d verdicts.", len(verdicts)) + return verdicts + + async def _call_openrouter(self, prompt: str, model: str = DEFAULT_MODEL) -> str: + """Call OpenRouter API with the given prompt.""" + response = await self.client.post( + "/chat/completions", + json={ + "model": model, + "messages": [ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": prompt} + ], + "temperature": MODEL_TEMPERATURE, + "max_tokens": MODEL_MAX_TOKENS, + }, + ) + response.raise_for_status() + result = response.json() + return result["choices"][0]["message"]["content"] + + async def _detect_claims(self, text: str) -> List[Claim]: + """Detect claims in the given text.""" + logger.info("Detecting claims...") + + prompt = """ + You are a claim detection agent designed to identify factual claims from text that require verification. + Your task is to identify explicit and implicit factual claims from the following text. + + For each claim, return: + 1. The original claim text + 2. A context score (0.0-1.0) + + Format your response as a JSON array of objects with 'text' and 'context' properties. + + Text to analyze: + {text} + """.format(text=text) + + response_text = await self._call_openrouter(prompt, CLAIM_DETECTOR_MODEL) + + try: + # Extract JSON from the response + json_str = response_text + if "```json" in response_text: + json_str = response_text.split("```json")[1].split("```")[0].strip() + elif "```" in response_text: + json_str = response_text.split("```")[1].split("```")[0].strip() + + claims_data = json.loads(json_str) + claims = [Claim(**claim) for claim in claims_data] + logger.info(f"Detected {len(claims)} claims") + return claims + except Exception as e: + logger.error(f"Error parsing claims: {e}") + logger.error(f"Response text: {response_text}") + # Fallback: try to extract claims manually + return [Claim(text=text, context=1.0)] + + async def _gather_evidence_for_claim(self, claim: Claim) -> List[Evidence]: + """Gather evidence for the given claim.""" + logger.info(f"Gathering evidence for claim: {claim.text[:50]}...") + + prompt = """ + You are an evidence gathering agent tasked with finding and evaluating evidence related to factual claims. + + For the following claim, provide evidence that supports or contradicts it. + + For each piece of evidence, provide: + - content: The relevant text passage that addresses the claim + - source: The source URL + - relevance: A score from 0.0 to 1.0 indicating how relevant this evidence is to the claim + - stance: "supporting", "contradicting", or "neutral" based on how the evidence relates to the claim + + Format your response as a JSON array of objects with 'content', 'source', 'relevance', and 'stance' properties. + + Claim to investigate: {claim} + Context of the claim: {context} + """.format(claim=claim.text, context=claim.context) + + response_text = await self._call_openrouter(prompt, EVIDENCE_HUNTER_MODEL) + + try: + # Extract JSON from the response + json_str = response_text + if "```json" in response_text: + json_str = response_text.split("```json")[1].split("```")[0].strip() + elif "```" in response_text: + json_str = response_text.split("```")[1].split("```")[0].strip() + + evidence_data = json.loads(json_str) + evidence = [Evidence(**ev) for ev in evidence_data] + logger.info(f"Gathered {len(evidence)} pieces of evidence") + return evidence + except Exception as e: + logger.error(f"Error parsing evidence: {e}") + logger.error(f"Response text: {response_text}") + # Fallback: return empty evidence + return [] + + async def _gather_evidence(self, claims: List[Claim]) -> List[tuple[Claim, Optional[List[Evidence]]]]: + """Gather evidence for all claims in parallel.""" + tasks = [self._gather_evidence_for_claim(claim) for claim in claims] + results = await asyncio.gather(*tasks, return_exceptions=True) + claim_evidence_pairs = [] + + for claim, result in zip(claims, results): + if isinstance(result, Exception): + logger.error(f"Error gathering evidence for claim: {claim.text[:50]}: {result}", exc_info=True) + claim_evidence_pairs.append((claim, None)) + elif result is None: + logger.warning(f"No evidence found for claim: {claim.text[:50]}") + claim_evidence_pairs.append((claim, None)) + else: + claim_evidence_pairs.append((claim, result)) + + return claim_evidence_pairs + + async def _generate_verdict_for_claim(self, claim: Claim, evidence: List[Evidence]) -> Verdict: + """Generate a verdict for the given claim based on the evidence.""" + logger.info(f"Generating verdict for claim: {claim.text[:50]}...") + + evidence_text = "\n".join([ + f"- Source: {ev.source}\n Content: {ev.content}\n Stance: {ev.stance}\n Relevance: {ev.relevance}" + for ev in evidence + ]) + + prompt = """ + You are a verdict writing agent tasked with analyzing evidence and generating verdicts for factual claims. + + Based on the evidence provided, determine whether the claim is true, false, partially true, or unverifiable. + + Provide: + - claim: The claim text + - verdict: "true", "false", "partially true", or "unverifiable" + - confidence: A score from 0.0 to 1.0 indicating your confidence in the verdict + - explanation: A detailed explanation of your reasoning + - sources: A list of sources used to reach the verdict + + Format your response as a JSON object with 'claim', 'verdict', 'confidence', 'explanation', and 'sources' properties. + + Claim to investigate: {claim} + + Evidence: + {evidence} + """.format(claim=claim.text, evidence=evidence_text) + + response_text = await self._call_openrouter(prompt, VERDICT_WRITER_MODEL) + + try: + # Extract JSON from the response + json_str = response_text + if "```json" in response_text: + json_str = response_text.split("```json")[1].split("```")[0].strip() + elif "```" in response_text: + json_str = response_text.split("```")[1].split("```")[0].strip() + + verdict_data = json.loads(json_str) + return Verdict(**verdict_data) + except Exception as e: + logger.error(f"Error parsing verdict: {e}") + logger.error(f"Response text: {response_text}") + # Fallback: return a default verdict + return Verdict( + claim=claim.text, + verdict="unverifiable", + confidence=0.5, + explanation="Unable to determine the veracity of this claim due to processing errors.", + sources=[ev.source for ev in evidence if hasattr(ev, 'source')] + ) + + async def _generate_all_verdicts(self, claims_with_evidence: List[tuple[Claim, Optional[List[Evidence]]]]) -> List[Verdict]: + """Generate verdicts for all claims.""" + logger.info("Generating verdicts...") + verdicts = [] + for claim, evidence in claims_with_evidence: + if not evidence: + logger.warning(f"Skipping claim - no evidence found") + continue + + verdict = await self._generate_verdict_for_claim(claim, evidence) + verdicts.append(verdict) + logger.info(f"Generated verdict: {verdict.verdict}") + + return verdicts + + async def close(self): + """Close the HTTP client.""" + await self.client.aclose() + +# Testing +if __name__ == "__main__": + async def main(): + manager = VerifactManager() + query = "The sky is blue and the grass is green" + try: + verdicts = await manager.run(query) + print(json.dumps([verdict.dict() for verdict in verdicts], indent=2)) + finally: + await manager.close() + + asyncio.run(main())