diff --git a/pyproject.toml b/pyproject.toml
index fc62c31..872c42e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ dependencies = [
"cryptography>=46.0.3",
"docling>=2.61.2",
"fastapi>=0.121.2",
- "httpx>=0.28.1",
+ "httpx[http2]>=0.28.1",
"langchain-community>=0.3.17",
"langchain-google-vertexai>=3.2.0",
"langchain-litellm>=0.3.0",
@@ -27,6 +27,7 @@ dependencies = [
"sentence-transformers>=5.2.0",
"sqlalchemy>=2.0.44",
"tiktoken>=0.12.0",
+ "trafilatura>=1.7.0",
"transformers>=4.57.1",
"uvicorn>=0.38.0",
]
diff --git a/ragitect/services/embedding.py b/ragitect/services/embedding.py
index 904ffe5..55a9148 100644
--- a/ragitect/services/embedding.py
+++ b/ragitect/services/embedding.py
@@ -202,6 +202,7 @@ def create_embeddings_model(config: EmbeddingConfig | None = None) -> Embeddings
return OpenAIEmbeddings(
model=config.model,
api_key=SecretStr(config.api_key),
+ dimensions=config.dimension,
)
elif provider == "vertex_ai":
diff --git a/ragitect/services/processor/__init__.py b/ragitect/services/processor/__init__.py
index e69de29..2fe1813 100644
--- a/ragitect/services/processor/__init__.py
+++ b/ragitect/services/processor/__init__.py
@@ -0,0 +1,20 @@
+"""Document processors for RAGitect.
+
+This module provides processors for extracting text/markdown from various sources.
+"""
+
+from ragitect.services.processor.base import BaseDocumentProcessor
+from ragitect.services.processor.simple import SimpleProcessor
+from ragitect.services.processor.web_url_processor import (
+ ContentExtractionError,
+ URLFetchError,
+ WebURLProcessor,
+)
+
+__all__ = [
+ "BaseDocumentProcessor",
+ "ContentExtractionError",
+ "SimpleProcessor",
+ "URLFetchError",
+ "WebURLProcessor",
+]
\ No newline at end of file
diff --git a/ragitect/services/processor/web_url_processor.py b/ragitect/services/processor/web_url_processor.py
new file mode 100644
index 0000000..e533466
--- /dev/null
+++ b/ragitect/services/processor/web_url_processor.py
@@ -0,0 +1,193 @@
+"""Web URL Processor - Fetches web pages and converts to clean Markdown.
+
+This processor handles web URL ingestion by:
+1. Fetching HTML content via httpx with proper timeout/HTTP2/redirect configuration
+2. Extracting main article content using trafilatura (removes nav, ads, footers)
+3. Converting to Markdown format for downstream chunking and embedding
+
+Usage:
+ processor = WebURLProcessor()
+ markdown = await processor.process("https://example.com/article")
+
+Note:
+ This processor inherits from BaseDocumentProcessor but overrides with an
+ async signature. The async process(url: str) method is used for URL fetching.
+ The sync process(file_bytes, file_name) method raises NotImplementedError.
+
+ Integration with ProcessorFactory happens in Story 5.5.
+"""
+
+import logging
+from typing import override
+
+import httpx
+import trafilatura
+
+from ragitect.services.processor.base import BaseDocumentProcessor
+
+logger = logging.getLogger(__name__)
+
+
+class URLFetchError(Exception):
+ """Raised when HTTP request fails (timeout, connection error, 4xx/5xx).
+
+ Attributes:
+ url: The URL that failed to fetch
+ message: Descriptive error message including URL and error type
+ """
+
+ pass
+
+
+class ContentExtractionError(Exception):
+ """Raised when content extraction fails (trafilatura returns None).
+
+ Attributes:
+ url: The URL where content extraction failed
+ message: Descriptive error message
+ """
+
+ pass
+
+
+class WebURLProcessor(BaseDocumentProcessor):
+ """Processor for fetching web pages and converting to Markdown.
+
+ Inherits from BaseDocumentProcessor but provides an async process(url: str)
+ method instead of the sync process(file_bytes, file_name) method.
+
+ Implements async web page fetching with:
+ - 30 second total timeout, 10 second connect timeout (NFR-P4)
+ - HTTP/2 support for improved performance
+ - Automatic redirect following
+ - Connection pooling (max 20 keepalive connections)
+
+ Content extraction uses trafilatura to:
+ - Extract main article content
+ - Remove navigation, ads, headers, footers
+ - Strip potentially malicious elements (scripts, iframes) for security (NFR-S5)
+ - Capture article metadata (title, author, date) when available
+
+ Example:
+ >>> processor = WebURLProcessor()
+ >>> markdown = await processor.process("https://en.wikipedia.org/wiki/Python")
+ >>> print(markdown[:100])
+ # Python (programming language)
+ ...
+ """
+
+ @override
+ def supported_formats(self) -> list[str]:
+ """Return list of supported file extensions.
+
+ WebURLProcessor is not file-based, so returns empty list.
+ URL-based routing is handled separately from file extension routing.
+
+ Returns:
+ Empty list (not file-based)
+ """
+ return []
+
+ async def process(self, url: str) -> str:
+ """Fetch web page and convert to clean Markdown.
+
+ Args:
+ url: HTTP or HTTPS URL to fetch
+
+ Returns:
+ Markdown string with main article content extracted
+
+ Raises:
+ URLFetchError: If HTTP request fails (timeout, connection error, 4xx/5xx)
+ ContentExtractionError: If content extraction fails (empty page)
+ """
+ logger.info(f"Processing web URL: {url}")
+
+ # Fetch HTML content
+ html_content = await self._fetch_url(url)
+
+ # Extract main content and convert to Markdown
+ markdown = self._extract_content(html_content, url)
+
+ logger.info(f"Successfully processed {url} - {len(markdown)} chars extracted")
+ return markdown
+
+ async def _fetch_url(self, url: str) -> str:
+ """Fetch HTML content from URL with configured httpx client.
+
+ Args:
+ url: URL to fetch
+
+ Returns:
+ HTML content as string
+
+ Raises:
+ URLFetchError: On timeout, connection error, or HTTP error status
+ """
+ # Configure timeout: 30s total, 10s connect (NFR-P4)
+ timeout = httpx.Timeout(30.0, connect=10.0)
+
+ # Configure connection limits for pooling (NFR-R3)
+ limits = httpx.Limits(max_keepalive_connections=20)
+
+ # Set User-Agent to avoid 403 from sites that block automated requests
+ headers = {
+ "User-Agent": "Mozilla/5.0 (compatible; RAGitect/1.0; +https://github.com/bhdai/ragitect)"
+ }
+
+ async with httpx.AsyncClient(
+ timeout=timeout,
+ http2=True, # Enable HTTP/2 support
+ follow_redirects=True, # Auto-follow redirects
+ limits=limits, # Connection pooling
+ headers=headers, # Default headers for all requests
+ ) as client:
+ try:
+ response = await client.get(url)
+ response.raise_for_status()
+ return response.text
+ except httpx.TimeoutException as e:
+ logger.error(f"Timeout fetching {url}: {e}")
+ raise URLFetchError(f"Timeout fetching {url} (30s limit)")
+ except httpx.ConnectError as e:
+ logger.error(f"Connection error fetching {url}: {e}")
+ raise URLFetchError(f"Connection error fetching {url}: {str(e)}")
+ except httpx.HTTPStatusError as e:
+ logger.error(f"HTTP {e.response.status_code} fetching {url}")
+ raise URLFetchError(f"HTTP {e.response.status_code} fetching {url}")
+
+ def _extract_content(self, html_content: str, url: str) -> str:
+ """Extract main article content and convert to Markdown.
+
+ Uses trafilatura for:
+ - Main content extraction (removes nav, ads, footers)
+ - Script/iframe stripping (NFR-S5 security)
+ - Direct Markdown output
+
+ Args:
+ html_content: Raw HTML content
+ url: Original URL (for error messages)
+
+ Returns:
+ Markdown string with extracted content
+
+ Raises:
+ ContentExtractionError: If extraction returns None/empty
+ """
+ # Extract main content with trafilatura
+ # output_format="markdown" gives us direct Markdown output
+ markdown = trafilatura.extract(
+ html_content,
+ output_format="markdown",
+ include_comments=False, # Exclude comments
+ include_tables=True, # Keep tables
+ include_images=True, # Keep image references
+ include_links=True, # Keep hyperlinks
+ no_fallback=False, # Use fallback extraction if main method fails
+ )
+
+ if markdown is None:
+ logger.error(f"Failed to extract content from {url}")
+ raise ContentExtractionError(f"Failed to extract content from {url}")
+
+ return markdown
diff --git a/tests/services/processor/test_web_url_processor.py b/tests/services/processor/test_web_url_processor.py
new file mode 100644
index 0000000..3ec6986
--- /dev/null
+++ b/tests/services/processor/test_web_url_processor.py
@@ -0,0 +1,448 @@
+"""Tests for WebURLProcessor - Web page fetching and markdown extraction
+
+Red-Green-Refactor TDD: These tests define expected behavior before implementation.
+"""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+
+# Module-level markers as per project-context.md
+pytestmark = [pytest.mark.asyncio]
+
+
+class TestWebURLProcessorInterface:
+ """Test WebURLProcessor class interface and method signatures"""
+
+ def test_class_exists(self):
+ """WebURLProcessor class should be importable"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+ assert processor is not None
+
+ def test_inherits_from_base_document_processor(self):
+ """WebURLProcessor should inherit from BaseDocumentProcessor"""
+ from ragitect.services.processor.base import BaseDocumentProcessor
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+ assert isinstance(processor, BaseDocumentProcessor)
+
+ def test_supported_formats_returns_empty_list(self):
+ """WebURLProcessor is not file-based, returns empty list"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+ formats = processor.supported_formats()
+ assert formats == []
+
+ async def test_process_method_signature_async(self):
+ """process() should be async and accept url string"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+ # This will fail until implemented - verifies async signature
+ with patch.object(processor, "_fetch_url", new_callable=AsyncMock) as mock_fetch:
+ mock_fetch.return_value = "
Test"
+ with patch("trafilatura.extract") as mock_extract:
+ mock_extract.return_value = "# Test"
+ result = await processor.process("https://example.com")
+ assert isinstance(result, str)
+
+
+class TestWebURLProcessorFetching:
+ """Test HTTP fetching functionality"""
+
+ async def test_successful_fetch_returns_markdown(self):
+ """Successful fetch and extraction returns markdown string"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+
+ mock_html = "Test Article
Content here.
"
+ expected_markdown = "# Test Article\n\nContent here."
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.text = mock_html
+ mock_response.raise_for_status = MagicMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with patch("trafilatura.extract") as mock_extract:
+ mock_extract.return_value = expected_markdown
+
+ result = await processor.process("https://example.com/article")
+
+ assert result == expected_markdown
+ mock_client.get.assert_called_once()
+ mock_extract.assert_called_once()
+
+ async def test_httpx_client_configured_with_timeouts(self):
+ """httpx client should have 30s total, 10s connect timeout"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.text = ""
+ mock_response.raise_for_status = MagicMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with patch("trafilatura.extract", return_value="# Test"):
+ await processor.process("https://example.com")
+
+ # Verify client was created with correct config
+ call_kwargs = mock_client_class.call_args.kwargs
+ assert "timeout" in call_kwargs
+ timeout = call_kwargs["timeout"]
+ assert timeout.connect == 10.0
+ assert timeout.read == 30.0 or timeout.pool == 30.0
+
+ async def test_httpx_client_has_http2_enabled(self):
+ """httpx client should have HTTP/2 support enabled"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.text = ""
+ mock_response.raise_for_status = MagicMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with patch("trafilatura.extract", return_value="# Test"):
+ await processor.process("https://example.com")
+
+ call_kwargs = mock_client_class.call_args.kwargs
+ assert call_kwargs.get("http2") is True
+
+ async def test_httpx_client_follows_redirects(self):
+ """httpx client should follow redirects automatically"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.text = ""
+ mock_response.raise_for_status = MagicMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with patch("trafilatura.extract", return_value="# Test"):
+ await processor.process("https://example.com")
+
+ call_kwargs = mock_client_class.call_args.kwargs
+ assert call_kwargs.get("follow_redirects") is True
+
+
+class TestWebURLProcessorContentExtraction:
+ """Test content extraction via trafilatura"""
+
+ async def test_trafilatura_extracts_main_content(self):
+ """trafilatura should be called with html and return markdown"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+
+ html_with_nav = """
+
+
+
+ Main Article
+ This is the main content.
+
+
+
+ """
+ expected_markdown = "# Main Article\n\nThis is the main content."
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.text = html_with_nav
+ mock_response.raise_for_status = MagicMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with patch("trafilatura.extract") as mock_extract:
+ mock_extract.return_value = expected_markdown
+
+ result = await processor.process("https://example.com")
+
+ # Verify trafilatura was called with html content
+ mock_extract.assert_called_once()
+ call_args = mock_extract.call_args
+ # First positional arg should be the HTML content
+ assert call_args[0][0] == html_with_nav
+
+ # Verify result is the markdown
+ assert result == expected_markdown
+
+
+class TestWebURLProcessorErrorHandling:
+ """Test error handling for various failure scenarios"""
+
+ async def test_timeout_raises_url_fetch_error(self):
+ """Timeout should raise URLFetchError with descriptive message"""
+ from ragitect.services.processor.web_url_processor import (
+ WebURLProcessor,
+ URLFetchError,
+ )
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("Timeout"))
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with pytest.raises(URLFetchError) as exc_info:
+ await processor.process("https://slow-site.com")
+
+ assert "slow-site.com" in str(exc_info.value)
+ assert "timeout" in str(exc_info.value).lower()
+
+ async def test_connection_error_raises_url_fetch_error(self):
+ """Connection error should raise URLFetchError with details"""
+ from ragitect.services.processor.web_url_processor import (
+ WebURLProcessor,
+ URLFetchError,
+ )
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_client.get = AsyncMock(
+ side_effect=httpx.ConnectError("Connection refused")
+ )
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with pytest.raises(URLFetchError) as exc_info:
+ await processor.process("https://unreachable.com")
+
+ assert "unreachable.com" in str(exc_info.value)
+
+ async def test_http_404_raises_url_fetch_error(self):
+ """HTTP 404 should raise URLFetchError with status code"""
+ from ragitect.services.processor.web_url_processor import (
+ WebURLProcessor,
+ URLFetchError,
+ )
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.status_code = 404
+ mock_request = MagicMock()
+ mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
+ "Not Found", request=mock_request, response=mock_response
+ )
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with pytest.raises(URLFetchError) as exc_info:
+ await processor.process("https://example.com/missing")
+
+ assert "404" in str(exc_info.value)
+ assert "example.com" in str(exc_info.value)
+
+ async def test_http_500_raises_url_fetch_error(self):
+ """HTTP 500 should raise URLFetchError with status code"""
+ from ragitect.services.processor.web_url_processor import (
+ WebURLProcessor,
+ URLFetchError,
+ )
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.status_code = 500
+ mock_request = MagicMock()
+ mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
+ "Internal Server Error", request=mock_request, response=mock_response
+ )
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with pytest.raises(URLFetchError) as exc_info:
+ await processor.process("https://example.com/error")
+
+ assert "500" in str(exc_info.value)
+
+ async def test_trafilatura_returns_none_raises_content_extraction_error(self):
+ """Empty extraction should raise ContentExtractionError"""
+ from ragitect.services.processor.web_url_processor import (
+ WebURLProcessor,
+ ContentExtractionError,
+ )
+
+ processor = WebURLProcessor()
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_response = MagicMock()
+ mock_response.text = ""
+ mock_response.raise_for_status = MagicMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with patch("trafilatura.extract", return_value=None):
+ with pytest.raises(ContentExtractionError) as exc_info:
+ await processor.process("https://empty-page.com")
+
+ assert "empty-page.com" in str(exc_info.value)
+
+ async def test_exception_messages_contain_url(self):
+ """All exception messages should include the URL for debugging"""
+ from ragitect.services.processor.web_url_processor import (
+ WebURLProcessor,
+ URLFetchError,
+ )
+
+ processor = WebURLProcessor()
+ test_url = "https://test-debugging.example.com/path"
+
+ with patch("httpx.AsyncClient") as mock_client_class:
+ mock_client = AsyncMock()
+ mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("Timeout"))
+ mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client.__aexit__ = AsyncMock(return_value=None)
+ mock_client_class.return_value = mock_client
+
+ with pytest.raises(URLFetchError) as exc_info:
+ await processor.process(test_url)
+
+ # URL should be in the error message for debugging
+ assert "test-debugging.example.com" in str(exc_info.value)
+
+
+class TestWebURLProcessorExceptions:
+ """Test custom exception classes exist and are properly defined"""
+
+ def test_url_fetch_error_exists(self):
+ """URLFetchError exception class should exist"""
+ from ragitect.services.processor.web_url_processor import URLFetchError
+
+ error = URLFetchError("Test error")
+ assert isinstance(error, Exception)
+ assert str(error) == "Test error"
+
+ def test_content_extraction_error_exists(self):
+ """ContentExtractionError exception class should exist"""
+ from ragitect.services.processor.web_url_processor import ContentExtractionError
+
+ error = ContentExtractionError("Test error")
+ assert isinstance(error, Exception)
+ assert str(error) == "Test error"
+
+
+@pytest.mark.integration
+class TestWebURLProcessorIntegration:
+ """Integration tests with real web pages (require network access)"""
+
+ async def test_process_wikipedia_article(self):
+ """Integration test: fetch real Wikipedia page and extract markdown"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+ url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
+
+ # Fetch real page
+ markdown = await processor.process(url)
+
+ # Verify main content present
+ assert "Python" in markdown
+ assert "#" in markdown # Has headings
+ assert len(markdown) > 1000 # Substantial content extracted
+
+ # Verify markdown format
+ assert any(
+ marker in markdown for marker in ["#", "##", "###"]
+ ) # Has markdown headings
+
+ async def test_process_removes_navigation_boilerplate(self):
+ """Integration test: verify navigation/boilerplate is stripped"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+
+ processor = WebURLProcessor()
+ url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
+
+ markdown = await processor.process(url)
+ markdown_lower = markdown.lower()
+
+ # These Wikipedia navigation elements should NOT be in extracted content
+ # Note: Some may appear in actual article text, so we check for common nav patterns
+ navigation_patterns = [
+ "jump to navigation",
+ "jump to search",
+ "personal tools",
+ "[edit]", # Wikipedia edit links
+ ]
+
+ # At least most navigation patterns should be absent
+ nav_found = sum(1 for p in navigation_patterns if p in markdown_lower)
+ assert nav_found <= 1, f"Too many nav patterns found: {nav_found}"
+
+ async def test_markdown_compatible_with_chunking(self):
+ """Integration test: verify markdown works with DocumentProcessor chunking"""
+ from ragitect.services.processor.web_url_processor import WebURLProcessor
+ from ragitect.services.document_processor import split_markdown_document
+
+ processor = WebURLProcessor()
+ url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
+
+ # Fetch and extract
+ markdown = await processor.process(url)
+
+ # Test with existing chunker
+ chunks = split_markdown_document(
+ raw_text=markdown,
+ chunk_size=512,
+ overlap=50,
+ )
+
+ # Verify chunking works
+ assert len(chunks) > 0, "Should produce at least one chunk"
+ assert all(
+ isinstance(chunk, str) for chunk in chunks
+ ), "Chunks should be strings"
+ assert all(
+ len(chunk) > 0 for chunk in chunks
+ ), "Each chunk should have content"
+
diff --git a/uv.lock b/uv.lock
index 5f953d4..23b88ec 100644
--- a/uv.lock
+++ b/uv.lock
@@ -196,6 +196,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
]
+[[package]]
+name = "babel"
+version = "2.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
+]
+
[[package]]
name = "beautifulsoup4"
version = "4.14.2"
@@ -394,6 +403,20 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" },
]
+[[package]]
+name = "courlan"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "babel" },
+ { name = "tld" },
+ { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382, upload-time = "2024-10-29T16:40:20.994Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848, upload-time = "2024-10-29T16:40:18.325Z" },
+]
+
[[package]]
name = "coverage"
version = "7.12.0"
@@ -524,6 +547,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686, upload-time = "2024-06-09T16:20:16.715Z" },
]
+[[package]]
+name = "dateparser"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "python-dateutil" },
+ { name = "pytz" },
+ { name = "regex" },
+ { name = "tzlocal" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/30/064144f0df1749e7bb5faaa7f52b007d7c2d08ec08fed8411aba87207f68/dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7", size = 329840, upload-time = "2025-06-26T09:29:23.211Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/87/22/f020c047ae1346613db9322638186468238bcfa8849b4668a22b97faad65/dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482", size = 315453, upload-time = "2025-06-26T09:29:21.412Z" },
+]
+
[[package]]
name = "dill"
version = "0.4.0"
@@ -1110,7 +1148,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
{ url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
{ url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" },
- { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" },
{ url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" },
{ url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
{ url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
@@ -1121,7 +1158,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
{ url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
{ url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" },
- { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
{ url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
{ url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
{ url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
@@ -1197,6 +1233,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
]
+[[package]]
+name = "h2"
+version = "4.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "hpack" },
+ { name = "hyperframe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" },
+]
+
[[package]]
name = "hf-xet"
version = "1.2.0"
@@ -1226,6 +1275,31 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
]
+[[package]]
+name = "hpack"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" },
+]
+
+[[package]]
+name = "htmldate"
+version = "1.9.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "charset-normalizer" },
+ { name = "dateparser" },
+ { name = "lxml" },
+ { name = "python-dateutil" },
+ { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/10/ead9dabc999f353c3aa5d0dc0835b1e355215a5ecb489a7f4ef2ddad5e33/htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0", size = 44690, upload-time = "2025-11-04T17:46:44.983Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/a1/bd/adfcdaaad5805c0c5156aeefd64c1e868c05e9c1cd6fd21751f168cd88c7/htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c", size = 31558, upload-time = "2025-11-04T17:46:43.258Z" },
+]
+
[[package]]
name = "httpcore"
version = "1.0.9"
@@ -1254,6 +1328,11 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
+[package.optional-dependencies]
+http2 = [
+ { name = "h2" },
+]
+
[[package]]
name = "httpx-sse"
version = "0.4.3"
@@ -1282,6 +1361,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" },
]
+[[package]]
+name = "hyperframe"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" },
+]
+
[[package]]
name = "identify"
version = "2.6.15"
@@ -1475,6 +1563,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
]
+[[package]]
+name = "justext"
+version = "3.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "lxml", extra = ["html-clean"] },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
+]
+
[[package]]
name = "langchain"
version = "1.1.3"
@@ -1784,6 +1884,23 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" },
]
+[package.optional-dependencies]
+html-clean = [
+ { name = "lxml-html-clean" },
+]
+
+[[package]]
+name = "lxml-html-clean"
+version = "0.4.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "lxml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d9/cb/c9c5bb2a9c47292e236a808dd233a03531f53b626f36259dcd32b49c76da/lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c", size = 21498, upload-time = "2025-10-02T20:49:24.895Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/10/4a/63a9540e3ca73709f4200564a737d63a4c8c9c4dd032bab8535f507c190a/lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e", size = 14177, upload-time = "2025-10-02T20:49:23.749Z" },
+]
+
[[package]]
name = "mako"
version = "1.3.10"
@@ -3278,7 +3395,7 @@ dependencies = [
{ name = "cryptography" },
{ name = "docling" },
{ name = "fastapi" },
- { name = "httpx" },
+ { name = "httpx", extra = ["http2"] },
{ name = "langchain-community" },
{ name = "langchain-google-vertexai" },
{ name = "langchain-litellm" },
@@ -3295,6 +3412,7 @@ dependencies = [
{ name = "sentence-transformers" },
{ name = "sqlalchemy" },
{ name = "tiktoken" },
+ { name = "trafilatura" },
{ name = "transformers" },
{ name = "uvicorn" },
]
@@ -3320,7 +3438,7 @@ requires-dist = [
{ name = "cryptography", specifier = ">=46.0.3" },
{ name = "docling", specifier = ">=2.61.2" },
{ name = "fastapi", specifier = ">=0.121.2" },
- { name = "httpx", specifier = ">=0.28.1" },
+ { name = "httpx", extras = ["http2"], specifier = ">=0.28.1" },
{ name = "langchain-community", specifier = ">=0.3.17" },
{ name = "langchain-google-vertexai", specifier = ">=3.2.0" },
{ name = "langchain-litellm", specifier = ">=0.3.0" },
@@ -3337,6 +3455,7 @@ requires-dist = [
{ name = "sentence-transformers", specifier = ">=5.2.0" },
{ name = "sqlalchemy", specifier = ">=2.0.44" },
{ name = "tiktoken", specifier = ">=0.12.0" },
+ { name = "trafilatura", specifier = ">=1.7.0" },
{ name = "transformers", specifier = ">=4.57.1" },
{ name = "uvicorn", specifier = ">=0.38.0" },
]
@@ -3997,6 +4116,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
]
+[[package]]
+name = "tld"
+version = "0.13.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/a1/5723b07a70c1841a80afc9ac572fdf53488306848d844cd70519391b0d26/tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350", size = 462000, upload-time = "2025-05-21T22:18:29.341Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/dc/70/b2f38360c3fc4bc9b5e8ef429e1fde63749144ac583c2dbdf7e21e27a9ad/tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c", size = 274718, upload-time = "2025-05-21T22:18:25.811Z" },
+]
+
[[package]]
name = "tokenizers"
version = "0.22.1"
@@ -4110,6 +4238,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
]
+[[package]]
+name = "trafilatura"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "certifi" },
+ { name = "charset-normalizer" },
+ { name = "courlan" },
+ { name = "htmldate" },
+ { name = "justext" },
+ { name = "lxml" },
+ { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404, upload-time = "2024-12-03T15:23:24.16Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557, upload-time = "2024-12-03T15:23:21.41Z" },
+]
+
[[package]]
name = "transformers"
version = "4.57.1"
@@ -4299,6 +4445,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
]
+[[package]]
+name = "tzlocal"
+version = "5.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "tzdata", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" },
+]
+
[[package]]
name = "urllib3"
version = "2.5.0"