diff --git a/liveweb_arena/core/cache.py b/liveweb_arena/core/cache.py index 939be47..1260b4c 100644 --- a/liveweb_arena/core/cache.py +++ b/liveweb_arena/core/cache.py @@ -272,6 +272,9 @@ class CacheManager: # Minimum interval between consecutive cache-miss fetches (seconds) _PREFETCH_INTERVAL = 1.0 + # Retry transient _fetch_page failures (timeout, 5xx, connection reset) + _MAX_PAGE_RETRIES = 2 + _PAGE_RETRY_DELAY = 3.0 def __init__(self, cache_dir: Path, ttl: int = None): self.cache_dir = Path(cache_dir) @@ -380,54 +383,30 @@ async def _ensure_single( start = time.time() try: - if need_api: - # Fetch HTML and API data concurrently - page_task = asyncio.ensure_future(self._fetch_page(url, plugin)) - api_task = asyncio.ensure_future(plugin.fetch_api_data(url)) - - # Wait for both, collecting errors - page_result = None - page_error = None - api_data = None - api_error = None - - try: - page_result = await page_task - except Exception as e: - page_error = e - api_task.cancel() + html, accessibility_tree = await self._fetch_page_with_retry( + url, plugin, + ) - if page_error is None: + if need_api: + # If the plugin can derive GT from the already-fetched HTML, + # skip the separate network request. This avoids a redundant + # concurrent fetch to the same URL (e.g. ArXiv listing pages). + api_data = plugin.extract_api_data_from_html(url, html) + if api_data is None: + # Plugin requires an independent API call try: - api_data = await api_task + api_data = await plugin.fetch_api_data(url) except Exception as e: - api_error = e - - if page_error is not None: - raise CacheFatalError( - f"Page fetch failed (browser cannot load): {page_error}", - url=url, - ) - html, accessibility_tree = page_result - - if api_error is not None: - raise CacheFatalError( - f"API data fetch failed (GT will be invalid): {api_error}", - url=url, - ) + raise CacheFatalError( + f"API data fetch failed (GT will be invalid): {e}", + url=url, + ) if not api_data: raise CacheFatalError( f"API data is empty (GT will be invalid)", url=url, ) else: - try: - html, accessibility_tree = await self._fetch_page(url, plugin) - except Exception as e: - raise CacheFatalError( - f"Page fetch failed (browser cannot load): {e}", - url=url, - ) api_data = None cached = CachedPage( @@ -523,6 +502,35 @@ def _save(self, cache_file: Path, cached: CachedPage): with open(cache_file, 'w', encoding='utf-8') as f: json.dump(cached.to_dict(), f, ensure_ascii=False) + async def _fetch_page_with_retry(self, url: str, plugin=None) -> tuple: + """Fetch page HTML with retry for transient failures. + + Retries on timeouts, HTTP 5xx, and connection errors. + Permanent failures (HTTP 4xx, CAPTCHA) are raised immediately. + """ + last_error: Exception = None + for attempt in range(self._MAX_PAGE_RETRIES): + try: + return await self._fetch_page(url, plugin) + except CacheFatalError as e: + msg = str(e) + # Permanent failures — do not retry + if any(s in msg for s in ("CAPTCHA", "HTTP 4")): + raise + last_error = e + except Exception as e: + last_error = e + + if attempt < self._MAX_PAGE_RETRIES - 1: + log("Cache", f"Page fetch retry {attempt + 1} for " + f"{url_display(url)}: {last_error}") + await asyncio.sleep(self._PAGE_RETRY_DELAY) + + raise CacheFatalError( + f"Page fetch failed (browser cannot load): {last_error}", + url=url, + ) + async def _fetch_page(self, url: str, plugin=None) -> tuple: """ Fetch page HTML and accessibility tree using shared Playwright browser. diff --git a/liveweb_arena/plugins/arxiv/api_client.py b/liveweb_arena/plugins/arxiv/api_client.py index 058156e..e480878 100644 --- a/liveweb_arena/plugins/arxiv/api_client.py +++ b/liveweb_arena/plugins/arxiv/api_client.py @@ -171,6 +171,9 @@ async def fetch_listing( session = await _get_session() req_timeout = aiohttp.ClientTimeout(total=timeout) + _RETRYABLE = {429, 500, 502, 503, 504} + last_status = None + for attempt in range(cls.MAX_RETRIES): await cls._rate_limit() try: @@ -178,22 +181,62 @@ async def fetch_listing( if resp.status == 200: text = await resp.text() return parse_listing_html(text) - if resp.status >= 500 and attempt < cls.MAX_RETRIES - 1: - wait = 2 ** attempt + + last_status = resp.status + if resp.status in _RETRYABLE and attempt < cls.MAX_RETRIES - 1: + # Honour Retry-After header when present + retry_after = resp.headers.get("Retry-After") + wait = int(retry_after) if retry_after else 2 ** attempt logger.info(f"ArXiv listing {resp.status}, retry in {wait}s") await asyncio.sleep(wait) continue - logger.warning(f"ArXiv listing error: status={resp.status}") - return [] + + raise APIFetchError( + f"ArXiv listing HTTP {resp.status} for {url}", + source="arxiv", + ) + except APIFetchError: + raise except Exception as e: if attempt < cls.MAX_RETRIES - 1: wait = 2 ** attempt logger.info(f"ArXiv listing failed: {e}, retry in {wait}s") await asyncio.sleep(wait) continue - logger.warning(f"ArXiv listing request failed: {e}") - return [] - return [] + raise APIFetchError( + f"ArXiv listing request failed after {cls.MAX_RETRIES} " + f"attempts: {e}", + source="arxiv", + ) + + raise APIFetchError( + f"ArXiv listing failed: HTTP {last_status} after " + f"{cls.MAX_RETRIES} retries", + source="arxiv", + ) + + +def build_listing_api_data(category: str, papers_list: List[Dict[str, Any]]) -> Dict[str, Any]: + """Build the API data dict from a parsed paper list. + + Raises ``APIFetchError`` when the list is empty (no new submissions). + """ + if not papers_list: + raise APIFetchError( + f"No new papers on listing page for category '{category}'", + source="arxiv", + ) + + papers = {} + for rank, paper in enumerate(papers_list, start=1): + arxiv_id = paper["arxiv_id"] + papers[arxiv_id] = {**paper, "rank": rank} + + return { + "category": category, + "paper_count": len(papers), + "papers": papers, + } async def fetch_listing_api_data(category: str) -> Dict[str, Any]: @@ -224,20 +267,4 @@ async def fetch_listing_api_data(category: str) -> Dict[str, Any]: } """ papers_list = await ArxivClient.fetch_listing(category) - - if not papers_list: - raise APIFetchError( - f"No new papers on listing page for category '{category}'", - source="arxiv", - ) - - papers = {} - for rank, paper in enumerate(papers_list, start=1): - arxiv_id = paper["arxiv_id"] - papers[arxiv_id] = {**paper, "rank": rank} - - return { - "category": category, - "paper_count": len(papers), - "papers": papers, - } + return build_listing_api_data(category, papers_list) diff --git a/liveweb_arena/plugins/arxiv/arxiv.py b/liveweb_arena/plugins/arxiv/arxiv.py index 1e17727..18835a3 100644 --- a/liveweb_arena/plugins/arxiv/arxiv.py +++ b/liveweb_arena/plugins/arxiv/arxiv.py @@ -6,11 +6,11 @@ """ import re -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from urllib.parse import urlparse from liveweb_arena.plugins.base import BasePlugin -from .api_client import fetch_listing_api_data +from .api_client import build_listing_api_data, fetch_listing_api_data, parse_listing_html class ArxivPlugin(BasePlugin): @@ -53,6 +53,20 @@ def needs_api_data(self, url: str) -> bool: return bool(self._extract_category(path)) + def extract_api_data_from_html(self, url: str, html: str) -> Optional[Dict[str, Any]]: + """Parse GT data from already-fetched listing page HTML. + + ArXiv GT is extracted from the same HTML the browser renders, so + there is no need for a separate network request. This eliminates + the redundant concurrent fetch that doubles rate-limit exposure. + """ + parsed = urlparse(url) + category = self._extract_category(parsed.path.strip("/")) + if not category: + return None + papers_list = parse_listing_html(html) + return build_listing_api_data(category, papers_list) + @staticmethod def _extract_category(path: str) -> str: """Extract category from listing path like 'list/cs.AI/new'. diff --git a/liveweb_arena/plugins/base.py b/liveweb_arena/plugins/base.py index 6a9f134..7495f46 100644 --- a/liveweb_arena/plugins/base.py +++ b/liveweb_arena/plugins/base.py @@ -187,6 +187,24 @@ async def setup_page_for_cache(self, page, url: str) -> None: """ pass + def extract_api_data_from_html(self, url: str, html: str) -> Optional[Dict[str, Any]]: + """ + Extract API data directly from already-fetched page HTML. + + Override when the API data is parsed from the same page the browser + fetches. Returning a non-None dict lets the cache manager skip + the separate ``fetch_api_data`` network call, eliminating duplicate + requests to the same URL and reducing rate-limit exposure. + + Args: + url: The page URL + html: Page HTML already retrieved by the browser + + Returns: + API data dict, or None to fall back to ``fetch_api_data``. + """ + return None + async def generate_task(self, seed: int, template_name: str = None, variant: int = None) -> SubTask: """ Generate a task using registered templates. diff --git a/tests/core/test_cache.py b/tests/core/test_cache.py index 8ed1c15..dfaa583 100644 --- a/tests/core/test_cache.py +++ b/tests/core/test_cache.py @@ -1,8 +1,10 @@ """Tests for cache module — CachedPage, normalize_url, url_to_cache_dir, CacheManager helpers.""" +import asyncio import json import time from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -303,3 +305,121 @@ def test_stale_fallback_rejects_captcha(self, tmp_path): mgr = CacheManager(cache_dir=tmp_path, ttl=3600) assert mgr._load_stale(cache_file, need_api=False) is None assert not cache_file.exists() + + +# ── _fetch_page_with_retry ────────────────────────────────────────── + + +class TestFetchPageWithRetry: + """Tests for CacheManager._fetch_page_with_retry.""" + + def _make_manager(self, tmp_path): + mgr = CacheManager(cache_dir=tmp_path, ttl=3600) + mgr._PAGE_RETRY_DELAY = 0 # no waiting in tests + return mgr + + def test_succeeds_on_first_attempt(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock(return_value=("ok", "tree")) + html, tree = asyncio.run(mgr._fetch_page_with_retry("https://x.com")) + assert html == "ok" + assert mgr._fetch_page.call_count == 1 + + def test_retries_on_transient_error(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock(side_effect=[ + TimeoutError("page.goto timed out"), + ("ok", "tree"), + ]) + html, tree = asyncio.run(mgr._fetch_page_with_retry("https://x.com")) + assert html == "ok" + assert mgr._fetch_page.call_count == 2 + + def test_retries_on_http_5xx(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock(side_effect=[ + CacheFatalError("HTTP 503 for https://arxiv.org/list/cs.AI/new"), + ("ok", "tree"), + ]) + html, _ = asyncio.run(mgr._fetch_page_with_retry("https://arxiv.org/list/cs.AI/new")) + assert html == "ok" + assert mgr._fetch_page.call_count == 2 + + def test_does_not_retry_http_4xx(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock( + side_effect=CacheFatalError("HTTP 404 for https://x.com"), + ) + with pytest.raises(CacheFatalError, match="HTTP 404"): + asyncio.run(mgr._fetch_page_with_retry("https://x.com")) + assert mgr._fetch_page.call_count == 1 + + def test_does_not_retry_captcha(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock( + side_effect=CacheFatalError("CAPTCHA/challenge page detected"), + ) + with pytest.raises(CacheFatalError, match="CAPTCHA"): + asyncio.run(mgr._fetch_page_with_retry("https://x.com")) + assert mgr._fetch_page.call_count == 1 + + def test_raises_after_all_retries_exhausted(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock( + side_effect=TimeoutError("page.goto timed out"), + ) + with pytest.raises(CacheFatalError, match="Page fetch failed"): + asyncio.run(mgr._fetch_page_with_retry("https://x.com")) + assert mgr._fetch_page.call_count == mgr._MAX_PAGE_RETRIES + + +# ── _ensure_single with extract_api_data_from_html ────────────────── + + +class TestEnsureSingleHtmlExtraction: + """Tests that _ensure_single uses extract_api_data_from_html when available.""" + + def _make_manager(self, tmp_path): + mgr = CacheManager(cache_dir=tmp_path, ttl=3600) + mgr._PAGE_RETRY_DELAY = 0 + return mgr + + @staticmethod + def _make_plugin(extract_return, fetch_return=None): + """Build a plugin mock with correct sync/async method types.""" + plugin = MagicMock() + # Sync methods + plugin.extract_api_data_from_html.return_value = extract_return + plugin.normalize_url.side_effect = normalize_url + plugin.get_blocked_patterns.return_value = [] + # Async methods + plugin.fetch_api_data = AsyncMock(return_value=fetch_return) + plugin.setup_page_for_cache = AsyncMock() + return plugin + + def test_skips_fetch_api_data_when_html_extraction_works(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock(return_value=("papers", "tree")) + plugin = self._make_plugin( + extract_return={"papers": {"1": {"rank": 1}}}, + ) + + cached = asyncio.run(mgr._ensure_single( + "https://arxiv.org/list/cs.AI/new", plugin, need_api=True, + )) + assert cached.api_data == {"papers": {"1": {"rank": 1}}} + plugin.fetch_api_data.assert_not_called() + + def test_falls_back_to_fetch_api_data_when_html_extraction_returns_none(self, tmp_path): + mgr = self._make_manager(tmp_path) + mgr._fetch_page = AsyncMock(return_value=("page", "tree")) + plugin = self._make_plugin( + extract_return=None, + fetch_return={"coins": {"btc": {"price": 50000}}}, + ) + + cached = asyncio.run(mgr._ensure_single( + "https://www.coingecko.com/en/coins/bitcoin", plugin, need_api=True, + )) + assert cached.api_data == {"coins": {"btc": {"price": 50000}}} + plugin.fetch_api_data.assert_called_once() diff --git a/tests/test_arxiv_integration.py b/tests/test_arxiv_integration.py index f00d698..1063bd9 100644 --- a/tests/test_arxiv_integration.py +++ b/tests/test_arxiv_integration.py @@ -9,8 +9,9 @@ from liveweb_arena.core.validators.base import get_registered_templates from liveweb_arena.plugins import get_all_plugins from liveweb_arena.plugins.base import SubTask +from liveweb_arena.plugins.base_client import APIFetchError from liveweb_arena.plugins.arxiv.arxiv import ArxivPlugin -from liveweb_arena.plugins.arxiv.api_client import parse_listing_html +from liveweb_arena.plugins.arxiv.api_client import build_listing_api_data, parse_listing_html from liveweb_arena.plugins.arxiv.templates.paper_info import ArxivPaperInfoTemplate from liveweb_arena.plugins.arxiv.templates.author_extrema import ArxivAuthorExtremaTemplate from liveweb_arena.plugins.arxiv.templates.multi_author_filter import ArxivMultiAuthorFilterTemplate @@ -1044,3 +1045,67 @@ def test_author_extrema_validation_rules_include_extrema_type(): }) assert "fewest" in rules_fewest assert "5" in rules_fewest + + +# ---------- extract_api_data_from_html ---------- + + +def test_extract_api_data_from_html_parses_listing_page(): + """Regression: GT can be derived from already-fetched HTML without a + separate network request.""" + plugin = ArxivPlugin() + data = plugin.extract_api_data_from_html( + "https://arxiv.org/list/cs.AI/new", SAMPLE_LISTING_HTML, + ) + assert data is not None + assert data["category"] == "cs.AI" + assert data["paper_count"] == 2 + assert "2603.17021" in data["papers"] + assert data["papers"]["2603.17021"]["rank"] == 1 + assert data["papers"]["2603.17063"]["rank"] == 2 + + +def test_extract_api_data_from_html_returns_none_for_non_listing_url(): + plugin = ArxivPlugin() + result = plugin.extract_api_data_from_html( + "https://arxiv.org/abs/2603.17021", SAMPLE_LISTING_HTML, + ) + assert result is None + + +def test_extract_api_data_from_html_raises_when_no_new_papers(): + """Weekend/holiday pages with zero new submissions raise APIFetchError.""" + plugin = ArxivPlugin() + empty_html = "