AffineFoundation · MkDev11 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/liveweb_arena/core/cache.py b/liveweb_arena/core/cache.py
@@ -272,6 +272,9 @@ class CacheManager:
 
     # Minimum interval between consecutive cache-miss fetches (seconds)
     _PREFETCH_INTERVAL = 1.0
+    # Retry transient _fetch_page failures (timeout, 5xx, connection reset)
+    _MAX_PAGE_RETRIES = 2
+    _PAGE_RETRY_DELAY = 3.0
 
     def __init__(self, cache_dir: Path, ttl: int = None):
         self.cache_dir = Path(cache_dir)
@@ -380,54 +383,30 @@ async def _ensure_single(
             start = time.time()
 
             try:
-                if need_api:
-                    # Fetch HTML and API data concurrently
-                    page_task = asyncio.ensure_future(self._fetch_page(url, plugin))
-                    api_task = asyncio.ensure_future(plugin.fetch_api_data(url))
-
-                    # Wait for both, collecting errors
-                    page_result = None
-                    page_error = None
-                    api_data = None
-                    api_error = None
-
-                    try:
-                        page_result = await page_task
-                    except Exception as e:
-                        page_error = e
-                        api_task.cancel()
+                html, accessibility_tree = await self._fetch_page_with_retry(
+                    url, plugin,
+                )
 
-                    if page_error is None:
+                if need_api:
+                    # If the plugin can derive GT from the already-fetched HTML,
+                    # skip the separate network request.  This avoids a redundant
+                    # concurrent fetch to the same URL (e.g. ArXiv listing pages).
+                    api_data = plugin.extract_api_data_from_html(url, html)
+                    if api_data is None:
+                        # Plugin requires an independent API call
                         try:
-                            api_data = await api_task
+                            api_data = await plugin.fetch_api_data(url)
                         except Exception as e:
-                            api_error = e
-
-                    if page_error is not None:
-                        raise CacheFatalError(
-                            f"Page fetch failed (browser cannot load): {page_error}",
-                            url=url,
-                        )
-                    html, accessibility_tree = page_result
-
-                    if api_error is not None:
-                        raise CacheFatalError(
-                            f"API data fetch failed (GT will be invalid): {api_error}",
-                            url=url,
-                        )
+                            raise CacheFatalError(
+                                f"API data fetch failed (GT will be invalid): {e}",
+                                url=url,
+                            )
                     if not api_data:
                         raise CacheFatalError(
                             f"API data is empty (GT will be invalid)",
                             url=url,
                         )
                 else:
-                    try:
-                        html, accessibility_tree = await self._fetch_page(url, plugin)
-                    except Exception as e:
-                        raise CacheFatalError(
-                            f"Page fetch failed (browser cannot load): {e}",
-                            url=url,
-                        )
                     api_data = None
 
                 cached = CachedPage(
@@ -523,6 +502,35 @@ def _save(self, cache_file: Path, cached: CachedPage):
         with open(cache_file, 'w', encoding='utf-8') as f:
             json.dump(cached.to_dict(), f, ensure_ascii=False)
 
+    async def _fetch_page_with_retry(self, url: str, plugin=None) -> tuple:
+        """Fetch page HTML with retry for transient failures.
+
+        Retries on timeouts, HTTP 5xx, and connection errors.
+        Permanent failures (HTTP 4xx, CAPTCHA) are raised immediately.
+        """
+        last_error: Exception = None
+        for attempt in range(self._MAX_PAGE_RETRIES):
+            try:
+                return await self._fetch_page(url, plugin)
+            except CacheFatalError as e:
+                msg = str(e)
+                # Permanent failures — do not retry
+                if any(s in msg for s in ("CAPTCHA", "HTTP 4")):
+                    raise
+                last_error = e
+            except Exception as e:
+                last_error = e
+
+            if attempt < self._MAX_PAGE_RETRIES - 1:
+                log("Cache", f"Page fetch retry {attempt + 1} for "
+                    f"{url_display(url)}: {last_error}")
+                await asyncio.sleep(self._PAGE_RETRY_DELAY)
+
+        raise CacheFatalError(
+            f"Page fetch failed (browser cannot load): {last_error}",
+            url=url,
+        )
+
     async def _fetch_page(self, url: str, plugin=None) -> tuple:
         """
         Fetch page HTML and accessibility tree using shared Playwright browser.

diff --git a/liveweb_arena/plugins/arxiv/api_client.py b/liveweb_arena/plugins/arxiv/api_client.py
@@ -171,29 +171,72 @@ async def fetch_listing(
         session = await _get_session()
         req_timeout = aiohttp.ClientTimeout(total=timeout)
 
+        _RETRYABLE = {429, 500, 502, 503, 504}
+        last_status = None
+
         for attempt in range(cls.MAX_RETRIES):
             await cls._rate_limit()
             try:
                 async with session.get(url, timeout=req_timeout) as resp:
                     if resp.status == 200:
                         text = await resp.text()
                         return parse_listing_html(text)
-                    if resp.status >= 500 and attempt < cls.MAX_RETRIES - 1:
-                        wait = 2 ** attempt
+
+                    last_status = resp.status
+                    if resp.status in _RETRYABLE and attempt < cls.MAX_RETRIES - 1:
+                        # Honour Retry-After header when present
+                        retry_after = resp.headers.get("Retry-After")
+                        wait = int(retry_after) if retry_after else 2 ** attempt
                         logger.info(f"ArXiv listing {resp.status}, retry in {wait}s")
                         await asyncio.sleep(wait)
                         continue
-                    logger.warning(f"ArXiv listing error: status={resp.status}")
-                    return []
+
+                    raise APIFetchError(
+                        f"ArXiv listing HTTP {resp.status} for {url}",
+                        source="arxiv",
+                    )
+            except APIFetchError:
+                raise
             except Exception as e:
                 if attempt < cls.MAX_RETRIES - 1:
                     wait = 2 ** attempt
                     logger.info(f"ArXiv listing failed: {e}, retry in {wait}s")
                     await asyncio.sleep(wait)
                     continue
-                logger.warning(f"ArXiv listing request failed: {e}")
-                return []
-        return []
+                raise APIFetchError(
+                    f"ArXiv listing request failed after {cls.MAX_RETRIES} "
+                    f"attempts: {e}",
+                    source="arxiv",
+                )
+
+        raise APIFetchError(
+            f"ArXiv listing failed: HTTP {last_status} after "
+            f"{cls.MAX_RETRIES} retries",
+            source="arxiv",
+        )
+
+
+def build_listing_api_data(category: str, papers_list: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Build the API data dict from a parsed paper list.
+
+    Raises ``APIFetchError`` when the list is empty (no new submissions).
+    """
+    if not papers_list:
+        raise APIFetchError(
+            f"No new papers on listing page for category '{category}'",
+            source="arxiv",
+        )
+
+    papers = {}
+    for rank, paper in enumerate(papers_list, start=1):
+        arxiv_id = paper["arxiv_id"]
+        papers[arxiv_id] = {**paper, "rank": rank}
+
+    return {
+        "category": category,
+        "paper_count": len(papers),
+        "papers": papers,
+    }
 
 
 async def fetch_listing_api_data(category: str) -> Dict[str, Any]:
@@ -224,20 +267,4 @@ async def fetch_listing_api_data(category: str) -> Dict[str, Any]:
     }
     """
     papers_list = await ArxivClient.fetch_listing(category)
-
-    if not papers_list:
-        raise APIFetchError(
-            f"No new papers on listing page for category '{category}'",
-            source="arxiv",
-        )
-
-    papers = {}
-    for rank, paper in enumerate(papers_list, start=1):
-        arxiv_id = paper["arxiv_id"]
-        papers[arxiv_id] = {**paper, "rank": rank}
-
-    return {
-        "category": category,
-        "paper_count": len(papers),
-        "papers": papers,
-    }
+    return build_listing_api_data(category, papers_list)
diff --git a/liveweb_arena/plugins/arxiv/arxiv.py b/liveweb_arena/plugins/arxiv/arxiv.py
@@ -6,11 +6,11 @@
 """
 
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse
 
 from liveweb_arena.plugins.base import BasePlugin
-from .api_client import fetch_listing_api_data
+from .api_client import build_listing_api_data, fetch_listing_api_data, parse_listing_html
 
 
 class ArxivPlugin(BasePlugin):
@@ -53,6 +53,20 @@ def needs_api_data(self, url: str) -> bool:
 
         return bool(self._extract_category(path))
 
+    def extract_api_data_from_html(self, url: str, html: str) -> Optional[Dict[str, Any]]:
+        """Parse GT data from already-fetched listing page HTML.
+
+        ArXiv GT is extracted from the same HTML the browser renders, so
+        there is no need for a separate network request.  This eliminates
+        the redundant concurrent fetch that doubles rate-limit exposure.
+        """
+        parsed = urlparse(url)
+        category = self._extract_category(parsed.path.strip("/"))
+        if not category:
+            return None
+        papers_list = parse_listing_html(html)
+        return build_listing_api_data(category, papers_list)
+
     @staticmethod
     def _extract_category(path: str) -> str:
         """Extract category from listing path like 'list/cs.AI/new'.

diff --git a/liveweb_arena/plugins/base.py b/liveweb_arena/plugins/base.py
@@ -187,6 +187,24 @@ async def setup_page_for_cache(self, page, url: str) -> None:
         """
         pass
 
+    def extract_api_data_from_html(self, url: str, html: str) -> Optional[Dict[str, Any]]:
+        """
+        Extract API data directly from already-fetched page HTML.
+
+        Override when the API data is parsed from the same page the browser
+        fetches.  Returning a non-None dict lets the cache manager skip
+        the separate ``fetch_api_data`` network call, eliminating duplicate
+        requests to the same URL and reducing rate-limit exposure.
+
+        Args:
+            url: The page URL
+            html: Page HTML already retrieved by the browser
+
+        Returns:
+            API data dict, or None to fall back to ``fetch_api_data``.
+        """
+        return None
+
     async def generate_task(self, seed: int, template_name: str = None, variant: int = None) -> SubTask:
         """
         Generate a task using registered templates.