Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 47 additions & 39 deletions liveweb_arena/core/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ class CacheManager:

# Minimum interval between consecutive cache-miss fetches (seconds)
_PREFETCH_INTERVAL = 1.0
# Retry transient _fetch_page failures (timeout, 5xx, connection reset)
_MAX_PAGE_RETRIES = 2
_PAGE_RETRY_DELAY = 3.0

def __init__(self, cache_dir: Path, ttl: int = None):
self.cache_dir = Path(cache_dir)
Expand Down Expand Up @@ -380,54 +383,30 @@ async def _ensure_single(
start = time.time()

try:
if need_api:
# Fetch HTML and API data concurrently
page_task = asyncio.ensure_future(self._fetch_page(url, plugin))
api_task = asyncio.ensure_future(plugin.fetch_api_data(url))

# Wait for both, collecting errors
page_result = None
page_error = None
api_data = None
api_error = None

try:
page_result = await page_task
except Exception as e:
page_error = e
api_task.cancel()
html, accessibility_tree = await self._fetch_page_with_retry(
url, plugin,
)

if page_error is None:
if need_api:
# If the plugin can derive GT from the already-fetched HTML,
# skip the separate network request. This avoids a redundant
# concurrent fetch to the same URL (e.g. ArXiv listing pages).
api_data = plugin.extract_api_data_from_html(url, html)
if api_data is None:
# Plugin requires an independent API call
try:
api_data = await api_task
api_data = await plugin.fetch_api_data(url)
except Exception as e:
api_error = e

if page_error is not None:
raise CacheFatalError(
f"Page fetch failed (browser cannot load): {page_error}",
url=url,
)
html, accessibility_tree = page_result

if api_error is not None:
raise CacheFatalError(
f"API data fetch failed (GT will be invalid): {api_error}",
url=url,
)
raise CacheFatalError(
f"API data fetch failed (GT will be invalid): {e}",
url=url,
)
if not api_data:
raise CacheFatalError(
f"API data is empty (GT will be invalid)",
url=url,
)
else:
try:
html, accessibility_tree = await self._fetch_page(url, plugin)
except Exception as e:
raise CacheFatalError(
f"Page fetch failed (browser cannot load): {e}",
url=url,
)
api_data = None

cached = CachedPage(
Expand Down Expand Up @@ -523,6 +502,35 @@ def _save(self, cache_file: Path, cached: CachedPage):
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(cached.to_dict(), f, ensure_ascii=False)

async def _fetch_page_with_retry(self, url: str, plugin=None) -> tuple:
"""Fetch page HTML with retry for transient failures.

Retries on timeouts, HTTP 5xx, and connection errors.
Permanent failures (HTTP 4xx, CAPTCHA) are raised immediately.
"""
last_error: Exception = None
for attempt in range(self._MAX_PAGE_RETRIES):
try:
return await self._fetch_page(url, plugin)
except CacheFatalError as e:
msg = str(e)
# Permanent failures — do not retry
if any(s in msg for s in ("CAPTCHA", "HTTP 4")):
raise
last_error = e
except Exception as e:
last_error = e

if attempt < self._MAX_PAGE_RETRIES - 1:
log("Cache", f"Page fetch retry {attempt + 1} for "
f"{url_display(url)}: {last_error}")
await asyncio.sleep(self._PAGE_RETRY_DELAY)

raise CacheFatalError(
f"Page fetch failed (browser cannot load): {last_error}",
url=url,
)

async def _fetch_page(self, url: str, plugin=None) -> tuple:
"""
Fetch page HTML and accessibility tree using shared Playwright browser.
Expand Down
75 changes: 51 additions & 24 deletions liveweb_arena/plugins/arxiv/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,29 +171,72 @@ async def fetch_listing(
session = await _get_session()
req_timeout = aiohttp.ClientTimeout(total=timeout)

_RETRYABLE = {429, 500, 502, 503, 504}
last_status = None

for attempt in range(cls.MAX_RETRIES):
await cls._rate_limit()
try:
async with session.get(url, timeout=req_timeout) as resp:
if resp.status == 200:
text = await resp.text()
return parse_listing_html(text)
if resp.status >= 500 and attempt < cls.MAX_RETRIES - 1:
wait = 2 ** attempt

last_status = resp.status
if resp.status in _RETRYABLE and attempt < cls.MAX_RETRIES - 1:
# Honour Retry-After header when present
retry_after = resp.headers.get("Retry-After")
wait = int(retry_after) if retry_after else 2 ** attempt
logger.info(f"ArXiv listing {resp.status}, retry in {wait}s")
await asyncio.sleep(wait)
continue
logger.warning(f"ArXiv listing error: status={resp.status}")
return []

raise APIFetchError(
f"ArXiv listing HTTP {resp.status} for {url}",
source="arxiv",
)
except APIFetchError:
raise
except Exception as e:
if attempt < cls.MAX_RETRIES - 1:
wait = 2 ** attempt
logger.info(f"ArXiv listing failed: {e}, retry in {wait}s")
await asyncio.sleep(wait)
continue
logger.warning(f"ArXiv listing request failed: {e}")
return []
return []
raise APIFetchError(
f"ArXiv listing request failed after {cls.MAX_RETRIES} "
f"attempts: {e}",
source="arxiv",
)

raise APIFetchError(
f"ArXiv listing failed: HTTP {last_status} after "
f"{cls.MAX_RETRIES} retries",
source="arxiv",
)


def build_listing_api_data(category: str, papers_list: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Build the API data dict from a parsed paper list.

Raises ``APIFetchError`` when the list is empty (no new submissions).
"""
if not papers_list:
raise APIFetchError(
f"No new papers on listing page for category '{category}'",
source="arxiv",
)

papers = {}
for rank, paper in enumerate(papers_list, start=1):
arxiv_id = paper["arxiv_id"]
papers[arxiv_id] = {**paper, "rank": rank}

return {
"category": category,
"paper_count": len(papers),
"papers": papers,
}


async def fetch_listing_api_data(category: str) -> Dict[str, Any]:
Expand Down Expand Up @@ -224,20 +267,4 @@ async def fetch_listing_api_data(category: str) -> Dict[str, Any]:
}
"""
papers_list = await ArxivClient.fetch_listing(category)

if not papers_list:
raise APIFetchError(
f"No new papers on listing page for category '{category}'",
source="arxiv",
)

papers = {}
for rank, paper in enumerate(papers_list, start=1):
arxiv_id = paper["arxiv_id"]
papers[arxiv_id] = {**paper, "rank": rank}

return {
"category": category,
"paper_count": len(papers),
"papers": papers,
}
return build_listing_api_data(category, papers_list)
18 changes: 16 additions & 2 deletions liveweb_arena/plugins/arxiv/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
"""

import re
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

from liveweb_arena.plugins.base import BasePlugin
from .api_client import fetch_listing_api_data
from .api_client import build_listing_api_data, fetch_listing_api_data, parse_listing_html


class ArxivPlugin(BasePlugin):
Expand Down Expand Up @@ -53,6 +53,20 @@ def needs_api_data(self, url: str) -> bool:

return bool(self._extract_category(path))

def extract_api_data_from_html(self, url: str, html: str) -> Optional[Dict[str, Any]]:
"""Parse GT data from already-fetched listing page HTML.

ArXiv GT is extracted from the same HTML the browser renders, so
there is no need for a separate network request. This eliminates
the redundant concurrent fetch that doubles rate-limit exposure.
"""
parsed = urlparse(url)
category = self._extract_category(parsed.path.strip("/"))
if not category:
return None
papers_list = parse_listing_html(html)
return build_listing_api_data(category, papers_list)

@staticmethod
def _extract_category(path: str) -> str:
"""Extract category from listing path like 'list/cs.AI/new'.
Expand Down
18 changes: 18 additions & 0 deletions liveweb_arena/plugins/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,24 @@ async def setup_page_for_cache(self, page, url: str) -> None:
"""
pass

def extract_api_data_from_html(self, url: str, html: str) -> Optional[Dict[str, Any]]:
"""
Extract API data directly from already-fetched page HTML.

Override when the API data is parsed from the same page the browser
fetches. Returning a non-None dict lets the cache manager skip
the separate ``fetch_api_data`` network call, eliminating duplicate
requests to the same URL and reducing rate-limit exposure.

Args:
url: The page URL
html: Page HTML already retrieved by the browser

Returns:
API data dict, or None to fall back to ``fetch_api_data``.
"""
return None

async def generate_task(self, seed: int, template_name: str = None, variant: int = None) -> SubTask:
"""
Generate a task using registered templates.
Expand Down
Loading