Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions liveweb_arena/core/gt_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,14 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]:
name = api_data.get("name", f"SN{netuid}")
return f"subnet[{name}]"

elif "hn.algolia.com" in url_lower:
# HN Algolia search data
query = str(api_data.get("query", "")).strip().lower()
page = int(api_data.get("page", 0))
key = f"hn_search:{query}:{page}"
self._collected_api_data[key] = api_data
return f"hn_search[{query}] page={page}"

elif "news.ycombinator.com" in url_lower:
if "stories" in api_data:
# Check if this is a category page (ask, show, jobs) or homepage
Expand All @@ -391,8 +399,8 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]:
if added > 0:
return f"+{added} stories"
return None
elif "id" in api_data and "title" in api_data:
# Story detail page: merge with existing data, preserving rank
elif "id" in api_data:
# Item detail page (story/comment): merge with existing data, preserving rank
story_id = str(api_data["id"])
existing = self._collected_api_data.get(story_id, {})
# Copy to avoid mutating cached/shared api_data reference
Expand Down
15 changes: 6 additions & 9 deletions liveweb_arena/core/task_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ class TaskRegistry:
76: ("hackernews", "hackernews_extrema_comparison"),
77: ("hackernews", "hackernews_category_comparison"),
78: ("hackernews", "hackernews_news_summary"),
110: ("hackernews", "hackernews_recent_burst_count"),
111: ("hackernews", "hackernews_comment_tree_focus"),
112: ("hackernews", "hackernews_keyword_scan_rank"),
113: ("hackernews", "hackernews_user_karma_gap"),

# Open Library templates
80: ("openlibrary", "openlibrary_book_stats"),
Expand All @@ -146,21 +150,13 @@ class TaskRegistry:
86: ("openmeteo", "openmeteo_comparison"),
87: ("openmeteo", "openmeteo_hourly_extrema"),
88: ("openmeteo", "openmeteo_forecast_trend"),
99: ("openmeteo", "openmeteo_hourly_threshold"),
100: ("openmeteo", "openmeteo_sunrise_sunset"),
101: ("openmeteo", "openmeteo_hourly_time_of"),

# ArXiv templates
90: ("arxiv", "arxiv_paper_info"),
91: ("arxiv", "arxiv_author_extrema"),
92: ("arxiv", "arxiv_category_comparison"),
94: ("arxiv", "arxiv_multi_author_filter"),
95: ("arxiv", "arxiv_title_length_extrema"),

# Open Library templates — engagement & comparison
96: ("openlibrary", "openlibrary_author_engagement_extrema"),
97: ("openlibrary", "openlibrary_author_comparison"),
98: ("openlibrary", "openlibrary_reading_stats_filter"),
}

# Template versions - each version's combinations come AFTER all previous versions
Expand Down Expand Up @@ -190,10 +186,11 @@ class TaskRegistry:
# Version 6: ArXiv templates
[90, 91, 92, 94, 95],
# Version 7: Open Library engagement & comparison templates (PR #13)
# NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8.
[96, 97, 98],
# Version 8: Additional Open Meteo templates
[99, 100, 101],
# Version 9: Hacker News gap-filling templates
[110, 111, 112, 113],
]

# Combination registry: list of template ID tuples
Expand Down
84 changes: 84 additions & 0 deletions liveweb_arena/plugins/hackernews/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, Dict, List, Optional

import aiohttp
from urllib.parse import quote_plus

from liveweb_arena.plugins.base_client import APIFetchError, BaseAPIClient, RateLimiter, validate_api_response

Expand All @@ -14,6 +15,7 @@

# Firebase API base URL
HN_API_BASE = "https://hacker-news.firebaseio.com/v0"
HN_ALGOLIA_SEARCH_API = "https://hn.algolia.com/api/v1/search_by_date"


class HackerNewsClient(BaseAPIClient):
Expand Down Expand Up @@ -83,6 +85,14 @@ async def get_top_stories(cls, limit: int = 30) -> List[int]:
return data[:limit]
return []

@classmethod
async def get_new_stories(cls, limit: int = 30) -> List[int]:
"""Get newest story IDs."""
data = await cls.get("/newstories.json")
if data and isinstance(data, list):
return data[:limit]
return []

@classmethod
async def get_ask_stories(cls, limit: int = 30) -> List[int]:
"""Get Ask HN story IDs."""
Expand Down Expand Up @@ -204,6 +214,36 @@ async def fetch_homepage_api_data(limit: int = 30) -> Dict[str, Any]:
return {"stories": stories}


async def fetch_newest_api_data(limit: int = 30) -> Dict[str, Any]:
"""
Fetch API data for HN newest page.

Returns:
{
"category": "newest",
"stories": {
"<id>": {
...
"rank": <1-based newest rank>
}
}
}
"""
story_ids = await HackerNewsClient.get_new_stories(limit=limit)
if not story_ids:
raise APIFetchError("Failed to fetch newest stories", source="hackernews")

items = await HackerNewsClient.get_items_batch(story_ids)
stories = {}
for rank, story_id in enumerate(story_ids, start=1):
if story_id in items:
story = items[story_id]
story["rank"] = rank
stories[str(story_id)] = story

return {"category": "newest", "stories": stories}


async def fetch_category_api_data(category: str, limit: int = 30) -> Dict[str, Any]:
"""
Fetch API data for a category page (ask, show, jobs).
Expand Down Expand Up @@ -290,3 +330,47 @@ async def fetch_user_api_data(username: str) -> Dict[str, Any]:
"user": user,
"submissions": user.get("submitted", [])[:30], # Limit to recent submissions
}


async def fetch_search_api_data(query: str, page: int = 0, hits_per_page: int = 30) -> Dict[str, Any]:
"""
Fetch search results from HN Algolia API.

Args:
query: Search query string
page: Search page number
hits_per_page: Number of results per page

Returns:
Dict with query metadata and hits list
"""
await HackerNewsClient._rate_limit()
encoded_query = quote_plus(query)
url = (
f"{HN_ALGOLIA_SEARCH_API}?query={encoded_query}&page={page}"
f"&hitsPerPage={hits_per_page}&tags=story"
)
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15.0)) as response:
if response.status != 200:
raise APIFetchError(
f"Failed to fetch Algolia search data: status={response.status}",
source="hackernews",
)
payload = await response.json()
except APIFetchError:
raise
except Exception as e:
raise APIFetchError(f"Algolia search request failed: {e}", source="hackernews") from e

if not validate_api_response(payload, required_fields=["hits"], source="hackernews"):
raise APIFetchError("Invalid Algolia search payload", source="hackernews")

return {
"query": query,
"page": page,
"hits_per_page": hits_per_page,
"hits": payload.get("hits", []),
"nb_hits": payload.get("nbHits"),
}
32 changes: 26 additions & 6 deletions liveweb_arena/plugins/hackernews/hackernews.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
from liveweb_arena.plugins.base import BasePlugin
from .api_client import (
fetch_homepage_api_data,
fetch_newest_api_data,
fetch_category_api_data,
fetch_item_api_data,
fetch_user_api_data,
fetch_search_api_data,
)

# Per-evaluation state via contextvars.
Expand Down Expand Up @@ -53,6 +55,7 @@ class HackerNewsPlugin(BasePlugin):

allowed_domains = [
"news.ycombinator.com",
"hn.algolia.com",
]

@classmethod
Expand All @@ -77,7 +80,7 @@ def get_blocked_patterns(self) -> List[str]:
"""Block direct API access to force agents to use the website."""
return [
"*hacker-news.firebaseio.com*", # Block Firebase API
"*hn.algolia.com*", # Block Algolia search API
"*hn.algolia.com/api/*", # Block direct Algolia API calls
]

@classmethod
Expand Down Expand Up @@ -287,6 +290,13 @@ async def fetch_api_data(self, url: str) -> Dict[str, Any]:
parsed = urlparse(url)
host = parsed.netloc.lower()

# Algolia search page
if "hn.algolia.com" in host:
query = parse_qs(parsed.query)
search_q = query.get("q", [""])[0].strip()
page = int(query.get("page", ["0"])[0] or 0)
return await fetch_search_api_data(search_q, page=page, hits_per_page=30)

# Check if this is an external URL (not HN domain)
if "ycombinator.com" not in host:
return self._get_external_url_data(url)
Expand Down Expand Up @@ -318,8 +328,14 @@ async def fetch_api_data(self, url: str) -> Dict[str, Any]:
self._extract_external_urls(data)
return data

# Homepage (including news, newest, front, etc. - all show top stories)
if path in ("", "news", "newest", "front") or not path:
# Newest page
if path == "newest":
data = await fetch_newest_api_data()
self._extract_external_urls(data)
return data

# Homepage (top stories)
if path in ("", "news", "front") or not path:
data = await fetch_homepage_api_data()
self._extract_external_urls(data)
return data
Expand All @@ -340,6 +356,10 @@ def needs_api_data(self, url: str) -> bool:
parsed = urlparse(url)
host = parsed.netloc.lower()

# Algolia search pages need API data
if "hn.algolia.com" in host:
return True

# External URLs (non-HN) need data extraction for GT
if "ycombinator.com" not in host:
return self.is_legitimate_external_url(url)
Expand All @@ -359,7 +379,7 @@ def needs_api_data(self, url: str) -> bool:
if path in ("ask", "show", "jobs"):
return True

# Homepage needs API data
# Newest/homepage need API data
if path in ("", "news", "newest", "front") or not path:
return True

Expand All @@ -381,8 +401,8 @@ def is_url_allowed(self, url: str) -> bool:
parsed = urlparse(url)
host = parsed.netloc.lower()

# Always allow HN domain
if "ycombinator.com" in host:
# Always allow HN and Algolia search domains
if "ycombinator.com" in host or "hn.algolia.com" in host:
return True

# Allow legitimate external URLs from HN stories
Expand Down
8 changes: 8 additions & 0 deletions liveweb_arena/plugins/hackernews/templates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@
from .extrema_comparison import HackerNewsExtremaComparisonTemplate
from .category_comparison import HackerNewsCategoryComparisonTemplate
from .news_summary import HackerNewsNewsSummaryTemplate
from .recent_burst_count import HackerNewsRecentBurstCountTemplate
from .comment_tree_focus import HackerNewsCommentTreeFocusTemplate
from .keyword_scan_rank import HackerNewsKeywordScanRankTemplate
from .user_karma_gap import HackerNewsUserKarmaGapTemplate

__all__ = [
"HackerNewsMultiConditionFilterTemplate",
"HackerNewsExtremaComparisonTemplate",
"HackerNewsCategoryComparisonTemplate",
"HackerNewsNewsSummaryTemplate",
"HackerNewsRecentBurstCountTemplate",
"HackerNewsCommentTreeFocusTemplate",
"HackerNewsKeywordScanRankTemplate",
"HackerNewsUserKarmaGapTemplate",
]
Loading