diff --git a/liveweb_arena/core/gt_collector.py b/liveweb_arena/core/gt_collector.py index 98deeb1..fa33080 100644 --- a/liveweb_arena/core/gt_collector.py +++ b/liveweb_arena/core/gt_collector.py @@ -369,6 +369,14 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]: name = api_data.get("name", f"SN{netuid}") return f"subnet[{name}]" + elif "hn.algolia.com" in url_lower: + # HN Algolia search data + query = str(api_data.get("query", "")).strip().lower() + page = int(api_data.get("page", 0)) + key = f"hn_search:{query}:{page}" + self._collected_api_data[key] = api_data + return f"hn_search[{query}] page={page}" + elif "news.ycombinator.com" in url_lower: if "stories" in api_data: # Check if this is a category page (ask, show, jobs) or homepage @@ -391,8 +399,8 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]: if added > 0: return f"+{added} stories" return None - elif "id" in api_data and "title" in api_data: - # Story detail page: merge with existing data, preserving rank + elif "id" in api_data: + # Item detail page (story/comment): merge with existing data, preserving rank story_id = str(api_data["id"]) existing = self._collected_api_data.get(story_id, {}) # Copy to avoid mutating cached/shared api_data reference @@ -400,6 +408,15 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]: if "rank" in existing and "rank" not in merged: merged["rank"] = existing["rank"] self._collected_api_data[story_id] = merged + added_comments = 0 + comment_items = api_data.get("_comment_items") + if isinstance(comment_items, dict): + for comment_id, comment_payload in comment_items.items(): + if isinstance(comment_id, str) and isinstance(comment_payload, dict): + self._collected_api_data[comment_id] = comment_payload + added_comments += 1 + if added_comments > 0: + return f"story[{story_id}] +{added_comments} comments" return f"story[{story_id}]" elif "user" in api_data: # User page diff --git a/liveweb_arena/core/task_registry.py b/liveweb_arena/core/task_registry.py index 924c887..c283c65 100644 --- a/liveweb_arena/core/task_registry.py +++ b/liveweb_arena/core/task_registry.py @@ -134,6 +134,10 @@ class TaskRegistry: 76: ("hackernews", "hackernews_extrema_comparison"), 77: ("hackernews", "hackernews_category_comparison"), 78: ("hackernews", "hackernews_news_summary"), + 110: ("hackernews", "hackernews_recent_burst_count"), + 111: ("hackernews", "hackernews_comment_tree_focus"), + 112: ("hackernews", "hackernews_keyword_scan_rank"), + 113: ("hackernews", "hackernews_user_karma_gap"), # Open Library templates 80: ("openlibrary", "openlibrary_book_stats"), @@ -146,9 +150,6 @@ class TaskRegistry: 86: ("openmeteo", "openmeteo_comparison"), 87: ("openmeteo", "openmeteo_hourly_extrema"), 88: ("openmeteo", "openmeteo_forecast_trend"), - 99: ("openmeteo", "openmeteo_hourly_threshold"), - 100: ("openmeteo", "openmeteo_sunrise_sunset"), - 101: ("openmeteo", "openmeteo_hourly_time_of"), # ArXiv templates 90: ("arxiv", "arxiv_paper_info"), @@ -156,11 +157,6 @@ class TaskRegistry: 92: ("arxiv", "arxiv_category_comparison"), 94: ("arxiv", "arxiv_multi_author_filter"), 95: ("arxiv", "arxiv_title_length_extrema"), - - # Open Library templates — engagement & comparison - 96: ("openlibrary", "openlibrary_author_engagement_extrema"), - 97: ("openlibrary", "openlibrary_author_comparison"), - 98: ("openlibrary", "openlibrary_reading_stats_filter"), } # Template versions - each version's combinations come AFTER all previous versions @@ -190,10 +186,11 @@ class TaskRegistry: # Version 6: ArXiv templates [90, 91, 92, 94, 95], # Version 7: Open Library engagement & comparison templates (PR #13) - # NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8. [96, 97, 98], # Version 8: Additional Open Meteo templates [99, 100, 101], + # Version 9: Hacker News gap-filling templates + [110, 111, 112, 113], ] # Combination registry: list of template ID tuples diff --git a/liveweb_arena/plugins/hackernews/api_client.py b/liveweb_arena/plugins/hackernews/api_client.py index 3e42b3c..4f42ae2 100644 --- a/liveweb_arena/plugins/hackernews/api_client.py +++ b/liveweb_arena/plugins/hackernews/api_client.py @@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional import aiohttp +from urllib.parse import quote_plus from liveweb_arena.plugins.base_client import APIFetchError, BaseAPIClient, RateLimiter, validate_api_response @@ -14,6 +15,7 @@ # Firebase API base URL HN_API_BASE = "https://hacker-news.firebaseio.com/v0" +HN_ALGOLIA_SEARCH_API = "https://hn.algolia.com/api/v1/search_by_date" class HackerNewsClient(BaseAPIClient): @@ -83,6 +85,14 @@ async def get_top_stories(cls, limit: int = 30) -> List[int]: return data[:limit] return [] + @classmethod + async def get_new_stories(cls, limit: int = 30) -> List[int]: + """Get newest story IDs.""" + data = await cls.get("/newstories.json") + if data and isinstance(data, list): + return data[:limit] + return [] + @classmethod async def get_ask_stories(cls, limit: int = 30) -> List[int]: """Get Ask HN story IDs.""" @@ -204,6 +214,36 @@ async def fetch_homepage_api_data(limit: int = 30) -> Dict[str, Any]: return {"stories": stories} +async def fetch_newest_api_data(limit: int = 30) -> Dict[str, Any]: + """ + Fetch API data for HN newest page. + + Returns: + { + "category": "newest", + "stories": { + "": { + ... + "rank": <1-based newest rank> + } + } + } + """ + story_ids = await HackerNewsClient.get_new_stories(limit=limit) + if not story_ids: + raise APIFetchError("Failed to fetch newest stories", source="hackernews") + + items = await HackerNewsClient.get_items_batch(story_ids) + stories = {} + for rank, story_id in enumerate(story_ids, start=1): + if story_id in items: + story = items[story_id] + story["rank"] = rank + stories[str(story_id)] = story + + return {"category": "newest", "stories": stories} + + async def fetch_category_api_data(category: str, limit: int = 30) -> Dict[str, Any]: """ Fetch API data for a category page (ask, show, jobs). @@ -262,7 +302,36 @@ async def fetch_item_api_data(item_id: int) -> Dict[str, Any]: item = await HackerNewsClient.get_item(item_id) if not item: raise APIFetchError(f"Item {item_id} not found", source="hackernews") - + kids = item.get("kids") + if not isinstance(kids, list) or len(kids) == 0: + return item + + # Collect a bounded nested comment subtree so nested-navigation templates + # can compute GT after visiting story detail without opening every comment URL. + comment_items: Dict[str, Dict[str, Any]] = {} + queue: List[int] = [cid for cid in kids if isinstance(cid, int)] + visited = set() + max_comment_nodes = 200 + + while queue and len(comment_items) < max_comment_nodes: + cid = queue.pop(0) + if cid in visited: + continue + visited.add(cid) + comment = await HackerNewsClient.get_item(cid) + if not isinstance(comment, dict): + continue + comment_items[str(cid)] = comment + child_ids = comment.get("kids") + if isinstance(child_ids, list): + for child_id in child_ids: + if isinstance(child_id, int) and child_id not in visited: + queue.append(child_id) + + if comment_items: + enriched = dict(item) + enriched["_comment_items"] = comment_items + return enriched return item @@ -290,3 +359,47 @@ async def fetch_user_api_data(username: str) -> Dict[str, Any]: "user": user, "submissions": user.get("submitted", [])[:30], # Limit to recent submissions } + + +async def fetch_search_api_data(query: str, page: int = 0, hits_per_page: int = 30) -> Dict[str, Any]: + """ + Fetch search results from HN Algolia API. + + Args: + query: Search query string + page: Search page number + hits_per_page: Number of results per page + + Returns: + Dict with query metadata and hits list + """ + await HackerNewsClient._rate_limit() + encoded_query = quote_plus(query) + url = ( + f"{HN_ALGOLIA_SEARCH_API}?query={encoded_query}&page={page}" + f"&hitsPerPage={hits_per_page}&tags=story" + ) + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=15.0)) as response: + if response.status != 200: + raise APIFetchError( + f"Failed to fetch Algolia search data: status={response.status}", + source="hackernews", + ) + payload = await response.json() + except APIFetchError: + raise + except Exception as e: + raise APIFetchError(f"Algolia search request failed: {e}", source="hackernews") from e + + if not validate_api_response(payload, required_fields=["hits"], source="hackernews"): + raise APIFetchError("Invalid Algolia search payload", source="hackernews") + + return { + "query": query, + "page": page, + "hits_per_page": hits_per_page, + "hits": payload.get("hits", []), + "nb_hits": payload.get("nbHits"), + } diff --git a/liveweb_arena/plugins/hackernews/hackernews.py b/liveweb_arena/plugins/hackernews/hackernews.py index 11a6e3d..e7ea47e 100644 --- a/liveweb_arena/plugins/hackernews/hackernews.py +++ b/liveweb_arena/plugins/hackernews/hackernews.py @@ -13,9 +13,11 @@ from liveweb_arena.plugins.base import BasePlugin from .api_client import ( fetch_homepage_api_data, + fetch_newest_api_data, fetch_category_api_data, fetch_item_api_data, fetch_user_api_data, + fetch_search_api_data, ) # Per-evaluation state via contextvars. @@ -53,6 +55,7 @@ class HackerNewsPlugin(BasePlugin): allowed_domains = [ "news.ycombinator.com", + "hn.algolia.com", ] @classmethod @@ -77,7 +80,7 @@ def get_blocked_patterns(self) -> List[str]: """Block direct API access to force agents to use the website.""" return [ "*hacker-news.firebaseio.com*", # Block Firebase API - "*hn.algolia.com*", # Block Algolia search API + "*hn.algolia.com/api/*", # Block direct Algolia API calls ] @classmethod @@ -287,6 +290,13 @@ async def fetch_api_data(self, url: str) -> Dict[str, Any]: parsed = urlparse(url) host = parsed.netloc.lower() + # Algolia search page + if "hn.algolia.com" in host: + query = parse_qs(parsed.query) + search_q = query.get("q", [""])[0].strip() + page = int(query.get("page", ["0"])[0] or 0) + return await fetch_search_api_data(search_q, page=page, hits_per_page=30) + # Check if this is an external URL (not HN domain) if "ycombinator.com" not in host: return self._get_external_url_data(url) @@ -318,8 +328,14 @@ async def fetch_api_data(self, url: str) -> Dict[str, Any]: self._extract_external_urls(data) return data - # Homepage (including news, newest, front, etc. - all show top stories) - if path in ("", "news", "newest", "front") or not path: + # Newest page + if path == "newest": + data = await fetch_newest_api_data() + self._extract_external_urls(data) + return data + + # Homepage (top stories) + if path in ("", "news", "front") or not path: data = await fetch_homepage_api_data() self._extract_external_urls(data) return data @@ -340,6 +356,10 @@ def needs_api_data(self, url: str) -> bool: parsed = urlparse(url) host = parsed.netloc.lower() + # Algolia search pages need API data + if "hn.algolia.com" in host: + return True + # External URLs (non-HN) need data extraction for GT if "ycombinator.com" not in host: return self.is_legitimate_external_url(url) @@ -359,7 +379,7 @@ def needs_api_data(self, url: str) -> bool: if path in ("ask", "show", "jobs"): return True - # Homepage needs API data + # Newest/homepage need API data if path in ("", "news", "newest", "front") or not path: return True @@ -381,8 +401,8 @@ def is_url_allowed(self, url: str) -> bool: parsed = urlparse(url) host = parsed.netloc.lower() - # Always allow HN domain - if "ycombinator.com" in host: + # Always allow HN and Algolia search domains + if "ycombinator.com" in host or "hn.algolia.com" in host: return True # Allow legitimate external URLs from HN stories diff --git a/liveweb_arena/plugins/hackernews/templates/__init__.py b/liveweb_arena/plugins/hackernews/templates/__init__.py index c447f7e..cd5762c 100644 --- a/liveweb_arena/plugins/hackernews/templates/__init__.py +++ b/liveweb_arena/plugins/hackernews/templates/__init__.py @@ -11,10 +11,18 @@ from .extrema_comparison import HackerNewsExtremaComparisonTemplate from .category_comparison import HackerNewsCategoryComparisonTemplate from .news_summary import HackerNewsNewsSummaryTemplate +from .recent_burst_count import HackerNewsRecentBurstCountTemplate +from .comment_tree_focus import HackerNewsCommentTreeFocusTemplate +from .keyword_scan_rank import HackerNewsKeywordScanRankTemplate +from .user_karma_gap import HackerNewsUserKarmaGapTemplate __all__ = [ "HackerNewsMultiConditionFilterTemplate", "HackerNewsExtremaComparisonTemplate", "HackerNewsCategoryComparisonTemplate", "HackerNewsNewsSummaryTemplate", + "HackerNewsRecentBurstCountTemplate", + "HackerNewsCommentTreeFocusTemplate", + "HackerNewsKeywordScanRankTemplate", + "HackerNewsUserKarmaGapTemplate", ] diff --git a/liveweb_arena/plugins/hackernews/templates/comment_tree_focus.py b/liveweb_arena/plugins/hackernews/templates/comment_tree_focus.py new file mode 100644 index 0000000..b975264 --- /dev/null +++ b/liveweb_arena/plugins/hackernews/templates/comment_tree_focus.py @@ -0,0 +1,181 @@ +"""Nested-structure navigation template for Hacker News comments.""" + +import random +from typing import Any, Dict, Optional + +from liveweb_arena.core.ground_truth_trigger import GroundTruthResult, TriggerConfig, UrlPatternTrigger +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) + +from .common import ( + count_descendants_with_min_depth, + get_category_stories, + get_collected_hn_data, + get_item_story, +) + +RANK_CHOICES = list(range(1, 31)) +DEPTH_CHOICES = [2, 3, 4, 5, 6] +METRIC_CHOICES = ["nodes", "leaf_nodes", "branch_nodes", "max_depth"] + +PATTERNS = [ + "On HN /newest, open rank #{rank} story and compute '{metric}' for comments at depth >= {min_depth}.", + "Using Hacker News newest, for story rank {rank}, return '{metric}' from descendants depth {min_depth}+.", + "From newest rank {rank} story, traverse nested replies and report {metric} with depth threshold {min_depth}.", +] + + +@register_template("hackernews_comment_tree_focus") +class HackerNewsCommentTreeFocusTemplate(QuestionTemplate): + """Measure top-level comment node count for a selected newest story rank.""" + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("hackernews_comment_tree_focus") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + rank = rng.choice(RANK_CHOICES) + min_depth = rng.choice(DEPTH_CHOICES) + metric = rng.choice(METRIC_CHOICES) + pattern = rng.choice(PATTERNS) + return GeneratedQuestion( + question_text=pattern.format(rank=rank, min_depth=min_depth, metric=metric), + start_url="https://news.ycombinator.com/newest", + variables={"rank": rank, "min_depth": min_depth, "metric": metric}, + validation_info={"rank": rank, "min_depth": min_depth, "metric": metric, "category_slug": "newest"}, + template_name=self.name, + expected_steps=12, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + return ( + "Task-Specific Rules (HN Comment Tree Focus):\n" + f"- Target newest rank: {validation_info.get('rank')}\n" + f"- Count descendants at depth >= {validation_info.get('min_depth')}\n" + f"- Metric: {validation_info.get('metric')}\n" + "- Score 1.0: exact\n" + "- Score 0.5: off by <=2\n" + "- Score 0.0: otherwise" + ) + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected, failure = get_collected_hn_data() + if failure is not None: + return failure + + rank = int(validation_info.get("rank", 1)) + stories, failure = get_category_stories(collected, "newest", min_count=rank) + if failure is not None: + return failure + + target_story = stories[rank - 1] + item_id = target_story.get("id") + if not isinstance(item_id, int): + return GroundTruthResult.fail("Target story missing id") + + item_data, failure = get_item_story(collected, item_id) + if failure is not None: + return failure + + kids = item_data.get("kids") + if kids is None: + return GroundTruthResult.ok("0") + if not isinstance(kids, list): + return GroundTruthResult.fail("Malformed kids field in item payload") + min_depth = int(validation_info.get("min_depth", 2)) + metric = str(validation_info.get("metric", "nodes")) + total, failure = count_descendants_with_min_depth(collected, kids, min_depth=min_depth) + if failure is not None: + return failure + if metric == "nodes": + return GroundTruthResult.ok(str(total)) + + # Re-traverse comment tree once to compute specialized metrics. + seen = set() + + def _walk(comment_ids: list[int], depth: int) -> tuple[int, int, int]: + node_count = 0 + leaf_count = 0 + branch_count = 0 + max_seen_depth = depth - 1 + for cid in comment_ids: + if cid in seen: + continue + seen.add(cid) + payload, failure = get_item_story(collected, cid) + if failure is not None: + return -1, -1, -1 + child_ids = payload.get("kids") or [] + if not isinstance(child_ids, list): + return -1, -1, -1 + qualifies = depth >= min_depth + if qualifies: + node_count += 1 + if len(child_ids) == 0: + leaf_count += 1 + else: + branch_count += 1 + n, l, b = _walk([x for x in child_ids if isinstance(x, int)], depth + 1) + if n < 0: + return -1, -1, -1 + node_count += n + leaf_count += l + branch_count += b + max_seen_depth = max(max_seen_depth, depth) + return node_count, leaf_count, branch_count + + _, leaf_count, branch_count = _walk([x for x in kids if isinstance(x, int)], 1) + if leaf_count < 0: + return GroundTruthResult.not_collected("Nested comment payload not fully collected") + + if metric == "leaf_nodes": + return GroundTruthResult.ok(str(leaf_count)) + if metric == "branch_nodes": + return GroundTruthResult.ok(str(branch_count)) + if metric == "max_depth": + # Approximate max depth from story-level descendants + depth threshold floor. + # This remains deterministic and requires nested traversal data collection. + if total == 0: + return GroundTruthResult.ok("0") + # Recompute exact max depth. + seen_depth = set() + + def _max_depth(comment_id: int, depth: int) -> int: + if comment_id in seen_depth: + return depth + seen_depth.add(comment_id) + payload, failure = get_item_story(collected, comment_id) + if failure is not None: + return depth + child_ids = payload.get("kids") or [] + if not isinstance(child_ids, list) or len(child_ids) == 0: + return depth + return max(_max_depth(c, depth + 1) for c in child_ids if isinstance(c, int)) + + depth_val = 0 + for cid in kids: + if isinstance(cid, int): + depth_val = max(depth_val, _max_depth(cid, 1)) + return GroundTruthResult.ok(str(depth_val)) + + return GroundTruthResult.fail(f"Unsupported metric '{metric}'") + + async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult: + return ValidationResult(score=0.0, is_correct=False, expected=None, actual=answer, details="Use LLM validation") + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + return TriggerConfig(trigger=UrlPatternTrigger(domains=["news.ycombinator.com"])) + + @classmethod + def get_cache_source(cls) -> str: + return "hackernews" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/hackernews/templates/common.py b/liveweb_arena/plugins/hackernews/templates/common.py new file mode 100644 index 0000000..2365b1b --- /dev/null +++ b/liveweb_arena/plugins/hackernews/templates/common.py @@ -0,0 +1,164 @@ +"""Shared utilities for advanced Hacker News templates.""" + +from typing import Any, Dict, List, Optional, Set, Tuple + +from liveweb_arena.core.ground_truth_trigger import GroundTruthResult +from liveweb_arena.core.gt_collector import get_current_gt_collector + + +def get_collected_hn_data() -> Tuple[Optional[Dict[str, Dict[str, Any]]], Optional[GroundTruthResult]]: + """Return collected API payload map for current evaluation.""" + gt_collector = get_current_gt_collector() + if gt_collector is None: + return None, GroundTruthResult.system_error("No GT collector") + return gt_collector.get_collected_api_data(), None + + +def get_category_stories( + collected: Dict[str, Dict[str, Any]], + category_slug: str, + min_count: int = 1, +) -> Tuple[Optional[List[Dict[str, Any]]], Optional[GroundTruthResult]]: + """Extract ordered category stories from collected data.""" + key = f"hn_category:{category_slug}" + category_data = collected.get(key) + if not isinstance(category_data, dict): + return None, GroundTruthResult.not_collected( + f"Category data '{category_slug}' not collected. Visit /{category_slug}." + ) + + stories = category_data.get("stories") + if not isinstance(stories, dict): + return None, GroundTruthResult.fail(f"Malformed stories in category '{category_slug}'") + + result: List[Dict[str, Any]] = [] + for _, story in stories.items(): + if not isinstance(story, dict): + continue + rank = story.get("rank") + if rank is None: + continue + result.append(story) + + result.sort(key=lambda s: s["rank"]) + if len(result) < min_count: + return None, GroundTruthResult.not_collected( + f"Need at least {min_count} stories in '{category_slug}', got {len(result)}." + ) + return result, None + + +def get_item_story( + collected: Dict[str, Dict[str, Any]], + item_id: int, +) -> Tuple[Optional[Dict[str, Any]], Optional[GroundTruthResult]]: + """Get item story data by item id from collected payload.""" + key = str(item_id) + story = collected.get(key) + if not isinstance(story, dict): + return None, GroundTruthResult.not_collected( + f"Item {item_id} not collected. Visit /item?id={item_id}." + ) + return story, None + + +def get_user_data( + collected: Dict[str, Dict[str, Any]], + username: str, +) -> Tuple[Optional[Dict[str, Any]], Optional[GroundTruthResult]]: + """Get user payload by username.""" + key = f"user:{username}" + payload = collected.get(key) + if not isinstance(payload, dict): + return None, GroundTruthResult.not_collected( + f"User data for '{username}' not collected. Visit /user?id={username}." + ) + user = payload.get("user") + if not isinstance(user, dict): + return None, GroundTruthResult.fail(f"Malformed user payload for '{username}'.") + return user, None + + +def parse_iso_minutes(timestamp: Any) -> Optional[int]: + """Parse basic ISO timestamp and return minutes from day start.""" + if not isinstance(timestamp, str) or "T" not in timestamp: + return None + time_part = timestamp.split("T", 1)[1] + if ":" not in time_part: + return None + hour_s, minute_s = time_part.split(":", 1) + try: + hour = int(hour_s) + minute = int(minute_s[:2]) + except ValueError: + return None + if hour < 0 or hour > 23 or minute < 0 or minute > 59: + return None + return hour * 60 + minute + + +def get_item_data( + collected: Dict[str, Dict[str, Any]], + item_id: int, +) -> Tuple[Optional[Dict[str, Any]], Optional[GroundTruthResult]]: + """Get generic item payload by id (story or comment).""" + key = str(item_id) + payload = collected.get(key) + if not isinstance(payload, dict): + return None, GroundTruthResult.not_collected( + f"Item {item_id} not collected. Visit /item?id={item_id}." + ) + return payload, None + + +def count_descendants_with_min_depth( + collected: Dict[str, Dict[str, Any]], + root_comment_ids: List[int], + min_depth: int, +) -> Tuple[Optional[int], Optional[GroundTruthResult]]: + """ + Count unique descendants at or below a minimum depth from root comments. + + Depth convention: + - root comments have depth=1 + - direct replies to root have depth=2 + """ + if min_depth < 1: + return None, GroundTruthResult.fail("min_depth must be >= 1") + + visited: Set[int] = set() + + def _dfs(comment_id: int, depth: int) -> Tuple[int, Optional[GroundTruthResult]]: + if comment_id in visited: + return 0, None + visited.add(comment_id) + + item, failure = get_item_data(collected, comment_id) + if failure is not None: + return 0, failure + + subtotal = 1 if depth >= min_depth else 0 + kids = item.get("kids", []) + if kids is None: + return subtotal, None + if not isinstance(kids, list): + return 0, GroundTruthResult.fail(f"Malformed kids for comment {comment_id}") + + for kid in kids: + if not isinstance(kid, int): + return 0, GroundTruthResult.fail(f"Malformed child id for comment {comment_id}") + nested_count, failure = _dfs(kid, depth + 1) + if failure is not None: + return 0, failure + subtotal += nested_count + return subtotal, None + + total = 0 + for root_id in root_comment_ids: + if not isinstance(root_id, int): + return None, GroundTruthResult.fail("Malformed root comment ids") + count, failure = _dfs(root_id, depth=1) + if failure is not None: + return None, failure + total += count + return total, None diff --git a/liveweb_arena/plugins/hackernews/templates/keyword_scan_rank.py b/liveweb_arena/plugins/hackernews/templates/keyword_scan_rank.py new file mode 100644 index 0000000..c36d7a2 --- /dev/null +++ b/liveweb_arena/plugins/hackernews/templates/keyword_scan_rank.py @@ -0,0 +1,144 @@ +"""Search-driven interaction template using HN Algolia search.""" + +import random +from typing import Any, Dict, Optional + +from liveweb_arena.core.ground_truth_trigger import GroundTruthResult, TriggerConfig, UrlPatternTrigger +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) + +from .common import get_collected_hn_data + +QUERIES = [ + "ai", "agent", "llm", "open source", "python", "javascript", "rust", "golang", + "database", "kubernetes", "linux", "performance", "security", "compiler", + "startup", "postgres", "cloud", "gpu", "distributed systems", "api", + "privacy", "benchmark", "webassembly", "vector database", "observability", + "monitoring", "network", "cache", "latency", "sqlite", "git", "docker", + "tensorflow", "pytorch", "machine learning", "deep learning", "robotics", + "sre", "frontend", "backend", "http", "kernel", "browser", "css", "typescript", + "java", "cpp", "mobile", "design", "product", "analytics", "search", +] +RESULT_RANKS = list(range(1, 21)) +RESULT_FIELDS = ["title", "author", "points", "comments"] +POINT_BUCKETS = [0, 1, 2, 3] + +PATTERNS = [ + "Use Hacker News search for '{query}'. Considering only results with at least {min_points} points, for rank #{rank} return the {field}.", + "On HN search page with query '{query}', after filtering to >= {min_points} points, what is the {field} of the #{rank} result?", + "Search HN for '{query}', keep hits with points >= {min_points}, inspect result {rank}, and answer with its {field}.", +] + + +@register_template("hackernews_keyword_scan_rank") +class HackerNewsKeywordScanRankTemplate(QuestionTemplate): + """Query HN search and retrieve a specific field from a ranked hit.""" + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("hackernews_keyword_scan_rank") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + query = rng.choice(QUERIES) + rank = rng.choice(RESULT_RANKS) + field = rng.choice(RESULT_FIELDS) + min_points = rng.choice(POINT_BUCKETS) + pattern = rng.choice(PATTERNS) + search_url = f"https://hn.algolia.com/?q={query.replace(' ', '+')}&sort=byDate&prefix=true&page=0" + return GeneratedQuestion( + question_text=pattern.format(query=query, rank=rank, field=field, min_points=min_points), + start_url=search_url, + variables={"query": query, "rank": rank, "field": field, "min_points": min_points}, + validation_info={ + "query": query, + "rank": rank, + "field": field, + "min_points": min_points, + "search_page": 0, + }, + template_name=self.name, + expected_steps=8, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + return ( + "Task-Specific Rules (HN Search Field):\n" + f"- Query: {validation_info.get('query')}\n" + f"- Target rank: {validation_info.get('rank')}\n" + f"- Requested field: {validation_info.get('field')}\n" + f"- Points floor applied in GT filtering: >= {validation_info.get('min_points')}\n" + "- Score 1.0: exact expected value\n" + "- Score 0.0: otherwise" + ) + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected, failure = get_collected_hn_data() + if failure is not None: + return failure + query = str(validation_info.get("query", "")).strip() + field = str(validation_info.get("field", "title")).strip().lower() + rank = int(validation_info.get("rank", 1)) + min_points = int(validation_info.get("min_points", 0)) + key = f"hn_search:{query.lower()}:{int(validation_info.get('search_page', 0))}" + + payload = collected.get(key) + if not isinstance(payload, dict): + return GroundTruthResult.not_collected( + f"Search data for query '{query}' not collected. Visit hn.algolia.com search page." + ) + + hits = payload.get("hits") + if not isinstance(hits, list): + return GroundTruthResult.fail("Malformed Algolia search payload: missing hits list") + + filtered_hits = [] + for hit in hits: + if not isinstance(hit, dict): + continue + points = hit.get("points") + if not isinstance(points, int): + points = 0 + if points >= min_points: + filtered_hits.append(hit) + + if rank < 1 or rank > len(filtered_hits): + return GroundTruthResult.ok("NONE") + + target = filtered_hits[rank - 1] + if field == "title": + value = target.get("title") + return GroundTruthResult.ok(str(value or "")) + if field == "author": + value = target.get("author") + return GroundTruthResult.ok(str(value or "")) + if field == "points": + value = target.get("points") + if isinstance(value, int): + return GroundTruthResult.ok(str(value)) + return GroundTruthResult.fail("Missing points in target hit") + if field == "comments": + value = target.get("num_comments") + if isinstance(value, int): + return GroundTruthResult.ok(str(value)) + return GroundTruthResult.fail("Missing num_comments in target hit") + return GroundTruthResult.fail(f"Unsupported field '{field}'") + + async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult: + return ValidationResult(score=0.0, is_correct=False, expected=None, actual=answer, details="Use LLM validation") + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + return TriggerConfig(trigger=UrlPatternTrigger(domains=["news.ycombinator.com", "hn.algolia.com"])) + + @classmethod + def get_cache_source(cls) -> str: + return "hackernews" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/hackernews/templates/recent_burst_count.py b/liveweb_arena/plugins/hackernews/templates/recent_burst_count.py new file mode 100644 index 0000000..f3cfcb6 --- /dev/null +++ b/liveweb_arena/plugins/hackernews/templates/recent_burst_count.py @@ -0,0 +1,125 @@ +"""Time-sensitive burst detection template for Hacker News newest feed.""" + +import random +from typing import Any, Dict, Optional + +from liveweb_arena.core.ground_truth_trigger import GroundTruthResult, TriggerConfig, UrlPatternTrigger +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) + +from .common import get_category_stories, get_collected_hn_data + +WINDOW_MINUTES = [10, 15, 20, 30, 45, 60, 90, 120, 180, 240] +STORY_COUNTS = [10, 12, 15, 18, 20, 24, 30, 36, 42, 50] +ANCHOR_RANKS = [1, 2, 3, 4, 5] +INCLUDE_EQUAL_MODES = [True, False] + +PATTERNS = [ + "On Hacker News newest, among the top {n} stories, how many were posted {cmp} {window} minutes of rank #{anchor_rank} story time?", + "Using HN /newest, count top {n} stories whose posting time is {cmp} {window} minutes from rank {anchor_rank}.", + "From the newest {n} HN stories, how many fall {cmp} a {window}-minute burst window anchored at rank {anchor_rank}?", +] + + +@register_template("hackernews_recent_burst_count") +class HackerNewsRecentBurstCountTemplate(QuestionTemplate): + """Count how many newest stories fall inside a recent time burst.""" + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("hackernews_recent_burst_count") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + n = rng.choice(STORY_COUNTS) + window = rng.choice(WINDOW_MINUTES) + anchor_rank = rng.choice([r for r in ANCHOR_RANKS if r <= n]) + include_equal = rng.choice(INCLUDE_EQUAL_MODES) + cmp_text = "within or equal to" if include_equal else "strictly within" + pattern = rng.choice(PATTERNS) + return GeneratedQuestion( + question_text=pattern.format( + n=n, + window=window, + anchor_rank=anchor_rank, + cmp=cmp_text, + ), + start_url="https://news.ycombinator.com/newest", + variables={ + "story_count": n, + "window_minutes": window, + "anchor_rank": anchor_rank, + "include_equal": include_equal, + }, + validation_info={ + "story_count": n, + "window_minutes": window, + "anchor_rank": anchor_rank, + "include_equal": include_equal, + "category_slug": "newest", + }, + template_name=self.name, + expected_steps=8, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + return ( + "Task-Specific Rules (HN Recent Burst Count):\n" + f"- Top stories considered: {validation_info.get('story_count')}\n" + f"- Anchor rank: {validation_info.get('anchor_rank')}\n" + f"- Time window: {validation_info.get('window_minutes')} minutes from newest post\n" + f"- Comparison mode includes equality: {validation_info.get('include_equal')}\n" + "- Score 1.0: exact count\n" + "- Score 0.5: off by 1\n" + "- Score 0.0: otherwise" + ) + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected, failure = get_collected_hn_data() + if failure is not None: + return failure + n = int(validation_info.get("story_count", 20)) + window = int(validation_info.get("window_minutes", 60)) + anchor_rank = int(validation_info.get("anchor_rank", 1)) + include_equal = bool(validation_info.get("include_equal", True)) + + stories, failure = get_category_stories(collected, "newest", min_count=max(n, anchor_rank)) + if failure is not None: + return failure + stories = stories[:n] + + anchor_story = stories[anchor_rank - 1] + anchor_ts = anchor_story.get("time") + if not isinstance(anchor_ts, int): + return GroundTruthResult.fail("Anchor story missing unix timestamp") + + count = 0 + for story in stories: + story_ts = story.get("time") + if not isinstance(story_ts, int): + return GroundTruthResult.fail("Story missing unix timestamp") + delta_minutes = abs(anchor_ts - story_ts) / 60.0 + in_window = delta_minutes <= window if include_equal else delta_minutes < window + if in_window: + count += 1 + + return GroundTruthResult.ok(str(count)) + + async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult: + return ValidationResult(score=0.0, is_correct=False, expected=None, actual=answer, details="Use LLM validation") + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + return TriggerConfig(trigger=UrlPatternTrigger(domains=["news.ycombinator.com"])) + + @classmethod + def get_cache_source(cls) -> str: + return "hackernews" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/hackernews/templates/user_karma_gap.py b/liveweb_arena/plugins/hackernews/templates/user_karma_gap.py new file mode 100644 index 0000000..76a3ab4 --- /dev/null +++ b/liveweb_arena/plugins/hackernews/templates/user_karma_gap.py @@ -0,0 +1,122 @@ +"""User-generated content template comparing author karma.""" + +import random +from typing import Any, Dict, Optional + +from liveweb_arena.core.ground_truth_trigger import GroundTruthResult, TriggerConfig, UrlPatternTrigger +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) + +from .common import get_category_stories, get_collected_hn_data, get_user_data + +RANK_CHOICES = list(range(1, 31)) +METRIC_CHOICES = ["karma", "created_days", "submitted_count"] + +PATTERNS = [ + "On HN newest, compare authors at ranks #{rank_a} and #{rank_b}. Return signed difference for metric '{metric}' (rank {rank_a} minus rank {rank_b}).", + "Using Hacker News newest, visit user profiles for ranks {rank_a} and {rank_b}. What is {metric}(rank {rank_a}) - {metric}(rank {rank_b})?", + "From newest HN stories, compute {metric} gap between authors at ranks {rank_a} and {rank_b} (first minus second).", +] + + +@register_template("hackernews_user_karma_gap") +class HackerNewsUserKarmaGapTemplate(QuestionTemplate): + """Compare karma of two story authors from newest feed.""" + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("hackernews_user_karma_gap") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + rank_a, rank_b = sorted(rng.sample(RANK_CHOICES, 2)) + metric = rng.choice(METRIC_CHOICES) + pattern = rng.choice(PATTERNS) + return GeneratedQuestion( + question_text=pattern.format(rank_a=rank_a, rank_b=rank_b, metric=metric), + start_url="https://news.ycombinator.com/newest", + variables={"rank_a": rank_a, "rank_b": rank_b, "metric": metric}, + validation_info={"rank_a": rank_a, "rank_b": rank_b, "metric": metric, "category_slug": "newest"}, + template_name=self.name, + expected_steps=10, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + return ( + "Task-Specific Rules (HN User Karma Gap):\n" + f"- Compare newest ranks {validation_info.get('rank_a')} and {validation_info.get('rank_b')}\n" + f"- Metric: {validation_info.get('metric')} from /user profile pages\n" + "- Score 1.0: exact signed difference\n" + "- Score 0.5: absolute error <= 50 karma\n" + "- Score 0.0: otherwise" + ) + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected, failure = get_collected_hn_data() + if failure is not None: + return failure + + rank_a = int(validation_info.get("rank_a", 1)) + rank_b = int(validation_info.get("rank_b", 2)) + min_count = max(rank_a, rank_b) + + stories, failure = get_category_stories(collected, "newest", min_count=min_count) + if failure is not None: + return failure + + story_a = stories[rank_a - 1] + story_b = stories[rank_b - 1] + user_a = story_a.get("by") + user_b = story_b.get("by") + if not isinstance(user_a, str) or not isinstance(user_b, str): + return GroundTruthResult.fail("Story author missing") + + data_a, failure = get_user_data(collected, user_a) + if failure is not None: + return failure + data_b, failure = get_user_data(collected, user_b) + if failure is not None: + return failure + + metric = str(validation_info.get("metric", "karma")) + if metric == "karma": + value_a = data_a.get("karma") + value_b = data_b.get("karma") + if not isinstance(value_a, int) or not isinstance(value_b, int): + return GroundTruthResult.fail("User karma missing in collected profile data") + return GroundTruthResult.ok(str(value_a - value_b)) + + if metric == "created_days": + value_a = data_a.get("created") + value_b = data_b.get("created") + if not isinstance(value_a, int) or not isinstance(value_b, int): + return GroundTruthResult.fail("User created timestamp missing in collected profile data") + return GroundTruthResult.ok(str((value_a - value_b) // 86400)) + + if metric == "submitted_count": + submitted_a = data_a.get("submitted") + submitted_b = data_b.get("submitted") + if not isinstance(submitted_a, list) or not isinstance(submitted_b, list): + return GroundTruthResult.fail("User submitted list missing in collected profile data") + return GroundTruthResult.ok(str(len(submitted_a) - len(submitted_b))) + + return GroundTruthResult.fail(f"Unsupported metric '{metric}'") + + async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult: + return ValidationResult(score=0.0, is_correct=False, expected=None, actual=answer, details="Use LLM validation") + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + return TriggerConfig(trigger=UrlPatternTrigger(domains=["news.ycombinator.com"])) + + @classmethod + def get_cache_source(cls) -> str: + return "hackernews" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/tests/plugins/hackernews/RED_TEAM_REVIEW_GAP_TEMPLATES.md b/tests/plugins/hackernews/RED_TEAM_REVIEW_GAP_TEMPLATES.md new file mode 100644 index 0000000..b4a3349 --- /dev/null +++ b/tests/plugins/hackernews/RED_TEAM_REVIEW_GAP_TEMPLATES.md @@ -0,0 +1,91 @@ +# Red Team Review: HackerNews Gap Templates (T110-T113) + +Date: 2026-03-27 +Scope: `hackernews_recent_burst_count`, `hackernews_comment_tree_focus`, `hackernews_keyword_scan_rank`, `hackernews_user_karma_gap` + +This document records the mandatory 6-check red-team review with concrete data. + +## Template Summary + +- **T110** (`hackernews_recent_burst_count`): count stories within time window around an anchor rank timestamp. +- **T111** (`hackernews_comment_tree_focus`): compute nested comment-tree metrics at configurable depth threshold. +- **T112** (`hackernews_keyword_scan_rank`): use HN search (`hn.algolia.com`) and extract rank/field from filtered search hits. +- **T113** (`hackernews_user_karma_gap`): compare user-profile metrics across two story-author ranks. + +## Check 1: API Semantic Verification + +Pass. + +- **T110/T111/T113** semantics map directly to Firebase item/user payload fields: + - Story timestamps (`time`), comment tree (`kids`), user profile (`karma`, `created`, `submitted`). +- **T112** semantics map to Algolia search payload fields: + - Query-specific `hits`, `title`, `author`, `points`, `num_comments`. +- Real payload structures were captured and exercised in: + - `tests/plugins/hackernews/test_gap_templates_real_api_data.py` + +## Check 2: World Knowledge Attack + +Pass. + +- Questions are tied to dynamic, rapidly changing page/API state: + - newest ordering/timestamps, per-story comment trees, query-time search hits, and live user metrics. +- Static world knowledge cannot recover: + - rank-conditioned values (`rank`, `anchor_rank`), + - depth-filtered nested counts, + - per-query ranked search result fields. +- Estimated no-browse success remains low (near random for numeric/string exact tasks). + +## Check 3: Memorization Space Analysis + +Pass. + +Effective variant space lower-bound (parameter-level): + +- **T110**: `10 story_counts * 10 windows * 5 anchors * 2 comparators = 1000` +- **T111**: `30 ranks * 5 min_depths * 4 metrics = 600` +- **T112**: `52 queries * 20 ranks * 4 fields * 4 point thresholds = 16640` +- **T113**: `C(30,2)=435 rank pairs * 3 metrics = 1305` + +All exceed 500. + +## Check 4: Answer Stability + +Pass. + +- **T110/T111/T112** rely on high-volatility feeds (`newstories`, search_by_date, active comment trees). +- **T113** mixes slower (`karma`, `created`) and more dynamic (`submitted_count`) profile dimensions. +- Because answers are rank-conditioned and interaction-dependent, even stable user/account facts do not collapse the overall QA pool. + +## Check 5: Random Baseline Analysis + +Pass. + +- Answer format is open numeric/string exact match, not multiple-choice. +- Random guess probability is low: + - numeric exact-difference/count tasks (T110/T111/T113), + - exact string/integer extraction from ranked filtered search hits (T112). +- No binary-choice style templates in this set. + +## Check 6: Cross-Parameter Collapse Detection + +Pass. + +- Added independent dimensions to avoid collapse: + - **T110**: anchor rank + comparator mode + - **T111**: depth + structural metric + - **T112**: query + rank + field + point-threshold filter + - **T113**: rank-pair combinations + metric family +- `NONE` paths remain valid but are no longer dominant across the whole parameter space due to expanded query/field/rank/threshold combinations and non-binary answers. + +## Verification Artifacts + +- Real API structure GT verification: + - `tests/plugins/hackernews/test_gap_templates_real_api_data.py` +- Core template behavior and variant-space assertions: + - `tests/test_hackernews_gap_templates.py` +- Nested collection path coverage for T111: + - story detail fetch now includes bounded `_comment_items` subtree payload + - GT collector merges `_comment_items` so nested metrics can be computed + after story-detail navigation without requiring each comment URL visit +- Local command run: + - `PYTHONPATH=. pytest -q tests/test_hackernews_gap_templates.py tests/plugins/hackernews/test_gap_templates_real_api_data.py` diff --git a/tests/plugins/hackernews/test_gap_templates_real_api_data.py b/tests/plugins/hackernews/test_gap_templates_real_api_data.py new file mode 100644 index 0000000..0ed678b --- /dev/null +++ b/tests/plugins/hackernews/test_gap_templates_real_api_data.py @@ -0,0 +1,247 @@ +"""GT computation tests using real Hacker News API data snapshots. + +Snapshots were fetched live on 2026-03-27 from: +- https://hacker-news.firebaseio.com/v0/newstories.json +- https://hacker-news.firebaseio.com/v0/item/.json +- https://hacker-news.firebaseio.com/v0/user/.json +- https://hn.algolia.com/api/v1/search_by_date + +These tests validate that template GT logic computes concrete values using +real API response structure and field naming. +""" + +import asyncio +from typing import Any, Dict + +from liveweb_arena.core.gt_collector import set_current_gt_collector +from liveweb_arena.plugins.hackernews.templates.comment_tree_focus import ( + HackerNewsCommentTreeFocusTemplate, +) +from liveweb_arena.plugins.hackernews.templates.keyword_scan_rank import ( + HackerNewsKeywordScanRankTemplate, +) +from liveweb_arena.plugins.hackernews.templates.recent_burst_count import ( + HackerNewsRecentBurstCountTemplate, +) +from liveweb_arena.plugins.hackernews.templates.user_karma_gap import ( + HackerNewsUserKarmaGapTemplate, +) + + +class _DummyCollector: + def __init__(self, data: Dict[str, Dict[str, Any]]): + self._data = data + + def get_collected_api_data(self) -> Dict[str, Dict[str, Any]]: + return self._data + + +def _run_gt(data: Dict[str, Dict[str, Any]], coro): + set_current_gt_collector(_DummyCollector(data)) + try: + return asyncio.run(coro) + finally: + set_current_gt_collector(None) + + +NEWEST_5_STORIES = { + "47541556": { + "id": 47541556, + "rank": 1, + "title": "OpenID AuthZen Authorization API 1.0 released", + "by": "Tepix", + "time": 1774612147, + "score": 1, + "descendants": 1, + }, + "47541541": { + "id": 47541541, + "rank": 2, + "title": "How to move from Prompt to Context Engineering (With demo code)", + "by": "visopsys", + "time": 1774612071, + "score": 1, + "descendants": 0, + }, + "47541528": { + "id": 47541528, + "rank": 3, + "title": "Crisp open source BA/PM framework for Claude Code(stop building the wrong thing)", + "by": "mirkoradeka", + "time": 1774611991, + "score": 1, + "descendants": 0, + }, + "47541523": { + "id": 47541523, + "rank": 4, + "title": "GLM-5.1 Released", + "by": "sumitsrivastava", + "time": 1774611963, + "score": 1, + "descendants": 0, + }, + "47541521": { + "id": 47541521, + "rank": 5, + "title": "Show HN: Agent-CI (Run GitHub Actions on your local machine.)", + "by": "pistoriusp", + "time": 1774611955, + "score": 1, + "descendants": 2, + }, +} + +HN_SEARCH_PYTHON_PAGE0 = { + "query": "python", + "page": 0, + "hits": [ + { + "title": "Show HN: 10 Lines of Python to fix mangled copy-paste from Claude Code", + "author": "collectedparts", + "points": 1, + "num_comments": 0, + }, + { + "title": "Telnyx v4.87.1 and v4.87.2 are compromised by TeamPCP", + "author": "ramimac", + "points": 2, + "num_comments": 1, + }, + { + "title": "Show HN: Pyconject - Ditch messy YAML loading in Python with config injection", + "author": "neolaw", + "points": 1, + "num_comments": 1, + }, + { + "title": "Mojo's Not (Yet) Python", + "author": "birdculture", + "points": 2, + "num_comments": 0, + }, + { + "title": "Show HN: LLMBillingKit - measure net margin per LLM call with one line of Python", + "author": "davidphan11", + "points": 1, + "num_comments": 1, + }, + ], +} + + +def _collected_for_recent_burst() -> Dict[str, Dict[str, Any]]: + return {"hn_category:newest": {"category": "newest", "stories": NEWEST_5_STORIES}} + + +def _collected_for_search() -> Dict[str, Dict[str, Any]]: + return {"hn_search:python:0": HN_SEARCH_PYTHON_PAGE0} + + +def _collected_for_nested_comments() -> Dict[str, Dict[str, Any]]: + return { + "hn_category:newest": { + "category": "newest", + "stories": { + "47540833": { + "id": 47540833, + "rank": 1, + "title": "Hold on to Your Hardware", + "by": "LucidLynx", + } + }, + }, + # Story with root comment + "47540833": {"id": 47540833, "kids": [47541161]}, + # Root depth=1 + "47541161": {"id": 47541161, "kids": [47541519, 47541377, 47541263]}, + # Depth=2 + "47541519": {"id": 47541519, "kids": []}, + "47541377": {"id": 47541377, "kids": [47541427]}, + "47541263": {"id": 47541263, "kids": [47541473, 47541336, 47541494]}, + # Depth=3 + "47541427": {"id": 47541427, "kids": [47541578, 47541483]}, + "47541473": {"id": 47541473, "kids": []}, + "47541336": {"id": 47541336, "kids": []}, + "47541494": {"id": 47541494, "kids": [47541522]}, + # Depth=4 + "47541578": {"id": 47541578, "kids": []}, + "47541483": {"id": 47541483, "kids": []}, + "47541522": {"id": 47541522, "kids": [47541558, 47541548]}, + # Depth=5 + "47541558": {"id": 47541558, "kids": []}, + "47541548": {"id": 47541548, "kids": []}, + } + + +def _collected_for_user_metrics() -> Dict[str, Dict[str, Any]]: + return { + "hn_category:newest": {"category": "newest", "stories": NEWEST_5_STORIES}, + "user:Tepix": { + "user": { + "id": "Tepix", + "karma": 13466, + "created": 1376904746, + "submitted": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + }, + "user:visopsys": { + "user": { + "id": "visopsys", + "karma": 351, + "created": 1321339868, + "submitted": [1, 2, 3, 4, 5], + } + }, + } + + +def test_t110_recent_burst_with_real_snapshot(): + tmpl = HackerNewsRecentBurstCountTemplate() + result = _run_gt( + _collected_for_recent_burst(), + tmpl.get_ground_truth( + { + "story_count": 5, + "window_minutes": 3, + "anchor_rank": 1, + "include_equal": True, + } + ), + ) + assert result.success is True + assert result.value == "3" + + +def test_t111_nested_comment_nodes_with_real_snapshot(): + tmpl = HackerNewsCommentTreeFocusTemplate() + result = _run_gt( + _collected_for_nested_comments(), + tmpl.get_ground_truth({"rank": 1, "min_depth": 3, "metric": "nodes"}), + ) + # depth>=3 nodes: 47541427,47541473,47541336,47541494,47541578,47541483,47541522,47541558,47541548 + assert result.success is True + assert result.value == "9" + + +def test_t112_hn_search_with_real_snapshot(): + tmpl = HackerNewsKeywordScanRankTemplate() + result = _run_gt( + _collected_for_search(), + tmpl.get_ground_truth( + {"query": "python", "rank": 2, "field": "author", "min_points": 2, "search_page": 0} + ), + ) + # Filtered by points>=2 gives authors: ramimac, birdculture + assert result.success is True + assert result.value == "birdculture" + + +def test_t113_user_metric_with_real_snapshot(): + tmpl = HackerNewsUserKarmaGapTemplate() + result = _run_gt( + _collected_for_user_metrics(), + tmpl.get_ground_truth({"rank_a": 1, "rank_b": 2, "metric": "created_days"}), + ) + assert result.success is True + assert result.value == "643" diff --git a/tests/test_hackernews_gap_templates.py b/tests/test_hackernews_gap_templates.py new file mode 100644 index 0000000..aac0b0c --- /dev/null +++ b/tests/test_hackernews_gap_templates.py @@ -0,0 +1,350 @@ +"""Integration tests for HackerNews gap-filling templates.""" + +import asyncio + +import pytest + +from liveweb_arena.core.gt_collector import GTCollector, GTSourceType, set_current_gt_collector +from liveweb_arena.core.task_registry import TaskRegistry +from liveweb_arena.core.validators.base import get_registered_templates +from liveweb_arena.plugins.base import SubTask +from liveweb_arena.plugins.hackernews import api_client as hn_api +from liveweb_arena.plugins.hackernews.hackernews import HackerNewsPlugin +from liveweb_arena.plugins.hackernews.templates.comment_tree_focus import ( + HackerNewsCommentTreeFocusTemplate, +) +from liveweb_arena.plugins.hackernews.templates.keyword_scan_rank import ( + HackerNewsKeywordScanRankTemplate, +) +from liveweb_arena.plugins.hackernews.templates.recent_burst_count import ( + HackerNewsRecentBurstCountTemplate, +) +from liveweb_arena.plugins.hackernews.templates.user_karma_gap import ( + HackerNewsUserKarmaGapTemplate, +) + + +def run_async(coro): + return asyncio.run(coro) + + +@pytest.fixture +def collector(): + gt_collector = GTCollector( + subtasks=[SubTask(plugin_name="hackernews", intent="test", validation_info={}, answer_tag="answer1")] + ) + set_current_gt_collector(gt_collector) + try: + yield gt_collector + finally: + set_current_gt_collector(None) + + +def _seed_newest_stories(): + # unix times descending by rank (newest first) + return { + "category": "newest", + "stories": { + "1001": {"id": 1001, "rank": 1, "title": "AI agent launch", "time": 1700000000, "by": "alice"}, + "1002": {"id": 1002, "rank": 2, "title": "Rust performance tricks", "time": 1699999700, "by": "bob"}, + "1003": {"id": 1003, "rank": 3, "title": "Cloud data warehousing", "time": 1699999200, "by": "carol"}, + "1004": {"id": 1004, "rank": 4, "title": "Python model serving", "time": 1699996000, "by": "dave"}, + "1005": {"id": 1005, "rank": 5, "title": "Open source infra", "time": 1699995500, "by": "eve"}, + }, + } + + +def test_template_registry_contains_new_gap_templates(): + templates = get_registered_templates() + assert "hackernews_recent_burst_count" in templates + assert "hackernews_comment_tree_focus" in templates + assert "hackernews_keyword_scan_rank" in templates + assert "hackernews_user_karma_gap" in templates + + +def test_task_registry_ids_are_new_and_non_conflicting(): + assert TaskRegistry.TEMPLATES[110] == ("hackernews", "hackernews_recent_burst_count") + assert TaskRegistry.TEMPLATES[111] == ("hackernews", "hackernews_comment_tree_focus") + assert TaskRegistry.TEMPLATES[112] == ("hackernews", "hackernews_keyword_scan_rank") + assert TaskRegistry.TEMPLATES[113] == ("hackernews", "hackernews_user_karma_gap") + # Registry versioning preserves upstream version slots before these IDs + assert [96, 97, 98] in TaskRegistry.TEMPLATE_VERSIONS + assert [99, 100, 101] in TaskRegistry.TEMPLATE_VERSIONS + + +def test_fetch_newest_api_data(monkeypatch): + async def fake_new_stories(limit=30): + return [11, 22, 33][:limit] + + async def fake_items_batch(ids): + return { + 11: {"id": 11, "title": "a"}, + 22: {"id": 22, "title": "b"}, + 33: {"id": 33, "title": "c"}, + } + + monkeypatch.setattr(hn_api.HackerNewsClient, "get_new_stories", fake_new_stories) + monkeypatch.setattr(hn_api.HackerNewsClient, "get_items_batch", fake_items_batch) + + payload = run_async(hn_api.fetch_newest_api_data(limit=3)) + assert payload["category"] == "newest" + assert payload["stories"]["11"]["rank"] == 1 + assert payload["stories"]["22"]["rank"] == 2 + assert payload["stories"]["33"]["rank"] == 3 + + +def test_plugin_fetch_api_data_routes_newest(monkeypatch): + plugin = HackerNewsPlugin() + + async def fake_newest(): + return {"category": "newest", "stories": {"1": {"id": 1, "rank": 1, "title": "x"}}} + + monkeypatch.setattr( + "liveweb_arena.plugins.hackernews.hackernews.fetch_newest_api_data", + fake_newest, + ) + payload = run_async(plugin.fetch_api_data("https://news.ycombinator.com/newest")) + assert payload["category"] == "newest" + assert "stories" in payload + + +def test_plugin_needs_api_data_for_newest(): + plugin = HackerNewsPlugin() + assert plugin.needs_api_data("https://news.ycombinator.com/newest") + assert plugin.needs_api_data("https://hn.algolia.com/?q=python") + + +@pytest.mark.parametrize( + "template_cls", + [ + HackerNewsRecentBurstCountTemplate, + HackerNewsCommentTreeFocusTemplate, + HackerNewsKeywordScanRankTemplate, + HackerNewsUserKarmaGapTemplate, + ], +) +def test_new_templates_generation_shape(template_cls): + q = template_cls().generate(42) + assert q.question_text + assert q.start_url.startswith("https://") + assert q.expected_steps >= 8 + assert q.template_name + + +def test_recent_burst_ground_truth_success(collector): + collector._merge_api_data("https://news.ycombinator.com/newest", _seed_newest_stories()) + + result = run_async( + HackerNewsRecentBurstCountTemplate().get_ground_truth( + {"story_count": 5, "window_minutes": 60, "anchor_rank": 1, "include_equal": True} + ) + ) + assert result.success is True + # ranks 1,2,3 within 60 minutes from newest (0s, 300s, 800s) + assert result.value == "3" + + +def test_recent_burst_not_collected(collector): + result = run_async( + HackerNewsRecentBurstCountTemplate().get_ground_truth( + {"story_count": 5, "window_minutes": 60, "anchor_rank": 1, "include_equal": True} + ) + ) + assert result.success is False + assert result.is_data_not_collected() + + +def test_comment_tree_focus_success(collector): + newest = _seed_newest_stories() + collector._merge_api_data("https://news.ycombinator.com/newest", newest) + + # Story and nested comments for rank 2 story + collector._merge_api_data( + "https://news.ycombinator.com/item?id=1002", + {"id": 1002, "title": "Rust performance tricks", "kids": [2001, 2002]}, + ) + collector._merge_api_data("https://news.ycombinator.com/item?id=2001", {"id": 2001, "kids": [2003]}) + collector._merge_api_data("https://news.ycombinator.com/item?id=2002", {"id": 2002, "kids": [2004]}) + collector._merge_api_data("https://news.ycombinator.com/item?id=2003", {"id": 2003, "kids": []}) + collector._merge_api_data("https://news.ycombinator.com/item?id=2004", {"id": 2004, "kids": [2005]}) + collector._merge_api_data("https://news.ycombinator.com/item?id=2005", {"id": 2005, "kids": []}) + result = run_async( + HackerNewsCommentTreeFocusTemplate().get_ground_truth( + {"rank": 2, "min_depth": 2, "metric": "nodes"} + ) + ) + assert result.success is True + assert result.value == "3" + + +def test_comment_tree_focus_success_with_bundled_comment_items(collector): + newest = _seed_newest_stories() + collector._merge_api_data("https://news.ycombinator.com/newest", newest) + collector._merge_api_data( + "https://news.ycombinator.com/item?id=1002", + { + "id": 1002, + "title": "Rust performance tricks", + "kids": [2001, 2002], + "_comment_items": { + "2001": {"id": 2001, "kids": [2003]}, + "2002": {"id": 2002, "kids": [2004]}, + "2003": {"id": 2003, "kids": []}, + "2004": {"id": 2004, "kids": [2005]}, + "2005": {"id": 2005, "kids": []}, + }, + }, + ) + result = run_async( + HackerNewsCommentTreeFocusTemplate().get_ground_truth( + {"rank": 2, "min_depth": 2, "metric": "nodes"} + ) + ) + assert result.success is True + assert result.value == "3" + + +def test_comment_tree_focus_missing_item(collector): + collector._merge_api_data("https://news.ycombinator.com/newest", _seed_newest_stories()) + result = run_async( + HackerNewsCommentTreeFocusTemplate().get_ground_truth( + {"rank": 1, "min_depth": 2, "metric": "nodes"} + ) + ) + assert result.success is False + assert result.is_data_not_collected() + + +def test_keyword_scan_rank_found(collector): + collector._merge_api_data( + "https://hn.algolia.com/?q=python&page=0", + { + "query": "python", + "page": 0, + "hits": [ + {"title": "Python libs", "author": "a1", "points": 80, "num_comments": 12}, + {"title": "Advanced Python", "author": "a2", "points": 150, "num_comments": 31}, + ], + }, + ) + result = run_async( + HackerNewsKeywordScanRankTemplate().get_ground_truth( + {"query": "python", "rank": 1, "field": "title", "min_points": 100, "search_page": 0} + ) + ) + assert result.success is True + assert result.value == "Advanced Python" + + +def test_keyword_scan_rank_none(collector): + collector._merge_api_data( + "https://hn.algolia.com/?q=python&page=0", + { + "query": "python", + "page": 0, + "hits": [{"title": "Python libs", "author": "a1", "points": 80, "num_comments": 12}], + }, + ) + result = run_async( + HackerNewsKeywordScanRankTemplate().get_ground_truth( + {"query": "python", "rank": 2, "field": "author", "min_points": 50, "search_page": 0} + ) + ) + assert result.success is True + assert result.value == "NONE" + + +def test_user_karma_gap_success(collector): + newest = _seed_newest_stories() + collector._merge_api_data("https://news.ycombinator.com/newest", newest) + + collector._merge_api_data( + "https://news.ycombinator.com/user?id=alice", + {"user": {"id": "alice", "karma": 950, "created": 1700000000, "submitted": [1, 2, 3, 4]}}, + ) + collector._merge_api_data( + "https://news.ycombinator.com/user?id=carol", + {"user": {"id": "carol", "karma": 700, "created": 1699136000, "submitted": [9]}}, + ) + + result = run_async( + HackerNewsUserKarmaGapTemplate().get_ground_truth({"rank_a": 1, "rank_b": 3, "metric": "karma"}) + ) + assert result.success is True + assert result.value == "250" + + +def test_user_karma_gap_missing_profile(collector): + collector._merge_api_data("https://news.ycombinator.com/newest", _seed_newest_stories()) + result = run_async( + HackerNewsUserKarmaGapTemplate().get_ground_truth({"rank_a": 1, "rank_b": 2, "metric": "karma"}) + ) + assert result.success is False + assert result.is_data_not_collected() + + +def test_user_metric_created_days(collector): + newest = _seed_newest_stories() + collector._merge_api_data("https://news.ycombinator.com/newest", newest) + collector._merge_api_data( + "https://news.ycombinator.com/user?id=alice", + {"user": {"id": "alice", "karma": 950, "created": 1700000000, "submitted": [1, 2]}}, + ) + collector._merge_api_data( + "https://news.ycombinator.com/user?id=bob", + {"user": {"id": "bob", "karma": 900, "created": 1699913600, "submitted": [7]}}, + ) + result = run_async( + HackerNewsUserKarmaGapTemplate().get_ground_truth( + {"rank_a": 1, "rank_b": 2, "metric": "created_days"} + ) + ) + assert result.success is True + assert result.value == "1" + + +def test_variant_spaces_exceed_500(): + # T110: 10 story counts * 10 windows * 5 anchors * 2 comparators = 1000 + # T111: 30 ranks * 5 depths * 4 metrics = 600 + # T112: 52 queries * 20 ranks * 4 fields * 4 point buckets = 16640 + # T113: C(30,2)=435 pairs * 3 metrics = 1305 + t110 = 10 * 10 * 5 * 2 + t111 = 30 * 5 * 4 + t112 = 52 * 20 * 4 * 4 + t113 = 435 * 3 + assert t110 > 500 + assert t111 > 500 + assert t112 > 500 + assert t113 > 500 + + +@pytest.mark.parametrize( + "template_cls", + [ + HackerNewsRecentBurstCountTemplate, + HackerNewsCommentTreeFocusTemplate, + HackerNewsKeywordScanRankTemplate, + HackerNewsUserKarmaGapTemplate, + ], +) +def test_gt_source_and_cache_source(template_cls): + t = template_cls() + assert t.get_gt_source() == GTSourceType.PAGE_ONLY + assert t.get_cache_source() == "hackernews" + + +@pytest.mark.parametrize( + ("seed", "template_cls"), + [ + (3, HackerNewsRecentBurstCountTemplate), + (5, HackerNewsCommentTreeFocusTemplate), + (7, HackerNewsKeywordScanRankTemplate), + (9, HackerNewsUserKarmaGapTemplate), + ], +) +def test_seed_stability(seed, template_cls): + t = template_cls() + q1 = t.generate(seed, variant=1) + q2 = t.generate(seed, variant=1) + assert q1.question_text == q2.question_text + assert q1.validation_info == q2.validation_info