Skip to content

Commit bd7debe

Browse files
committed
fix(version9): align T110–T113 with gap definitions, nested OL work drill-down, red team doc
1 parent a498a39 commit bd7debe

29 files changed

Lines changed: 1230 additions & 1497 deletions

liveweb_arena/core/cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ def _load_cache(self, cache_file: Path, need_api: bool, allow_stale: bool) -> Op
489489
return None
490490

491491
if not allow_stale and cached.is_expired(self.ttl):
492+
self._delete_cache(cache_file)
492493
return None
493494

494495
# Check if cache is complete

liveweb_arena/core/gt_collector.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -369,14 +369,6 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]:
369369
name = api_data.get("name", f"SN{netuid}")
370370
return f"subnet[{name}]"
371371

372-
elif "hn.algolia.com" in url_lower:
373-
# HN Algolia search data
374-
query = str(api_data.get("query", "")).strip().lower()
375-
page = int(api_data.get("page", 0))
376-
key = f"hn_search:{query}:{page}"
377-
self._collected_api_data[key] = api_data
378-
return f"hn_search[{query}] page={page}"
379-
380372
elif "news.ycombinator.com" in url_lower:
381373
if "stories" in api_data:
382374
# Check if this is a category page (ask, show, jobs) or homepage
@@ -408,15 +400,6 @@ def _merge_api_data(self, url: str, api_data: Dict[str, Any]) -> Optional[str]:
408400
if "rank" in existing and "rank" not in merged:
409401
merged["rank"] = existing["rank"]
410402
self._collected_api_data[story_id] = merged
411-
added_comments = 0
412-
comment_items = api_data.get("_comment_items")
413-
if isinstance(comment_items, dict):
414-
for comment_id, comment_payload in comment_items.items():
415-
if isinstance(comment_id, str) and isinstance(comment_payload, dict):
416-
self._collected_api_data[comment_id] = comment_payload
417-
added_comments += 1
418-
if added_comments > 0:
419-
return f"story[{story_id}] +{added_comments} comments"
420403
return f"story[{story_id}]"
421404
elif "user" in api_data:
422405
# User page

liveweb_arena/core/task_registry.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,6 @@ class TaskRegistry:
134134
76: ("hackernews", "hackernews_extrema_comparison"),
135135
77: ("hackernews", "hackernews_category_comparison"),
136136
78: ("hackernews", "hackernews_news_summary"),
137-
110: ("hackernews", "hackernews_recent_burst_count"),
138-
111: ("hackernews", "hackernews_comment_tree_focus"),
139-
112: ("hackernews", "hackernews_keyword_scan_rank"),
140-
113: ("hackernews", "hackernews_user_karma_gap"),
141137

142138
# Open Library templates
143139
80: ("openlibrary", "openlibrary_book_stats"),
@@ -150,13 +146,23 @@ class TaskRegistry:
150146
86: ("openmeteo", "openmeteo_comparison"),
151147
87: ("openmeteo", "openmeteo_hourly_extrema"),
152148
88: ("openmeteo", "openmeteo_forecast_trend"),
149+
96: ("openlibrary", "openlibrary_author_engagement_extrema"),
150+
97: ("openlibrary", "openlibrary_author_comparison"),
151+
98: ("openlibrary", "openlibrary_reading_stats_filter"),
152+
99: ("openmeteo", "openmeteo_hourly_threshold"),
153+
100: ("openmeteo", "openmeteo_sunrise_sunset"),
154+
101: ("openmeteo", "openmeteo_hourly_time_of"),
153155

154156
# ArXiv templates
155157
90: ("arxiv", "arxiv_paper_info"),
156158
91: ("arxiv", "arxiv_author_extrema"),
157159
92: ("arxiv", "arxiv_category_comparison"),
158160
94: ("arxiv", "arxiv_multi_author_filter"),
159161
95: ("arxiv", "arxiv_title_length_extrema"),
162+
110: ("openmeteo", "openmeteo_daily_precip_peak_day"),
163+
111: ("openlibrary", "openlibrary_subject_nested_work_title"),
164+
112: ("arxiv", "arxiv_category_infer_title_substring"),
165+
113: ("arxiv", "arxiv_category_infer_author_filter"),
160166
}
161167

162168
# Template versions - each version's combinations come AFTER all previous versions
@@ -189,7 +195,7 @@ class TaskRegistry:
189195
[96, 97, 98],
190196
# Version 8: Additional Open Meteo templates
191197
[99, 100, 101],
192-
# Version 9: Hacker News gap-filling templates
198+
# Version 9: Cross-site templates (daylight calendar, subject hub, arXiv category feeds)
193199
[110, 111, 112, 113],
194200
]
195201

liveweb_arena/plugins/arxiv/templates/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
from .multi_author_filter import ArxivMultiAuthorFilterTemplate
77
from .title_length_extrema import ArxivTitleLengthExtremaTemplate
88
from .category_comparison import ArxivCategoryComparisonTemplate
9+
from .category_infer_title_substring import ArxivCategoryInferTitleSubstringTemplate
10+
from .category_infer_author_filter import ArxivCategoryInferAuthorFilterTemplate
911

1012
__all__ = [
1113
"ArxivPaperInfoTemplate",
1214
"ArxivAuthorExtremaTemplate",
1315
"ArxivMultiAuthorFilterTemplate",
1416
"ArxivTitleLengthExtremaTemplate",
1517
"ArxivCategoryComparisonTemplate",
18+
"ArxivCategoryInferTitleSubstringTemplate",
19+
"ArxivCategoryInferAuthorFilterTemplate",
1620
]
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
"""Prose hints for locating an arXiv new-submissions stream without naming its official label.
2+
3+
Aligned with CLAUDE.md §3: questions must not embed URLs, selectors, or routing shortcuts.
4+
The agent infers which `/list/<code>/new` page to open from domain knowledge plus browsing.
5+
6+
Keys are arXiv category codes matching `variables.CATEGORIES`.
7+
"""
8+
9+
from typing import Dict
10+
11+
CATEGORY_NAVIGATION_HINTS: Dict[str, str] = {
12+
"cs.AI": (
13+
"The computer-science stream where planning, search, and knowledge representation papers "
14+
"most often land alongside modern learning-based agents."
15+
),
16+
"cs.CL": (
17+
"The cs area chiefly concerned with human languages, token sequences, and machine "
18+
"translation or parsing benchmarks."
19+
),
20+
"cs.CV": (
21+
"The cs track focused on pixels, cameras, detectors, segmentation masks, and visual scenes."
22+
),
23+
"cs.LG": (
24+
"The cs partition most associated with empirical training loops, generalization, and "
25+
"differentiable models fit to datasets."
26+
),
27+
"cs.SE": (
28+
"The cs subject for software lifecycle, repositories, testing practice, and large-scale "
29+
"engineering studies."
30+
),
31+
"cs.CR": (
32+
"The cs stream covering protocols, adversaries, proofs about secrecy, and cryptographic constructions."
33+
),
34+
"cs.RO": (
35+
"The cs listings where manipulation, kinematics, sensing stacks, and autonomous platforms converge."
36+
),
37+
"cs.DS": (
38+
"The cs class devoted to asymptotic complexity, classical algorithms, and combinatorial structures."
39+
),
40+
"cs.HC": (
41+
"The cs bucket for usability studies, interaction techniques, and studies of people using interfaces."
42+
),
43+
"cs.IR": (
44+
"The cs lane for ranking, retrieval metrics, corpora, and query–document modeling."
45+
),
46+
"cs.GT": (
47+
"The cs niche treating strategic interaction, equilibria, and incentives among rational actors."
48+
),
49+
"math.CO": (
50+
"The mathematics archive section for enumerative arguments, graphs as discrete objects, and designs."
51+
),
52+
"math.PR": (
53+
"The math feed centered on stochastic processes, measure-theoretic limits, and random structures."
54+
),
55+
"math.OC": (
56+
"The math stream about variational problems, controllers, and continuous-time decision systems."
57+
),
58+
"math.NA": (
59+
"The math area for discretization schemes, floating-point stability, and iterative linear algebra."
60+
),
61+
"math.AG": (
62+
"The math subject built around varieties, sheaves, and geometric invariants of polynomial systems."
63+
),
64+
"math.AP": (
65+
"The math queue for PDE well-posedness, Sobolev estimates, and evolution of physical fields."
66+
),
67+
"math.NT": (
68+
"The math lane for primes, congruences, L-functions, and arithmetic of integers."
69+
),
70+
"math.DG": (
71+
"The math topic for curvature, bundles, connections, and smooth manifolds beyond Euclidean space."
72+
),
73+
"math.GR": (
74+
"The math column for symmetries, presentations, and actions of abstract algebraic systems."
75+
),
76+
"hep-th": (
77+
"The high-energy theory feed discussing strings, dualities, quantum fields, and spacetime models."
78+
),
79+
"hep-ph": (
80+
"The collider-adjacent phenomenology stream bridging models with signals, rates, and detectors."
81+
),
82+
"quant-ph": (
83+
"The quantum archive for qubits, channels, entanglement measures, and information-theoretic protocols."
84+
),
85+
"gr-qc": (
86+
"The archive slice merging classical gravitation with quantum expectations about horizons and cosmology."
87+
),
88+
"astro-ph.CO": (
89+
"The astrophysics bucket for large-scale structure, dark components, and expansion history."
90+
),
91+
"astro-ph.GA": (
92+
"The astrophysics lane for stellar populations, galaxies as systems, and interstellar medium interplay."
93+
),
94+
"astro-ph.HE": (
95+
"The high-energy astrophysics feed for compact objects, relativistic outflows, and energetic spectra."
96+
),
97+
"astro-ph.SR": (
98+
"The stellar astrophysics listings for interiors, magnetism, and long-lived luminous spheres."
99+
),
100+
"astro-ph.IM": (
101+
"The astrophysics instrumentation track for telescopes, calibration pipelines, and survey hardware."
102+
),
103+
"cond-mat.str-el": (
104+
"The condensed-matter stream for strongly correlated lattices, emergent quasiparticles, and phases."
105+
),
106+
"cond-mat.mes-hall": (
107+
"The mesoscale condensed-matter area for nanowires, quantum dots, and low-dimensional transport."
108+
),
109+
"cond-mat.mtrl-sci": (
110+
"The materials-facing condensed-matter lane for synthesis, characterization, and structure–property links."
111+
),
112+
"cond-mat.stat-mech": (
113+
"The many-body equilibrium column for ensembles, phase transitions, and emergent macroscopic laws."
114+
),
115+
"cond-mat.supr-con": (
116+
"The low-temperature condensed-matter listings for Cooper pairing, Meissner physics, and critical fields."
117+
),
118+
"cond-mat.soft": (
119+
"The soft matter feed for colloids, gels, active grains, and sluggish thermal motion."
120+
),
121+
"physics.optics": (
122+
"The physics subject lane for interference, coherence, guided waves, and photonic devices."
123+
),
124+
"stat.ML": (
125+
"The statistics archive where inference meets high-dimensional prediction and uncertainty for models."
126+
),
127+
"stat.ME": (
128+
"The statistics methodology lane for estimators, experimental design, and inferential frameworks."
129+
),
130+
"eess.SP": (
131+
"The electrical-engineering signal stream for filters, spectra, acquisition chains, and discrete transforms."
132+
),
133+
"eess.SY": (
134+
"The systems-and-control electrical-engineering feed for stability, observers, and feedback synthesis."
135+
),
136+
"eess.AS": (
137+
"The audio-focused electrical-engineering listings for speech, hearing, and acoustic modeling."
138+
),
139+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""ArXiv category discovery + author-count threshold (registry T113).
2+
3+
Same navigation pattern as `arxiv_category_infer_title_substring`: prose describes
4+
which new-submissions stream to open—no official label in the question text.
5+
6+
Effective variants: len(CATEGORIES) * len(TOP_N) * len(AUTHOR_THRESHOLDS) * len(PATTERNS) > 500.
7+
"""
8+
9+
import random
10+
from typing import Any, Dict, Optional
11+
12+
from liveweb_arena.core.ground_truth_trigger import GroundTruthResult, TriggerConfig, UrlPatternTrigger
13+
from liveweb_arena.core.gt_collector import GTSourceType
14+
15+
from liveweb_arena.core.validators.base import (
16+
GeneratedQuestion,
17+
QuestionTemplate,
18+
ValidationResult,
19+
register_template,
20+
)
21+
22+
from .category_discovery_hints import CATEGORY_NAVIGATION_HINTS
23+
from .common import get_collected_listing_data, get_papers_from_listing
24+
from .variables import CATEGORIES
25+
26+
TOP_N = [4, 5, 6, 7, 8]
27+
AUTHOR_THRESHOLDS = [2, 3, 4, 5, 6]
28+
29+
PATTERNS = [
30+
(
31+
"On arXiv, open today's new-submissions listing for the stream best described by: \"{nav_hint}\". "
32+
"Among the first {n} papers, how many list strictly more than {k} authors?"
33+
),
34+
(
35+
"Locate the arXiv new-submissions page matching this topical description: \"{nav_hint}\". "
36+
"Considering only the first {n} entries, count papers whose author count is greater than {k}."
37+
),
38+
(
39+
"Find the daily new papers on arXiv under the area summarized as: \"{nav_hint}\". "
40+
"In the top {n} items, how many have more than {k} authors named on the listing?"
41+
),
42+
]
43+
44+
45+
@register_template("arxiv_category_infer_author_filter")
46+
class ArxivCategoryInferAuthorFilterTemplate(QuestionTemplate):
47+
GT_SOURCE = GTSourceType.PAGE_ONLY
48+
49+
def __init__(self):
50+
super().__init__("arxiv_category_infer_author_filter")
51+
52+
def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion:
53+
rng = random.Random(seed)
54+
cat = rng.choice(CATEGORIES)
55+
nav_hint = CATEGORY_NAVIGATION_HINTS[cat.code]
56+
n = rng.choice(TOP_N)
57+
k = rng.choice(AUTHOR_THRESHOLDS)
58+
pattern = rng.choice(PATTERNS)
59+
question_text = pattern.format(nav_hint=nav_hint, n=n, k=k)
60+
return GeneratedQuestion(
61+
question_text=question_text,
62+
start_url="https://arxiv.org",
63+
variables={"category": cat.code, "top_n": n, "author_threshold": k},
64+
validation_info={"category": cat.code, "top_n": n, "author_threshold": k},
65+
template_name=self.name,
66+
expected_steps=14,
67+
)
68+
69+
def get_validation_rules(self, validation_info: Dict[str, Any]) -> str:
70+
return (
71+
"Task-Specific Rules (ArXiv category discovery + author threshold):\n"
72+
f"- Expected listing category code: {validation_info.get('category')}\n"
73+
f"- Papers scanned: first {validation_info.get('top_n')}\n"
74+
f"- Strict author floor: {validation_info.get('author_threshold')}\n"
75+
"- Score 1.0: exact count\n"
76+
"- Score 0.0: otherwise"
77+
)
78+
79+
async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult:
80+
category = str(validation_info.get("category", ""))
81+
n = int(validation_info.get("top_n", 5))
82+
k = int(validation_info.get("author_threshold", 3))
83+
84+
data, failure = get_collected_listing_data(category)
85+
if failure is not None:
86+
return failure
87+
papers, failure = get_papers_from_listing(data)
88+
if failure is not None:
89+
return failure
90+
if len(papers) < n:
91+
return GroundTruthResult.not_collected(
92+
f"Need at least {n} papers in listing, have {len(papers)}."
93+
)
94+
95+
count = 0
96+
for p in papers[:n]:
97+
authors = p.get("authors")
98+
if not isinstance(authors, list):
99+
return GroundTruthResult.fail("Paper missing authors list")
100+
if len(authors) > k:
101+
count += 1
102+
return GroundTruthResult.ok(str(count))
103+
104+
async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult:
105+
return ValidationResult(score=0.0, is_correct=False, expected=None, actual=answer, details="Use LLM validation")
106+
107+
def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig:
108+
return TriggerConfig(trigger=UrlPatternTrigger(domains=["arxiv.org"]))
109+
110+
@classmethod
111+
def get_cache_source(cls) -> str:
112+
return "arxiv"
113+
114+
def get_gt_source(self) -> GTSourceType:
115+
return self.GT_SOURCE

0 commit comments

Comments
 (0)