Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ba6f766
refactor(openlibrary): extract author-search helpers to common.py
MkDev11 Mar 23, 2026
fd8dcaa
feat(openlibrary): add author_engagement_extrema template (ID 96)
MkDev11 Mar 23, 2026
d2023db
feat(openlibrary): add author_comparison template (ID 97)
MkDev11 Mar 23, 2026
8eb1f34
feat(openlibrary): add reading_stats_filter template (ID 98)
MkDev11 Mar 23, 2026
216711d
test(openlibrary): add tests for engagement & comparison templates
MkDev11 Mar 23, 2026
2bc5e9d
fix: accept plain-text author queries in find_author_search_entry
MkDev11 Mar 23, 2026
4f1e501
fix(openlibrary): reduce live GT not_collected for author templates
MkDev11 Mar 23, 2026
6a893b4
docs(pr): update description
MkDev11 Mar 23, 2026
6d436d5
fix: address PR #13 review — remove broken authors, drop already_read…
MkDev11 Mar 25, 2026
2490e6a
fix: treat missing engagement metrics as 0 instead of hard-failing
MkDev11 Mar 25, 2026
1608d88
fix: handle non-numeric metric values without TypeError
MkDev11 Mar 25, 2026
c99582f
refactor: extract safe_metric_value helper to reduce duplication
MkDev11 Mar 25, 2026
5dceaec
fix: drop ratings_count from all templates, fail on non-numeric data
MkDev11 Mar 25, 2026
c5fa4e1
fix: docstring drift and add non-numeric regression tests for compari…
MkDev11 Mar 25, 2026
6db81ec
fix: restore ratings_count with targeted exclusions for anti-memoriza…
MkDev11 Mar 25, 2026
050b5d9
fix(openlibrary): expand AUTHOR_POOL and RESULT_COUNTS for T96 varian…
MkDev11 Mar 25, 2026
4e0fc5c
fix(openlibrary): raise search fetch limit to 25 for T96 work_count=25
MkDev11 Mar 25, 2026
3f8e3ea
fix(openlibrary): separate ENGAGEMENT_AUTHOR_POOL, cap lowest RESULT_…
MkDev11 Mar 25, 2026
3913d43
fix(openlibrary): address PR #13 review — deterministic GT, numeric T…
MkDev11 Mar 26, 2026
5320bca
fix(openlibrary): cap ratings_count variants to low N to reduce GT-fa…
MkDev11 Mar 26, 2026
7c9097f
test(openlibrary): verify GT computation with real OL API data
MkDev11 Mar 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions liveweb_arena/core/task_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ class TaskRegistry:
92: ("arxiv", "arxiv_category_comparison"),
94: ("arxiv", "arxiv_multi_author_filter"),
95: ("arxiv", "arxiv_title_length_extrema"),

# Open Library templates — engagement & comparison
96: ("openlibrary", "openlibrary_author_engagement_extrema"),
97: ("openlibrary", "openlibrary_author_comparison"),
98: ("openlibrary", "openlibrary_reading_stats_filter"),
}

# Template versions - each version's combinations come AFTER all previous versions
Expand Down Expand Up @@ -181,6 +186,8 @@ class TaskRegistry:
[85, 86, 87, 88],
# Version 6: ArXiv templates
[90, 91, 92, 94, 95],
# Version 7: Open Library engagement & comparison templates
[96, 97, 98],
]

# Combination registry: list of template ID tuples
Expand Down
3 changes: 2 additions & 1 deletion liveweb_arena/plugins/openlibrary/openlibrary.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ async def fetch_api_data(self, url: str) -> Dict[str, Any]:
sort = parse_qs(parsed.query).get("sort", [None])[0]
mode = parse_qs(parsed.query).get("mode", [None])[0]
if query:
return await fetch_search_api_data(query, limit=20, sort=sort, mode=mode)
# limit=25 to support T96 RESULT_COUNTS up to work_count=25
return await fetch_search_api_data(query, limit=25, sort=sort, mode=mode)
return {}

# Work detail page: /works/OL...W or /works/OL...W/Title
Expand Down
6 changes: 6 additions & 0 deletions liveweb_arena/plugins/openlibrary/templates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@
from .book_comparison import OpenLibraryBookComparisonTemplate
from .author_editions import OpenLibraryAuthorEditionsTemplate
from .subject_multi_condition import OpenLibrarySubjectMultiConditionTemplate
from .author_engagement_extrema import OpenLibraryAuthorEngagementExtremaTemplate
from .author_comparison import OpenLibraryAuthorComparisonTemplate
from .reading_stats_filter import OpenLibraryReadingStatsFilterTemplate

__all__ = [
"OpenLibraryBookStatsTemplate",
"OpenLibraryBookComparisonTemplate",
"OpenLibraryAuthorEditionsTemplate",
"OpenLibrarySubjectMultiConditionTemplate",
"OpenLibraryAuthorEngagementExtremaTemplate",
"OpenLibraryAuthorComparisonTemplate",
"OpenLibraryReadingStatsFilterTemplate",
]
260 changes: 260 additions & 0 deletions liveweb_arena/plugins/openlibrary/templates/author_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
"""Author comparison template for Open Library - MEDIUM/HARD DIFFICULTY.

RL-friendly design:
- Requires TWO separate author searches and cross-page comparison
- Dynamic data: engagement metrics change continuously as users interact
- Large entity pool: C(81,2)×2 metrics×2 result counts = 12,960 variants
- Computation required: sum metric across N books for each author, compare
"""

import random
from enum import Enum
from typing import Any, Dict, Optional
from urllib.parse import quote_plus

from liveweb_arena.core.ground_truth_trigger import (
GroundTruthResult,
TriggerConfig,
UrlPatternTrigger,
)
from liveweb_arena.core.gt_collector import GTSourceType
from liveweb_arena.core.validators.base import (
GeneratedQuestion,
QuestionTemplate,
ValidationResult,
register_template,
)
from .author_editions import ENGAGEMENT_AUTHOR_POOL
from .common import find_author_search_entry, get_collected_data, safe_metric_value


class AuthorMetric(Enum):
"""Engagement metrics for cross-author comparison."""
WANT_TO_READ = ("want_to_read_count", "total want-to-read count")
RATINGS_COUNT = ("ratings_count", "total number of ratings")


RESULT_COUNTS = [3, 5]

PATTERNS = [
(
'On Open Library, search for books by "{author_a}" and "{author_b}", '
"both sorted by most editions. Comparing the first {n} results for each, "
"which author has a higher {metric_label}? Answer with the author name only."
),
(
'Compare "{author_a}" and "{author_b}" on Open Library. For each author, '
"look at the top {n} books (sorted by most editions). "
"Which author has more {metric_label} in total? Reply with just the name."
),
(
'Search Open Library for books by "{author_a}" and by "{author_b}" '
"(most editions). Among each author's top {n} results, which author's books "
"have a higher combined {metric_label}? Answer with the author name."
),
]


@register_template("openlibrary_author_comparison")
class OpenLibraryAuthorComparisonTemplate(QuestionTemplate):
"""Compare aggregate engagement metrics between two authors' top works.

MEDIUM/HARD difficulty: requires two separate author searches, summing
a metric across top N results for each, then comparing the totals.
"""

GT_SOURCE = GTSourceType.PAGE_ONLY

def __init__(self):
super().__init__("openlibrary_author_comparison")

def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion:
rng = random.Random(seed)

metrics = list(AuthorMetric)
metric = (
metrics[variant % len(metrics)]
if variant is not None
else rng.choice(metrics)
)

(name_a, query_a), (name_b, query_b) = rng.sample(ENGAGEMENT_AUTHOR_POOL, 2)

# Randomly swap order to prevent position bias
if rng.random() > 0.5:
name_a, query_a, name_b, query_b = name_b, query_b, name_a, query_a

count = rng.choice(RESULT_COUNTS)
search_query_a = f'author:"{query_a}"'
search_query_b = f'author:"{query_b}"'

pattern = rng.choice(PATTERNS)
question_text = pattern.format(
author_a=name_a,
author_b=name_b,
n=count,
metric_label=metric.value[1],
)

query_encoded_a = quote_plus(search_query_a)
start_url = (
f"https://openlibrary.org/search?q={query_encoded_a}&sort=editions"
)

return GeneratedQuestion(
question_text=question_text,
start_url=start_url,
variables={
"author_a": name_a,
"author_b": name_b,
"metric": metric.value[0],
"work_count": count,
},
validation_info={
"author_a_name": name_a,
"author_a_query": query_a,
"search_query_a": search_query_a,
"author_b_name": name_b,
"author_b_query": query_b,
"search_query_b": search_query_b,
"sort": "editions",
"work_count": count,
"metric": metric.value[0],
"metric_label": metric.value[1],
},
template_name=self.name,
expected_steps=12,
)

def get_validation_rules(self, validation_info: Dict[str, Any]) -> str:
author_a = validation_info.get("author_a_name", "")
author_b = validation_info.get("author_b_name", "")
count = validation_info.get("work_count", "")
metric_label = validation_info.get("metric_label", "")
return f"""Task-Specific Rules (Open Library Author Comparison):
- Compare: "{author_a}" vs "{author_b}"
- Metric: {metric_label} summed across top {count} results
- Score 1.0: Correct winning author name
- Score 0.0: Wrong author or no answer
- Tie rule: alphabetically earlier author name wins"""

async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult:
collected = get_collected_data()
if not collected:
return GroundTruthResult.fail("No Open Library data collected")

author_a_name = validation_info.get("author_a_name")
author_b_name = validation_info.get("author_b_name")
search_query_a = validation_info.get("search_query_a")
search_query_b = validation_info.get("search_query_b")
sort = validation_info.get("sort")
work_count = validation_info.get("work_count")
metric = validation_info.get("metric")

if (
not isinstance(author_a_name, str)
or not isinstance(author_b_name, str)
or not isinstance(search_query_a, str)
or not isinstance(search_query_b, str)
or not isinstance(sort, str)
or not isinstance(work_count, int)
or not isinstance(metric, str)
):
return GroundTruthResult.fail("Missing or invalid comparison inputs")
if work_count <= 0:
return GroundTruthResult.fail(f"Invalid work_count: {work_count}")

sum_a = self._sum_metric(
collected, author_a_name, search_query_a, sort, work_count, metric,
)
if isinstance(sum_a, GroundTruthResult):
return sum_a

sum_b = self._sum_metric(
collected, author_b_name, search_query_b, sort, work_count, metric,
)
if isinstance(sum_b, GroundTruthResult):
return sum_b

if sum_a == sum_b:
winner = min(author_a_name, author_b_name, key=str.casefold)
elif sum_a > sum_b:
winner = author_a_name
else:
winner = author_b_name

return GroundTruthResult.ok(winner)

@staticmethod
def _sum_metric(
collected: Dict[str, Dict[str, Any]],
author_name: str,
search_query: str,
sort: str,
work_count: int,
metric: str,
) -> "int | GroundTruthResult":
"""Sum a metric across an author's top N search results.

Returns the integer sum on success, or a GroundTruthResult on failure.
"""
data = find_author_search_entry(
collected,
search_query=search_query,
sort=sort,
allow_unsorted_fallback=True,
)
if data is None:
ol_keys = [k for k in collected if k.startswith("ol:")][:5]
return GroundTruthResult.not_collected(
f"Did not collect search data for author '{author_name}' "
f"sorted by '{sort}'. Collected OL keys: {ol_keys}"
)

works_dict = data.get("works")
if not isinstance(works_dict, dict):
return GroundTruthResult.fail(
f"Collected data for '{author_name}' missing works dictionary"
)
if len(works_dict) < work_count:
return GroundTruthResult.fail(
f"Only {len(works_dict)} works collected for '{author_name}', "
f"need {work_count}"
)

ranked = sorted(works_dict.values(), key=lambda w: w.get("rank", 999))
top_n = ranked[:work_count]

total = 0
for work in top_n:
try:
value = safe_metric_value(work, metric)
except ValueError as exc:
return GroundTruthResult.fail(str(exc))
total += int(value)

return total

async def validate_answer(
self,
answer: str,
validation_info: Dict[str, Any],
) -> ValidationResult:
return ValidationResult(
score=0.0,
is_correct=False,
expected=None,
actual=answer,
details="Use LLM validation",
)

def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig:
trigger = UrlPatternTrigger(domains=["openlibrary.org"])
return TriggerConfig(trigger=trigger)

@classmethod
def get_cache_source(cls) -> str:
return "openlibrary"

def get_gt_source(self) -> GTSourceType:
return self.GT_SOURCE
Loading