diff --git a/liveweb_arena/core/task_registry.py b/liveweb_arena/core/task_registry.py index 26cd235..221f30b 100644 --- a/liveweb_arena/core/task_registry.py +++ b/liveweb_arena/core/task_registry.py @@ -153,6 +153,11 @@ class TaskRegistry: 92: ("arxiv", "arxiv_category_comparison"), 94: ("arxiv", "arxiv_multi_author_filter"), 95: ("arxiv", "arxiv_title_length_extrema"), + + # Open Library templates — engagement & comparison + 96: ("openlibrary", "openlibrary_author_engagement_extrema"), + 97: ("openlibrary", "openlibrary_author_comparison"), + 98: ("openlibrary", "openlibrary_reading_stats_filter"), } # Template versions - each version's combinations come AFTER all previous versions @@ -181,6 +186,9 @@ class TaskRegistry: [85, 86, 87, 88], # Version 6: ArXiv templates [90, 91, 92, 94, 95], + # Version 7: Open Library engagement & comparison templates (PR #13) + # NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8. + [96, 97, 98], ] # Combination registry: list of template ID tuples diff --git a/liveweb_arena/plugins/openlibrary/openlibrary.py b/liveweb_arena/plugins/openlibrary/openlibrary.py index c4bbd32..ee31633 100644 --- a/liveweb_arena/plugins/openlibrary/openlibrary.py +++ b/liveweb_arena/plugins/openlibrary/openlibrary.py @@ -60,7 +60,8 @@ async def fetch_api_data(self, url: str) -> Dict[str, Any]: sort = parse_qs(parsed.query).get("sort", [None])[0] mode = parse_qs(parsed.query).get("mode", [None])[0] if query: - return await fetch_search_api_data(query, limit=20, sort=sort, mode=mode) + # limit=25 to support T96 RESULT_COUNTS up to work_count=25 + return await fetch_search_api_data(query, limit=25, sort=sort, mode=mode) return {} # Work detail page: /works/OL...W or /works/OL...W/Title diff --git a/liveweb_arena/plugins/openlibrary/templates/__init__.py b/liveweb_arena/plugins/openlibrary/templates/__init__.py index 3e5da6a..5f28cf8 100644 --- a/liveweb_arena/plugins/openlibrary/templates/__init__.py +++ b/liveweb_arena/plugins/openlibrary/templates/__init__.py @@ -10,10 +10,16 @@ from .book_comparison import OpenLibraryBookComparisonTemplate from .author_editions import OpenLibraryAuthorEditionsTemplate from .subject_multi_condition import OpenLibrarySubjectMultiConditionTemplate +from .author_engagement_extrema import OpenLibraryAuthorEngagementExtremaTemplate +from .author_comparison import OpenLibraryAuthorComparisonTemplate +from .reading_stats_filter import OpenLibraryReadingStatsFilterTemplate __all__ = [ "OpenLibraryBookStatsTemplate", "OpenLibraryBookComparisonTemplate", "OpenLibraryAuthorEditionsTemplate", "OpenLibrarySubjectMultiConditionTemplate", + "OpenLibraryAuthorEngagementExtremaTemplate", + "OpenLibraryAuthorComparisonTemplate", + "OpenLibraryReadingStatsFilterTemplate", ] diff --git a/liveweb_arena/plugins/openlibrary/templates/author_comparison.py b/liveweb_arena/plugins/openlibrary/templates/author_comparison.py new file mode 100644 index 0000000..452aed7 --- /dev/null +++ b/liveweb_arena/plugins/openlibrary/templates/author_comparison.py @@ -0,0 +1,256 @@ +"""Author comparison template for Open Library - MEDIUM/HARD DIFFICULTY. + +RL-friendly design: +- Requires TWO separate author searches and cross-page comparison +- Dynamic data: engagement metrics change continuously as users interact +- Large entity pool: C(81,2)×2 metrics×2 result counts = 12,960 variants +- Computation required: sum metric across N books for each author, compute difference +- Numeric answer (absolute difference) avoids 50% random baseline of binary choice +""" + +import random +from enum import Enum +from typing import Any, Dict, Optional +from urllib.parse import quote_plus + +from liveweb_arena.core.ground_truth_trigger import ( + GroundTruthResult, + TriggerConfig, + UrlPatternTrigger, +) +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) +from .author_editions import ENGAGEMENT_AUTHOR_POOL +from .common import find_author_search_entry, get_collected_data, safe_metric_value + + +class AuthorMetric(Enum): + """Engagement metrics for cross-author comparison.""" + WANT_TO_READ = ("want_to_read_count", "total want-to-read count") + RATINGS_COUNT = ("ratings_count", "total number of ratings") + + +RESULT_COUNTS = [3, 5] + +PATTERNS = [ + ( + 'On Open Library, search for books by "{author_a}" and "{author_b}", ' + "both sorted by most editions. What is the absolute difference in " + "{metric_label} between the first {n} results for each author? " + "Answer with just the number." + ), + ( + 'Compare "{author_a}" and "{author_b}" on Open Library. For each author, ' + "look at the top {n} books (sorted by most editions) and sum their " + "{metric_label}. What is the absolute difference between the two totals? " + "Reply with just a number." + ), + ( + 'Search Open Library for books by "{author_a}" and by "{author_b}" ' + "(most editions). Sum the {metric_label} across each author's top {n} " + "results. What is the absolute difference? Answer with the number only." + ), +] + + +@register_template("openlibrary_author_comparison") +class OpenLibraryAuthorComparisonTemplate(QuestionTemplate): + """Compare aggregate engagement metrics between two authors' top works. + + MEDIUM/HARD difficulty: requires two separate author searches, summing + a metric across top N results for each, then comparing the totals. + """ + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("openlibrary_author_comparison") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + + metrics = list(AuthorMetric) + metric = ( + metrics[variant % len(metrics)] + if variant is not None + else rng.choice(metrics) + ) + + (name_a, query_a), (name_b, query_b) = rng.sample(ENGAGEMENT_AUTHOR_POOL, 2) + + # Randomly swap order to prevent position bias + if rng.random() > 0.5: + name_a, query_a, name_b, query_b = name_b, query_b, name_a, query_a + + count = rng.choice(RESULT_COUNTS) + search_query_a = f'author:"{query_a}"' + search_query_b = f'author:"{query_b}"' + + pattern = rng.choice(PATTERNS) + question_text = pattern.format( + author_a=name_a, + author_b=name_b, + n=count, + metric_label=metric.value[1], + ) + + query_encoded_a = quote_plus(search_query_a) + start_url = ( + f"https://openlibrary.org/search?q={query_encoded_a}&sort=editions" + ) + + return GeneratedQuestion( + question_text=question_text, + start_url=start_url, + variables={ + "author_a": name_a, + "author_b": name_b, + "metric": metric.value[0], + "work_count": count, + }, + validation_info={ + "author_a_name": name_a, + "author_a_query": query_a, + "search_query_a": search_query_a, + "author_b_name": name_b, + "author_b_query": query_b, + "search_query_b": search_query_b, + "sort": "editions", + "work_count": count, + "metric": metric.value[0], + "metric_label": metric.value[1], + }, + template_name=self.name, + expected_steps=12, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + author_a = validation_info.get("author_a_name", "") + author_b = validation_info.get("author_b_name", "") + count = validation_info.get("work_count", "") + metric_label = validation_info.get("metric_label", "") + return f"""Task-Specific Rules (Open Library Author Comparison): +- Compare: "{author_a}" vs "{author_b}" +- Metric: {metric_label} summed across top {count} results +- Answer: absolute difference between the two totals (a single number) +- Score 1.0: Exact difference +- Score 0.5: Within ±10% of correct difference +- Score 0.0: Wrong value or no answer""" + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected = get_collected_data() + if not collected: + return GroundTruthResult.fail("No Open Library data collected") + + author_a_name = validation_info.get("author_a_name") + author_b_name = validation_info.get("author_b_name") + search_query_a = validation_info.get("search_query_a") + search_query_b = validation_info.get("search_query_b") + sort = validation_info.get("sort") + work_count = validation_info.get("work_count") + metric = validation_info.get("metric") + + if ( + not isinstance(author_a_name, str) + or not isinstance(author_b_name, str) + or not isinstance(search_query_a, str) + or not isinstance(search_query_b, str) + or not isinstance(sort, str) + or not isinstance(work_count, int) + or not isinstance(metric, str) + ): + return GroundTruthResult.fail("Missing or invalid comparison inputs") + if work_count <= 0: + return GroundTruthResult.fail(f"Invalid work_count: {work_count}") + + sum_a = self._sum_metric( + collected, author_a_name, search_query_a, sort, work_count, metric, + ) + if isinstance(sum_a, GroundTruthResult): + return sum_a + + sum_b = self._sum_metric( + collected, author_b_name, search_query_b, sort, work_count, metric, + ) + if isinstance(sum_b, GroundTruthResult): + return sum_b + + return GroundTruthResult.ok(str(abs(sum_a - sum_b))) + + @staticmethod + def _sum_metric( + collected: Dict[str, Dict[str, Any]], + author_name: str, + search_query: str, + sort: str, + work_count: int, + metric: str, + ) -> "int | GroundTruthResult": + """Sum a metric across an author's top N search results. + + Returns the integer sum on success, or a GroundTruthResult on failure. + """ + data = find_author_search_entry( + collected, + search_query=search_query, + sort=sort, + ) + if data is None: + ol_keys = [k for k in collected if k.startswith("ol:")][:5] + return GroundTruthResult.not_collected( + f"Did not collect search data for author '{author_name}' " + f"sorted by '{sort}'. Collected OL keys: {ol_keys}" + ) + + works_dict = data.get("works") + if not isinstance(works_dict, dict): + return GroundTruthResult.fail( + f"Collected data for '{author_name}' missing works dictionary" + ) + if len(works_dict) < work_count: + return GroundTruthResult.fail( + f"Only {len(works_dict)} works collected for '{author_name}', " + f"need {work_count}" + ) + + ranked = sorted(works_dict.values(), key=lambda w: w.get("rank", 999)) + top_n = ranked[:work_count] + + total = 0 + for work in top_n: + try: + value = safe_metric_value(work, metric) + except ValueError as exc: + return GroundTruthResult.fail(str(exc)) + total += int(value) + + return total + + async def validate_answer( + self, + answer: str, + validation_info: Dict[str, Any], + ) -> ValidationResult: + return ValidationResult( + score=0.0, + is_correct=False, + expected=None, + actual=answer, + details="Use LLM validation", + ) + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + trigger = UrlPatternTrigger(domains=["openlibrary.org"]) + return TriggerConfig(trigger=trigger) + + @classmethod + def get_cache_source(cls) -> str: + return "openlibrary" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/openlibrary/templates/author_editions.py b/liveweb_arena/plugins/openlibrary/templates/author_editions.py index f7fdd79..22c0897 100644 --- a/liveweb_arena/plugins/openlibrary/templates/author_editions.py +++ b/liveweb_arena/plugins/openlibrary/templates/author_editions.py @@ -1,7 +1,6 @@ """Author editions aggregation template for Open Library - MEDIUM DIFFICULTY.""" import random -import re from typing import Any, Dict, Optional from urllib.parse import quote_plus @@ -17,7 +16,7 @@ ValidationResult, register_template, ) -from .common import get_collected_data, parse_numeric +from .common import find_author_search_entry, get_collected_data, parse_numeric AUTHOR_POOL = [ # --- Original pool (20) --- @@ -94,6 +93,49 @@ ("Chimamanda Ngozi Adichie", "chimamanda ngozi adichie"), ] +# Separate pool for engagement templates (T96/T97/T98). +# Derived from AUTHOR_POOL minus authors with poor engagement data, plus +# 25 engagement-specific additions. Referencing AUTHOR_POOL prevents drift. +_ENGAGEMENT_EXCLUDED = frozenset({ + "Herman Melville", "Nathaniel Hawthorne", "Philip K. Dick", + "James Joyce", "Ralph Waldo Emerson", "Emily Bronte", + "Leo Tolstoy", "Walt Whitman", "Emily Dickinson", + "Rabindranath Tagore", "Fyodor Dostoevsky", "Haruki Murakami", + "Anton Chekhov", "Octavia Butler", +}) + +_ENGAGEMENT_ADDITIONS = [ + ("Charlotte Bronte", "charlotte bronte"), + ("Jack London", "jack london"), + ("Daniel Defoe", "daniel defoe"), + ("C.S. Lewis", "c s lewis"), + ("James Baldwin", "james baldwin"), + ("Sylvia Plath", "sylvia plath"), + ("Maya Angelou", "maya angelou"), + ("Zora Neale Hurston", "zora neale hurston"), + ("Gustave Flaubert", "gustave flaubert"), + ("Bram Stoker", "bram stoker"), + ("Lewis Carroll", "lewis carroll"), + ("Beatrix Potter", "beatrix potter"), + ("Enid Blyton", "enid blyton"), + ("Judy Blume", "judy blume"), + ("Beverly Cleary", "beverly cleary"), + ("Philip Roth", "philip roth"), + ("Milan Kundera", "milan kundera"), + ("Daphne du Maurier", "daphne du maurier"), + ("Wilkie Collins", "wilkie collins"), + ("Thomas Mann", "thomas mann"), + ("Hermann Hesse", "hermann hesse"), + ("E.M. Forster", "e m forster"), + ("Somerset Maugham", "somerset maugham"), + ("Anne Rice", "anne rice"), + ("Dan Brown", "dan brown"), +] + +ENGAGEMENT_AUTHOR_POOL = [ + entry for entry in AUTHOR_POOL if entry[0] not in _ENGAGEMENT_EXCLUDED +] + _ENGAGEMENT_ADDITIONS + RESULT_COUNTS = [3, 5, 7, 10] SORT_OPTIONS = [ @@ -193,7 +235,7 @@ async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruth if not search_query: search_query = f'author:"{author_query}"' - data = self._find_author_search_entry( + data = find_author_search_entry( collected, search_query=search_query, sort=sort, ) if data is None: @@ -230,75 +272,6 @@ async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruth return GroundTruthResult.ok(str(total_editions)) - @staticmethod - def _normalize_author_fragment(value: str) -> str: - """Normalize author text by stripping punctuation and collapsing whitespace.""" - return " ".join(re.findall(r"[a-z0-9]+", value.lower())) - - @classmethod - def _extract_author_filter(cls, query: str) -> Optional[str]: - """ - Extract normalized author text from author-filter queries. - - Accepts query forms like: - - author:"mark twain" - - AUTHOR: "Mark Twain" - - author:'h.g. wells' - """ - cleaned = query.strip().lower() - if not cleaned: - return None - - match = re.match(r"^author\s*:\s*(.+)$", cleaned) - if not match: - return None - - rhs = match.group(1).strip() - if len(rhs) >= 2 and rhs[0] == rhs[-1] and rhs[0] in {'"', "'"}: - rhs = rhs[1:-1].strip() - - normalized = cls._normalize_author_fragment(rhs) - return normalized or None - - @classmethod - def _find_author_search_entry( - cls, - collected: Dict[str, Dict[str, Any]], - *, - search_query: str, - sort: str, - ) -> Optional[Dict[str, Any]]: - """ - Find search data for an author-filtered search query. - - We intentionally require author-filter syntax to keep page semantics - aligned with the question ("books by "). - """ - target_author = cls._extract_author_filter(search_query) - if not target_author: - return None - - matched_entry: Optional[Dict[str, Any]] = None - - for key, entry in collected.items(): - if not key.startswith("ol:") or not isinstance(entry, dict): - continue - works = entry.get("works") - if not isinstance(works, dict): - continue - if entry.get("sort") != sort: - continue - - entry_query = str(entry.get("query", "")) - if not entry_query.strip(): - continue - - entry_author = cls._extract_author_filter(entry_query) - if entry_author == target_author: - matched_entry = entry - - return matched_entry - async def validate_answer( self, answer: str, diff --git a/liveweb_arena/plugins/openlibrary/templates/author_engagement_extrema.py b/liveweb_arena/plugins/openlibrary/templates/author_engagement_extrema.py new file mode 100644 index 0000000..86fc78b --- /dev/null +++ b/liveweb_arena/plugins/openlibrary/templates/author_engagement_extrema.py @@ -0,0 +1,261 @@ +"""Author engagement extrema template for Open Library - MEDIUM DIFFICULTY. + +RL-friendly design: +- Requires searching for an author and scanning multiple results +- Dynamic data: want_to_read counts and ratings change continuously +- Entity pool: 81 authors × (highest-wtr: 7 + highest-rc: 2 + lowest-wtr: 3) = 972 variants +- Computation required: must compare values across N books to find extremum +- Strict sort matching: GT only accepts data from sort=editions pages (no unsorted fallback) +- Missing ratings_count causes GT failure; only want_to_read_count defaults to 0 when absent +- ratings_count variants capped to N∈{3,5} to limit GT-fail from sparse OL data +""" + +import random +from enum import Enum +from typing import Any, Dict, Optional +from urllib.parse import quote_plus + +from liveweb_arena.core.ground_truth_trigger import ( + GroundTruthResult, + TriggerConfig, + UrlPatternTrigger, +) +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) +from .author_editions import ENGAGEMENT_AUTHOR_POOL +from .common import find_author_search_entry, get_collected_data, safe_metric_value + + +class ExtremaType(Enum): + """Whether to find the highest or lowest value.""" + HIGHEST = "highest" + LOWEST = "lowest" + + +class EngagementMetric(Enum): + """Reader engagement metrics confirmed visible on search result pages.""" + WANT_TO_READ = ("want_to_read_count", "want-to-read count") + RATINGS_COUNT = ("ratings_count", "number of ratings") + + +# ratings_count is excluded from LOWEST extrema because the OL API omits +# the field for unrated works; missing-as-zero would always "win" lowest. +_LOWEST_METRICS = [EngagementMetric.WANT_TO_READ] + +RESULT_COUNTS = [3, 5, 7, 10, 15, 20, 25] + +# For lowest extrema, cap work_count to avoid missing-as-zero domination. +# At work_count >= 10, many authors have missing want_to_read_count entries +# that coerce to 0, making the GT answer = alphabetically first zero-book. +_LOWEST_RESULT_COUNTS = [3, 5, 7] + +# ratings_count is sparse in OL data (20-40% of top-N missing at N≥7). +# Cap to small N where coverage is highest to limit GT-fail exposure. +_RATINGS_RESULT_COUNTS = [3, 5] + +PATTERNS = { + ExtremaType.HIGHEST: [ + ( + 'Search Open Library for books by "{author}" sorted by most editions. ' + "Among the first {n} results, which book has the highest {metric_label}? " + "Answer with the book title only." + ), + ( + 'On Open Library, look up books by "{author}" (most editions). ' + "Of the top {n} results, which has the most {metric_label}? " + "Reply with just the title." + ), + ], + ExtremaType.LOWEST: [ + ( + 'Search Open Library for books by "{author}" sorted by most editions. ' + "Among the first {n} results, which book has the lowest {metric_label}? " + "Answer with the book title only." + ), + ( + 'On Open Library, look up books by "{author}" (most editions). ' + "Of the top {n} results, which has the fewest {metric_label}? " + "Reply with just the title." + ), + ], +} + + +@register_template("openlibrary_author_engagement_extrema") +class OpenLibraryAuthorEngagementExtremaTemplate(QuestionTemplate): + """Find the book with the highest/lowest engagement metric among an author's top works. + + MEDIUM difficulty: requires searching for an author, reading engagement + metrics across multiple results, and identifying the extremum. + """ + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("openlibrary_author_engagement_extrema") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + + author_name, author_query = rng.choice(ENGAGEMENT_AUTHOR_POOL) + extrema = rng.choice(list(ExtremaType)) + pool = _LOWEST_METRICS if extrema == ExtremaType.LOWEST else list(EngagementMetric) + metric = rng.choice(pool) + + if extrema == ExtremaType.LOWEST: + counts = _LOWEST_RESULT_COUNTS + elif metric == EngagementMetric.RATINGS_COUNT: + counts = _RATINGS_RESULT_COUNTS + else: + counts = RESULT_COUNTS + count = ( + counts[variant % len(counts)] + if variant is not None + else rng.choice(counts) + ) + + search_query = f'author:"{author_query}"' + pattern = rng.choice(PATTERNS[extrema]) + question_text = pattern.format( + author=author_name, + n=count, + metric_label=metric.value[1], + ) + query_encoded = quote_plus(search_query) + start_url = f"https://openlibrary.org/search?q={query_encoded}&sort=editions" + + return GeneratedQuestion( + question_text=question_text, + start_url=start_url, + variables={ + "author": author_name, + "work_count": count, + "extrema": extrema.value, + "metric": metric.value[0], + }, + validation_info={ + "author_name": author_name, + "author_query": author_query, + "search_query": search_query, + "sort": "editions", + "work_count": count, + "extrema": extrema.value, + "metric": metric.value[0], + "metric_label": metric.value[1], + }, + template_name=self.name, + expected_steps=7, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + author = validation_info.get("author_name", "") + count = validation_info.get("work_count", "") + extrema = validation_info.get("extrema", "") + metric_label = validation_info.get("metric_label", "") + return f"""Task-Specific Rules (Open Library Author Engagement Extrema): +- Author: "{author}" +- Find the {extrema} {metric_label} among the first {count} results +- Score 1.0: Correct book title +- Score 0.0: Wrong title or no answer +- Tie rule: alphabetically earlier title wins""" + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected = get_collected_data() + if not collected: + return GroundTruthResult.fail("No Open Library data collected") + + author_name = validation_info.get("author_name") + search_query = validation_info.get("search_query") + sort = validation_info.get("sort") + work_count = validation_info.get("work_count") + extrema = validation_info.get("extrema") + metric = validation_info.get("metric") + + if ( + not isinstance(author_name, str) + or not isinstance(search_query, str) + or not isinstance(sort, str) + or not isinstance(work_count, int) + or not isinstance(extrema, str) + or not isinstance(metric, str) + ): + return GroundTruthResult.fail("Missing or invalid extrema inputs") + if work_count <= 0: + return GroundTruthResult.fail(f"Invalid work_count: {work_count}") + + data = find_author_search_entry( + collected, + search_query=search_query, + sort=sort, + ) + if data is None: + ol_keys = [k for k in collected if k.startswith("ol:")][:5] + return GroundTruthResult.not_collected( + f"Did not collect search data for author '{author_name}' " + f"sorted by '{sort}'. Collected OL keys: {ol_keys}" + ) + + works_dict = data.get("works") + if not isinstance(works_dict, dict): + return GroundTruthResult.fail("Collected search data missing works dictionary") + if len(works_dict) < work_count: + return GroundTruthResult.fail( + f"Only {len(works_dict)} works collected, need {work_count}" + ) + + ranked = sorted(works_dict.values(), key=lambda w: w.get("rank", 999)) + top_n = ranked[:work_count] + + best_title: Optional[str] = None + best_value: Optional[float] = None + for work in top_n: + title = work.get("title") + if not isinstance(title, str): + return GroundTruthResult.fail("Work missing title field") + try: + value = safe_metric_value(work, metric) + except ValueError as exc: + return GroundTruthResult.fail(str(exc)) + is_better = ( + best_value is None + or (extrema == "highest" and value > best_value) + or (extrema == "lowest" and value < best_value) + or (value == best_value and title.casefold() < best_title.casefold()) + ) + if is_better: + best_title = title + best_value = value + + if best_title is None: + return GroundTruthResult.fail("No works with valid metric values found") + + return GroundTruthResult.ok(best_title) + + async def validate_answer( + self, + answer: str, + validation_info: Dict[str, Any], + ) -> ValidationResult: + return ValidationResult( + score=0.0, + is_correct=False, + expected=None, + actual=answer, + details="Use LLM validation", + ) + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + trigger = UrlPatternTrigger(domains=["openlibrary.org"]) + return TriggerConfig(trigger=trigger) + + @classmethod + def get_cache_source(cls) -> str: + return "openlibrary" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/openlibrary/templates/common.py b/liveweb_arena/plugins/openlibrary/templates/common.py index 6eabe81..f4e111d 100644 --- a/liveweb_arena/plugins/openlibrary/templates/common.py +++ b/liveweb_arena/plugins/openlibrary/templates/common.py @@ -1,5 +1,6 @@ """Shared helpers for Open Library templates.""" +import re from typing import Any, Dict, Iterator, Optional from liveweb_arena.core.gt_collector import get_current_gt_collector @@ -52,6 +53,39 @@ def parse_numeric(value: Any) -> Optional[float]: return None +_ZERO_DEFAULTABLE_METRICS = frozenset({"want_to_read_count"}) + + +def safe_metric_value(work: Dict[str, Any], metric: str) -> float: + """Read an engagement metric from a work dict. + + For metrics in ``_ZERO_DEFAULTABLE_METRICS`` (currently only + ``want_to_read_count``), absent values are treated as ``0.0`` + because the OL API omits that field when no one has marked the + book. For all other metrics (e.g. ``ratings_count``), absence + raises ``ValueError`` — the data is too sparse to assume zero + is semantically correct. + + Non-null values that cannot be parsed as a number always raise + ``ValueError`` so callers can surface a proper GT failure. + """ + raw = work.get(metric) + if raw is None: + if metric in _ZERO_DEFAULTABLE_METRICS: + return 0.0 + title = work.get("title", "") + raise ValueError( + f"Missing '{metric}' for work '{title}'" + ) + parsed = parse_numeric(raw) + if parsed is None: + title = work.get("title", "") + raise ValueError( + f"Non-numeric '{metric}' value {raw!r} for work '{title}'" + ) + return parsed + + def get_collected_data() -> Optional[Dict[str, Dict[str, Any]]]: """Get collected API data for PAGE_ONLY templates.""" collector = get_current_gt_collector() @@ -100,3 +134,94 @@ def iter_collected_works(collected: Dict[str, Dict[str, Any]]) -> Iterator[Dict[ if "key" in entry and "title" in entry: yield entry + + +def normalize_author_fragment(value: str) -> str: + """Normalize author text by stripping punctuation and collapsing whitespace.""" + return " ".join(re.findall(r"[a-z0-9]+", value.lower())) + + +def extract_author_filter(query: str) -> Optional[str]: + """ + Extract normalized author text from author-filter queries. + + Accepts query forms like: + - author:"mark twain" + - AUTHOR: "Mark Twain" + - author:'h.g. wells' + + Returns None if the query is not an author-filter query. + """ + cleaned = query.strip().lower() + if not cleaned: + return None + + match = re.match(r"^author\s*:\s*(.+)$", cleaned) + if not match: + return None + + rhs = match.group(1).strip() + if len(rhs) >= 2 and rhs[0] == rhs[-1] and rhs[0] in {'"', "'"}: + rhs = rhs[1:-1].strip() + + normalized = normalize_author_fragment(rhs) + return normalized or None + + +def find_author_search_entry( + collected: Dict[str, Dict[str, Any]], + *, + search_query: str, + sort: str, + allow_unsorted_fallback: bool = False, +) -> Optional[Dict[str, Any]]: + """ + Find search data for an author-filtered search query. + + The *search_query* passed by the template always uses ``author:"name"`` + syntax, but the agent may have typed a plain-text query like + ``agatha christie`` instead. To handle both cases the collected entry's + query is first checked for ``author:`` syntax; if that is absent the + raw query text is normalized and compared directly. + + By default, this matcher is strict about sort order. When + ``allow_unsorted_fallback=True``, it first prefers an exact sort match and + only falls back to entries with no sort parameter when no exact match was + collected. + """ + target_author = extract_author_filter(search_query) + if not target_author: + return None + + requested_sort = sort.strip() + matched_exact: Optional[Dict[str, Any]] = None + matched_unsorted: Optional[Dict[str, Any]] = None + + for key, entry in collected.items(): + if not key.startswith("ol:") or not isinstance(entry, dict): + continue + works = entry.get("works") + if not isinstance(works, dict): + continue + + entry_query = str(entry.get("query", "")) + if not entry_query.strip(): + continue + + entry_author = extract_author_filter(entry_query) + if entry_author is None: + entry_author = normalize_author_fragment(entry_query) + if entry_author != target_author: + continue + + sort_value = entry.get("sort") + entry_sort = str(sort_value).strip() if sort_value is not None else "" + + if entry_sort == requested_sort: + matched_exact = entry + continue + + if allow_unsorted_fallback and not entry_sort: + matched_unsorted = entry + + return matched_exact if matched_exact is not None else matched_unsorted diff --git a/liveweb_arena/plugins/openlibrary/templates/reading_stats_filter.py b/liveweb_arena/plugins/openlibrary/templates/reading_stats_filter.py new file mode 100644 index 0000000..52c331a --- /dev/null +++ b/liveweb_arena/plugins/openlibrary/templates/reading_stats_filter.py @@ -0,0 +1,224 @@ +"""Reading stats filter template for Open Library - HARD DIFFICULTY. + +RL-friendly design: +- Requires searching for an author and scanning engagement metrics per book +- Dynamic data: want_to_read counts and ratings change continuously +- Entity pool: 81 authors × (wtr: 4 thresholds × 3 counts + rc: 4 thresholds × 1 count) = 1,296 variants +- Counting task: agent must check each book against a threshold (no single-sort shortcut) +- ratings_count variants capped to N=5 to limit GT-fail from sparse OL data +""" + +import random +from enum import Enum +from typing import Any, Dict, List, Optional +from urllib.parse import quote_plus + +from liveweb_arena.core.ground_truth_trigger import ( + GroundTruthResult, + TriggerConfig, + UrlPatternTrigger, +) +from liveweb_arena.core.gt_collector import GTSourceType +from liveweb_arena.core.validators.base import ( + GeneratedQuestion, + QuestionTemplate, + ValidationResult, + register_template, +) +from .author_editions import ENGAGEMENT_AUTHOR_POOL +from .common import find_author_search_entry, get_collected_data, safe_metric_value + + +class ReaderMetric(Enum): + """Reader engagement metrics with per-metric thresholds.""" + WANT_TO_READ = ("want_to_read_count", "people who want to read them") + RATINGS_COUNT = ("ratings_count", "ratings") + + +THRESHOLDS: Dict[ReaderMetric, List[int]] = { + ReaderMetric.WANT_TO_READ: [100, 200, 500, 1000], + ReaderMetric.RATINGS_COUNT: [30, 50, 100, 200], +} + +RESULT_COUNTS = [5, 10, 15] + +# ratings_count is sparse in OL data (22% of authors missing at N=5, 57% at N=10). +# Cap to N=5 for ratings_count to keep GT-fail exposure under ~11%. +_RATINGS_RESULT_COUNTS = [5] + +PATTERNS = [ + ( + 'Search Open Library for books by "{author}" sorted by most editions. ' + "Among the first {n} results, how many have more than {threshold} " + "{metric_label}?" + ), + ( + 'On Open Library, look up books by "{author}" (most editions). ' + "Of the top {n} results, count how many have over {threshold} " + "{metric_label}." + ), + ( + 'Find books by "{author}" on Open Library (most editions). ' + "How many of the first {n} results have more than {threshold} " + "{metric_label}?" + ), +] + + +@register_template("openlibrary_reading_stats_filter") +class OpenLibraryReadingStatsFilterTemplate(QuestionTemplate): + """Count books in an author's catalog meeting an engagement threshold. + + HARD difficulty: requires scanning each book's engagement metric and + counting those above a threshold. Cannot be solved by sorting a single + column — the threshold is on a different metric than the sort order. + """ + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("openlibrary_reading_stats_filter") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + + metrics = list(ReaderMetric) + metric = ( + metrics[variant % len(metrics)] + if variant is not None + else rng.choice(metrics) + ) + + author_name, author_query = rng.choice(ENGAGEMENT_AUTHOR_POOL) + counts = _RATINGS_RESULT_COUNTS if metric == ReaderMetric.RATINGS_COUNT else RESULT_COUNTS + count = rng.choice(counts) + threshold = rng.choice(THRESHOLDS[metric]) + + search_query = f'author:"{author_query}"' + pattern = rng.choice(PATTERNS) + question_text = pattern.format( + author=author_name, + n=count, + threshold=threshold, + metric_label=metric.value[1], + ) + query_encoded = quote_plus(search_query) + start_url = f"https://openlibrary.org/search?q={query_encoded}&sort=editions" + + return GeneratedQuestion( + question_text=question_text, + start_url=start_url, + variables={ + "author": author_name, + "work_count": count, + "metric": metric.value[0], + "threshold": threshold, + }, + validation_info={ + "author_name": author_name, + "author_query": author_query, + "search_query": search_query, + "sort": "editions", + "work_count": count, + "metric": metric.value[0], + "metric_label": metric.value[1], + "threshold": threshold, + }, + template_name=self.name, + expected_steps=8, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + author = validation_info.get("author_name", "") + count = validation_info.get("work_count", "") + metric_label = validation_info.get("metric_label", "") + threshold = validation_info.get("threshold", "") + return f"""Task-Specific Rules (Open Library Reading Stats Filter): +- Author: "{author}" +- Count books among top {count} with > {threshold} {metric_label} +- Score 1.0: Exact count match +- Score 0.5: Count within ±1 of correct answer +- Score 0.0: Wrong count or no answer""" + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + collected = get_collected_data() + if not collected: + return GroundTruthResult.fail("No Open Library data collected") + + author_name = validation_info.get("author_name") + search_query = validation_info.get("search_query") + sort = validation_info.get("sort") + work_count = validation_info.get("work_count") + metric = validation_info.get("metric") + threshold = validation_info.get("threshold") + + if ( + not isinstance(author_name, str) + or not isinstance(search_query, str) + or not isinstance(sort, str) + or not isinstance(work_count, int) + or not isinstance(metric, str) + or not isinstance(threshold, int) + ): + return GroundTruthResult.fail("Missing or invalid filter inputs") + if work_count <= 0: + return GroundTruthResult.fail(f"Invalid work_count: {work_count}") + + data = find_author_search_entry( + collected, + search_query=search_query, + sort=sort, + ) + if data is None: + ol_keys = [k for k in collected if k.startswith("ol:")][:5] + return GroundTruthResult.not_collected( + f"Did not collect search data for author '{author_name}' " + f"sorted by '{sort}'. Collected OL keys: {ol_keys}" + ) + + works_dict = data.get("works") + if not isinstance(works_dict, dict): + return GroundTruthResult.fail("Collected search data missing works dictionary") + if len(works_dict) < work_count: + return GroundTruthResult.fail( + f"Only {len(works_dict)} works collected for '{author_name}', " + f"need {work_count}" + ) + + ranked = sorted(works_dict.values(), key=lambda w: w.get("rank", 999)) + top_n = ranked[:work_count] + + match_count = 0 + for work in top_n: + try: + value = safe_metric_value(work, metric) + except ValueError as exc: + return GroundTruthResult.fail(str(exc)) + if int(value) > threshold: + match_count += 1 + + return GroundTruthResult.ok(str(match_count)) + + async def validate_answer( + self, + answer: str, + validation_info: Dict[str, Any], + ) -> ValidationResult: + return ValidationResult( + score=0.0, + is_correct=False, + expected=None, + actual=answer, + details="Use LLM validation", + ) + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + trigger = UrlPatternTrigger(domains=["openlibrary.org"]) + return TriggerConfig(trigger=trigger) + + @classmethod + def get_cache_source(cls) -> str: + return "openlibrary" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/tests/plugins/openlibrary/test_engagement_filter_and_helpers.py b/tests/plugins/openlibrary/test_engagement_filter_and_helpers.py new file mode 100644 index 0000000..c051148 --- /dev/null +++ b/tests/plugins/openlibrary/test_engagement_filter_and_helpers.py @@ -0,0 +1,517 @@ +"""Tests: reading_stats_filter GT, registry, helpers, consistency, pool invariants.""" + +import asyncio +from typing import Any, Dict, List, Optional + +import pytest + +from liveweb_arena.core.gt_collector import GTSourceType, set_current_gt_collector +from liveweb_arena.core.task_registry import TaskRegistry +from liveweb_arena.plugins.openlibrary.templates.author_comparison import ( + AuthorMetric, + OpenLibraryAuthorComparisonTemplate, +) +from liveweb_arena.plugins.openlibrary.templates.author_engagement_extrema import ( + EngagementMetric, + OpenLibraryAuthorEngagementExtremaTemplate, +) +from liveweb_arena.plugins.openlibrary.templates.author_editions import ENGAGEMENT_AUTHOR_POOL +from liveweb_arena.plugins.openlibrary.templates.common import ( + extract_author_filter, + find_author_search_entry, + normalize_author_fragment, +) +from liveweb_arena.plugins.openlibrary.templates.reading_stats_filter import ( + OpenLibraryReadingStatsFilterTemplate, + ReaderMetric, +) + + +class _DummyCollector: + def __init__(self, data: Dict[str, Dict[str, Any]]): + self._data = data + + def get_collected_api_data(self) -> Dict[str, Dict[str, Any]]: + return self._data + + +def _run_gt(data: Dict[str, Dict[str, Any]], coro): + set_current_gt_collector(_DummyCollector(data)) + try: + return asyncio.run(coro) + finally: + set_current_gt_collector(None) + + +def _make_search_entry( + query: str, sort: Optional[str], works: List[Dict[str, Any]], +) -> Dict[str, Any]: + return { + "query": query, + "sort": sort, + "works": {work["key"]: work for work in works}, + } + + +# ── 5. reading_stats_filter GT behavior ─────────────────────────────── + + +def test_filter_counts_above_threshold(): + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 10000}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "want_to_read_count": 2000}, + {"key": "/works/OL3W", "rank": 3, "title": "Misery", "want_to_read_count": 2500}, + {"key": "/works/OL4W", "rank": 4, "title": "The Shining", "want_to_read_count": 150}, + {"key": "/works/OL5W", "rank": 5, "title": "Salem's Lot", "want_to_read_count": 50}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Stephen King", "author_query": "stephen king", + "search_query": 'author:"stephen king"', "sort": "editions", + "work_count": 5, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 200, + })) + assert result.success is True + assert result.value == "3" # It(10000), Carrie(2000), Misery(2500) > 200 + + +def test_filter_returns_zero_when_none_match(): + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:poe": _make_search_entry('author:"edgar allan poe"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "The Raven", "want_to_read_count": 10}, + {"key": "/works/OL2W", "rank": 2, "title": "Annabel Lee", "want_to_read_count": 5}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Edgar Allan Poe", "author_query": "edgar allan poe", + "search_query": 'author:"edgar allan poe"', "sort": "editions", + "work_count": 2, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 500, + })) + assert result.success is True + assert result.value == "0" + + +def test_filter_exact_threshold_not_counted(): + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:poe": _make_search_entry('author:"edgar allan poe"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "The Raven", "want_to_read_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "Annabel Lee", "want_to_read_count": 101}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Edgar Allan Poe", "author_query": "edgar allan poe", + "search_query": 'author:"edgar allan poe"', "sort": "editions", + "work_count": 2, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 100, + })) + assert result.success is True + assert result.value == "1" # only 101 > 100, not 100 > 100 + + +def test_filter_rejects_unsorted_data(): + """GT must require sort=editions; unsorted data should produce not_collected.""" + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:king": _make_search_entry("stephen king", None, [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 10000}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "want_to_read_count": 2000}, + {"key": "/works/OL3W", "rank": 3, "title": "Misery", "want_to_read_count": 2500}, + {"key": "/works/OL4W", "rank": 4, "title": "The Shining", "want_to_read_count": 150}, + {"key": "/works/OL5W", "rank": 5, "title": "Salem's Lot", "want_to_read_count": 50}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Stephen King", "author_query": "stephen king", + "search_query": 'author:"stephen king"', "sort": "editions", + "work_count": 5, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 200, + })) + assert result.success is False + assert result.is_data_not_collected() + + +def test_filter_not_collected_wrong_author(): + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:poe": _make_search_entry('author:"edgar allan poe"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "X", "want_to_read_count": 100}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Mark Twain", "author_query": "mark twain", + "search_query": 'author:"mark twain"', "sort": "editions", + "work_count": 5, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 100, + })) + assert result.success is False + assert result.is_data_not_collected() + + +def test_filter_missing_wtr_treated_as_zero(): + """OL API omits want_to_read_count when the value is zero; GT treats absent as 0.""" + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:poe": _make_search_entry('author:"edgar allan poe"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "The Raven", "want_to_read_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "Annabel Lee"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Edgar Allan Poe", "author_query": "edgar allan poe", + "search_query": 'author:"edgar allan poe"', "sort": "editions", + "work_count": 2, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 50, + })) + assert result.success is True + assert result.value == "1" # only The Raven (100) > 50; Annabel Lee (0) is not + + +def test_filter_missing_ratings_count_fails_gt(): + """Missing ratings_count should cause GT failure (not default to 0).""" + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:poe": _make_search_entry('author:"edgar allan poe"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "The Raven", "ratings_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "Annabel Lee"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Edgar Allan Poe", "author_query": "edgar allan poe", + "search_query": 'author:"edgar allan poe"', "sort": "editions", + "work_count": 2, "metric": "ratings_count", + "metric_label": "ratings", "threshold": 50, + })) + assert result.success is False + + +def test_filter_non_numeric_metric_causes_gt_failure(): + """Non-null non-numeric metric values should cause a GT fail via safe_metric_value.""" + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:poe": _make_search_entry('author:"edgar allan poe"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "The Raven", "want_to_read_count": "N/A"}, + {"key": "/works/OL2W", "rank": 2, "title": "Annabel Lee", "want_to_read_count": 100}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Edgar Allan Poe", "author_query": "edgar allan poe", + "search_query": 'author:"edgar allan poe"', "sort": "editions", + "work_count": 2, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 50, + })) + assert result.success is False + + +def test_filter_no_collected_data(): + tmpl = OpenLibraryReadingStatsFilterTemplate() + result = _run_gt({}, tmpl.get_ground_truth({ + "author_name": "X", "author_query": "x", + "search_query": 'author:"x"', "sort": "editions", + "work_count": 5, "metric": "want_to_read_count", + "metric_label": "people who want to read them", "threshold": 100, + })) + assert result.success is False + + +def test_filter_ratings_count_gt(): + """Verify GT works correctly with ratings_count metric (not just want_to_read).""" + tmpl = OpenLibraryReadingStatsFilterTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "ratings_count": 500}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "ratings_count": 80}, + {"key": "/works/OL3W", "rank": 3, "title": "Misery", "ratings_count": 20}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Stephen King", "author_query": "stephen king", + "search_query": 'author:"stephen king"', "sort": "editions", + "work_count": 3, "metric": "ratings_count", + "metric_label": "ratings", "threshold": 50, + })) + assert result.success is True + assert result.value == "2" # It(500) and Carrie(80) > 50; Misery(20) is not + + +# ── 6. Task registry ────────────────────────────────────────────────── + + +def test_task_registry_new_template_ids(): + assert TaskRegistry.TEMPLATES[96] == ( + "openlibrary", "openlibrary_author_engagement_extrema", + ) + assert TaskRegistry.TEMPLATES[97] == ( + "openlibrary", "openlibrary_author_comparison", + ) + assert TaskRegistry.TEMPLATES[98] == ( + "openlibrary", "openlibrary_reading_stats_filter", + ) + + +def test_task_registry_version_7_entry(): + found = any(sorted(v) == [96, 97, 98] for v in TaskRegistry.TEMPLATE_VERSIONS) + assert found, "No TEMPLATE_VERSIONS entry for [96, 97, 98]" + + +# ── 7. Shared helper refactoring ────────────────────────────────────── + + +def test_normalize_author_fragment(): + assert normalize_author_fragment("Mark Twain") == "mark twain" + assert normalize_author_fragment("H.G. Wells") == "h g wells" + assert normalize_author_fragment("J.K. Rowling") == "j k rowling" + assert normalize_author_fragment("") == "" + + +def test_extract_author_filter_standard(): + assert extract_author_filter('author:"mark twain"') == "mark twain" + assert extract_author_filter("AUTHOR: \"Mark Twain\"") == "mark twain" + assert extract_author_filter("author:'h.g. wells'") == "h g wells" + + +def test_extract_author_filter_rejects_plain_text(): + assert extract_author_filter("mark twain") is None + assert extract_author_filter("") is None + + +def test_find_author_search_entry_matches(): + collected = { + "ol:search:twain": _make_search_entry('author:"mark twain"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "X"}, + ]), + } + result = find_author_search_entry( + collected, search_query='author:"mark twain"', sort="editions", + ) + assert result is not None + assert result["query"] == 'author:"mark twain"' + + +def test_find_author_search_entry_rejects_wrong_sort(): + collected = { + "ol:search:twain": _make_search_entry('author:"mark twain"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "X"}, + ]), + } + result = find_author_search_entry( + collected, search_query='author:"mark twain"', sort="new", + ) + assert result is None + + +def test_find_author_search_entry_unsorted_fallback_disabled_by_default(): + collected = { + "ol:search:christie": _make_search_entry("agatha christie", None, [ + {"key": "/works/OL1W", "rank": 1, "title": "Styles"}, + ]), + } + result = find_author_search_entry( + collected, search_query='author:"agatha christie"', sort="editions", + ) + assert result is None + + +def test_find_author_search_entry_matches_unsorted_when_fallback_enabled(): + collected = { + "ol:search:christie": _make_search_entry("agatha christie", None, [ + {"key": "/works/OL1W", "rank": 1, "title": "Styles"}, + ]), + } + result = find_author_search_entry( + collected, + search_query='author:"agatha christie"', + sort="editions", + allow_unsorted_fallback=True, + ) + assert result is not None + assert result["query"] == "agatha christie" + + +def test_find_author_search_entry_prefers_exact_sort_over_unsorted_fallback(): + collected = { + "ol:search:unsorted": _make_search_entry("agatha christie", None, [ + {"key": "/works/OL1W", "rank": 1, "title": "Unsorted"}, + ]), + "ol:search:sorted": _make_search_entry("agatha christie", "editions", [ + {"key": "/works/OL2W", "rank": 1, "title": "Sorted"}, + ]), + } + result = find_author_search_entry( + collected, + search_query='author:"agatha christie"', + sort="editions", + allow_unsorted_fallback=True, + ) + assert result is not None + assert result["sort"] == "editions" + assert result["query"] == "agatha christie" + + +def test_find_author_search_entry_matches_plain_text_query(): + """Agent typed 'agatha christie' instead of 'author:\"agatha christie\"'.""" + collected = { + "ol:search:christie": _make_search_entry("agatha christie", "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "Styles"}, + ]), + } + result = find_author_search_entry( + collected, search_query='author:"agatha christie"', sort="editions", + ) + assert result is not None + assert result["query"] == "agatha christie" + + +def test_find_author_search_entry_plain_text_wrong_author_no_match(): + """Plain-text fallback must still reject a different author.""" + collected = { + "ol:search:king": _make_search_entry("stephen king", "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It"}, + ]), + } + result = find_author_search_entry( + collected, search_query='author:"agatha christie"', sort="editions", + ) + assert result is None + + +def test_comparison_matches_when_second_author_uses_plain_text(): + """Regression: author_comparison must not return not_collected when the + agent searches for the second author using plain text.""" + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 500}, + ]), + "ol:search:christie": _make_search_entry("agatha christie", "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles", "want_to_read_count": 100}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 1, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + assert result.value == "400" # abs(500 - 100) + + +# ── 8. Cross-template consistency ───────────────────────────────────── + + +@pytest.mark.parametrize("cls", [ + OpenLibraryAuthorEngagementExtremaTemplate, + OpenLibraryAuthorComparisonTemplate, + OpenLibraryReadingStatsFilterTemplate, +]) +def test_gt_source_is_page_only(cls): + assert cls().get_gt_source() == GTSourceType.PAGE_ONLY + + +@pytest.mark.parametrize("cls", [ + OpenLibraryAuthorEngagementExtremaTemplate, + OpenLibraryAuthorComparisonTemplate, + OpenLibraryReadingStatsFilterTemplate, +]) +def test_cache_source_is_openlibrary(cls): + assert cls.get_cache_source() == "openlibrary" + + +def test_engagement_extrema_metrics_use_confirmed_visible_fields(): + metric_names = {m.value[0] for m in EngagementMetric} + assert metric_names == {"want_to_read_count", "ratings_count"} + + +def test_author_comparison_metrics_use_confirmed_visible_fields(): + metric_names = {m.value[0] for m in AuthorMetric} + assert metric_names == {"want_to_read_count", "ratings_count"} + + +def test_reading_filter_metrics_use_confirmed_visible_fields(): + metric_names = {m.value[0] for m in ReaderMetric} + assert metric_names == {"want_to_read_count", "ratings_count"} + + +def test_all_new_templates_reuse_engagement_pool(): + from liveweb_arena.plugins.openlibrary.templates.author_engagement_extrema import ENGAGEMENT_AUTHOR_POOL as EX_POOL + from liveweb_arena.plugins.openlibrary.templates.author_comparison import ENGAGEMENT_AUTHOR_POOL as CMP_POOL + from liveweb_arena.plugins.openlibrary.templates.reading_stats_filter import ENGAGEMENT_AUTHOR_POOL as FLT_POOL + assert EX_POOL is ENGAGEMENT_AUTHOR_POOL + assert CMP_POOL is ENGAGEMENT_AUTHOR_POOL + assert FLT_POOL is ENGAGEMENT_AUTHOR_POOL + + +def test_all_validation_info_values_are_serializable(): + templates = [ + OpenLibraryAuthorEngagementExtremaTemplate(), + OpenLibraryAuthorComparisonTemplate(), + OpenLibraryReadingStatsFilterTemplate(), + ] + for tmpl in templates: + q = tmpl.generate(seed=1) + for key, val in q.validation_info.items(): + assert isinstance(val, (str, int, float, bool, type(None))), ( + f"{tmpl.name}.validation_info['{key}'] = {type(val).__name__} " + f"(not JSON-serializable)" + ) + + +# ── 9. Author pool invariants ───────────────────────────────────────── + + +def test_engagement_author_pool_size(): + assert len(ENGAGEMENT_AUTHOR_POOL) == 81, f"Expected 81 authors, got {len(ENGAGEMENT_AUTHOR_POOL)}" + + +def test_engagement_author_pool_no_duplicates(): + names = [name for name, _ in ENGAGEMENT_AUTHOR_POOL] + queries = [query for _, query in ENGAGEMENT_AUTHOR_POOL] + assert len(names) == len(set(names)), "Duplicate author names in ENGAGEMENT_AUTHOR_POOL" + assert len(queries) == len(set(queries)), "Duplicate author queries in ENGAGEMENT_AUTHOR_POOL" + + +def test_extrema_highest_ratings_count_gt(): + """Verify GT works with ratings_count metric for highest extrema.""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "ratings_count": 500}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "ratings_count": 200}, + {"key": "/works/OL3W", "rank": 3, "title": "Misery", "ratings_count": 300}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Stephen King", "author_query": "stephen king", + "search_query": 'author:"stephen king"', "sort": "editions", + "work_count": 3, "extrema": "highest", "metric": "ratings_count", + "metric_label": "number of ratings", + })) + assert result.success is True + assert result.value == "It" + + +def test_extrema_gt_succeeds_with_25_works(): + """Regression: work_count=25 must succeed when collector fetches ≥25 works.""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + works = [{"key": f"/works/OL{i}W", "rank": i, "title": f"Book {i}", + "want_to_read_count": 1000 - i * 10} for i in range(1, 26)] + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", works), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Stephen King", "author_query": "stephen king", + "search_query": 'author:"stephen king"', "sort": "editions", + "work_count": 25, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "Book 1" # highest want_to_read_count = 990 diff --git a/tests/plugins/openlibrary/test_engagement_real_api_data.py b/tests/plugins/openlibrary/test_engagement_real_api_data.py new file mode 100644 index 0000000..1d49719 --- /dev/null +++ b/tests/plugins/openlibrary/test_engagement_real_api_data.py @@ -0,0 +1,312 @@ +"""End-to-end GT computation tests using REAL Open Library API data. + +Data fetched live on March 26, 2026 via: + curl "https://openlibrary.org/search.json?q=author%3A%22{author}%22&sort=editions&limit=10&fields=key,title,want_to_read_count,ratings_count,edition_count" + +These tests verify CLAUDE.md §5 item 1: "GT must return a concrete value." +They inject real API response structure (field names, nesting, types) into the +GT collector and confirm each template computes a concrete answer. +""" + +import asyncio +from typing import Any, Dict, List, Optional + +from liveweb_arena.core.gt_collector import set_current_gt_collector +from liveweb_arena.plugins.openlibrary.templates.author_comparison import ( + OpenLibraryAuthorComparisonTemplate, +) +from liveweb_arena.plugins.openlibrary.templates.author_engagement_extrema import ( + OpenLibraryAuthorEngagementExtremaTemplate, +) +from liveweb_arena.plugins.openlibrary.templates.reading_stats_filter import ( + OpenLibraryReadingStatsFilterTemplate, +) + + +class _DummyCollector: + def __init__(self, data: Dict[str, Dict[str, Any]]): + self._data = data + + def get_collected_api_data(self) -> Dict[str, Dict[str, Any]]: + return self._data + + +def _run_gt(data: Dict[str, Dict[str, Any]], coro): + set_current_gt_collector(_DummyCollector(data)) + try: + return asyncio.run(coro) + finally: + set_current_gt_collector(None) + + +def _make_search_entry( + query: str, sort: Optional[str], works: List[Dict[str, Any]], +) -> Dict[str, Any]: + return { + "query": query, + "sort": sort, + "works": {work["key"]: work for work in works}, + } + + +# ── Real API data (fetched March 26, 2026) ──────────────────────────── + +AGATHA_CHRISTIE_TOP10 = [ + {"key": "/works/OL472715W", "rank": 1, "title": "The Mysterious Affair at Styles", "ratings_count": 84, "want_to_read_count": 620}, + {"key": "/works/OL471789W", "rank": 2, "title": "The Secret Adversary", "ratings_count": 33, "want_to_read_count": 295}, + {"key": "/works/OL472073W", "rank": 3, "title": "Murder on the Links", "ratings_count": 22, "want_to_read_count": 340}, + {"key": "/works/OL471576W", "rank": 4, "title": "Murder on the Orient Express", "ratings_count": 116, "want_to_read_count": 1355}, + {"key": "/works/OL471932W", "rank": 5, "title": "The Murder of Roger Ackroyd", "ratings_count": 76, "want_to_read_count": 699}, + {"key": "/works/OL471940W", "rank": 6, "title": "Poirot investigates", "ratings_count": 16, "want_to_read_count": 290}, + {"key": "/works/OL471565W", "rank": 7, "title": "And Then There Were None", "ratings_count": 164, "want_to_read_count": 1728}, + {"key": "/works/OL472549W", "rank": 8, "title": "The Man in the Brown Suit", "ratings_count": 18, "want_to_read_count": 228}, + {"key": "/works/OL471724W", "rank": 9, "title": "Death on the Nile", "ratings_count": 24, "want_to_read_count": 677}, + {"key": "/works/OL471509W", "rank": 10, "title": "The A.B.C. Murders", "ratings_count": 59, "want_to_read_count": 1056}, +] + +STEPHEN_KING_TOP10 = [ + {"key": "/works/OL81626W", "rank": 1, "title": "Carrie", "ratings_count": 160, "want_to_read_count": 2341}, + {"key": "/works/OL81632W", "rank": 2, "title": "\u2018Salem\u2019s Lot", "ratings_count": 93, "want_to_read_count": 1349}, + {"key": "/works/OL81634W", "rank": 3, "title": "Misery", "ratings_count": 135, "want_to_read_count": 2504}, + {"key": "/works/OL81633W", "rank": 4, "title": "The Shining", "ratings_count": 273, "want_to_read_count": 2874}, + {"key": "/works/OL81613W", "rank": 5, "title": "It", "ratings_count": 488, "want_to_read_count": 10362}, + {"key": "/works/OL81628W", "rank": 6, "title": "The Gunslinger", "ratings_count": 62, "want_to_read_count": 1061}, + {"key": "/works/OL81631W", "rank": 7, "title": "Pet Sematary", "ratings_count": 171, "want_to_read_count": 2238}, + {"key": "/works/OL81629W", "rank": 8, "title": "The Green Mile", "ratings_count": 104, "want_to_read_count": 1378}, + {"key": "/works/OL81630W", "rank": 9, "title": "The Dead Zone", "ratings_count": 46, "want_to_read_count": 524}, + {"key": "/works/OL81618W", "rank": 10, "title": "The Stand", "ratings_count": 85, "want_to_read_count": 1184}, +] + +NEIL_GAIMAN_TOP10 = [ + {"key": "/works/OL679358W", "rank": 1, "title": "Coraline", "ratings_count": 196, "want_to_read_count": 2546}, + {"key": "/works/OL453936W", "rank": 2, "title": "Good Omens", "ratings_count": 87, "want_to_read_count": 1038}, + {"key": "/works/OL679360W", "rank": 3, "title": "American Gods", "ratings_count": 56, "want_to_read_count": 665}, + {"key": "/works/OL15833328W", "rank": 4, "title": "Stardust", "ratings_count": 81, "want_to_read_count": 427}, + {"key": "/works/OL16804661W", "rank": 5, "title": "The Ocean at the End of the Lane", "ratings_count": 114, "want_to_read_count": 417}, + {"key": "/works/OL679333W", "rank": 6, "title": "Neverwhere", "ratings_count": 122, "want_to_read_count": 297}, + {"key": "/works/OL679266W", "rank": 7, "title": "Anansi Boys", "ratings_count": 71, "want_to_read_count": 174}, + {"key": "/works/OL679348W", "rank": 8, "title": "The Graveyard Book", "ratings_count": 121, "want_to_read_count": 509}, + {"key": "/works/OL679359W", "rank": 9, "title": "Fragile Things", "ratings_count": 8, "want_to_read_count": 109}, + {"key": "/works/OL101948W", "rank": 10, "title": "The swords of Lankhmar", "ratings_count": 2, "want_to_read_count": 14}, +] + + +def _christie_collected(): + return { + "ol:search:christie": _make_search_entry( + 'author:"agatha christie"', "editions", AGATHA_CHRISTIE_TOP10, + ), + } + + +def _king_collected(): + return { + "ol:search:king": _make_search_entry( + 'author:"stephen king"', "editions", STEPHEN_KING_TOP10, + ), + } + + +def _gaiman_collected(): + return { + "ol:search:gaiman": _make_search_entry( + 'author:"neil gaiman"', "editions", NEIL_GAIMAN_TOP10, + ), + } + + +# ── T96: author_engagement_extrema with real data ───────────────────── + + +class TestT96RealData: + """GT computation for T96 using real OL API data.""" + + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + + def test_highest_want_to_read_top5_christie(self): + result = _run_gt(_christie_collected(), self.tmpl.get_ground_truth({ + "author_name": "Agatha Christie", + "author_query": "agatha christie", + "search_query": 'author:"agatha christie"', + "sort": "editions", "work_count": 5, + "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "Murder on the Orient Express" # 1355 + + def test_highest_ratings_count_top5_christie(self): + result = _run_gt(_christie_collected(), self.tmpl.get_ground_truth({ + "author_name": "Agatha Christie", + "author_query": "agatha christie", + "search_query": 'author:"agatha christie"', + "sort": "editions", "work_count": 5, + "extrema": "highest", "metric": "ratings_count", + "metric_label": "number of ratings", + })) + assert result.success is True + assert result.value == "Murder on the Orient Express" # 116 + + def test_lowest_want_to_read_top5_king(self): + result = _run_gt(_king_collected(), self.tmpl.get_ground_truth({ + "author_name": "Stephen King", + "author_query": "stephen king", + "search_query": 'author:"stephen king"', + "sort": "editions", "work_count": 5, + "extrema": "lowest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "\u2018Salem\u2019s Lot" # 1349 + + def test_highest_ratings_count_top3_gaiman(self): + result = _run_gt(_gaiman_collected(), self.tmpl.get_ground_truth({ + "author_name": "Neil Gaiman", + "author_query": "neil gaiman", + "search_query": 'author:"neil gaiman"', + "sort": "editions", "work_count": 3, + "extrema": "highest", "metric": "ratings_count", + "metric_label": "number of ratings", + })) + assert result.success is True + assert result.value == "Coraline" # 196 + + def test_highest_want_to_read_top7_king(self): + result = _run_gt(_king_collected(), self.tmpl.get_ground_truth({ + "author_name": "Stephen King", + "author_query": "stephen king", + "search_query": 'author:"stephen king"', + "sort": "editions", "work_count": 7, + "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "It" # 10362 + + +# ── T97: author_comparison with real data ────────────────────────────── + + +class TestT97RealData: + """GT computation for T97 using real OL API data.""" + + tmpl = OpenLibraryAuthorComparisonTemplate() + + def test_want_to_read_difference_christie_vs_king_top5(self): + collected = {**_christie_collected(), **_king_collected()} + result = _run_gt(collected, self.tmpl.get_ground_truth({ + "author_a_name": "Agatha Christie", + "author_a_query": "agatha christie", + "search_query_a": 'author:"agatha christie"', + "author_b_name": "Stephen King", + "author_b_query": "stephen king", + "search_query_b": 'author:"stephen king"', + "sort": "editions", "work_count": 5, + "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + # Christie top 5 wtr: 620+295+340+1355+699 = 3309 + # King top 5 wtr: 2341+1349+2504+2874+10362 = 19430 + assert result.value == str(abs(3309 - 19430)) # "16121" + + def test_ratings_count_difference_christie_vs_gaiman_top3(self): + collected = {**_christie_collected(), **_gaiman_collected()} + result = _run_gt(collected, self.tmpl.get_ground_truth({ + "author_a_name": "Agatha Christie", + "author_a_query": "agatha christie", + "search_query_a": 'author:"agatha christie"', + "author_b_name": "Neil Gaiman", + "author_b_query": "neil gaiman", + "search_query_b": 'author:"neil gaiman"', + "sort": "editions", "work_count": 3, + "metric": "ratings_count", + "metric_label": "total number of ratings", + })) + assert result.success is True + # Christie top 3 rc: 84+33+22 = 139 + # Gaiman top 3 rc: 196+87+56 = 339 + assert result.value == str(abs(139 - 339)) # "200" + + def test_want_to_read_difference_king_vs_gaiman_top3(self): + collected = {**_king_collected(), **_gaiman_collected()} + result = _run_gt(collected, self.tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Neil Gaiman", + "author_b_query": "neil gaiman", + "search_query_b": 'author:"neil gaiman"', + "sort": "editions", "work_count": 3, + "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + # King top 3 wtr: 2341+1349+2504 = 6194 + # Gaiman top 3 wtr: 2546+1038+665 = 4249 + assert result.value == str(abs(6194 - 4249)) # "1945" + + +# ── T98: reading_stats_filter with real data ─────────────────────────── + + +class TestT98RealData: + """GT computation for T98 using real OL API data.""" + + tmpl = OpenLibraryReadingStatsFilterTemplate() + + def test_want_to_read_above_500_top5_christie(self): + result = _run_gt(_christie_collected(), self.tmpl.get_ground_truth({ + "author_name": "Agatha Christie", + "author_query": "agatha christie", + "search_query": 'author:"agatha christie"', + "sort": "editions", "work_count": 5, + "metric": "want_to_read_count", + "metric_label": "people who want to read them", + "threshold": 500, + })) + assert result.success is True + # Styles=620>500 ✓, Adversary=295 ✗, Links=340 ✗, Orient=1355>500 ✓, Ackroyd=699>500 ✓ + assert result.value == "3" + + def test_ratings_count_above_50_top5_king(self): + result = _run_gt(_king_collected(), self.tmpl.get_ground_truth({ + "author_name": "Stephen King", + "author_query": "stephen king", + "search_query": 'author:"stephen king"', + "sort": "editions", "work_count": 5, + "metric": "ratings_count", + "metric_label": "ratings", + "threshold": 50, + })) + assert result.success is True + # Carrie=160>50 ✓, Salem=93>50 ✓, Misery=135>50 ✓, Shining=273>50 ✓, It=488>50 ✓ + assert result.value == "5" + + def test_want_to_read_above_1000_top10_gaiman(self): + result = _run_gt(_gaiman_collected(), self.tmpl.get_ground_truth({ + "author_name": "Neil Gaiman", + "author_query": "neil gaiman", + "search_query": 'author:"neil gaiman"', + "sort": "editions", "work_count": 10, + "metric": "want_to_read_count", + "metric_label": "people who want to read them", + "threshold": 1000, + })) + assert result.success is True + # Coraline=2546>1000 ✓, Good Omens=1038>1000 ✓, rest < 1000 + assert result.value == "2" + + def test_ratings_count_above_100_top5_gaiman(self): + result = _run_gt(_gaiman_collected(), self.tmpl.get_ground_truth({ + "author_name": "Neil Gaiman", + "author_query": "neil gaiman", + "search_query": 'author:"neil gaiman"', + "sort": "editions", "work_count": 5, + "metric": "ratings_count", + "metric_label": "ratings", + "threshold": 100, + })) + assert result.success is True + # Coraline=196>100 ✓, Good Omens=87 ✗, American Gods=56 ✗, Stardust=81 ✗, Ocean=114>100 ✓ + assert result.value == "2" diff --git a/tests/plugins/openlibrary/test_engagement_templates.py b/tests/plugins/openlibrary/test_engagement_templates.py new file mode 100644 index 0000000..3398a39 --- /dev/null +++ b/tests/plugins/openlibrary/test_engagement_templates.py @@ -0,0 +1,533 @@ +"""Tests for Open Library engagement & comparison templates (part 1). + +Covers: +1. Template registration and generation invariants +2. author_engagement_extrema GT behavior and edge cases +3. author_comparison GT behavior and edge cases + +Part 2 (reading_stats_filter, helpers, registry, consistency) is in +test_engagement_filter_and_helpers.py. +""" + +import asyncio +from typing import Any, Dict, List, Optional + +import pytest + +from liveweb_arena.core.gt_collector import set_current_gt_collector +from liveweb_arena.core.validators.base import get_registered_templates +from liveweb_arena.plugins.openlibrary.templates.author_comparison import ( + OpenLibraryAuthorComparisonTemplate, +) +from liveweb_arena.plugins.openlibrary.templates.author_engagement_extrema import ( + OpenLibraryAuthorEngagementExtremaTemplate, +) +from liveweb_arena.plugins.openlibrary.templates.reading_stats_filter import ( + OpenLibraryReadingStatsFilterTemplate, +) + + +class _DummyCollector: + def __init__(self, data: Dict[str, Dict[str, Any]]): + self._data = data + + def get_collected_api_data(self) -> Dict[str, Dict[str, Any]]: + return self._data + + +def _run_gt(data: Dict[str, Dict[str, Any]], coro): + set_current_gt_collector(_DummyCollector(data)) + try: + return asyncio.run(coro) + finally: + set_current_gt_collector(None) + + +def _make_search_entry( + query: str, sort: Optional[str], works: List[Dict[str, Any]], +) -> Dict[str, Any]: + return { + "query": query, + "sort": sort, + "works": {work["key"]: work for work in works}, + } + + +# ── 1. Template registration ────────────────────────────────────────── + +SEEDS = [1, 42, 100, 999, 12345] + + +@pytest.mark.parametrize("name", [ + "openlibrary_author_engagement_extrema", + "openlibrary_author_comparison", + "openlibrary_reading_stats_filter", +]) +def test_template_registered(name): + templates = get_registered_templates() + assert name in templates, f"template '{name}' not registered" + + +# ── 2. Generation invariants ────────────────────────────────────────── + + +@pytest.mark.parametrize("seed", SEEDS) +def test_engagement_extrema_generate(seed): + q = OpenLibraryAuthorEngagementExtremaTemplate().generate(seed) + assert q.question_text + assert "openlibrary.org" in q.start_url + assert q.template_name == "openlibrary_author_engagement_extrema" + assert q.validation_info["extrema"] in {"highest", "lowest"} + assert q.validation_info["metric"] in { + "want_to_read_count", "ratings_count", + } + if q.validation_info["extrema"] == "lowest": + assert q.validation_info["work_count"] in {3, 5, 7} + elif q.validation_info["metric"] == "ratings_count": + assert q.validation_info["work_count"] in {3, 5} + else: + assert q.validation_info["work_count"] in {3, 5, 7, 10, 15, 20, 25} + assert "q=author%3A%22" in q.start_url + assert "sort=editions" in q.start_url + + +@pytest.mark.parametrize("seed", SEEDS) +def test_author_comparison_generate(seed): + q = OpenLibraryAuthorComparisonTemplate().generate(seed) + assert q.question_text + assert "openlibrary.org" in q.start_url + assert q.template_name == "openlibrary_author_comparison" + assert q.validation_info["author_a_name"] != q.validation_info["author_b_name"] + assert q.validation_info["metric"] in { + "want_to_read_count", "ratings_count", + } + assert q.validation_info["work_count"] in {3, 5} + + +@pytest.mark.parametrize("seed", SEEDS) +def test_reading_stats_filter_generate(seed): + q = OpenLibraryReadingStatsFilterTemplate().generate(seed) + assert q.question_text + assert "openlibrary.org" in q.start_url + assert q.template_name == "openlibrary_reading_stats_filter" + assert q.validation_info["metric"] in { + "want_to_read_count", "ratings_count", + } + if q.validation_info["metric"] == "ratings_count": + assert q.validation_info["work_count"] in {5} + else: + assert q.validation_info["work_count"] in {5, 10, 15} + assert isinstance(q.validation_info["threshold"], int) + + +def test_author_comparison_distinct_authors_all_seeds(): + tmpl = OpenLibraryAuthorComparisonTemplate() + for seed in range(1, 30): + q = tmpl.generate(seed) + assert q.validation_info["author_a_name"] != q.validation_info["author_b_name"], ( + f"seed={seed}: same author selected twice" + ) + + +def test_author_comparison_position_swap_occurs(): + tmpl = OpenLibraryAuthorComparisonTemplate() + pairs = set() + for seed in range(1, 50): + q = tmpl.generate(seed) + pairs.add((q.validation_info["author_a_name"], q.validation_info["author_b_name"])) + assert len(pairs) > 10, "Position bias: too few unique ordered pairs" + + +def test_extrema_lowest_excludes_ratings_count(): + """ratings_count is excluded from lowest extrema to avoid missing-as-zero bias.""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + lowest_metrics = set() + highest_metrics = set() + for seed in range(200): + q = tmpl.generate(seed) + if q.validation_info["extrema"] == "lowest": + lowest_metrics.add(q.validation_info["metric"]) + else: + highest_metrics.add(q.validation_info["metric"]) + assert lowest_metrics == {"want_to_read_count"}, ( + f"lowest should only use want_to_read_count, got {lowest_metrics}" + ) + assert "ratings_count" in highest_metrics, ( + "highest should include ratings_count" + ) + + +# ── 3. author_engagement_extrema GT behavior ────────────────────────── + + +def test_extrema_finds_highest_want_to_read(): + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 10000}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "want_to_read_count": 2000}, + {"key": "/works/OL3W", "rank": 3, "title": "Misery", "want_to_read_count": 2500}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Stephen King", "author_query": "stephen king", + "search_query": 'author:"stephen king"', "sort": "editions", + "work_count": 3, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "It" + + +def test_extrema_finds_lowest_want_to_read(): + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:austen": _make_search_entry('author:"jane austen"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "Sense and Sensibility", "want_to_read_count": 50}, + {"key": "/works/OL2W", "rank": 2, "title": "Pride and Prejudice", "want_to_read_count": 500}, + {"key": "/works/OL3W", "rank": 3, "title": "Emma", "want_to_read_count": 200}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Jane Austen", "author_query": "jane austen", + "search_query": 'author:"jane austen"', "sort": "editions", + "work_count": 3, "extrema": "lowest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "Sense and Sensibility" + + +def test_extrema_rejects_unsorted_data(): + """GT must require sort=editions; unsorted data should produce not_collected.""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:austen": _make_search_entry("jane austen", None, [ + {"key": "/works/OL1W", "rank": 1, "title": "Sense and Sensibility", "want_to_read_count": 50}, + {"key": "/works/OL2W", "rank": 2, "title": "Pride and Prejudice", "want_to_read_count": 500}, + {"key": "/works/OL3W", "rank": 3, "title": "Emma", "want_to_read_count": 200}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Jane Austen", "author_query": "jane austen", + "search_query": 'author:"jane austen"', "sort": "editions", + "work_count": 3, "extrema": "lowest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is False + assert result.is_data_not_collected() + + +def test_extrema_tie_breaks_alphabetically(): + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:dickens": _make_search_entry('author:"charles dickens"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "Oliver Twist", "want_to_read_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "David Copperfield", "want_to_read_count": 100}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Charles Dickens", "author_query": "charles dickens", + "search_query": 'author:"charles dickens"', "sort": "editions", + "work_count": 2, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "David Copperfield" # alphabetically earlier + + +def test_extrema_not_collected_wrong_author(): + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:dickens": _make_search_entry('author:"charles dickens"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "X", "want_to_read_count": 100}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Jane Austen", "author_query": "jane austen", + "search_query": 'author:"jane austen"', "sort": "editions", + "work_count": 3, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is False + assert result.is_data_not_collected() + + +def test_extrema_missing_wtr_treated_as_zero(): + """OL API omits want_to_read_count when the value is zero; GT treats absent as 0.""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:dickens": _make_search_entry('author:"charles dickens"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "Oliver Twist", "want_to_read_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "David Copperfield"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Charles Dickens", "author_query": "charles dickens", + "search_query": 'author:"charles dickens"', "sort": "editions", + "work_count": 2, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is True + assert result.value == "Oliver Twist" # 100 > 0 (missing wtr treated as 0) + + +def test_extrema_missing_ratings_count_fails_gt(): + """Missing ratings_count should cause GT failure (not default to 0).""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:dickens": _make_search_entry('author:"charles dickens"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "Oliver Twist", "ratings_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "David Copperfield"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Charles Dickens", "author_query": "charles dickens", + "search_query": 'author:"charles dickens"', "sort": "editions", + "work_count": 2, "extrema": "highest", "metric": "ratings_count", + "metric_label": "number of ratings", + })) + assert result.success is False + + +def test_extrema_non_numeric_metric_causes_gt_failure(): + """Non-null non-numeric metric values (e.g. 'N/A') should cause a GT fail, + not be silently treated as 0 — this signals unexpected data.""" + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + collected = { + "ol:search:dickens": _make_search_entry('author:"charles dickens"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "Oliver Twist", "want_to_read_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "David Copperfield", "want_to_read_count": "N/A"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_name": "Charles Dickens", "author_query": "charles dickens", + "search_query": 'author:"charles dickens"', "sort": "editions", + "work_count": 2, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is False + + +def test_extrema_no_collected_data(): + tmpl = OpenLibraryAuthorEngagementExtremaTemplate() + result = _run_gt({}, tmpl.get_ground_truth({ + "author_name": "X", "author_query": "x", + "search_query": 'author:"x"', "sort": "editions", + "work_count": 3, "extrema": "highest", "metric": "want_to_read_count", + "metric_label": "want-to-read count", + })) + assert result.success is False + + +# ── 4. author_comparison GT behavior ────────────────────────────────── + + +def test_comparison_returns_absolute_difference(): + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 500}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "want_to_read_count": 200}, + ]), + "ol:search:christie": _make_search_entry('author:"agatha christie"', "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles", "want_to_read_count": 100}, + {"key": "/works/OL4W", "rank": 2, "title": "Adversary", "want_to_read_count": 50}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 2, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + assert result.value == "550" # abs(700 - 150) + + +def test_comparison_difference_is_commutative(): + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 100}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "want_to_read_count": 50}, + ]), + "ol:search:christie": _make_search_entry('author:"agatha christie"', "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles", "want_to_read_count": 800}, + {"key": "/works/OL4W", "rank": 2, "title": "Adversary", "want_to_read_count": 300}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 2, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + assert result.value == "950" # abs(150 - 1100) + + +def test_comparison_equal_totals_yield_zero_difference(): + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 300}, + ]), + "ol:search:christie": _make_search_entry('author:"agatha christie"', "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles", "want_to_read_count": 300}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 1, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + assert result.value == "0" + + +def test_comparison_rejects_unsorted_data(): + """GT must require sort=editions; unsorted data should produce not_collected.""" + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry("stephen king", None, [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 500}, + {"key": "/works/OL2W", "rank": 2, "title": "Carrie", "want_to_read_count": 200}, + ]), + "ol:search:christie": _make_search_entry("agatha christie", None, [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles", "want_to_read_count": 100}, + {"key": "/works/OL4W", "rank": 2, "title": "Adversary", "want_to_read_count": 50}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 2, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is False + assert result.is_data_not_collected() + + +def test_comparison_not_collected_missing_author(): + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 500}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 1, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is False + assert result.is_data_not_collected() + + +def test_comparison_no_collected_data(): + tmpl = OpenLibraryAuthorComparisonTemplate() + result = _run_gt({}, tmpl.get_ground_truth({ + "author_a_name": "A", "author_a_query": "a", + "search_query_a": 'author:"a"', + "author_b_name": "B", "author_b_query": "b", + "search_query_b": 'author:"b"', + "sort": "editions", "work_count": 1, "metric": "want_to_read_count", + "metric_label": "x", + })) + assert result.success is False + + +def test_comparison_missing_wtr_treated_as_zero(): + """OL API omits want_to_read_count when the value is zero; GT treats absent as 0.""" + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": 500}, + ]), + "ol:search:christie": _make_search_entry('author:"agatha christie"', "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 1, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is True + assert result.value == "500" # abs(500 - 0) + + +def test_comparison_missing_ratings_count_fails_gt(): + """Missing ratings_count should cause GT failure (not default to 0).""" + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "ratings_count": 500}, + ]), + "ol:search:christie": _make_search_entry('author:"agatha christie"', "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles"}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 1, "metric": "ratings_count", + "metric_label": "total number of ratings", + })) + assert result.success is False + + +def test_comparison_non_numeric_metric_causes_gt_failure(): + """Non-null non-numeric metric values should cause a GT fail via safe_metric_value.""" + tmpl = OpenLibraryAuthorComparisonTemplate() + collected = { + "ol:search:king": _make_search_entry('author:"stephen king"', "editions", [ + {"key": "/works/OL1W", "rank": 1, "title": "It", "want_to_read_count": "N/A"}, + ]), + "ol:search:christie": _make_search_entry('author:"agatha christie"', "editions", [ + {"key": "/works/OL3W", "rank": 1, "title": "Styles", "want_to_read_count": 100}, + ]), + } + result = _run_gt(collected, tmpl.get_ground_truth({ + "author_a_name": "Stephen King", + "author_a_query": "stephen king", + "search_query_a": 'author:"stephen king"', + "author_b_name": "Agatha Christie", + "author_b_query": "agatha christie", + "search_query_b": 'author:"agatha christie"', + "sort": "editions", "work_count": 1, "metric": "want_to_read_count", + "metric_label": "total want-to-read count", + })) + assert result.success is False diff --git a/tests/plugins/openlibrary/test_new_templates.py b/tests/plugins/openlibrary/test_new_templates.py index bebe1da..b72dafa 100644 --- a/tests/plugins/openlibrary/test_new_templates.py +++ b/tests/plugins/openlibrary/test_new_templates.py @@ -296,7 +296,9 @@ def test_author_editions_matches_punctuated_author_filter(): assert result.value == "1000" -def test_author_editions_rejects_plain_text_query_without_author_filter(): +def test_author_editions_matches_plain_text_query_via_normalization(): + """Plain-text queries like 'mark twain' should match when they normalize + to the same author fragment as the template's author:\"mark twain\" query.""" tmpl = OpenLibraryAuthorEditionsTemplate() collected = { "ol:search:wells": _make_search_entry("mark twain", "editions", [ @@ -309,8 +311,8 @@ def test_author_editions_rejects_plain_text_query_without_author_filter(): "search_query": 'author:"mark twain"', "sort": "editions", "work_count": 2, })) - assert result.success is False - assert result.is_data_not_collected() + assert result.success is True + assert result.value == "1000" def test_author_editions_not_collected_wrong_author():