diff --git a/liveweb_arena/core/task_registry.py b/liveweb_arena/core/task_registry.py index 221f30b..924c887 100644 --- a/liveweb_arena/core/task_registry.py +++ b/liveweb_arena/core/task_registry.py @@ -146,6 +146,9 @@ class TaskRegistry: 86: ("openmeteo", "openmeteo_comparison"), 87: ("openmeteo", "openmeteo_hourly_extrema"), 88: ("openmeteo", "openmeteo_forecast_trend"), + 99: ("openmeteo", "openmeteo_hourly_threshold"), + 100: ("openmeteo", "openmeteo_sunrise_sunset"), + 101: ("openmeteo", "openmeteo_hourly_time_of"), # ArXiv templates 90: ("arxiv", "arxiv_paper_info"), @@ -189,6 +192,8 @@ class TaskRegistry: # Version 7: Open Library engagement & comparison templates (PR #13) # NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8. [96, 97, 98], + # Version 8: Additional Open Meteo templates + [99, 100, 101], ] # Combination registry: list of template ID tuples diff --git a/liveweb_arena/plugins/openmeteo/openmeteo.py b/liveweb_arena/plugins/openmeteo/openmeteo.py index 97c42a0..2281a66 100644 --- a/liveweb_arena/plugins/openmeteo/openmeteo.py +++ b/liveweb_arena/plugins/openmeteo/openmeteo.py @@ -122,14 +122,22 @@ def _build_data_html(data: dict) -> str: t_max = daily.get("temperature_2m_max", []) t_min = daily.get("temperature_2m_min", []) p_max = daily.get("precipitation_probability_max", []) + sr = daily.get("sunrise", []) + ss = daily.get("sunset", []) for i, t in enumerate(times): mx = t_max[i] if i < len(t_max) else "N/A" mn = t_min[i] if i < len(t_min) else "N/A" pp = p_max[i] if i < len(p_max) else "N/A" - rows.append(f"{t}{mx} C{mn} C{pp}%") + sunrise = sr[i] if i < len(sr) else "N/A" + sunset = ss[i] if i < len(ss) else "N/A" + rows.append( + f"{t}{mx} C{mn} C" + f"{pp}%{sunrise}{sunset}" + ) parts.append( "

Daily Forecast

" - "" + "" + "" + "".join(rows) + "
DateMax TempMin TempPrecip Prob
DateMax TempMin TempPrecip ProbSunriseSunset
" ) diff --git a/liveweb_arena/plugins/openmeteo/templates/__init__.py b/liveweb_arena/plugins/openmeteo/templates/__init__.py index 9c3a246..821b732 100644 --- a/liveweb_arena/plugins/openmeteo/templates/__init__.py +++ b/liveweb_arena/plugins/openmeteo/templates/__init__.py @@ -4,10 +4,16 @@ from .comparison import OpenMeteoComparisonTemplate from .hourly_extrema import OpenMeteoHourlyExtremaTemplate from .forecast_trend import OpenMeteoForecastTrendTemplate +from .hourly_threshold import OpenMeteoHourlyThresholdTemplate +from .sunrise_sunset import OpenMeteoSunriseSunsetTemplate +from .hourly_time_of import OpenMeteoHourlyTimeOfTemplate __all__ = [ "OpenMeteoCurrentWeatherTemplate", "OpenMeteoComparisonTemplate", "OpenMeteoHourlyExtremaTemplate", "OpenMeteoForecastTrendTemplate", + "OpenMeteoHourlyThresholdTemplate", + "OpenMeteoSunriseSunsetTemplate", + "OpenMeteoHourlyTimeOfTemplate", ] diff --git a/liveweb_arena/plugins/openmeteo/templates/common.py b/liveweb_arena/plugins/openmeteo/templates/common.py index 9acfbc8..cad4931 100644 --- a/liveweb_arena/plugins/openmeteo/templates/common.py +++ b/liveweb_arena/plugins/openmeteo/templates/common.py @@ -28,13 +28,14 @@ def get_collected_location_data( return data, None -def get_today_hourly_series( +def get_today_hourly_pairs( data: Dict[str, Any], field_name: str, -) -> Tuple[Optional[List[float]], Optional[GroundTruthResult]]: - """Extract today's hourly values for the given field from API data. +) -> Tuple[Optional[List[Tuple[str, float]]], Optional[GroundTruthResult]]: + """Extract today's hourly (time_str, value) pairs for the given field. - Returns (values, None) on success, or (None, failure_result) on error. + Returns a list of (ISO time string, numeric value) tuples for today, + or (None, failure_result) on error. """ hourly = data.get("hourly") if not hourly: @@ -70,29 +71,37 @@ def get_today_hourly_series( if not today: today = str(times[0]).split("T", 1)[0] - values: List[float] = [] + pairs: List[Tuple[str, float]] = [] for time_str, val in zip(times, series): if not isinstance(time_str, str) or not time_str.startswith(today): continue if val is None: continue try: - values.append(float(val)) + pairs.append((time_str, float(val))) except (TypeError, ValueError): return None, GroundTruthResult.fail( f"Non-numeric value in hourly {field_name}: {val!r}" ) - if not values: + if not pairs: return None, GroundTruthResult.fail( f"No hourly {field_name} data found for today ({today})" ) - return values, None + return pairs, None -def get_today_hourly_temperatures( +def get_today_hourly_series( data: Dict[str, Any], + field_name: str, ) -> Tuple[Optional[List[float]], Optional[GroundTruthResult]]: - """Extract today's hourly temperatures from a collected API payload.""" - return get_today_hourly_series(data, "temperature_2m") + """Extract today's hourly values for the given field from API data. + + Thin wrapper around get_today_hourly_pairs that discards the timestamps. + Returns (values, None) on success, or (None, failure_result) on error. + """ + pairs, failure = get_today_hourly_pairs(data, field_name) + if failure is not None: + return None, failure + return [val for _, val in pairs], None diff --git a/liveweb_arena/plugins/openmeteo/templates/hourly_threshold.py b/liveweb_arena/plugins/openmeteo/templates/hourly_threshold.py new file mode 100644 index 0000000..2f9215e --- /dev/null +++ b/liveweb_arena/plugins/openmeteo/templates/hourly_threshold.py @@ -0,0 +1,198 @@ +"""Hourly threshold counting template for Open Meteo - MEDIUM DIFFICULTY. + +Asks how many hours today a given metric is above or below a threshold +in a given city. The agent starts on the generic docs page, finds the city, +then counts qualifying hours from the hourly forecast table. + +Dynamic data: hourly forecasts update continuously. +Time-sensitive: asks about "today" which changes daily. +Computation required: agent must count hours, not read a single value. + +SFT defense: +- Threshold includes a seed-derived offset (±2.0 for temp, scaled for others), + so the exact threshold is never a memorizable constant. +- Strict scoring: exact count only for 1.0, off-by-1 for 0.5. + On a 0-24 range, SFT with climate priors may guess close but rarely exact. + +Effective variants: 170 cities x 4 metrics x ~8 base thresholds x continuous offset + x 2 directions → effectively continuous. +""" + +import random +from typing import Any, Dict, Optional + +from liveweb_arena.core.validators.base import ( + QuestionTemplate, GeneratedQuestion, ValidationResult, register_template, +) +from liveweb_arena.core.ground_truth_trigger import ( + UrlPatternTrigger, TriggerConfig, GroundTruthResult, +) +from liveweb_arena.core.gt_collector import GTSourceType + +from .common import DOCS_HOME_URL, get_collected_location_data, get_today_hourly_series +from .variables import CITIES, HourlyMetric, HOURLY_THRESHOLDS + +# Per-metric jitter half-range applied to each base threshold. +# Prevents SFT from memorising fixed threshold→count mappings. +_THRESHOLD_JITTER = { + "temperature_2m": 2.0, # ±2 °C + "relative_humidity_2m": 5.0, # ±5 % + "wind_speed_10m": 3.0, # ±3 km/h + "precipitation_probability": 5.0, # ±5 % +} + + +PATTERNS_ABOVE = { + HourlyMetric.TEMPERATURE: [ + "According to Open-Meteo, how many hours today will the temperature in {city} be above {threshold}{unit}?", + "Using Open-Meteo, count the hours today when {city}'s temperature exceeds {threshold}{unit}.", + "On Open-Meteo, for how many hours today is {city}'s temperature forecast above {threshold}{unit}?", + ], + HourlyMetric.HUMIDITY: [ + "According to Open-Meteo, how many hours today will the relative humidity in {city} be above {threshold}{unit}?", + "Using Open-Meteo, count the hours today when {city}'s humidity exceeds {threshold}{unit}.", + ], + HourlyMetric.WIND_SPEED: [ + "According to Open-Meteo, how many hours today will the wind speed in {city} be above {threshold} {unit}?", + "Using Open-Meteo, count the hours today when {city}'s wind speed exceeds {threshold} {unit}.", + ], + HourlyMetric.PRECIP_PROBABILITY: [ + "According to Open-Meteo, how many hours today will the precipitation probability in {city} be above {threshold}{unit}?", + "Using Open-Meteo, count the hours today when {city}'s precipitation probability exceeds {threshold}{unit}.", + ], +} + +PATTERNS_BELOW = { + HourlyMetric.TEMPERATURE: [ + "According to Open-Meteo, how many hours today will the temperature in {city} be below {threshold}{unit}?", + "Using Open-Meteo, count the hours today when {city}'s temperature is below {threshold}{unit}.", + "On Open-Meteo, for how many hours today is {city}'s temperature forecast below {threshold}{unit}?", + ], + HourlyMetric.HUMIDITY: [ + "According to Open-Meteo, how many hours today will the relative humidity in {city} be below {threshold}{unit}?", + "Using Open-Meteo, count the hours today when {city}'s humidity is below {threshold}{unit}.", + ], + HourlyMetric.WIND_SPEED: [ + "According to Open-Meteo, how many hours today will the wind speed in {city} be below {threshold} {unit}?", + "Using Open-Meteo, count the hours today when {city}'s wind speed is below {threshold} {unit}.", + ], + HourlyMetric.PRECIP_PROBABILITY: [ + "According to Open-Meteo, how many hours today will the precipitation probability in {city} be below {threshold}{unit}?", + "Using Open-Meteo, count the hours today when {city}'s precipitation probability is below {threshold}{unit}.", + ], +} + + +@register_template("openmeteo_hourly_threshold") +class OpenMeteoHourlyThresholdTemplate(QuestionTemplate): + """ + MEDIUM: Count hours above/below a jittered threshold for a metric today. + + Requires scanning hourly forecast data and counting qualifying entries. + Threshold includes a seed-derived random offset so SFT cannot memorise + fixed threshold-to-count mappings. Scoring is strict: exact = 1.0, + off-by-1 = 0.5, off-by->1 = 0.0. + """ + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("openmeteo_hourly_threshold") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + + metrics = list(HourlyMetric) + metric = metrics[variant % len(metrics)] if variant is not None else rng.choice(metrics) + + base_thresholds = HOURLY_THRESHOLDS[metric.api_field] + base = rng.choice(base_thresholds) + jitter_range = _THRESHOLD_JITTER[metric.api_field] + offset = rng.uniform(-jitter_range, jitter_range) + # Round to 1 decimal so the question reads naturally + threshold = round(base + offset, 1) + + is_above = rng.choice([True, False]) + + city = rng.choice(CITIES) + patterns = PATTERNS_ABOVE[metric] if is_above else PATTERNS_BELOW[metric] + question_text = rng.choice(patterns).format( + city=city.display_name, + threshold=threshold, + unit=metric.unit, + ) + + return GeneratedQuestion( + question_text=question_text, + start_url=DOCS_HOME_URL, + variables={"city": city.name, "metric": metric.name, "threshold": threshold, "is_above": is_above}, + validation_info={ + "city_name": city.name, + "coord_key": city.coord_key, + "metric_field": metric.api_field, + "metric_label": metric.display_name, + "unit": metric.unit, + "threshold": threshold, + "is_above": is_above, + }, + template_name=self.name, + expected_steps=7, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + city = validation_info.get("city_name", "") + label = validation_info.get("metric_label", "hourly temperature") + unit = validation_info.get("unit", "°C") + threshold = validation_info.get("threshold", 0) + is_above = validation_info.get("is_above", True) + direction = "above" if is_above else "below" + return f"""Task-Specific Rules (Open Meteo Hourly Threshold Count): +- City: {city} +- Count hours today where {label} is strictly {direction} {threshold}{unit} +- Answer should be a whole number (0-24) +- Score 1.0: Exact count +- Score 0.5: Off by exactly 1 hour +- Score 0.0: Off by more than 1 hour or no numeric answer +- Use the hourly forecast for today's local date""" + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + coord_key = validation_info.get("coord_key", "") + city_name = validation_info.get("city_name", "") + metric_field = validation_info.get("metric_field", "temperature_2m") + threshold = validation_info.get("threshold", 0) + is_above = validation_info.get("is_above", True) + + data, failure = get_collected_location_data(coord_key, city_name) + if failure is not None: + return failure + + values, val_failure = get_today_hourly_series(data, metric_field) + if val_failure is not None: + return val_failure + + if is_above: + count = sum(1 for v in values if v > threshold) + else: + count = sum(1 for v in values if v < threshold) + + return GroundTruthResult.ok(str(count)) + + async def validate_answer( + self, answer: str, validation_info: Dict[str, Any] + ) -> ValidationResult: + """Not used — the pipeline uses LLM-based validation via get_validation_rules().""" + return ValidationResult( + score=0.0, is_correct=False, expected=None, actual=answer, + details="Use LLM validation", + ) + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + trigger = UrlPatternTrigger(domains=["open-meteo.com"]) + return TriggerConfig(trigger=trigger) + + @classmethod + def get_cache_source(cls) -> str: + return "openmeteo" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/openmeteo/templates/hourly_time_of.py b/liveweb_arena/plugins/openmeteo/templates/hourly_time_of.py new file mode 100644 index 0000000..89f9ba7 --- /dev/null +++ b/liveweb_arena/plugins/openmeteo/templates/hourly_time_of.py @@ -0,0 +1,205 @@ +"""Hourly time-of-extremum template for Open Meteo - MEDIUM DIFFICULTY. + +Asks at what time today a city will reach its peak or lowest hourly value +for a given metric. The agent starts on the generic docs page, finds the +city, then scans the hourly forecast to find the argmax/argmin time. + +Dynamic data: hourly forecasts update continuously. +Time-sensitive: asks about "today" which changes daily. +Computation required: agent must find the extremum AND report its time. + +SFT defense: +- Temperature is EXCLUDED because its diurnal cycle is a textbook fixed + pattern (peak ~14:00, min ~05:00) that SFT can memorise. +- Remaining metrics (humidity, wind speed, precipitation probability) have + weather-dependent patterns that vary significantly by day and location. +- Humidity has a weak inverse-temperature pattern (~40% reliable) but the + exact hour varies enough that ±1h accuracy is hard without reading data. + +Effective variants: 170 cities x 3 metrics x 2 (max/min) = 1,020 (>500). +""" + +import random +from typing import Any, Dict, List, Optional + +from liveweb_arena.core.validators.base import ( + QuestionTemplate, GeneratedQuestion, ValidationResult, register_template, +) +from liveweb_arena.core.ground_truth_trigger import ( + UrlPatternTrigger, TriggerConfig, GroundTruthResult, +) +from liveweb_arena.core.gt_collector import GTSourceType + +from .common import DOCS_HOME_URL, get_collected_location_data, get_today_hourly_pairs +from .variables import CITIES, HourlyMetric + +# Exclude TEMPERATURE — its diurnal cycle (peak ~14:00, min ~05:00) is a +# fixed pattern that SFT can exploit for easy partial credit. +TIME_OF_METRICS: List[HourlyMetric] = [ + HourlyMetric.HUMIDITY, + HourlyMetric.WIND_SPEED, + HourlyMetric.PRECIP_PROBABILITY, +] + +PATTERNS_MAX = { + HourlyMetric.HUMIDITY: [ + "At what time today will {city} reach its peak hourly relative humidity according to Open-Meteo?", + "Using Open-Meteo, find the hour today when {city}'s humidity is highest.", + "On Open-Meteo, what time today does {city} hit its maximum hourly humidity?", + ], + HourlyMetric.WIND_SPEED: [ + "At what time today will {city} reach its peak hourly wind speed according to Open-Meteo?", + "Using Open-Meteo, find the hour today when {city}'s wind speed is highest.", + "On Open-Meteo, what time today does {city} hit its maximum hourly wind speed?", + ], + HourlyMetric.PRECIP_PROBABILITY: [ + "At what time today will {city} reach its peak hourly precipitation probability according to Open-Meteo?", + "Using Open-Meteo, find the hour today when {city}'s precipitation probability is highest.", + "On Open-Meteo, what time today does {city} hit its maximum precipitation probability?", + ], +} + +PATTERNS_MIN = { + HourlyMetric.HUMIDITY: [ + "At what time today will {city} reach its lowest hourly relative humidity according to Open-Meteo?", + "Using Open-Meteo, find the hour today when {city}'s humidity is lowest.", + "On Open-Meteo, what time today does {city} hit its minimum hourly humidity?", + ], + HourlyMetric.WIND_SPEED: [ + "At what time today will {city} reach its lowest hourly wind speed according to Open-Meteo?", + "Using Open-Meteo, find the hour today when {city}'s wind speed is lowest.", + "On Open-Meteo, what time today does {city} hit its minimum hourly wind speed?", + ], + HourlyMetric.PRECIP_PROBABILITY: [ + "At what time today will {city} reach its lowest hourly precipitation probability according to Open-Meteo?", + "Using Open-Meteo, find the hour today when {city}'s precipitation probability is lowest.", + "On Open-Meteo, what time today does {city} hit its minimum hourly precipitation probability?", + ], +} + + +@register_template("openmeteo_hourly_time_of") +class OpenMeteoHourlyTimeOfTemplate(QuestionTemplate): + """ + MEDIUM: Find the time of the hourly peak or low for a metric today. + + Requires scanning hourly forecast data to find argmax/argmin. + Tie-breaking: first (earliest) hour wins. + Temperature excluded (fixed diurnal pattern exploitable by SFT). + 170 cities x 3 metrics x 2 (max/min) = 1,020 effective variants. + """ + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("openmeteo_hourly_time_of") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + + metric = ( + TIME_OF_METRICS[variant % len(TIME_OF_METRICS)] + if variant is not None + else rng.choice(TIME_OF_METRICS) + ) + is_max = rng.choice([True, False]) + + city = rng.choice(CITIES) + patterns = PATTERNS_MAX[metric] if is_max else PATTERNS_MIN[metric] + question_text = rng.choice(patterns).format(city=city.display_name) + + return GeneratedQuestion( + question_text=question_text, + start_url=DOCS_HOME_URL, + variables={"city": city.name, "is_max": is_max, "metric": metric.name}, + validation_info={ + "city_name": city.name, + "coord_key": city.coord_key, + "is_max": is_max, + "metric_field": metric.api_field, + "metric_label": metric.display_name, + "unit": metric.unit, + }, + template_name=self.name, + expected_steps=7, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + city = validation_info.get("city_name", "") + is_max = validation_info.get("is_max", True) + label = validation_info.get("metric_label", "hourly wind speed") + extrema = "peak (highest)" if is_max else "lowest (minimum)" + return f"""Task-Specific Rules (Open Meteo Hourly Time Of Extremum): +- City: {city} +- Looking for: time of {extrema} {label} today +- Answer should be a time (e.g. "14:00", "2 PM", "14h") +- Score 1.0: Exact hour match +- Score 0.5: Within ±1 hour +- Score 0.0: Off by more than 1 hour or no answer +- If multiple hours tie, the earliest hour is correct +- Use the hourly forecast for today's local date""" + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + coord_key = validation_info.get("coord_key", "") + is_max = validation_info.get("is_max", True) + city_name = validation_info.get("city_name", "") + metric_field = validation_info.get("metric_field", "wind_speed_10m") + + data, failure = get_collected_location_data(coord_key, city_name) + if failure is not None: + return failure + + pairs, pair_failure = get_today_hourly_pairs(data, metric_field) + if pair_failure is not None: + return pair_failure + + # Degenerate case: all values identical (e.g., precip=0 for arid cities). + # argmax/argmin would always return 00:00, which SFT can memorize. + values = [v for _, v in pairs] + if len(set(values)) == 1: + return GroundTruthResult.fail( + f"All {len(values)} hourly {metric_field} values are identical " + f"({values[0]}) — degenerate case, no meaningful extremum" + ) + + # Find argmax/argmin — first occurrence wins ties + if is_max: + best_time, best_val = pairs[0] + for time_str, val in pairs[1:]: + if val > best_val: + best_val = val + best_time = time_str + else: + best_time, best_val = pairs[0] + for time_str, val in pairs[1:]: + if val < best_val: + best_val = val + best_time = time_str + + # Extract time portion: "2026-03-20T14:00" -> "14:00" + if "T" in best_time: + time_part = best_time.split("T", 1)[1] + else: + time_part = best_time + + return GroundTruthResult.ok(time_part) + + async def validate_answer( + self, answer: str, validation_info: Dict[str, Any] + ) -> ValidationResult: + """Not used — the pipeline uses LLM-based validation via get_validation_rules().""" + return ValidationResult( + score=0.0, is_correct=False, expected=None, actual=answer, + details="Use LLM validation", + ) + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + trigger = UrlPatternTrigger(domains=["open-meteo.com"]) + return TriggerConfig(trigger=trigger) + + @classmethod + def get_cache_source(cls) -> str: + return "openmeteo" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/openmeteo/templates/sunrise_sunset.py b/liveweb_arena/plugins/openmeteo/templates/sunrise_sunset.py new file mode 100644 index 0000000..c6436d6 --- /dev/null +++ b/liveweb_arena/plugins/openmeteo/templates/sunrise_sunset.py @@ -0,0 +1,190 @@ +"""Daylight duration template for Open Meteo - MEDIUM DIFFICULTY. + +Asks how long the daylight period is in a city on a given day. +The agent starts on the generic docs page, finds the city, then reads +BOTH sunrise AND sunset from the daily forecast table and computes the +duration (sunset - sunrise). + +Dynamic data: sunrise/sunset shift by ~1-4 minutes daily. +Computation required: read two time values and compute the difference. +Multi-value: requires both sunrise AND sunset — not a single-value read. + +SFT defense: +- Answer is in "Xh Ym" format with minute-level precision. +- Scoring is tight: ±3 min for 1.0, ±10 min for 0.5. +- An LLM can estimate daylight ≈ f(latitude, date) to ±15-30 min, but + rarely within ±3 min. The API uses its own atmospheric refraction model, + so exact minute differs from astronomical tables. +- Near equinoxes (~12h), SFT's "12h 0m" guess is off by 5-20 min for + most cities (exact duration depends on latitude + refraction). + +Effective variants: 170 cities x 3 days x 3 patterns = 1,530 (>500). +""" + +import random +from typing import Any, Dict, Optional + +from liveweb_arena.core.validators.base import ( + QuestionTemplate, GeneratedQuestion, ValidationResult, register_template, +) +from liveweb_arena.core.ground_truth_trigger import ( + UrlPatternTrigger, TriggerConfig, GroundTruthResult, +) +from liveweb_arena.core.gt_collector import GTSourceType + +from .common import DOCS_HOME_URL, get_collected_location_data +from .variables import CITIES + + +DAY_OPTIONS = [ + (0, "today"), + (1, "tomorrow"), + (2, "the day after tomorrow"), +] + +PATTERNS = [ + "According to Open-Meteo, how long is the daylight period in {city} {day_label}? Answer in hours and minutes.", + "Using Open-Meteo, what is the duration from sunrise to sunset in {city} {day_label}?", + "On Open-Meteo, how many hours and minutes of daylight does {city} get {day_label}?", +] + + +@register_template("openmeteo_sunrise_sunset") +class OpenMeteoSunriseSunsetTemplate(QuestionTemplate): + """ + MEDIUM: Compute daylight duration from sunrise and sunset times. + + Requires reading TWO values (sunrise + sunset) from the daily forecast + table and computing the time difference. This is multi-step computation, + not a single-value read — satisfying §4 gate 1 (non-trivial) and + gate 3 (computation required). + + Tight scoring (±3 min for 1.0) prevents SFT from exploiting the + latitude→daylight approximation. + + 170 cities x 3 days x 3 patterns = 1,530 effective variants. + """ + + GT_SOURCE = GTSourceType.PAGE_ONLY + + def __init__(self): + super().__init__("openmeteo_sunrise_sunset") + + def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion: + rng = random.Random(seed) + + city = rng.choice(CITIES) + day_idx, day_label = rng.choice(DAY_OPTIONS) + pattern = rng.choice(PATTERNS) + + question_text = pattern.format( + city=city.display_name, + day_label=day_label, + ) + + return GeneratedQuestion( + question_text=question_text, + start_url=DOCS_HOME_URL, + variables={"city": city.name, "day_idx": day_idx}, + validation_info={ + "city_name": city.name, + "coord_key": city.coord_key, + "day_idx": day_idx, + "day_label": day_label, + }, + template_name=self.name, + expected_steps=7, + ) + + def get_validation_rules(self, validation_info: Dict[str, Any]) -> str: + city = validation_info.get("city_name", "") + day_label = validation_info.get("day_label", "today") + return f"""Task-Specific Rules (Open Meteo Daylight Duration): +- City: {city} +- Day: {day_label} +- Read both sunrise and sunset times, compute the difference +- Answer should be in hours and minutes (e.g. "12h 18m", "12 hours 18 minutes") +- Score 1.0: Within ±3 minutes of correct duration +- Score 0.5: Within ±10 minutes +- Score 0.0: Off by more than 10 minutes or no answer +- Use the daily forecast table on Open-Meteo""" + + async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult: + coord_key = validation_info.get("coord_key", "") + city_name = validation_info.get("city_name", "") + day_idx = validation_info.get("day_idx", 0) + + data, failure = get_collected_location_data(coord_key, city_name) + if failure is not None: + return failure + + daily = data.get("daily") + if not daily: + return GroundTruthResult.fail("No daily data in API response") + + sunrise_list = daily.get("sunrise") + sunset_list = daily.get("sunset") + if not sunrise_list or not sunset_list: + return GroundTruthResult.fail("No sunrise/sunset data in daily forecast") + + if len(sunrise_list) <= day_idx or len(sunset_list) <= day_idx: + return GroundTruthResult.fail( + f"Need at least {day_idx + 1} days of sunrise/sunset data" + ) + + sunrise_str = sunrise_list[day_idx] + sunset_str = sunset_list[day_idx] + + # Polar regions may have null sunrise/sunset + if not sunrise_str or not sunset_str: + return GroundTruthResult.fail( + f"Sunrise or sunset is null for {city_name} on day {day_idx} " + "(possible polar day/night)" + ) + + # Parse ISO timestamps: "2026-03-20T06:15" format + try: + sr_parts = str(sunrise_str).split("T", 1) + ss_parts = str(sunset_str).split("T", 1) + sr_time = sr_parts[1] if len(sr_parts) == 2 else sr_parts[0] + ss_time = ss_parts[1] if len(ss_parts) == 2 else ss_parts[0] + + sr_h, sr_m = int(sr_time.split(":")[0]), int(sr_time.split(":")[1]) + ss_h, ss_m = int(ss_time.split(":")[0]), int(ss_time.split(":")[1]) + except (ValueError, IndexError) as e: + return GroundTruthResult.fail( + f"Failed to parse sunrise/sunset times: {sunrise_str}, {sunset_str}: {e}" + ) + + total_sr = sr_h * 60 + sr_m + total_ss = ss_h * 60 + ss_m + duration_min = total_ss - total_sr + + if duration_min < 0: + return GroundTruthResult.fail( + f"Sunset before sunrise: {sunrise_str} / {sunset_str}" + ) + + hours = duration_min // 60 + minutes = duration_min % 60 + return GroundTruthResult.ok(f"{hours}h {minutes}m") + + async def validate_answer( + self, answer: str, validation_info: Dict[str, Any] + ) -> ValidationResult: + """Not used — the pipeline uses LLM-based validation via get_validation_rules().""" + return ValidationResult( + score=0.0, is_correct=False, expected=None, actual=answer, + details="Use LLM validation", + ) + + def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig: + trigger = UrlPatternTrigger(domains=["open-meteo.com"]) + return TriggerConfig(trigger=trigger) + + @classmethod + def get_cache_source(cls) -> str: + return "openmeteo" + + def get_gt_source(self) -> GTSourceType: + return self.GT_SOURCE diff --git a/liveweb_arena/plugins/openmeteo/templates/variables.py b/liveweb_arena/plugins/openmeteo/templates/variables.py index aa39e05..72eb52c 100644 --- a/liveweb_arena/plugins/openmeteo/templates/variables.py +++ b/liveweb_arena/plugins/openmeteo/templates/variables.py @@ -287,3 +287,14 @@ def display_name(self) -> str: @property def unit(self) -> str: return self.value[2] + + +# Thresholds for hourly_threshold template, keyed by HourlyMetric.api_field. +# Each list covers the practical range for the metric so that most cities +# will have a non-trivial count (neither 0 nor 24). +HOURLY_THRESHOLDS: Dict[str, List[float]] = { + "temperature_2m": [-10.0, -5.0, 0.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0], + "relative_humidity_2m": [30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0], + "wind_speed_10m": [5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 40.0], + "precipitation_probability": [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0], +} diff --git a/tests/test_openmeteo_integration.py b/tests/test_openmeteo_integration.py index 47bfa31..688f53e 100644 --- a/tests/test_openmeteo_integration.py +++ b/tests/test_openmeteo_integration.py @@ -16,6 +16,9 @@ from liveweb_arena.plugins.openmeteo.templates.current_weather import OpenMeteoCurrentWeatherTemplate from liveweb_arena.plugins.openmeteo.templates.forecast_trend import OpenMeteoForecastTrendTemplate from liveweb_arena.plugins.openmeteo.templates.hourly_extrema import OpenMeteoHourlyExtremaTemplate +from liveweb_arena.plugins.openmeteo.templates.hourly_threshold import OpenMeteoHourlyThresholdTemplate +from liveweb_arena.plugins.openmeteo.templates.sunrise_sunset import OpenMeteoSunriseSunsetTemplate +from liveweb_arena.plugins.openmeteo.templates.hourly_time_of import OpenMeteoHourlyTimeOfTemplate from liveweb_arena.plugins.openmeteo.templates.variables import CITIES @@ -43,6 +46,9 @@ def test_plugin_and_templates_registered(): "openmeteo_comparison", "openmeteo_hourly_extrema", "openmeteo_forecast_trend", + "openmeteo_hourly_threshold", + "openmeteo_sunrise_sunset", + "openmeteo_hourly_time_of", ]: assert name in templates @@ -73,6 +79,9 @@ def test_coordinate_extraction_and_cache_keys(): (OpenMeteoCurrentWeatherTemplate, {"city_name", "coord_key", "metric_field", "unit"}), (OpenMeteoHourlyExtremaTemplate, {"city_name", "coord_key", "is_max"}), (OpenMeteoForecastTrendTemplate, {"city_name", "coord_key"}), + (OpenMeteoHourlyThresholdTemplate, {"city_name", "coord_key", "threshold", "is_above"}), + (OpenMeteoSunriseSunsetTemplate, {"city_name", "coord_key", "day_idx"}), + (OpenMeteoHourlyTimeOfTemplate, {"city_name", "coord_key", "is_max"}), ], ) def test_interaction_first_templates_start_from_generic_docs(template_cls, expected_fields): @@ -232,12 +241,16 @@ def test_registry_contains_openmeteo_templates(): 86: ("openmeteo", "openmeteo_comparison"), 87: ("openmeteo", "openmeteo_hourly_extrema"), 88: ("openmeteo", "openmeteo_forecast_trend"), + 99: ("openmeteo", "openmeteo_hourly_threshold"), + 100: ("openmeteo", "openmeteo_sunrise_sunset"), + 101: ("openmeteo", "openmeteo_hourly_time_of"), } for template_id, template_info in expected.items(): assert TaskRegistry.TEMPLATES[template_id] == template_info TaskRegistry._ensure_initialized() assert (85,) in TaskRegistry._combinations + assert (99,) in TaskRegistry._combinations def test_city_docs_urls_are_unique_and_parseable(): @@ -258,3 +271,299 @@ def test_openmeteo_templates_expose_page_only_gt_source(): assert OpenMeteoComparisonTemplate().get_gt_source() == GTSourceType.PAGE_ONLY assert OpenMeteoHourlyExtremaTemplate().get_gt_source() == GTSourceType.PAGE_ONLY assert OpenMeteoForecastTrendTemplate().get_gt_source() == GTSourceType.PAGE_ONLY + assert OpenMeteoHourlyThresholdTemplate().get_gt_source() == GTSourceType.PAGE_ONLY + assert OpenMeteoSunriseSunsetTemplate().get_gt_source() == GTSourceType.PAGE_ONLY + assert OpenMeteoHourlyTimeOfTemplate().get_gt_source() == GTSourceType.PAGE_ONLY + + +def test_hourly_threshold_counts_correctly(collector): + collector._merge_api_data( + "https://open-meteo.com/en/docs?latitude=35.68&longitude=139.65", + { + "_location_key": "35.68,139.65", + "current_weather": {"temperature": 12.5, "time": "2026-03-17T09:00"}, + "daily": {"time": ["2026-03-17"]}, + "hourly": { + "time": [ + "2026-03-17T00:00", + "2026-03-17T06:00", + "2026-03-17T12:00", + "2026-03-17T18:00", + ], + "temperature_2m": [5.0, 10.0, 20.0, 15.0], + }, + }, + ) + + tmpl = OpenMeteoHourlyThresholdTemplate() + + # Above 10: 20.0, 15.0 → 2 + result_above = run_async( + tmpl.get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "metric_field": "temperature_2m", "threshold": 10.0, "is_above": True, + }) + ) + assert result_above.success is True + assert result_above.value == "2" + + # Below 10: 5.0 → 1 + result_below = run_async( + tmpl.get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "metric_field": "temperature_2m", "threshold": 10.0, "is_above": False, + }) + ) + assert result_below.success is True + assert result_below.value == "1" + + +def test_hourly_threshold_uses_jittered_thresholds(): + """Verify that different seeds produce different (non-round) thresholds.""" + tmpl = OpenMeteoHourlyThresholdTemplate() + thresholds = set() + for seed in range(50): + q = tmpl.generate(seed) + thresholds.add(q.validation_info["threshold"]) + # With jitter, we should get many distinct values (not just the base list) + assert len(thresholds) > 20 + + +def test_daylight_duration_computes_correctly(collector): + collector._merge_api_data( + "https://open-meteo.com/en/docs?latitude=35.68&longitude=139.65", + { + "_location_key": "35.68,139.65", + "current_weather": {"temperature": 12.5, "time": "2026-03-17T09:00"}, + "daily": { + "time": ["2026-03-17", "2026-03-18", "2026-03-19"], + "sunrise": ["2026-03-17T06:03", "2026-03-18T05:58", "2026-03-19T05:56"], + "sunset": ["2026-03-17T18:05", "2026-03-18T18:06", "2026-03-19T18:07"], + }, + }, + ) + + tmpl = OpenMeteoSunriseSunsetTemplate() + + # Day 0: 06:03 → 18:05 = 12h 2m + result = run_async( + tmpl.get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "day_idx": 0, "day_label": "today", + }) + ) + assert result.success is True + assert result.value == "12h 2m" + + # Day 1: 05:58 → 18:06 = 12h 8m + result_d1 = run_async( + tmpl.get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "day_idx": 1, "day_label": "tomorrow", + }) + ) + assert result_d1.success is True + assert result_d1.value == "12h 8m" + + +def test_daylight_duration_handles_null_polar(collector): + collector._merge_api_data( + "https://open-meteo.com/en/docs?latitude=68.97&longitude=33.09", + { + "_location_key": "68.97,33.09", + "current_weather": {"temperature": -5.0, "time": "2026-06-21T12:00"}, + "daily": { + "time": ["2026-06-21"], + "sunrise": [None], + "sunset": [None], + }, + }, + ) + + result = run_async( + OpenMeteoSunriseSunsetTemplate().get_ground_truth({ + "city_name": "Murmansk", "coord_key": "68.97,33.09", + "day_idx": 0, "day_label": "today", + }) + ) + assert result.success is False + + +def test_hourly_time_of_finds_extremum_time(collector): + collector._merge_api_data( + "https://open-meteo.com/en/docs?latitude=35.68&longitude=139.65", + { + "_location_key": "35.68,139.65", + "current_weather": {"temperature": 12.5, "time": "2026-03-17T09:00"}, + "daily": {"time": ["2026-03-17"]}, + "hourly": { + "time": [ + "2026-03-17T00:00", + "2026-03-17T06:00", + "2026-03-17T12:00", + "2026-03-17T14:00", + "2026-03-17T18:00", + ], + "wind_speed_10m": [5.0, 12.0, 8.0, 12.0, 3.0], + }, + }, + ) + + tmpl = OpenMeteoHourlyTimeOfTemplate() + + # Max wind = 12.0 at 06:00 (first occurrence wins over 14:00) + max_result = run_async( + tmpl.get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "is_max": True, "metric_field": "wind_speed_10m", + }) + ) + assert max_result.success is True + assert max_result.value == "06:00" + + # Min wind = 3.0 at 18:00 + min_result = run_async( + tmpl.get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "is_max": False, "metric_field": "wind_speed_10m", + }) + ) + assert min_result.success is True + assert min_result.value == "18:00" + + +def test_hourly_time_of_excludes_temperature(): + """Template 101 must not generate temperature questions (diurnal cycle is a fixed pattern).""" + tmpl = OpenMeteoHourlyTimeOfTemplate() + for seed in range(100): + q = tmpl.generate(seed) + assert q.validation_info["metric_field"] != "temperature_2m", ( + f"seed {seed} generated temperature question — should be excluded" + ) + + +def test_hourly_time_of_rejects_degenerate_all_same(collector): + """All-zero precip (arid cities) must fail GT, not return 00:00.""" + collector._merge_api_data( + "https://open-meteo.com/en/docs?latitude=33.45&longitude=-112.07", + { + "_location_key": "33.45,-112.07", + "current_weather": {"temperature": 35.0, "time": "2026-03-17T12:00"}, + "daily": {"time": ["2026-03-17"]}, + "hourly": { + "time": [f"2026-03-17T{h:02d}:00" for h in range(24)], + "precipitation_probability": [0] * 24, + }, + }, + ) + + result = run_async( + OpenMeteoHourlyTimeOfTemplate().get_ground_truth({ + "city_name": "Phoenix", "coord_key": "33.45,-112.07", + "is_max": True, "metric_field": "precipitation_probability", + }) + ) + assert result.success is False + + +def test_hourly_threshold_requires_city_visit(): + result = run_async( + OpenMeteoHourlyThresholdTemplate().get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "metric_field": "temperature_2m", "threshold": 20.0, "is_above": True, + }) + ) + assert result.success is False + assert result.is_data_not_collected() + + +def test_gt_with_real_api_data(collector): + """Verify GT returns concrete values using real Open-Meteo API data (Tokyo, 2026-03-26).""" + # Real API response snapshot — fetched from: + # https://api.open-meteo.com/v1/forecast?latitude=35.68&longitude=139.65 + # ¤t_weather=true&hourly=temperature_2m,relative_humidity_2m, + # wind_speed_10m,precipitation_probability + # &daily=temperature_2m_max,temperature_2m_min,precipitation_probability_max, + # sunrise,sunset&timezone=auto&forecast_days=3 + real_data = { + "_location_key": "35.68,139.65", + "current_weather": {"time": "2026-03-26T17:00", "temperature": 11.3, "windspeed": 5.1, "winddirection": 352}, + "hourly": { + "time": [f"2026-03-26T{h:02d}:00" for h in range(24)] + [f"2026-03-27T{h:02d}:00" for h in range(24)], + "temperature_2m": [8.4,8.1,8.0,8.6,8.5,8.2,8.5,8.6,8.8,10.1,10.4,10.8,11.7,11.8,11.5,11.2,11.5,11.3,10.8,10.5,10.1,10.0,9.9,9.8, + 9.7,9.5,9.3,9.1,8.8,8.6,8.4,8.8,9.7,10.9,12.3,13.5,14.9,15.9,16.3,16.5,16.1,15.3,14.3,12.8,11.8,11.2,10.9,10.7], + "relative_humidity_2m": [99,99,99,98,98,98,98,98,98,96,95,94,92,92,91,92,90,90,93,92,95,96,97,96, + 93,93,93,93,94,95,94,91,86,81,76,73,67,61,57,56,59,64,71,83,89,92,92,93], + "wind_speed_10m": [2.4,2.4,2.6,3.2,3.2,3.3,4.0,4.7,4.3,4.5,5.4,5.4,4.7,5.9,6.2,6.5,4.3,5.1,4.7,4.7,4.4,4.5,5.2,5.6, + 6.2,5.8,5.4,5.1,4.4,4.0,4.1,4.0,4.3,3.6,3.3,3.1,3.8,4.3,4.5,5.1,5.4,4.8,5.6,5.4,4.6,3.9,4.5,3.7], + "precipitation_probability": [100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,95,90,85,63,40,20,13,10, + 5,0,3,3,0,0,0,0,0,0,0,0,0,0,0,3,5,20,15,25,23,45,50,53], + }, + "daily": { + "time": ["2026-03-26", "2026-03-27", "2026-03-28"], + "temperature_2m_max": [11.8, 16.5, 17.8], + "temperature_2m_min": [8.0, 8.4, 8.8], + "precipitation_probability_max": [100, 53, 63], + "sunrise": ["2026-03-26T05:37", "2026-03-27T05:35", "2026-03-28T05:34"], + "sunset": ["2026-03-26T17:57", "2026-03-27T17:58", "2026-03-28T17:59"], + }, + } + + collector._merge_api_data( + "https://open-meteo.com/en/docs?latitude=35.68&longitude=139.65", + real_data, + ) + + # T99: count hours above 10.0°C today + # Today's temps: [8.4,8.1,8.0,8.6,8.5,8.2,8.5,8.6,8.8,10.1,10.4,10.8,11.7,11.8,11.5,11.2,11.5,11.3,10.8,10.5,10.1,10.0,9.9,9.8] + # Strictly above 10.0: indices 9-20 minus those <=10.0 → 10.1,10.4,10.8,11.7,11.8,11.5,11.2,11.5,11.3,10.8,10.5,10.1 = 12 + result_t99 = run_async( + OpenMeteoHourlyThresholdTemplate().get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "metric_field": "temperature_2m", "threshold": 10.0, "is_above": True, + }) + ) + assert result_t99.success is True + assert result_t99.value == "12" + + # T100: daylight duration day 0 → sunrise 05:37, sunset 17:57 = 12h 20m + result_t100 = run_async( + OpenMeteoSunriseSunsetTemplate().get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "day_idx": 0, "day_label": "today", + }) + ) + assert result_t100.success is True + assert result_t100.value == "12h 20m" + + # T101: peak wind speed today + # Wind: [2.4,2.4,2.6,3.2,3.2,3.3,4.0,4.7,4.3,4.5,5.4,5.4,4.7,5.9,6.2,6.5,4.3,5.1,4.7,4.7,4.4,4.5,5.2,5.6] + # Max = 6.5 at index 15 → 15:00 + result_t101 = run_async( + OpenMeteoHourlyTimeOfTemplate().get_ground_truth({ + "city_name": "Tokyo", "coord_key": "35.68,139.65", + "is_max": True, "metric_field": "wind_speed_10m", + }) + ) + assert result_t101.success is True + assert result_t101.value == "15:00" + + +def test_build_data_html_includes_sunrise_sunset(): + plugin = OpenMeteoPlugin() + html = plugin._build_data_html({ + "current_weather": {"temperature": 12.5, "windspeed": 5.0, "winddirection": 180}, + "daily": { + "time": ["2026-03-17"], + "temperature_2m_max": [16.0], + "temperature_2m_min": [9.0], + "precipitation_probability_max": [30], + "sunrise": ["2026-03-17T06:00"], + "sunset": ["2026-03-17T18:05"], + }, + "hourly": {"time": [], "temperature_2m": []}, + }) + assert "Sunrise" in html + assert "Sunset" in html + assert "2026-03-17T06:00" in html + assert "2026-03-17T18:05" in html