Skip to content
5 changes: 5 additions & 0 deletions liveweb_arena/core/task_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ class TaskRegistry:
86: ("openmeteo", "openmeteo_comparison"),
87: ("openmeteo", "openmeteo_hourly_extrema"),
88: ("openmeteo", "openmeteo_forecast_trend"),
99: ("openmeteo", "openmeteo_hourly_threshold"),
100: ("openmeteo", "openmeteo_sunrise_sunset"),
101: ("openmeteo", "openmeteo_hourly_time_of"),

# ArXiv templates
90: ("arxiv", "arxiv_paper_info"),
Expand Down Expand Up @@ -189,6 +192,8 @@ class TaskRegistry:
# Version 7: Open Library engagement & comparison templates (PR #13)
# NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8.
[96, 97, 98],
# Version 8: Additional Open Meteo templates
[99, 100, 101],
]

# Combination registry: list of template ID tuples
Expand Down
12 changes: 10 additions & 2 deletions liveweb_arena/plugins/openmeteo/openmeteo.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,22 @@ def _build_data_html(data: dict) -> str:
t_max = daily.get("temperature_2m_max", [])
t_min = daily.get("temperature_2m_min", [])
p_max = daily.get("precipitation_probability_max", [])
sr = daily.get("sunrise", [])
ss = daily.get("sunset", [])
for i, t in enumerate(times):
mx = t_max[i] if i < len(t_max) else "N/A"
mn = t_min[i] if i < len(t_min) else "N/A"
pp = p_max[i] if i < len(p_max) else "N/A"
rows.append(f"<tr><td>{t}</td><td>{mx} C</td><td>{mn} C</td><td>{pp}%</td></tr>")
sunrise = sr[i] if i < len(sr) else "N/A"
sunset = ss[i] if i < len(ss) else "N/A"
rows.append(
f"<tr><td>{t}</td><td>{mx} C</td><td>{mn} C</td>"
f"<td>{pp}%</td><td>{sunrise}</td><td>{sunset}</td></tr>"
)
parts.append(
"<h2>Daily Forecast</h2><table>"
"<tr><th>Date</th><th>Max Temp</th><th>Min Temp</th><th>Precip Prob</th></tr>"
"<tr><th>Date</th><th>Max Temp</th><th>Min Temp</th>"
"<th>Precip Prob</th><th>Sunrise</th><th>Sunset</th></tr>"
+ "".join(rows) + "</table>"
)

Expand Down
6 changes: 6 additions & 0 deletions liveweb_arena/plugins/openmeteo/templates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
from .comparison import OpenMeteoComparisonTemplate
from .hourly_extrema import OpenMeteoHourlyExtremaTemplate
from .forecast_trend import OpenMeteoForecastTrendTemplate
from .hourly_threshold import OpenMeteoHourlyThresholdTemplate
from .sunrise_sunset import OpenMeteoSunriseSunsetTemplate
from .hourly_time_of import OpenMeteoHourlyTimeOfTemplate

__all__ = [
"OpenMeteoCurrentWeatherTemplate",
"OpenMeteoComparisonTemplate",
"OpenMeteoHourlyExtremaTemplate",
"OpenMeteoForecastTrendTemplate",
"OpenMeteoHourlyThresholdTemplate",
"OpenMeteoSunriseSunsetTemplate",
"OpenMeteoHourlyTimeOfTemplate",
]
31 changes: 20 additions & 11 deletions liveweb_arena/plugins/openmeteo/templates/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ def get_collected_location_data(
return data, None


def get_today_hourly_series(
def get_today_hourly_pairs(
data: Dict[str, Any],
field_name: str,
) -> Tuple[Optional[List[float]], Optional[GroundTruthResult]]:
"""Extract today's hourly values for the given field from API data.
) -> Tuple[Optional[List[Tuple[str, float]]], Optional[GroundTruthResult]]:
"""Extract today's hourly (time_str, value) pairs for the given field.

Returns (values, None) on success, or (None, failure_result) on error.
Returns a list of (ISO time string, numeric value) tuples for today,
or (None, failure_result) on error.
"""
hourly = data.get("hourly")
if not hourly:
Expand Down Expand Up @@ -70,29 +71,37 @@ def get_today_hourly_series(
if not today:
today = str(times[0]).split("T", 1)[0]

values: List[float] = []
pairs: List[Tuple[str, float]] = []
for time_str, val in zip(times, series):
if not isinstance(time_str, str) or not time_str.startswith(today):
continue
if val is None:
continue
try:
values.append(float(val))
pairs.append((time_str, float(val)))
except (TypeError, ValueError):
return None, GroundTruthResult.fail(
f"Non-numeric value in hourly {field_name}: {val!r}"
)

if not values:
if not pairs:
return None, GroundTruthResult.fail(
f"No hourly {field_name} data found for today ({today})"
)

return values, None
return pairs, None


def get_today_hourly_temperatures(
def get_today_hourly_series(
data: Dict[str, Any],
field_name: str,
) -> Tuple[Optional[List[float]], Optional[GroundTruthResult]]:
"""Extract today's hourly temperatures from a collected API payload."""
return get_today_hourly_series(data, "temperature_2m")
"""Extract today's hourly values for the given field from API data.

Thin wrapper around get_today_hourly_pairs that discards the timestamps.
Returns (values, None) on success, or (None, failure_result) on error.
"""
pairs, failure = get_today_hourly_pairs(data, field_name)
if failure is not None:
return None, failure
return [val for _, val in pairs], None
198 changes: 198 additions & 0 deletions liveweb_arena/plugins/openmeteo/templates/hourly_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
"""Hourly threshold counting template for Open Meteo - MEDIUM DIFFICULTY.

Asks how many hours today a given metric is above or below a threshold
in a given city. The agent starts on the generic docs page, finds the city,
then counts qualifying hours from the hourly forecast table.

Dynamic data: hourly forecasts update continuously.
Time-sensitive: asks about "today" which changes daily.
Computation required: agent must count hours, not read a single value.

SFT defense:
- Threshold includes a seed-derived offset (±2.0 for temp, scaled for others),
so the exact threshold is never a memorizable constant.
- Strict scoring: exact count only for 1.0, off-by-1 for 0.5.
On a 0-24 range, SFT with climate priors may guess close but rarely exact.

Effective variants: 170 cities x 4 metrics x ~8 base thresholds x continuous offset
x 2 directions → effectively continuous.
"""

import random
from typing import Any, Dict, Optional

from liveweb_arena.core.validators.base import (
QuestionTemplate, GeneratedQuestion, ValidationResult, register_template,
)
from liveweb_arena.core.ground_truth_trigger import (
UrlPatternTrigger, TriggerConfig, GroundTruthResult,
)
from liveweb_arena.core.gt_collector import GTSourceType

from .common import DOCS_HOME_URL, get_collected_location_data, get_today_hourly_series
from .variables import CITIES, HourlyMetric, HOURLY_THRESHOLDS

# Per-metric jitter half-range applied to each base threshold.
# Prevents SFT from memorising fixed threshold→count mappings.
_THRESHOLD_JITTER = {
"temperature_2m": 2.0, # ±2 °C
"relative_humidity_2m": 5.0, # ±5 %
"wind_speed_10m": 3.0, # ±3 km/h
"precipitation_probability": 5.0, # ±5 %
}


PATTERNS_ABOVE = {
HourlyMetric.TEMPERATURE: [
"According to Open-Meteo, how many hours today will the temperature in {city} be above {threshold}{unit}?",
"Using Open-Meteo, count the hours today when {city}'s temperature exceeds {threshold}{unit}.",
"On Open-Meteo, for how many hours today is {city}'s temperature forecast above {threshold}{unit}?",
],
HourlyMetric.HUMIDITY: [
"According to Open-Meteo, how many hours today will the relative humidity in {city} be above {threshold}{unit}?",
"Using Open-Meteo, count the hours today when {city}'s humidity exceeds {threshold}{unit}.",
],
HourlyMetric.WIND_SPEED: [
"According to Open-Meteo, how many hours today will the wind speed in {city} be above {threshold} {unit}?",
"Using Open-Meteo, count the hours today when {city}'s wind speed exceeds {threshold} {unit}.",
],
HourlyMetric.PRECIP_PROBABILITY: [
"According to Open-Meteo, how many hours today will the precipitation probability in {city} be above {threshold}{unit}?",
"Using Open-Meteo, count the hours today when {city}'s precipitation probability exceeds {threshold}{unit}.",
],
}

PATTERNS_BELOW = {
HourlyMetric.TEMPERATURE: [
"According to Open-Meteo, how many hours today will the temperature in {city} be below {threshold}{unit}?",
"Using Open-Meteo, count the hours today when {city}'s temperature is below {threshold}{unit}.",
"On Open-Meteo, for how many hours today is {city}'s temperature forecast below {threshold}{unit}?",
],
HourlyMetric.HUMIDITY: [
"According to Open-Meteo, how many hours today will the relative humidity in {city} be below {threshold}{unit}?",
"Using Open-Meteo, count the hours today when {city}'s humidity is below {threshold}{unit}.",
],
HourlyMetric.WIND_SPEED: [
"According to Open-Meteo, how many hours today will the wind speed in {city} be below {threshold} {unit}?",
"Using Open-Meteo, count the hours today when {city}'s wind speed is below {threshold} {unit}.",
],
HourlyMetric.PRECIP_PROBABILITY: [
"According to Open-Meteo, how many hours today will the precipitation probability in {city} be below {threshold}{unit}?",
"Using Open-Meteo, count the hours today when {city}'s precipitation probability is below {threshold}{unit}.",
],
}


@register_template("openmeteo_hourly_threshold")
class OpenMeteoHourlyThresholdTemplate(QuestionTemplate):
"""
MEDIUM: Count hours above/below a jittered threshold for a metric today.

Requires scanning hourly forecast data and counting qualifying entries.
Threshold includes a seed-derived random offset so SFT cannot memorise
fixed threshold-to-count mappings. Scoring is strict: exact = 1.0,
off-by-1 = 0.5, off-by->1 = 0.0.
"""

GT_SOURCE = GTSourceType.PAGE_ONLY

def __init__(self):
super().__init__("openmeteo_hourly_threshold")

def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion:
rng = random.Random(seed)

metrics = list(HourlyMetric)
metric = metrics[variant % len(metrics)] if variant is not None else rng.choice(metrics)

base_thresholds = HOURLY_THRESHOLDS[metric.api_field]
base = rng.choice(base_thresholds)
jitter_range = _THRESHOLD_JITTER[metric.api_field]
offset = rng.uniform(-jitter_range, jitter_range)
# Round to 1 decimal so the question reads naturally
threshold = round(base + offset, 1)

is_above = rng.choice([True, False])

city = rng.choice(CITIES)
patterns = PATTERNS_ABOVE[metric] if is_above else PATTERNS_BELOW[metric]
question_text = rng.choice(patterns).format(
city=city.display_name,
threshold=threshold,
unit=metric.unit,
)

return GeneratedQuestion(
question_text=question_text,
start_url=DOCS_HOME_URL,
variables={"city": city.name, "metric": metric.name, "threshold": threshold, "is_above": is_above},
validation_info={
"city_name": city.name,
"coord_key": city.coord_key,
"metric_field": metric.api_field,
"metric_label": metric.display_name,
"unit": metric.unit,
"threshold": threshold,
"is_above": is_above,
},
template_name=self.name,
expected_steps=7,
)

def get_validation_rules(self, validation_info: Dict[str, Any]) -> str:
city = validation_info.get("city_name", "")
label = validation_info.get("metric_label", "hourly temperature")
unit = validation_info.get("unit", "°C")
threshold = validation_info.get("threshold", 0)
is_above = validation_info.get("is_above", True)
direction = "above" if is_above else "below"
return f"""Task-Specific Rules (Open Meteo Hourly Threshold Count):
- City: {city}
- Count hours today where {label} is strictly {direction} {threshold}{unit}
- Answer should be a whole number (0-24)
- Score 1.0: Exact count
- Score 0.5: Off by exactly 1 hour
- Score 0.0: Off by more than 1 hour or no numeric answer
- Use the hourly forecast for today's local date"""

async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult:
coord_key = validation_info.get("coord_key", "")
city_name = validation_info.get("city_name", "")
metric_field = validation_info.get("metric_field", "temperature_2m")
threshold = validation_info.get("threshold", 0)
is_above = validation_info.get("is_above", True)

data, failure = get_collected_location_data(coord_key, city_name)
if failure is not None:
return failure

values, val_failure = get_today_hourly_series(data, metric_field)
if val_failure is not None:
return val_failure

if is_above:
count = sum(1 for v in values if v > threshold)
else:
count = sum(1 for v in values if v < threshold)

return GroundTruthResult.ok(str(count))

async def validate_answer(
self, answer: str, validation_info: Dict[str, Any]
) -> ValidationResult:
"""Not used — the pipeline uses LLM-based validation via get_validation_rules()."""
return ValidationResult(
score=0.0, is_correct=False, expected=None, actual=answer,
details="Use LLM validation",
)

def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig:
trigger = UrlPatternTrigger(domains=["open-meteo.com"])
return TriggerConfig(trigger=trigger)

@classmethod
def get_cache_source(cls) -> str:
return "openmeteo"

def get_gt_source(self) -> GTSourceType:
return self.GT_SOURCE
Loading