Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions liveweb_arena/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

__version__ = "0.1.0"

# Core components
# Core components that do not require optional browser runtime deps.
from .core.models import BrowserObservation, BrowserAction, CompositeTask, TrajectoryStep
from .core.browser import BrowserEngine, BrowserSession
from .plugins.base import BasePlugin, SubTask, ValidationResult

__all__ = [
Expand All @@ -22,3 +21,12 @@
"SubTask",
"ValidationResult",
]


def __getattr__(name: str):
"""Lazy-load browser classes so base imports work without Playwright."""
if name in {"BrowserEngine", "BrowserSession"}:
from .core.browser import BrowserEngine, BrowserSession

return {"BrowserEngine": BrowserEngine, "BrowserSession": BrowserSession}[name]
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
8 changes: 8 additions & 0 deletions liveweb_arena/core/task_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,12 @@ class TaskRegistry:
86: ("openmeteo", "openmeteo_comparison"),
87: ("openmeteo", "openmeteo_hourly_extrema"),
88: ("openmeteo", "openmeteo_forecast_trend"),
96: ("openmeteo", "openmeteo_daily_range"),
97: ("openmeteo", "openmeteo_precip_window_count"),
98: ("openmeteo", "openmeteo_humidity_band_hours"),
99: ("openmeteo", "openmeteo_wind_shift"),
100: ("openmeteo", "openmeteo_city_pair_forecast_gap"),
101: ("openmeteo", "openmeteo_comfort_index"),

# ArXiv templates
90: ("arxiv", "arxiv_paper_info"),
Expand Down Expand Up @@ -181,6 +187,8 @@ class TaskRegistry:
[85, 86, 87, 88],
# Version 6: ArXiv templates
[90, 91, 92, 94, 95],
# Version 7: Additional Open Meteo templates
[96, 97, 98, 99, 100, 101],
]

# Combination registry: list of template ID tuples
Expand Down
14 changes: 13 additions & 1 deletion liveweb_arena/plugins/openmeteo/templates/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
"""Open Meteo question templates"""
"""Open Meteo question templates."""

from .current_weather import OpenMeteoCurrentWeatherTemplate
from .comparison import OpenMeteoComparisonTemplate
from .hourly_extrema import OpenMeteoHourlyExtremaTemplate
from .forecast_trend import OpenMeteoForecastTrendTemplate
from .daily_range import OpenMeteoDailyRangeTemplate
from .precip_window_count import OpenMeteoPrecipWindowCountTemplate
from .humidity_band_hours import OpenMeteoHumidityBandHoursTemplate
from .wind_shift import OpenMeteoWindShiftTemplate
from .city_pair_forecast_gap import OpenMeteoCityPairForecastGapTemplate
from .comfort_index import OpenMeteoComfortIndexTemplate

__all__ = [
"OpenMeteoCurrentWeatherTemplate",
"OpenMeteoComparisonTemplate",
"OpenMeteoHourlyExtremaTemplate",
"OpenMeteoForecastTrendTemplate",
"OpenMeteoDailyRangeTemplate",
"OpenMeteoPrecipWindowCountTemplate",
"OpenMeteoHumidityBandHoursTemplate",
"OpenMeteoWindShiftTemplate",
"OpenMeteoCityPairForecastGapTemplate",
"OpenMeteoComfortIndexTemplate",
]
136 changes: 136 additions & 0 deletions liveweb_arena/plugins/openmeteo/templates/city_pair_forecast_gap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Two-city daily forecast comparison template - HARD difficulty."""

import random
from typing import Any, Dict, Optional

from liveweb_arena.core.ground_truth_trigger import (
GroundTruthResult,
TriggerConfig,
UrlPatternTrigger,
)
from liveweb_arena.core.gt_collector import GTSourceType
from liveweb_arena.core.validators.base import (
GeneratedQuestion,
QuestionTemplate,
ValidationResult,
register_template,
)

from .common import DOCS_HOME_URL, get_collected_location_data, get_daily_value
from .variables import CITIES, DailyMetric

DAY_CHOICES = [
(0, "today"),
(1, "tomorrow"),
(2, "the day after tomorrow"),
]

PATTERNS = [
"Using Open-Meteo, what is the signed difference in {metric_label} for {day_label} between {city1} and {city2} (answer as {city1} minus {city2})?",
"On Open-Meteo, compare {day_label}'s {metric_label} in {city1} vs {city2}. Report {city1} - {city2} with unit.",
"According to Open-Meteo forecast data, by how much is {city1}'s {day_label} {metric_label} above or below {city2}'s? (signed {city1} - {city2})",
]


@register_template("openmeteo_city_pair_forecast_gap")
class OpenMeteoCityPairForecastGapTemplate(QuestionTemplate):
"""Compare same-day metric across two cities (signed city1 - city2)."""

GT_SOURCE = GTSourceType.PAGE_ONLY

def __init__(self):
super().__init__("openmeteo_city_pair_forecast_gap")

def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion:
rng = random.Random(seed)
city1, city2 = rng.sample(CITIES, 2)
metric = rng.choice(list(DailyMetric))
day_idx, day_label = rng.choice(DAY_CHOICES)
pattern = rng.choice(PATTERNS)

return GeneratedQuestion(
question_text=pattern.format(
metric_label=metric.display_name,
day_label=day_label,
city1=city1.display_name,
city2=city2.display_name,
),
start_url=DOCS_HOME_URL,
variables={
"city1": city1.name,
"city2": city2.name,
"metric": metric.name,
"day_idx": day_idx,
},
validation_info={
"city1_name": city1.name,
"city1_coord_key": city1.coord_key,
"city2_name": city2.name,
"city2_coord_key": city2.coord_key,
"metric_field": metric.api_field,
"metric_label": metric.display_name,
"unit": metric.unit,
"day_idx": day_idx,
"day_label": day_label,
},
template_name=self.name,
expected_steps=10,
)

def get_validation_rules(self, validation_info: Dict[str, Any]) -> str:
return (
"Task-Specific Rules (Open Meteo City Pair Forecast Gap):\n"
f"- Day: {validation_info.get('day_label', '')}\n"
f"- Metric: {validation_info.get('metric_label', '')}\n"
f"- Signed difference must be {validation_info.get('city1_name', 'city1')} - {validation_info.get('city2_name', 'city2')}\n"
"- Score 1.0: signed value within ±1.0 unit\n"
"- Score 0.5: absolute magnitude close but sign wrong OR error <=3.0 units\n"
"- Score 0.0: otherwise"
)

async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult:
day_idx = int(validation_info.get("day_idx", 0))
metric_field = validation_info.get("metric_field", "temperature_2m_max")
unit = validation_info.get("unit", "")

city1_data, failure = get_collected_location_data(
validation_info.get("city1_coord_key", ""),
validation_info.get("city1_name", ""),
)
if failure is not None:
return failure
city2_data, failure = get_collected_location_data(
validation_info.get("city2_coord_key", ""),
validation_info.get("city2_name", ""),
)
if failure is not None:
return failure

value1, failure = get_daily_value(city1_data, metric_field, day_idx)
if failure is not None:
return failure
value2, failure = get_daily_value(city2_data, metric_field, day_idx)
if failure is not None:
return failure

diff = value1 - value2
return GroundTruthResult.ok(f"{diff:.1f}{unit}")

async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult:
return ValidationResult(
score=0.0,
is_correct=False,
expected=None,
actual=answer,
details="Use LLM validation",
)

def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig:
return TriggerConfig(trigger=UrlPatternTrigger(domains=["open-meteo.com"]))

@classmethod
def get_cache_source(cls) -> str:
return "openmeteo"

def get_gt_source(self) -> GTSourceType:
return self.GT_SOURCE
133 changes: 133 additions & 0 deletions liveweb_arena/plugins/openmeteo/templates/comfort_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Comfort index template for Open Meteo - HARD difficulty."""

import random
from typing import Any, Dict, Optional

from liveweb_arena.core.ground_truth_trigger import (
GroundTruthResult,
TriggerConfig,
UrlPatternTrigger,
)
from liveweb_arena.core.gt_collector import GTSourceType
from liveweb_arena.core.validators.base import (
GeneratedQuestion,
QuestionTemplate,
ValidationResult,
register_template,
)

from .common import DOCS_HOME_URL, get_collected_location_data
from .variables import CITIES

PATTERNS = [
"For {city} on Open-Meteo, compute the comfort index: temperature_2m - 0.2 * wind_speed_10m - 0.05 * relative_humidity_2m. What is the value?",
"Using Open-Meteo current weather for {city}, calculate CI = T - 0.2W - 0.05H, where T is temperature (°C), W is wind speed (km/h), H is humidity (%).",
"According to Open-Meteo, what is {city}'s comfort index defined as T - 0.2W - 0.05H from current weather values?",
]


@register_template("openmeteo_comfort_index")
class OpenMeteoComfortIndexTemplate(QuestionTemplate):
"""Compute a deterministic index from three current-weather fields."""

GT_SOURCE = GTSourceType.PAGE_ONLY

def __init__(self):
super().__init__("openmeteo_comfort_index")

def generate(self, seed: int, variant: Optional[int] = None) -> GeneratedQuestion:
rng = random.Random(seed)
city = rng.choice(CITIES)
pattern = rng.choice(PATTERNS)

return GeneratedQuestion(
question_text=pattern.format(city=city.display_name),
start_url=DOCS_HOME_URL,
variables={"city": city.name},
validation_info={
"city_name": city.name,
"coord_key": city.coord_key,
"formula": "T - 0.2W - 0.05H",
"unit": "index-points",
},
template_name=self.name,
expected_steps=8,
)

def get_validation_rules(self, validation_info: Dict[str, Any]) -> str:
return (
"Task-Specific Rules (Open Meteo Comfort Index):\n"
f"- City: {validation_info.get('city_name', '')}\n"
"- Formula: CI = temperature - 0.2*wind_speed - 0.05*humidity\n"
"- Use current_weather values from Open-Meteo\n"
"- Score 1.0: within ±0.8 index-points\n"
"- Score 0.5: within ±2.0 index-points\n"
"- Score 0.0: otherwise"
)

async def get_ground_truth(self, validation_info: Dict[str, Any]) -> GroundTruthResult:
data, failure = get_collected_location_data(
validation_info.get("coord_key", ""),
validation_info.get("city_name", ""),
)
if failure is not None:
return failure

current = data.get("current_weather")
hourly = data.get("hourly")
if not isinstance(current, dict):
return GroundTruthResult.fail("No current_weather in API response")
if not isinstance(hourly, dict):
return GroundTruthResult.fail("No hourly data in API response")

temp_raw = current.get("temperature")
wind_raw = current.get("windspeed")
if temp_raw is None or wind_raw is None:
return GroundTruthResult.fail("Missing temperature/windspeed in current_weather")
try:
temp = float(temp_raw)
wind = float(wind_raw)
except (TypeError, ValueError):
return GroundTruthResult.fail("Non-numeric temperature/windspeed")

# Humidity may not be in current_weather; use hourly value nearest current time.
times = hourly.get("time")
humidity = hourly.get("relative_humidity_2m")
if not isinstance(times, list) or not isinstance(humidity, list) or len(times) != len(humidity):
return GroundTruthResult.fail("Invalid hourly humidity arrays")

current_time = current.get("time")
humidity_value = None
if isinstance(current_time, str) and current_time in times:
idx = times.index(current_time)
if idx < len(humidity) and humidity[idx] is not None:
humidity_value = humidity[idx]
if humidity_value is None and humidity:
humidity_value = humidity[0]

try:
hum = float(humidity_value)
except (TypeError, ValueError):
return GroundTruthResult.fail("Non-numeric humidity value")

ci = temp - 0.2 * wind - 0.05 * hum
return GroundTruthResult.ok(f"{ci:.2f}")

async def validate_answer(self, answer: str, validation_info: Dict[str, Any]) -> ValidationResult:
return ValidationResult(
score=0.0,
is_correct=False,
expected=None,
actual=answer,
details="Use LLM validation",
)

def get_ground_truth_trigger(self, validation_info: dict) -> TriggerConfig:
return TriggerConfig(trigger=UrlPatternTrigger(domains=["open-meteo.com"]))

@classmethod
def get_cache_source(cls) -> str:
return "openmeteo"

def get_gt_source(self) -> GTSourceType:
return self.GT_SOURCE
Loading