-
Notifications
You must be signed in to change notification settings - Fork 6
feat: Implement ingested_forecast_length utility and integrate with GFS (#412) #421
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6332ae1
6d5f762
e79f998
6f0270c
75f169c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| from collections.abc import Mapping, Sequence | ||
| from typing import Protocol | ||
|
|
||
| import xarray as xr | ||
|
|
||
| from reformatters.common.logging import get_logger | ||
| from reformatters.common.types import Timedelta, Timestamp | ||
|
|
||
| log = get_logger(__name__) | ||
|
|
||
|
|
||
| class DeterministicForecastSourceFileCoord(Protocol): | ||
| init_time: Timestamp | ||
| lead_time: Timedelta | ||
|
|
||
|
|
||
| def update_ingested_forecast_length( | ||
| template_ds: xr.Dataset, | ||
| results_coords: Mapping[str, Sequence[DeterministicForecastSourceFileCoord]], | ||
| ) -> xr.Dataset: | ||
| """ | ||
| Updates the 'ingested_forecast_length' coordinate in the template dataset. | ||
|
|
||
| The maximum processed lead time across all variables is set as the | ||
| ingested_forecast_length. This can hide the nuance of a specific variable | ||
| having fewer lead times processed than others. | ||
| """ | ||
| assert "ingested_forecast_length" in template_ds.coords | ||
|
|
||
| max_lead_per_init: dict[Timestamp, Timedelta] = {} | ||
|
|
||
| for coords_seq in results_coords.values(): | ||
| for coord in coords_seq: | ||
| if ( | ||
| coord.init_time not in max_lead_per_init | ||
| or coord.lead_time > max_lead_per_init[coord.init_time] | ||
| ): | ||
| max_lead_per_init[coord.init_time] = coord.lead_time | ||
|
|
||
| for init_time, max_lead in max_lead_per_init.items(): | ||
| template_ds["ingested_forecast_length"].loc[{"init_time": init_time}] = max_lead | ||
| return template_ds | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| from collections.abc import Mapping | ||
|
|
||
| import pandas as pd | ||
| import xarray as xr | ||
|
|
||
| from reformatters.common.ingest_stats import update_ingested_forecast_length | ||
| from reformatters.common.region_job import CoordinateValueOrRange, SourceFileCoord | ||
| from reformatters.common.types import Dim, Timedelta, Timestamp | ||
|
|
||
|
|
||
| class MockSourceFileCoord(SourceFileCoord): | ||
| init_time: Timestamp | ||
| lead_time: Timedelta | ||
|
|
||
| def out_loc(self) -> Mapping[Dim, CoordinateValueOrRange]: | ||
| return {} | ||
|
|
||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are missing a test that checks that the existing values in the array not not modified. |
||
| def test_update_ingested_forecast_length_simple() -> None: | ||
| init_times = [ | ||
| pd.Timestamp("2025-01-01 12:00"), | ||
| pd.Timestamp("2025-01-01 18:00"), | ||
| ] | ||
|
|
||
| empty_deltas = pd.to_timedelta([pd.NaT, pd.NaT]).to_numpy() # type: ignore[call-overload] | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @aldenks just a headsup this line was failing in type so i had to ignore this line while type checking. |
||
|
|
||
| ds = xr.Dataset( | ||
| coords={ | ||
| "init_time": init_times, | ||
| "ingested_forecast_length": (("init_time",), empty_deltas), | ||
| } | ||
| ) | ||
|
|
||
| coord1 = MockSourceFileCoord( | ||
| init_time=pd.Timestamp("2025-01-01 12:00"), | ||
| lead_time=pd.Timedelta(hours=6), | ||
| ) | ||
| coord2 = MockSourceFileCoord( | ||
| init_time=pd.Timestamp("2025-01-01 18:00"), | ||
| lead_time=pd.Timedelta(hours=48), | ||
| ) | ||
|
|
||
| results = {"var1": [coord1, coord2]} | ||
| ds = update_ingested_forecast_length(ds, results) | ||
|
|
||
| assert ds["ingested_forecast_length"].sel( | ||
| init_time="2025-01-01 12:00" | ||
| ).values == pd.Timedelta(hours=6) | ||
| assert ds["ingested_forecast_length"].sel( | ||
| init_time="2025-01-01 18:00" | ||
| ).values == pd.Timedelta(hours=48) | ||
|
|
||
|
|
||
| def test_update_ingested_forecast_length_update_existing() -> None: | ||
| init_time = pd.Timestamp("2025-01-01 12:00") | ||
|
|
||
| ds = xr.Dataset( | ||
| coords={ | ||
| "init_time": [init_time], | ||
| "ingested_forecast_length": (("init_time",), [pd.Timedelta(hours=6)]), | ||
| } | ||
| ) | ||
|
|
||
| new_coord = MockSourceFileCoord( | ||
| init_time=init_time, | ||
| lead_time=pd.Timedelta(hours=12), | ||
| ) | ||
|
|
||
| ds = update_ingested_forecast_length(ds, {"var1": [new_coord]}) | ||
|
|
||
| assert ds["ingested_forecast_length"].sel( | ||
| init_time=init_time | ||
| ).values == pd.Timedelta(hours=12) | ||
Uh oh!
There was an error while loading. Please reload this page.