diff --git a/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/geopotential_height_500hpa/zarr.json b/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/geopotential_height_500hpa/zarr.json new file mode 100644 index 000000000..6519281d7 --- /dev/null +++ b/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/geopotential_height_500hpa/zarr.json @@ -0,0 +1,84 @@ +{ + "shape": [ + 1, + 721, + 1440 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 2880, + 384, + 384 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": [ + 1440, + 32, + 32 + ], + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "index_codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "crc32c" + } + ], + "index_location": "end" + } + } + ], + "attributes": { + "long_name": "Geopotential height", + "short_name": "gh", + "standard_name": "geopotential_height", + "units": "m", + "step_type": "instant", + "coordinates": "spatial_ref", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "time", + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] +} \ No newline at end of file diff --git a/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/zarr.json b/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/zarr.json index 88f864bb8..7981e106b 100644 --- a/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/zarr.json +++ b/src/reformatters/noaa/gefs/analysis/templates/latest.zarr/zarr.json @@ -521,6 +521,90 @@ "node_type": "array", "storage_transformers": [] }, + "geopotential_height_500hpa": { + "shape": [ + 1, + 721, + 1440 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 2880, + 384, + 384 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": [ + 1440, + 32, + 32 + ], + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "index_codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "crc32c" + } + ], + "index_location": "end" + } + } + ], + "attributes": { + "long_name": "Geopotential height", + "short_name": "gh", + "standard_name": "geopotential_height", + "units": "m", + "step_type": "instant", + "coordinates": "spatial_ref", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "time", + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, "geopotential_height_cloud_ceiling": { "shape": [ 1, diff --git a/src/reformatters/noaa/gefs/common_gefs_template_config.py b/src/reformatters/noaa/gefs/common_gefs_template_config.py index 394428df8..1150340de 100644 --- a/src/reformatters/noaa/gefs/common_gefs_template_config.py +++ b/src/reformatters/noaa/gefs/common_gefs_template_config.py @@ -474,6 +474,25 @@ def get_shared_data_var_configs( keep_mantissa_bits=8, ), ), + GEFSDataVar( + name="geopotential_height_500hpa", + encoding=encoding_float32, + attrs=DataVarAttrs( + short_name="gh", + long_name="Geopotential height", + units="m", + step_type="instant", + standard_name="geopotential_height", + ), + internal_attrs=GEFSInternalAttrs( + grib_element="HGT", + grib_description='50000[Pa] ISBL="Isobaric surface"', + grib_index_level="500 mb", + gefs_file_type="a", + index_position=31, + keep_mantissa_bits=11, + ), + ), GEFSDataVar( name="downward_short_wave_radiation_flux_surface", encoding=encoding_float32, diff --git a/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/geopotential_height_500hpa/zarr.json b/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/geopotential_height_500hpa/zarr.json new file mode 100644 index 000000000..2690235a2 --- /dev/null +++ b/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/geopotential_height_500hpa/zarr.json @@ -0,0 +1,92 @@ +{ + "shape": [ + 1, + 31, + 181, + 721, + 1440 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 1, + 31, + 192, + 374, + 368 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": [ + 1, + 31, + 64, + 17, + 16 + ], + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "index_codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "crc32c" + } + ], + "index_location": "end" + } + } + ], + "attributes": { + "long_name": "Geopotential height", + "short_name": "gh", + "standard_name": "geopotential_height", + "units": "m", + "step_type": "instant", + "coordinates": "expected_forecast_length ingested_forecast_length spatial_ref valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "init_time", + "ensemble_member", + "lead_time", + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] +} \ No newline at end of file diff --git a/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/zarr.json b/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/zarr.json index 426858024..a50d6f4b4 100644 --- a/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/zarr.json +++ b/src/reformatters/noaa/gefs/forecast_35_day/templates/latest.zarr/zarr.json @@ -676,6 +676,98 @@ "node_type": "array", "storage_transformers": [] }, + "geopotential_height_500hpa": { + "shape": [ + 1, + 31, + 181, + 721, + 1440 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 1, + 31, + 192, + 374, + 368 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": [ + 1, + 31, + 64, + 17, + 16 + ], + "codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "blosc", + "configuration": { + "typesize": 4, + "cname": "zstd", + "clevel": 3, + "shuffle": "shuffle", + "blocksize": 0 + } + } + ], + "index_codecs": [ + { + "name": "bytes", + "configuration": { + "endian": "little" + } + }, + { + "name": "crc32c" + } + ], + "index_location": "end" + } + } + ], + "attributes": { + "long_name": "Geopotential height", + "short_name": "gh", + "standard_name": "geopotential_height", + "units": "m", + "step_type": "instant", + "coordinates": "expected_forecast_length ingested_forecast_length spatial_ref valid_time", + "_FillValue": "AAAAAAAA+H8=" + }, + "dimension_names": [ + "init_time", + "ensemble_member", + "lead_time", + "latitude", + "longitude" + ], + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] + }, "geopotential_height_cloud_ceiling": { "shape": [ 1, diff --git a/src/reformatters/noaa/gefs/gefs_config_models.py b/src/reformatters/noaa/gefs/gefs_config_models.py index eb5af310b..2fd625de2 100644 --- a/src/reformatters/noaa/gefs/gefs_config_models.py +++ b/src/reformatters/noaa/gefs/gefs_config_models.py @@ -60,6 +60,7 @@ class GEFSDataVar(DataVar[GEFSInternalAttrs]): "2 m above ground": "2m", "10 m above ground": "hgt", "100 m above ground": "hgt", + "500 mb": "pres_abv700mb", } GEFS_REFORECAST_GRIB_ELEMENT_RENAME = { "PRMSL": "PRES", # In the reforecast, PRMSL is PRES with level "mean sea level" diff --git a/src/reformatters/noaa/gefs/read_data.py b/src/reformatters/noaa/gefs/read_data.py index b0936ed7f..7536e1fd2 100644 --- a/src/reformatters/noaa/gefs/read_data.py +++ b/src/reformatters/noaa/gefs/read_data.py @@ -91,7 +91,14 @@ def read_rasterio( result: Array2D[np.float32] match true_gefs_file_type: - case "a" | "b": + case "s": + # Confirm the arguments we use to resample 1.0/0.5 degree data + # to 0.25 degree grid below match the source 0.25 degree data. + assert reader.shape == out_spatial_shape + assert reader.transform == out_transform + assert reader.crs.to_dict() == out_crs + return reader.read(rasterio_band_index, out_dtype=np.float32) + case "a" | "b" | "reforecast": # Interpolate 1.0/0.5 degree data to the 0.25 degree grid. # Every 2nd (0.5 deg) or every 4th (1.0 deg) 0.25 degree pixel's center aligns exactly # with a 1.0/0.5 degree pixel's center. @@ -102,6 +109,11 @@ def read_rasterio( # Note: having the .read() call interpolate gives very slightly shifted results # so we pay for an extra memory allocation and use reproject to do the interpolation instead. raw = reader.read(rasterio_band_index, out_dtype=np.float32) + if reader.shape == out_spatial_shape: + # Some reforecast files are already 0.25° - no reprojection needed. + assert reader.transform == out_transform + assert reader.crs.to_dict() == out_crs + return raw result, _ = rasterio.warp.reproject( raw, np.full(out_spatial_shape, np.nan, dtype=np.float32), @@ -116,17 +128,5 @@ def read_rasterio( step = 2 if is_v12(coord.init_time) else 4 assert np.array_equal(raw, result[::step, ::step]) return result - case "s" | "reforecast": - # Confirm the arguments we use to resample 1.0/0.5 degree data - # to 0.25 degree grid above match the source 0.25 degree data. - assert reader.shape == out_spatial_shape - assert reader.transform == out_transform - assert reader.crs.to_dict() == out_crs - - result = reader.read( - rasterio_band_index, - out_dtype=np.float32, - ) - return result case _ as unreachable: assert_never(unreachable) diff --git a/tests/noaa/gefs/analysis/region_job_test.py b/tests/noaa/gefs/analysis/region_job_test.py index 3a9535810..97334f8af 100644 --- a/tests/noaa/gefs/analysis/region_job_test.py +++ b/tests/noaa/gefs/analysis/region_job_test.py @@ -1,4 +1,5 @@ import tempfile +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, Mock, patch @@ -23,6 +24,7 @@ GefsAnalysisRegionJob, GefsAnalysisSourceFileCoord, ) +from reformatters.noaa.gefs.analysis.template_config import GefsAnalysisTemplateConfig from reformatters.noaa.gefs.gefs_config_models import ( GEFSDataVar, GEFSInternalAttrs, @@ -769,3 +771,122 @@ def test_download_file_fallback_permission_denied_converts_to_file_not_found( # Verify it's a FileNotFoundError with PermissionDeniedError as cause assert exc_info.value.__cause__ is not None assert isinstance(exc_info.value.__cause__, PermissionDeniedError) + + +def _make_analysis_region_job() -> GefsAnalysisRegionJob: + template_config = GefsAnalysisTemplateConfig() + return GefsAnalysisRegionJob.model_construct( + tmp_store=Mock(), + template_ds=template_config.get_template(pd.Timestamp.now()), + data_vars=template_config.data_vars, + append_dim=template_config.append_dim, + region=slice(0, 1), + reformat_job_name="test", + ) + + +# Variables not present in the GEFSv12 reforecast archive (file doesn't exist or not in GRIB index) +_REFORECAST_MISSING_VARS = frozenset( + { + "relative_humidity_2m", + "pressure_reduced_to_mean_sea_level", + "categorical_snow_surface", + "categorical_ice_pellets_surface", + "categorical_freezing_rain_surface", + "categorical_rain_surface", + "percent_frozen_precipitation_surface", + } +) + +# Variables not present in pre-v12 GEFS files (2020-01-01 to 2020-09-23) +_PRE_V12_MISSING_VARS = frozenset( + { + "geopotential_height_cloud_ceiling", # not in pre-v12 b-files + } +) + + +@pytest.mark.slow +def test_download_and_read_all_vars_reforecast() -> None: + """Download and read all vars from GEFS v12 reforecast (2000-2019), one var per file. + + Analysis source_groups returns all vars in one group; we iterate one var at a time + to match how the reforecast is accessed (one file per variable). + """ + template_config = GefsAnalysisTemplateConfig() + region_job = _make_analysis_region_job() + init_time = pd.Timestamp("2018-01-01T00:00") + lead_time = pd.Timedelta(hours=24) + data_vars = [ + dv + for dv in template_config.data_vars + if dv.name not in _REFORECAST_MISSING_VARS + ] + + def _download_and_check(data_var: GEFSDataVar) -> None: + coord = GefsAnalysisSourceFileCoord( + init_time=init_time, + lead_time=lead_time, + data_vars=[data_var], + ) + coord = replace(coord, downloaded_path=region_job.download_file(coord)) + data = region_job.read_data(coord, data_var) + assert np.all(np.isfinite(data)), f"Non-finite values for {data_var.name}" + + with ThreadPoolExecutor() as pool: + list(pool.map(_download_and_check, data_vars)) + + +@pytest.mark.slow +def test_download_and_read_all_vars_pre_v12() -> None: + """Download and read all vars from pre-GEFS v12 period (2020-01-01 to 2020-09-23). + + Analysis source_groups returns all vars in one group; we iterate one var at a time + to match actual processing (max_vars_per_download_group=1). + """ + template_config = GefsAnalysisTemplateConfig() + region_job = _make_analysis_region_job() + init_time = pd.Timestamp("2020-06-01T00:00") + lead_time = pd.Timedelta(hours=6) + data_vars = [ + dv for dv in template_config.data_vars if dv.name not in _PRE_V12_MISSING_VARS + ] + + def _download_and_check(data_var: GEFSDataVar) -> None: + coord = GefsAnalysisSourceFileCoord( + init_time=init_time, + lead_time=lead_time, + data_vars=[data_var], + ) + coord = replace(coord, downloaded_path=region_job.download_file(coord)) + data = region_job.read_data(coord, data_var) + assert np.all(np.isfinite(data)), f"Non-finite values for {data_var.name}" + + with ThreadPoolExecutor() as pool: + list(pool.map(_download_and_check, data_vars)) + + +@pytest.mark.slow +def test_download_and_read_all_vars_current_early_lead() -> None: + """Download and read all vars from current GEFS analysis archive (s-files, lead = 6h). + + Analysis source_groups returns all vars in one group; we iterate one var at a time + to match actual processing (max_vars_per_download_group=1). + """ + template_config = GefsAnalysisTemplateConfig() + region_job = _make_analysis_region_job() + init_time = pd.Timestamp("2024-01-01T00:00") + lead_time = pd.Timedelta(hours=6) + + def _download_and_check(data_var: GEFSDataVar) -> None: + coord = GefsAnalysisSourceFileCoord( + init_time=init_time, + lead_time=lead_time, + data_vars=[data_var], + ) + coord = replace(coord, downloaded_path=region_job.download_file(coord)) + data = region_job.read_data(coord, data_var) + assert np.all(np.isfinite(data)), f"Non-finite values for {data_var.name}" + + with ThreadPoolExecutor() as pool: + list(pool.map(_download_and_check, template_config.data_vars)) diff --git a/tests/noaa/gefs/forecast_35_day/region_job_test.py b/tests/noaa/gefs/forecast_35_day/region_job_test.py index 466ffb1f8..e654a81ef 100644 --- a/tests/noaa/gefs/forecast_35_day/region_job_test.py +++ b/tests/noaa/gefs/forecast_35_day/region_job_test.py @@ -1,4 +1,5 @@ import tempfile +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, Mock, patch @@ -22,6 +23,9 @@ GefsForecast35DayRegionJob, GefsForecast35DaySourceFileCoord, ) +from reformatters.noaa.gefs.forecast_35_day.template_config import ( + GefsForecast35DayTemplateConfig, +) from reformatters.noaa.gefs.gefs_config_models import ( GEFSDataVar, GefsEnsembleSourceFileCoord, @@ -704,3 +708,46 @@ def test_download_file_fallback_permission_denied_converts_to_file_not_found( # Verify it's a FileNotFoundError with PermissionDeniedError as cause assert exc_info.value.__cause__ is not None assert isinstance(exc_info.value.__cause__, PermissionDeniedError) + + +def _make_forecast_region_job() -> GefsForecast35DayRegionJob: + template_config = GefsForecast35DayTemplateConfig() + return GefsForecast35DayRegionJob.model_construct( + tmp_store=Mock(), + template_ds=template_config.get_template(pd.Timestamp.now()), + data_vars=template_config.data_vars, + append_dim=template_config.append_dim, + region=slice(0, 1), + reformat_job_name="test", + ) + + +@pytest.mark.slow +@pytest.mark.parametrize( + "lead_time", + [ + pd.Timedelta(hours=6), # s-files (lead <= 240h) + pd.Timedelta(hours=252), # a/b-files (lead > 240h) + ], +) +def test_download_and_read_all_vars_current(lead_time: pd.Timedelta) -> None: + """Download and read all vars from current GEFS archive at early and later lead times.""" + template_config = GefsForecast35DayTemplateConfig() + region_job = _make_forecast_region_job() + init_time = pd.Timestamp("2024-01-01T00:00") + + def _download_and_check(group: list[GEFSDataVar]) -> None: + coord = GefsForecast35DaySourceFileCoord( + init_time=init_time, + lead_time=lead_time, + ensemble_member=0, + data_vars=group, + ) + coord = replace(coord, downloaded_path=region_job.download_file(coord)) + for data_var in group: + data = region_job.read_data(coord, data_var) + assert np.all(np.isfinite(data)), f"Non-finite values for {data_var.name}" + + groups = list(GefsForecast35DayRegionJob.source_groups(template_config.data_vars)) + with ThreadPoolExecutor() as pool: + list(pool.map(_download_and_check, groups))