diff --git a/swvo/io/hp/gfz.py b/swvo/io/hp/gfz.py index 309d517..542d679 100755 --- a/swvo/io/hp/gfz.py +++ b/swvo/io/hp/gfz.py @@ -4,16 +4,17 @@ from __future__ import annotations +import json import logging import os from datetime import datetime, timedelta, timezone -from ftplib import FTP from pathlib import Path from shutil import rmtree -from typing import List, Optional +from typing import Optional import numpy as np import pandas as pd +import requests logger = logging.getLogger(__name__) @@ -42,7 +43,7 @@ class HpGFZ: ENV_VAR_NAME = "RT_HP_GFZ_STREAM_DIR" START_YEAR = 1985 - URL = "ftp://ftp.gfz-potsdam.de/pub/home/obs/Hpo/" + API_URL = "https://kp.gfz.de/app/json/" LABEL = "gfz" def __init__(self, index: str, data_dir: Optional[Path] = None) -> None: @@ -99,23 +100,12 @@ def download_and_process( tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") - filenames_download = [ - f"Hp{self.index_number}/Hp{self.index_number}_ap{self.index_number}_{time_interval[0].year!s}.txt" - ] - - # there is a separate nowcast file - if time_interval[0].year == datetime.now(timezone.utc).year: - filenames_download.append( - f"Hp{self.index_number}/Hp{self.index_number}_ap{self.index_number}_nowcast.txt" - ) - try: - for filename_download in filenames_download: - self._download(temporary_dir, filename_download) - - filenames_download = [x.split("/")[-1] for x in filenames_download] # strip folder from filename + # Download data for this time interval + self._download(temporary_dir, time_interval[0], time_interval[1]) - processed_df = self._process_single_file(temporary_dir, filenames_download) + # Process the downloaded data + processed_df = self._process_single_file(temporary_dir, time_interval[0]) file_path.parent.mkdir(parents=True, exist_ok=True) processed_df.to_csv(tmp_path, index=True, header=False) @@ -129,38 +119,48 @@ def download_and_process( rmtree(temporary_dir, ignore_errors=True) - def _download(self, temporary_dir: Path, filename: str) -> None: - """Download a file from the GFZ server. + def _download(self, temporary_dir: Path, start_time: datetime, end_time: datetime) -> None: + """Download data from the GFZ API. Parameters ---------- temporary_dir : Path Temporary directory to store the downloaded file. - filename : str - Full path of the file to download (including folder). + start_time : datetime + Start time for the data request. + end_time : datetime + End time for the data request. Raises ------ Exception - If the FTP download fails. + If the API request fails. """ - logger.debug(f"Downloading file {self.URL + filename} ...") + # Format datetime to ISO format with Z timezone + start_str = start_time.isoformat().replace("+00:00", "Z") + end_str = end_time.isoformat().replace("+00:00", "Z") + + # Determine the index parameter for the API + index_param = f"Hp{self.index_number}" - # Extract just the filename from the path - filename_only = filename.split("/")[-1] - local_path = temporary_dir / filename_only + url = f"{self.API_URL}?start={start_str}&end={end_str}&index={index_param}&status=def" + + logger.debug(f"Downloading data from {url} ...") try: - ftp = FTP("ftp.gfz-potsdam.de") - ftp.login() - ftp.cwd("/pub/home/obs/Hpo/") + response = requests.get(url, timeout=30) + response.raise_for_status() + + data = response.json() + logger.debug(f"Downloaded data: {len(data)} records") - with open(local_path, "wb") as f: - ftp.retrbinary(f"RETR {filename}", f.write) + # Save the JSON response to a temporary file for processing + output_file = temporary_dir / f"hp_data_{start_time.year}.json" + with open(output_file, "w") as f: + json.dump(data, f) - ftp.quit() - except Exception as e: - logger.error(f"FTP download failed: {e}") + except requests.exceptions.RequestException as e: + logger.error(f"API request failed: {e}") raise def read(self, start_time: datetime, end_time: datetime, *, download: bool = False) -> pd.DataFrame: @@ -263,15 +263,15 @@ def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> return file_paths, time_intervals - def _process_single_file(self, temp_dir: Path, filenames: List[str]) -> pd.DataFrame: - """Process HpGFZ file to a DataFrame. + def _process_single_file(self, temp_dir: Path, start_time: datetime) -> pd.DataFrame: + """Process HpGFZ data from JSON response to a DataFrame. Parameters ---------- temp_dir : Path - Temporary directory to store the file. - filenames : List[str] - List of filenames to process. + Temporary directory containing the JSON file. + start_time : datetime + Start time to identify the correct JSON file. Returns ------- @@ -280,48 +280,20 @@ def _process_single_file(self, temp_dir: Path, filenames: List[str]) -> pd.DataF """ data_total = pd.DataFrame() + json_file = temp_dir / f"hp_data_{start_time.year}.json" - # combine nowcast and yearly file - for filename in filenames: - data = {self.index: [], "timestamp": []} - - with open(temp_dir / filename) as f: # noqa: PTH123 - for line in f: - if line[0] == "#": - continue - line = line.split(" ") - line = [x for x in line if x != ""] - - year = line[0] - month = line[1] - day = line[2] - hour = line[3][0:2] - - if int(line[3][3:4]) == 0: - minute = 0 - elif int(line[3][3:4]) == 5: - minute = 30 - else: - msg = "Value for minute not expected" - raise ValueError(msg) - data["timestamp"] += [ - datetime( - int(year), - int(month), - int(day), - int(hour), - minute, - tzinfo=timezone.utc, - ) - ] - data[self.index] += [float(line[7])] - - data = pd.DataFrame(data) - data.index = data["timestamp"] - data = data.drop(labels=["timestamp"], axis=1) - data.loc[data[self.index] == -1, self.index] = np.nan - - data_total = data_total.combine_first(data) + if not json_file.exists(): + logger.warning(f"JSON file {json_file} not found") + return data_total + + with open(json_file) as f: + json_data = json.load(f) + + data_total = pd.DataFrame( + {f"Hp{self.index_number}": json_data[f"Hp{self.index_number}"]}, + index=pd.to_datetime(json_data["datetime"], utc=True), + ) + data_total.index = data_total.index.tz_convert("UTC") return data_total diff --git a/tests/io/hp/test_hp.py b/tests/io/hp/test_hp.py index 79e9a34..5a9fac3 100644 --- a/tests/io/hp/test_hp.py +++ b/tests/io/hp/test_hp.py @@ -4,9 +4,10 @@ # ruff: noqa: S101 +import json from datetime import datetime, timezone from pathlib import Path -from unittest.mock import mock_open, patch +from unittest.mock import Mock, patch import pandas as pd import pytest @@ -63,8 +64,8 @@ def test_missing_env_var(self): Hp30GFZ() def test_read_with_download(self, hp30gfz, mocker, mock_hp_data): - end_time = datetime(2020, 12, 31) # noqa: DTZ001 - start_time = datetime(2020, 1, 1) # noqa: DTZ001 + end_time = datetime(2020, 12, 31) + start_time = datetime(2020, 1, 1) mocker.patch("pathlib.Path.exists", return_value=False) mocker.patch.object(hp30gfz, "download_and_process") @@ -77,8 +78,8 @@ def test_read_with_download(self, hp30gfz, mocker, mock_hp_data): assert "hp30" in df.columns def test_start_year_behind(self, hp30gfz, mocker, mock_hp_data): - start_time = datetime(1980, 1, 1) # noqa: DTZ001 - end_time = datetime(2020, 12, 31) # noqa: DTZ001 + start_time = datetime(1980, 1, 1) + end_time = datetime(2020, 12, 31) mocker.patch("pathlib.Path.exists", return_value=True) mocker.patch.object(hp30gfz, "_read_single_file", return_value=mock_hp_data) @@ -90,25 +91,29 @@ def test_start_year_behind(self, hp30gfz, mocker, mock_hp_data): " available mission files..." ) - def test_process_single_file(self, hp30gfz, tmp_path, mocker): - file_content = """# Header - 2020 01 01 0000 0.0 0.0 0.0 15.0 - 2020 01 01 0030 0.0 0.0 0.0 16.0 - 2020 01 01 0100 0.0 0.0 0.0 17.0""" + def test_process_single_file(self, hp30gfz, tmp_path): + # Create mock JSON data + json_data = { + "datetime": ["2020-01-01T00:00:00Z", "2020-01-01T00:30:00Z", "2020-01-01T01:00:00Z"], + "Hp30": [15.0, 16.0, 17.0], + } - m = mock_open(read_data=file_content) + # Create temporary JSON file + json_file = tmp_path / "hp_data_2020.json" + with open(json_file, "w") as f: + json.dump(json_data, f) - mocker.patch("builtins.open", m) - df = hp30gfz._process_single_file(tmp_path, ["test_file.txt"]) + start_time = datetime(2020, 1, 1, tzinfo=timezone.utc) + df = hp30gfz._process_single_file(tmp_path, start_time) assert len(df) == 3 assert df.index[0] == pd.Timestamp("2020-01-01 00:00:00", tzinfo=timezone.utc) - assert df.iloc[0]["hp30"] == 15.0 - assert "hp30" in df.columns + assert df.iloc[0]["Hp30"] == 15.0 + assert "Hp30" in df.columns def test_get_processed_file_list(self, hp30gfz): - start_time = datetime(2020, 1, 1) # noqa: DTZ001 - end_time = datetime(2021, 12, 31) # noqa: DTZ001 + start_time = datetime(2020, 1, 1) + end_time = datetime(2021, 12, 31) file_paths, time_intervals = hp30gfz._get_processed_file_list(start_time, end_time) @@ -119,29 +124,45 @@ def test_get_processed_file_list(self, hp30gfz): assert time_intervals[1][0].year == 2021 def test_invalid_time_range(self, hp30gfz): - end_time = datetime(2020, 1, 1) # noqa: DTZ001 - start_time = datetime(2020, 12, 31) # noqa: DTZ001 + end_time = datetime(2020, 1, 1) + start_time = datetime(2020, 12, 31) with pytest.raises(AssertionError): hp30gfz.read(start_time, end_time) def test_download_and_process(self, hp30gfz, mocker): - start_time = datetime(2020, 1, 1) # noqa: DTZ001 - end_time = datetime(2020, 12, 31) # noqa: DTZ001 + start_time = datetime(2020, 1, 1) + end_time = datetime(2020, 12, 31) - # Mock FTP operations - mock_ftp = mocker.Mock() - mock_ftp.retrbinary = mocker.Mock() - mocker.patch("swvo.io.hp.gfz.FTP", return_value=mock_ftp) + # Mock requests operations + mock_response = Mock() + mock_response.json.return_value = {"datetime": ["2020-01-01T00:00:00Z"], "Hp30": [15.0]} + mock_response.raise_for_status = Mock() + mock_get = mocker.patch("swvo.io.hp.gfz.requests.get", return_value=mock_response) - mocker.patch("shutil.rmtree") + mocker.patch("swvo.io.hp.gfz.rmtree") mocker.patch.object(hp30gfz, "_process_single_file", return_value=pd.DataFrame()) hp30gfz.download_and_process(start_time, end_time) + mock_get.assert_called() + + def test_download_api_url_construction(self, hp30gfz, tmp_path, mocker): + start_time = datetime(2020, 1, 1, tzinfo=timezone.utc) + end_time = datetime(2020, 12, 31, tzinfo=timezone.utc) + + # Mock requests + mock_response = Mock() + mock_response.json.return_value = {"datetime": ["2020-01-01T00:00:00Z"], "Hp30": [15.0]} + mock_response.raise_for_status = Mock() + mock_get = mocker.patch("swvo.io.hp.gfz.requests.get", return_value=mock_response) - mock_ftp.login.assert_called() - mock_ftp.retrbinary.assert_called() - mock_ftp.quit.assert_called() + hp30gfz._download(tmp_path, start_time, end_time) + + # Verify the URL was constructed correctly + expected_url = ( + "https://kp.gfz.de/app/json/?start=2020-01-01T00:00:00Z&end=2020-12-31T00:00:00Z&index=Hp30&status=def" + ) + mock_get.assert_called_once_with(expected_url, timeout=30) @pytest.fixture def sample_csv_data(self):