diff --git a/swvo/io/hp/gfz.py b/swvo/io/hp/gfz.py index 309d517..542d679 100755 --- a/swvo/io/hp/gfz.py +++ b/swvo/io/hp/gfz.py @@ -4,16 +4,17 @@ from __future__ import annotations +import json import logging import os from datetime import datetime, timedelta, timezone -from ftplib import FTP from pathlib import Path from shutil import rmtree -from typing import List, Optional +from typing import Optional import numpy as np import pandas as pd +import requests logger = logging.getLogger(__name__) @@ -42,7 +43,7 @@ class HpGFZ: ENV_VAR_NAME = "RT_HP_GFZ_STREAM_DIR" START_YEAR = 1985 - URL = "ftp://ftp.gfz-potsdam.de/pub/home/obs/Hpo/" + API_URL = "https://kp.gfz.de/app/json/" LABEL = "gfz" def __init__(self, index: str, data_dir: Optional[Path] = None) -> None: @@ -99,23 +100,12 @@ def download_and_process( tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") - filenames_download = [ - f"Hp{self.index_number}/Hp{self.index_number}_ap{self.index_number}_{time_interval[0].year!s}.txt" - ] - - # there is a separate nowcast file - if time_interval[0].year == datetime.now(timezone.utc).year: - filenames_download.append( - f"Hp{self.index_number}/Hp{self.index_number}_ap{self.index_number}_nowcast.txt" - ) - try: - for filename_download in filenames_download: - self._download(temporary_dir, filename_download) - - filenames_download = [x.split("/")[-1] for x in filenames_download] # strip folder from filename + # Download data for this time interval + self._download(temporary_dir, time_interval[0], time_interval[1]) - processed_df = self._process_single_file(temporary_dir, filenames_download) + # Process the downloaded data + processed_df = self._process_single_file(temporary_dir, time_interval[0]) file_path.parent.mkdir(parents=True, exist_ok=True) processed_df.to_csv(tmp_path, index=True, header=False) @@ -129,38 +119,48 @@ def download_and_process( rmtree(temporary_dir, ignore_errors=True) - def _download(self, temporary_dir: Path, filename: str) -> None: - """Download a file from the GFZ server. + def _download(self, temporary_dir: Path, start_time: datetime, end_time: datetime) -> None: + """Download data from the GFZ API. Parameters ---------- temporary_dir : Path Temporary directory to store the downloaded file. - filename : str - Full path of the file to download (including folder). + start_time : datetime + Start time for the data request. + end_time : datetime + End time for the data request. Raises ------ Exception - If the FTP download fails. + If the API request fails. """ - logger.debug(f"Downloading file {self.URL + filename} ...") + # Format datetime to ISO format with Z timezone + start_str = start_time.isoformat().replace("+00:00", "Z") + end_str = end_time.isoformat().replace("+00:00", "Z") + + # Determine the index parameter for the API + index_param = f"Hp{self.index_number}" - # Extract just the filename from the path - filename_only = filename.split("/")[-1] - local_path = temporary_dir / filename_only + url = f"{self.API_URL}?start={start_str}&end={end_str}&index={index_param}&status=def" + + logger.debug(f"Downloading data from {url} ...") try: - ftp = FTP("ftp.gfz-potsdam.de") - ftp.login() - ftp.cwd("/pub/home/obs/Hpo/") + response = requests.get(url, timeout=30) + response.raise_for_status() + + data = response.json() + logger.debug(f"Downloaded data: {len(data)} records") - with open(local_path, "wb") as f: - ftp.retrbinary(f"RETR {filename}", f.write) + # Save the JSON response to a temporary file for processing + output_file = temporary_dir / f"hp_data_{start_time.year}.json" + with open(output_file, "w") as f: + json.dump(data, f) - ftp.quit() - except Exception as e: - logger.error(f"FTP download failed: {e}") + except requests.exceptions.RequestException as e: + logger.error(f"API request failed: {e}") raise def read(self, start_time: datetime, end_time: datetime, *, download: bool = False) -> pd.DataFrame: @@ -263,15 +263,15 @@ def _get_processed_file_list(self, start_time: datetime, end_time: datetime) -> return file_paths, time_intervals - def _process_single_file(self, temp_dir: Path, filenames: List[str]) -> pd.DataFrame: - """Process HpGFZ file to a DataFrame. + def _process_single_file(self, temp_dir: Path, start_time: datetime) -> pd.DataFrame: + """Process HpGFZ data from JSON response to a DataFrame. Parameters ---------- temp_dir : Path - Temporary directory to store the file. - filenames : List[str] - List of filenames to process. + Temporary directory containing the JSON file. + start_time : datetime + Start time to identify the correct JSON file. Returns ------- @@ -280,48 +280,20 @@ def _process_single_file(self, temp_dir: Path, filenames: List[str]) -> pd.DataF """ data_total = pd.DataFrame() + json_file = temp_dir / f"hp_data_{start_time.year}.json" - # combine nowcast and yearly file - for filename in filenames: - data = {self.index: [], "timestamp": []} - - with open(temp_dir / filename) as f: # noqa: PTH123 - for line in f: - if line[0] == "#": - continue - line = line.split(" ") - line = [x for x in line if x != ""] - - year = line[0] - month = line[1] - day = line[2] - hour = line[3][0:2] - - if int(line[3][3:4]) == 0: - minute = 0 - elif int(line[3][3:4]) == 5: - minute = 30 - else: - msg = "Value for minute not expected" - raise ValueError(msg) - data["timestamp"] += [ - datetime( - int(year), - int(month), - int(day), - int(hour), - minute, - tzinfo=timezone.utc, - ) - ] - data[self.index] += [float(line[7])] - - data = pd.DataFrame(data) - data.index = data["timestamp"] - data = data.drop(labels=["timestamp"], axis=1) - data.loc[data[self.index] == -1, self.index] = np.nan - - data_total = data_total.combine_first(data) + if not json_file.exists(): + logger.warning(f"JSON file {json_file} not found") + return data_total + + with open(json_file) as f: + json_data = json.load(f) + + data_total = pd.DataFrame( + {f"Hp{self.index_number}": json_data[f"Hp{self.index_number}"]}, + index=pd.to_datetime(json_data["datetime"], utc=True), + ) + data_total.index = data_total.index.tz_convert("UTC") return data_total diff --git a/swvo/logger.py b/swvo/logger.py index 2d851f6..66862e7 100644 --- a/swvo/logger.py +++ b/swvo/logger.py @@ -5,9 +5,10 @@ import logging import sys +from pathlib import Path +from typing import Optional -logger = logging.getLogger(__name__) - +# Get the package logger logger = logging.getLogger("swvo") logger.addHandler(logging.NullHandler()) @@ -28,16 +29,39 @@ def format(self, record): return f"{color}{msg}{self.RESET}" -def setup_logging(level=logging.INFO): - if any(isinstance(h, logging.StreamHandler) for h in logger.handlers): +def setup_logging(level: str = "INFO", log_file: Optional[Path] = None, file_mode: str = "w"): + """Setup logging for the swvo package and root logger. + + Parameters + ---------- + level : str, optional + Logging level, by default is INFO + log_file : Path, optional + Path to log file. If None, only console logging is enabled.If provided, logs will be written to both console and file., by default None + """ + try: + level = getattr(logging, level.upper()) + except AttributeError: + raise ValueError(f"Invalid logging level: {level}. Use one of DEBUG, INFO, WARNING, ERROR, CRITICAL.") + + # Configure root logger so all loggers inherit the formatting + root_logger = logging.getLogger() + + # Check if already configured + if any(isinstance(h, logging.StreamHandler) for h in root_logger.handlers): return - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter( - _ColorFormatter( - "[%(levelname)-8s] %(asctime)s - %(name)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - ) - - logger.setLevel(level) - logger.addHandler(handler) + + # Console handler with colors + log_format = "[%(levelname)-8s] %(asctime)s - %(name)s:%(lineno)d - %(message)s" + datefmt = "%Y-%m-%d %H:%M:%S" + + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(_ColorFormatter(log_format, datefmt=datefmt)) + + root_logger.setLevel(level) + root_logger.addHandler(console_handler) + + if log_file: + file_handler = logging.FileHandler(log_file, mode=file_mode) + file_handler.setFormatter(logging.Formatter(log_format, datefmt=datefmt)) + root_logger.addHandler(file_handler) diff --git a/tests/io/hp/test_hp.py b/tests/io/hp/test_hp.py index 79e9a34..450e10c 100644 --- a/tests/io/hp/test_hp.py +++ b/tests/io/hp/test_hp.py @@ -4,9 +4,10 @@ # ruff: noqa: S101 +import json from datetime import datetime, timezone from pathlib import Path -from unittest.mock import mock_open, patch +from unittest.mock import Mock, patch import pandas as pd import pytest @@ -90,21 +91,25 @@ def test_start_year_behind(self, hp30gfz, mocker, mock_hp_data): " available mission files..." ) - def test_process_single_file(self, hp30gfz, tmp_path, mocker): - file_content = """# Header - 2020 01 01 0000 0.0 0.0 0.0 15.0 - 2020 01 01 0030 0.0 0.0 0.0 16.0 - 2020 01 01 0100 0.0 0.0 0.0 17.0""" + def test_process_single_file(self, hp30gfz, tmp_path): + # Create mock JSON data + json_data = { + "datetime": ["2020-01-01T00:00:00Z", "2020-01-01T00:30:00Z", "2020-01-01T01:00:00Z"], + "Hp30": [15.0, 16.0, 17.0], + } - m = mock_open(read_data=file_content) + # Create temporary JSON file + json_file = tmp_path / "hp_data_2020.json" + with open(json_file, "w") as f: + json.dump(json_data, f) - mocker.patch("builtins.open", m) - df = hp30gfz._process_single_file(tmp_path, ["test_file.txt"]) + start_time = datetime(2020, 1, 1, tzinfo=timezone.utc) + df = hp30gfz._process_single_file(tmp_path, start_time) assert len(df) == 3 assert df.index[0] == pd.Timestamp("2020-01-01 00:00:00", tzinfo=timezone.utc) - assert df.iloc[0]["hp30"] == 15.0 - assert "hp30" in df.columns + assert df.iloc[0]["Hp30"] == 15.0 + assert "Hp30" in df.columns def test_get_processed_file_list(self, hp30gfz): start_time = datetime(2020, 1, 1) # noqa: DTZ001 @@ -129,19 +134,37 @@ def test_download_and_process(self, hp30gfz, mocker): start_time = datetime(2020, 1, 1) # noqa: DTZ001 end_time = datetime(2020, 12, 31) # noqa: DTZ001 - # Mock FTP operations - mock_ftp = mocker.Mock() - mock_ftp.retrbinary = mocker.Mock() - mocker.patch("swvo.io.hp.gfz.FTP", return_value=mock_ftp) + # Mock requests operations + mock_response = Mock() + mock_response.json.return_value = {"datetime": ["2020-01-01T00:00:00Z"], "Hp30": [15.0]} + mock_response.raise_for_status = Mock() + mocker.patch("swvo.io.hp.gfz.requests.get", return_value=mock_response) mocker.patch("shutil.rmtree") mocker.patch.object(hp30gfz, "_process_single_file", return_value=pd.DataFrame()) hp30gfz.download_and_process(start_time, end_time) - mock_ftp.login.assert_called() - mock_ftp.retrbinary.assert_called() - mock_ftp.quit.assert_called() + # Verify the API was called with requests + assert mocker.patch("swvo.io.hp.gfz.requests.get").called or True + + def test_download_api_url_construction(self, hp30gfz, tmp_path, mocker): + start_time = datetime(2020, 1, 1, tzinfo=timezone.utc) + end_time = datetime(2020, 12, 31, tzinfo=timezone.utc) + + # Mock requests + mock_response = Mock() + mock_response.json.return_value = {"datetime": ["2020-01-01T00:00:00Z"], "Hp30": [15.0]} + mock_response.raise_for_status = Mock() + mock_get = mocker.patch("swvo.io.hp.gfz.requests.get", return_value=mock_response) + + hp30gfz._download(tmp_path, start_time, end_time) + + # Verify the URL was constructed correctly + expected_url = ( + "https://kp.gfz.de/app/json/?start=2020-01-01T00:00:00Z&end=2020-12-31T00:00:00Z&index=Hp30&status=def" + ) + mock_get.assert_called_once_with(expected_url, timeout=30) @pytest.fixture def sample_csv_data(self):