diff --git a/MANIFEST.in b/MANIFEST.in index 81ab62ed..36ebec26 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ -include pydartdiags/obs_sequence/composite_types.yaml +include src/pydartdiags/obs_sequence/composite_types.yaml diff --git a/docs/api/example_data.rst b/docs/api/example_data.rst new file mode 100644 index 00000000..047159a6 --- /dev/null +++ b/docs/api/example_data.rst @@ -0,0 +1,9 @@ +.. example_data: + +===================== +module: data +===================== + +.. automodule:: data + :members: + :member-order: bysource \ No newline at end of file diff --git a/docs/api/index.rst b/docs/api/index.rst index 6ae26367..64670ce3 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -12,3 +12,4 @@ API Guide stats matplots plots + example_data diff --git a/examples/01_manipulating/plot_duplicates.py b/examples/01_manipulating/plot_duplicates.py index 041f3bbb..6d71189b 100644 --- a/examples/01_manipulating/plot_duplicates.py +++ b/examples/01_manipulating/plot_duplicates.py @@ -9,15 +9,14 @@ ########################################### # Import the obs_sequence module -import os import pydartdiags.obs_sequence.obs_sequence as obsq +from pydartdiags.data import get_example_data ########################################### # Read in the observation sequence file. In this example we'll use a real obs_seq file, -# the NCEP+ACARS.201303_6H.obs_seq2013030306 file that comes with the pyDARTdiags package. +# the NCEP+ACARS.201303_6H.obs_seq2013030306 file. # This is 6 hours of observations from March 3, 2013. -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "NCEP+ACARS.201303_6H.obs_seq2013030306") +data_file = get_example_data("NCEP+ACARS.201303_6H.obs_seq2013030306") obs_seq = obsq.ObsSequence(data_file) diff --git a/examples/01_manipulating/plot_external_fo.py b/examples/01_manipulating/plot_external_fo.py index 72902fc2..46740596 100644 --- a/examples/01_manipulating/plot_external_fo.py +++ b/examples/01_manipulating/plot_external_fo.py @@ -16,16 +16,15 @@ ########################################### # Import the obs_sequence module, and numpy -import os import pydartdiags.obs_sequence.obs_sequence as obsq +from pydartdiags.data import get_example_data import numpy as np ########################################### # Read in the observation sequence file. In this example we'll use the -# obs_seq.out.tracer file that comes with the pyDARTdiags package. -# This file only has two observations. -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.out.tracer") +# obs_seq.out.tracer file. +# This file only has two observations. +data_file = get_example_data("obs_seq.out.tracer") obs_seq = obsq.ObsSequence(data_file) ########################################### diff --git a/examples/01_manipulating/plot_join_obs_seq.py b/examples/01_manipulating/plot_join_obs_seq.py index 1cb5c703..0269a4d6 100644 --- a/examples/01_manipulating/plot_join_obs_seq.py +++ b/examples/01_manipulating/plot_join_obs_seq.py @@ -10,15 +10,12 @@ ########################################### # Import the obs_sequence module. import pydartdiags.obs_sequence.obs_sequence as obsq +from pydartdiags.data import get_example_data ########################################### # Chose the first obs_seq file to read. -# In this example, we are using a small obs_seq file "obs_seq.final.1000" -# that comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file. -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file1 = os.path.join(data_dir, "obs_seq.final.1000") +# In this example, we are using a small obs_seq file "obs_seq.final.1000". +data_file1 = get_example_data("obs_seq.final.1000") ########################################### # Read the obs_seq file into an obs_seq object. @@ -30,7 +27,7 @@ ########################################### # Chose the second obs_seq file to read. -data_file2 = os.path.join(data_dir, "obs_seq.final.ascii.small") +data_file2 = get_example_data("obs_seq.final.ascii.small") obs_seq2 = obsq.ObsSequence(data_file2) print('obs_seq2 has assimilation info:', obs_seq2.has_assimilation_info()) diff --git a/examples/01_manipulating/plot_remove_obs.py b/examples/01_manipulating/plot_remove_obs.py index 5e9ee832..f6d7c909 100644 --- a/examples/01_manipulating/plot_remove_obs.py +++ b/examples/01_manipulating/plot_remove_obs.py @@ -10,15 +10,12 @@ ########################################### # Import the obs_sequence module import pydartdiags.obs_sequence.obs_sequence as obsq +from pydartdiags.data import get_example_data ########################################### # Chose an obs_seq file to read. -# In this example, we are using a small obs_seq file "obs_seq.final.medium" -# that comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file. -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.ascii.medium") +# In this example, we are using a small obs_seq file "obs_seq.final.medium". +data_file = get_example_data("obs_seq.final.ascii.medium") ########################################### # Read the obs_seq file into an obs_seq object. diff --git a/examples/02_visualizing/plot_qc_hover.py b/examples/02_visualizing/plot_qc_hover.py index 5dc8c799..019bd040 100644 --- a/examples/02_visualizing/plot_qc_hover.py +++ b/examples/02_visualizing/plot_qc_hover.py @@ -13,20 +13,17 @@ ########################################### # Import the modules -import os import plotly.express as px import pydartdiags.obs_sequence.obs_sequence as obsq +from pydartdiags.data import get_example_data # sphinx_gallery_thumbnail_path = '_static/geo_thumb.png' ########################################### # Read the obs_seq file into an obs_seq object. -# In this example, we use a small obs_seq file "obs_seq.final.ascii.medium" -# that comes with the pyDARTdiags package -# in the data directory, so we use ``os`` to get the path to the file -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.ascii.medium") +# In this example, we use a small obs_seq file "obs_seq.final.ascii.medium". +data_file = get_example_data("obs_seq.final.ascii.medium") obs_seq = obsq.ObsSequence(data_file) diff --git a/examples/03_diagnostics/plot_evolution.py b/examples/03_diagnostics/plot_evolution.py index ccdad41b..df631122 100644 --- a/examples/03_diagnostics/plot_evolution.py +++ b/examples/03_diagnostics/plot_evolution.py @@ -11,16 +11,13 @@ # and the matplots module for plotting. import pydartdiags.obs_sequence.obs_sequence as obsq from pydartdiags.matplots import matplots as mp +from pydartdiags.data import get_example_data ########################################### # Chose an obs_seq file to read. # In this example, we are using "obs_seq.final.lorenz_96" which is from # a Lorenz 96 model run with the DART assimilation system. -# The obs_seq.final.lorenz_96 file comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file. -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.lorenz_96") +data_file = get_example_data("obs_seq.final.lorenz_96") ########################################### # Read the obs_seq file into an obs_seq object. diff --git a/examples/03_diagnostics/plot_grand_stats.py b/examples/03_diagnostics/plot_grand_stats.py index ee501e62..e9ca32c1 100644 --- a/examples/03_diagnostics/plot_grand_stats.py +++ b/examples/03_diagnostics/plot_grand_stats.py @@ -13,14 +13,12 @@ # and the statistics module. import pydartdiags.obs_sequence.obs_sequence as obsq from pydartdiags.stats import stats +from pydartdiags.data import get_example_data + ########################################### # Chose an obs_seq file to read. -# This is a small obs_seq file "obs_seq.final.ascii.medium" -# that comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.ascii.medium") +# This is a small obs_seq file "obs_seq.final.ascii.medium". +data_file = get_example_data("obs_seq.final.ascii.medium") ########################################### diff --git a/examples/03_diagnostics/plot_profiles.py b/examples/03_diagnostics/plot_profiles.py index f0365280..eb4b2c0d 100644 --- a/examples/03_diagnostics/plot_profiles.py +++ b/examples/03_diagnostics/plot_profiles.py @@ -12,15 +12,12 @@ # and the matplots module for plotting. import pydartdiags.obs_sequence.obs_sequence as obsq from pydartdiags.matplots import matplots as mp +from pydartdiags.data import get_example_data ########################################### # Chose an obs_seq file to read. -# In this example, we are using a small obs_seq file "obs_seq.final.1000" -# that comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file. -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.1000") +# In this example, we are using a small obs_seq file "obs_seq.final.1000". +data_file = get_example_data("obs_seq.final.1000") ########################################### # Read the obs_seq file into an obs_seq object. diff --git a/examples/03_diagnostics/plot_qc_poss_vs_used.py b/examples/03_diagnostics/plot_qc_poss_vs_used.py index 804647d2..b66c6d0e 100644 --- a/examples/03_diagnostics/plot_qc_poss_vs_used.py +++ b/examples/03_diagnostics/plot_qc_poss_vs_used.py @@ -12,15 +12,12 @@ ########################################### # Import the obs_sequence module import pydartdiags.obs_sequence.obs_sequence as obsq +from pydartdiags.data import get_example_data ########################################### # Chose an obs_seq file to read. -# This is a small obs_seq file "obs_seq.final.ascii.small" -# that comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.ascii.small") +# This is a small obs_seq file "obs_seq.final.ascii.small". +data_file = get_example_data("obs_seq.final.ascii.small") ########################################### # read the obs_seq file into an obs_seq object diff --git a/examples/03_diagnostics/plot_rank_histogram.py b/examples/03_diagnostics/plot_rank_histogram.py index 892e702e..c93a7b9e 100644 --- a/examples/03_diagnostics/plot_rank_histogram.py +++ b/examples/03_diagnostics/plot_rank_histogram.py @@ -12,15 +12,12 @@ # and the matplots module for plotting. import pydartdiags.obs_sequence.obs_sequence as obsq from pydartdiags.matplots import matplots as mp +from pydartdiags.data import get_example_data ########################################### # Chose an obs_seq file to read. -# In this example, we are using a small obs_seq file "obs_seq.final.1000" -# that comes with the pyDARTdiags package -# in the data directory, so we ``import os`` to get the path to the file. -import os -data_dir = os.path.join(os.getcwd(), "../..", "data") -data_file = os.path.join(data_dir, "obs_seq.final.1000") +# In this example, we are using a small obs_seq file "obs_seq.final.1000". +data_file = get_example_data("obs_seq.final.1000") ########################################### # Read the obs_seq file into an obs_seq object. diff --git a/pyproject.toml b/pyproject.toml index 014f79d3..8ae12454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "wheel"] +requires = ["setuptools>=64", "wheel"] build-backend = "setuptools.build_meta" [project] @@ -29,3 +29,6 @@ Homepage = "https://github.com/NCAR/pyDARTdiags.git" Issues = "https://github.com/NCAR/pyDARTdiags/issues" Documentation = "https://ncar.github.io/pyDARTdiags" +[tool.setuptools.packages.find] +where = ["src"] +include = ["pydartdiags*"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 1a9aeff6..00000000 --- a/setup.py +++ /dev/null @@ -1,31 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="pydartdiags", - version="0.6.2", - packages=find_packages(where="src"), - package_dir={"": "src"}, - include_package_data=True, - package_data={ - "pydartdiags": ["obs_sequence/composite_types.yaml"], - }, - author="Helen Kershaw", - author_email="hkershaw@ucar.edu", - description="Observation Sequence Diagnostics for DART", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/NCAR/pyDARTdiags.git", - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - python_requires=">=3.8", - install_requires=[ - "pandas>=2.2.0", - "numpy>=1.26", - "plotly>=5.22.0", - "pyyaml>=6.0.2", - "matplotlib>=3.9.4" - ], -) diff --git a/src/pydartdiags/data.py b/src/pydartdiags/data.py new file mode 100644 index 00000000..436a7c9c --- /dev/null +++ b/src/pydartdiags/data.py @@ -0,0 +1,246 @@ +""" +Data file utilities for pyDARTdiags examples. + +This module provides functions to locate, download, +and cache example files used in pyDARTdiags examples. +The data files are cached in the users home directory +under ~/.pydartdiags/data. + +""" + +import os +from pathlib import Path +import urllib.request +import zipfile +import shutil + +# Zenodo DOI/URL for the data archive +ZENODO_RECORD_ID = "18135062" +ZENODO_RECORD_URL = f"https://zenodo.org/api/records/{ZENODO_RECORD_ID}/files-archive" +ZENODO_DOI = f"https://doi.org/10.5281/zenodo.{ZENODO_RECORD_ID}" + + +def get_data_cache_dir(): + """Get the cache directory for downloaded data files.""" + cache_dir = Path.home() / ".pydartdiags" / "data" + return cache_dir + + +def get_example_data(filename, auto_download=True): + """ + Get path to a data file, checking multiple locations. + + Searches for data files in the following order: + 1. Development location (../../data from package GitHub repo) + 2. Environment variable PYDARTDIAGS_DATA + 3. User cache directory (~/.pydartdiags/data) + 4. Downloads from Zenodo if auto_download=True + + Parameters + ---------- + filename : str + Name of the data file to locate + auto_download : bool, optional + If True, automatically download from Zenodo if not found locally. + Default is True. + + Returns + ------- + str + Absolute path to the data file + + Raises + ------ + FileNotFoundError + If the file is not found and auto_download=False + + Examples + -------- + + .. code-block:: python + + data_file = get_example_data("obs_seq.final.lorenz_96") + + """ + # 1. Check development location (for contributors/developers) + try: + package_dir = Path(__file__).parent.parent.parent + dev_data = package_dir / "data" / filename + print(f"package_dir: {package_dir}") + if dev_data.exists(): + print(f"Using development data file: {dev_data}") + return str(dev_data) + except: + pass + + # 2. Check environment variable + if "PYDARTDIAGS_DATA" in os.environ: + env_data = Path(os.environ["PYDARTDIAGS_DATA"]) / filename + if env_data.exists(): + print(f"Using data file from PYDARTDIAGS_DATA: {env_data}") + return str(env_data) + + # 3. Check cache directory + cache_dir = get_data_cache_dir() + cache_file = cache_dir / filename + if cache_file.exists(): + print(f"Using cached data file: {cache_file}") + return str(cache_file) + + # 4. File not found + if not auto_download: + raise FileNotFoundError( + f"Data file '{filename}' not found.\n\n" + f"To download example data:\n" + f" Option 1: Automatic download\n" + f" >>> from pydartdiags.data import download_all_data\n" + f" >>> download_all_data()\n\n" + f" Option 2: Manual download\n" + f" Download from: {ZENODO_DOI}\n" + f" Extract to: {cache_dir}\n\n" + f" Option 3: Set environment variable\n" + f" export PYDARTDIAGS_DATA=/path/to/your/data\n" + ) + + # Auto-download + print(f"Data file '{filename}' not found locally.") + print("Downloading all example data from Zenodo...") + download_all_data() + + # Check again after download + if cache_file.exists(): + return str(cache_file) + else: + raise FileNotFoundError( + f"Downloaded data but '{filename}' still not found. " + f"Please check {ZENODO_RECORD_URL}" + ) + + +def download_all_data(force=False): + """ + Download all example data files from Zenodo. + + Downloads and extracts the complete data archive to the user's + cache directory (~/.pydartdiags/data). + + Parameters + ---------- + force : bool, optional + If True, re-download even if data already exists. Default is False. + + Examples + -------- + + .. code-block:: python + + from pydartdiags.data import download_all_data + download_all_data() + + """ + cache_dir = get_data_cache_dir() + + if cache_dir.exists() and not force: + print(f"Data directory already exists: {cache_dir}") + print("Use force=True to re-download.") + return + + cache_dir.mkdir(parents=True, exist_ok=True) + + # Download archive + archive_file = cache_dir.parent / f"{ZENODO_RECORD_ID}.zip" + + print(f"Downloading data from Zenodo ({ZENODO_DOI})...") + print(f"This may take a few minutes (approx. 85 MB)...") + + try: + urllib.request.urlretrieve(ZENODO_RECORD_URL, archive_file) + print(f"Download complete: {archive_file}") + + # Extract archive + print(f"Extracting to {cache_dir}...") + with zipfile.ZipFile(archive_file, "r") as zip_ref: + zip_ref.extractall(path=cache_dir) + + # Clean up archive + archive_file.unlink() + + print(f"Data successfully installed to {cache_dir}") + print(f"Found {len(list(cache_dir.glob('*')))} data files") + + except Exception as e: + print(f"Error downloading data: {e}") + print(f"\nManual download instructions:") + print(f"1. Download from: {ZENODO_DOI}") + print(f"2. Extract to: {cache_dir}") + raise + + +def list_available_data(): + """ + List all available data files. + + Returns + ------- + list of str + List of available data file names + + Examples + -------- + + .. code-block:: python + + from pydartdiags.data import list_available_data + files = list_available_data() + print(files) + + """ + locations = [] + + # Check development location + try: + package_dir = Path(__file__).parent.parent.parent + dev_data = package_dir / "data" + if dev_data.exists(): + locations.append(dev_data) + except: + pass + + # Check environment variable + if "PYDARTDIAGS_DATA" in os.environ: + env_data = Path(os.environ["PYDARTDIAGS_DATA"]) + if env_data.exists(): + locations.append(env_data) + + # Check cache + cache_dir = get_data_cache_dir() + if cache_dir.exists(): + locations.append(cache_dir) + + # Collect all files + files = set() + for loc in locations: + files.update([f.name for f in loc.glob("*") if f.is_file()]) + + return sorted(list(files)) + + +def clear_cache(): + """ + Remove all downloaded data from the cache directory. + + Examples + -------- + + .. code-block:: python + + from pydartdiags.data import clear_cache + clear_cache() + + """ + cache_dir = get_data_cache_dir() + if cache_dir.exists(): + shutil.rmtree(cache_dir) + print(f"Cleared cache: {cache_dir}") + else: + print("Cache directory does not exist.")