From 1c95743e953630b7702bb3d42d39215222c7c701 Mon Sep 17 00:00:00 2001 From: Ford Date: Wed, 30 Oct 2024 21:56:19 -0600 Subject: [PATCH] NSRDB parallel API calls :) --- pvdeg/store.py | 2 - pvdeg/weather.py | 82 +- scripts/load_nsrdb_distributed.ipynb | 1485 +++++++++++++++++++++++++- 3 files changed, 1455 insertions(+), 114 deletions(-) diff --git a/pvdeg/store.py b/pvdeg/store.py index 7875aa2..cc7644f 100644 --- a/pvdeg/store.py +++ b/pvdeg/store.py @@ -6,8 +6,6 @@ import zarr import os -from pvdeg.weather import pvgis_hourly_empty_weather_ds - from pvdeg import METOROLOGICAL_DOWNLOAD_PATH def get(group): diff --git a/pvdeg/weather.py b/pvdeg/weather.py index f3051e2..a567869 100644 --- a/pvdeg/weather.py +++ b/pvdeg/weather.py @@ -1034,7 +1034,13 @@ def _weather_distributed_vec( return weather_ds, meta_dict, None -def emtpy_weather_ds(gids_size, periodicity, database): +# THE NSRDB shapes could be moved to their own definition +# organization style question? +def emtpy_weather_ds( + gids_size, + periodicity, + database + )->xr.Dataset: """ Create an empty weather dataframe for generalized input. @@ -1060,8 +1066,7 @@ def emtpy_weather_ds(gids_size, periodicity, database): import dask.array as da - # pvgis default shapes - shapes = { + pvgis_shapes = { "temp_air": ("gid", "time"), "relative_humidity": ("gid", "time"), "ghi": ("gid", "time"), @@ -1073,15 +1078,22 @@ def emtpy_weather_ds(gids_size, periodicity, database): "pressure": ("gid", "time"), } - # additional results from NSRDB - nsrdb_extra_shapes = { + nsrdb_shapes = { 'Year': ("gid", "time"), 'Month': ("gid", "time"), 'Day': ("gid", "time"), 'Hour': ("gid", "time"), 'Minute': ("gid", "time"), + 'temp_air':("gid", "time"), 'dew_point': ("gid", "time"), - 'albedo': ("gid", "time") + 'dhi': ("gid", "time"), + 'dni': ("gid", "time"), + 'ghi': ("gid", "time"), + 'albedo': ("gid", "time"), + 'pressure': ("gid", "time"), + 'wind_direction': ("gid", "time"), + 'wind_speed' : ("gid", "time"), + 'relative_humidity': ("gid", "time"), } attrs = {} @@ -1091,63 +1103,20 @@ def emtpy_weather_ds(gids_size, periodicity, database): dims_size = {'time': TIME_PERIODICITY_MAP[periodicity], 'gid': gids_size} if database == "NSRDB" or database == "PSM3": - shapes = shapes | nsrdb_extra_shapes - - weather_ds = xr.Dataset( - data_vars={ - var: (dim, da.empty([dims_size[d] for d in dim]), attrs.get(var)) - for var, dim in shapes.items() - }, - coords={'time': pd.date_range("2022-01-01", freq=periodicity, periods=TIME_PERIODICITY_MAP[periodicity]), - 'gid': np.linspace(0, gids_size-1, gids_size, dtype=int)}, - attrs=global_attrs, - ) - - return weather_ds - - - -def pvgis_hourly_empty_weather_ds(gids_size): - """ - Create an empty weather dataset for pvgis hourly TMY data - - Parameters - ---------- - gids_size: int - number of gids, equivalent to number of unique locations - - Returns - ------- - weather_ds: xarray.Dataset - Weather dataset of the same format/shapes given by a `pvdeg.weather.get` geospatial call or `pvdeg.weather.weather_distributed` call or `GeosptialScenario.get_geospatial_data`. - """ - import dask.array as da - - - - shapes = { - "temp_air": ("gid", "time"), - "relative_humidity": ("gid", "time"), - "ghi": ("gid", "time"), - "dni": ("gid", "time"), - "dhi": ("gid", "time"), - "IR(h)": ("gid", "time"), - "wind_speed": ("gid", "time"), - "wind_direction": ("gid", "time"), - "pressure": ("gid", "time"), - } - attrs = {} - global_attrs = {} + # shapes = shapes | nsrdb_extra_shapes + shapes = nsrdb_shapes + elif database == "PVGIS": + shapes = pvgis_shapes + else: + raise ValueError(f"database must be PVGIS, NSRDB, PSM3 not {database}") - dims = {'gid', 'time'} - dims_size = {'time': 8760, 'gid': gids_size} weather_ds = xr.Dataset( data_vars={ var: (dim, da.empty([dims_size[d] for d in dim]), attrs.get(var)) for var, dim in shapes.items() }, - coords={'time': pd.date_range("2022-01-01", freq="h", periods=365 * 24), + coords={'time': pd.date_range("2022-01-01", freq=periodicity, periods=TIME_PERIODICITY_MAP[periodicity]), 'gid': np.linspace(0, gids_size-1, gids_size, dtype=int)}, attrs=global_attrs, ) @@ -1162,6 +1131,7 @@ def pvgis_hourly_empty_weather_ds(gids_size): # TODO: implement rate throttling so we do not make too many requests. # TODO: multiple API keys to get around NSRDB key rate limit. 2 key, email pairs means twice the speed ;) +# TODO: this overwrites NSRDB GIDS when database == "PSM3" def weather_distributed( database: str, coords: list[tuple], diff --git a/scripts/load_nsrdb_distributed.ipynb b/scripts/load_nsrdb_distributed.ipynb index b959886..2a3639b 100644 --- a/scripts/load_nsrdb_distributed.ipynb +++ b/scripts/load_nsrdb_distributed.ipynb @@ -6,26 +6,71 @@ "metadata": {}, "outputs": [], "source": [ + "from dask.distributed import LocalCluster, Client\n", + "from dotenv import load_dotenv\n", "import pvdeg\n", - "import os\n", - "from dotenv import load_dotenv" + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting Up\n", + "\n", + "As in [load_pvgis_distributed.ipynb](./load_pvgis_distributed.ipynb) we need to get ready to make our parallelized API calls. The notebook linked here goes through the process in more detail but we need to import our API key and email. This cell will not work for you unless you replace the `api_key` and `email` with your personal NSRDB api keys. [REQUEST A KEY](https://developer.nrel.gov/signup/).\n", + "\n", + "We also need to initalize a dask client. `pvdeg.weather.weather_distributed` will not work without it. It will fail silently and not populate and of the results in the resulting `weather_ds` called `geo_weather` in the example below. It is hard to recognize that this has occured so be careful. Make sure to initialize a dask client first. Visiting the link takes you to a daskboard that shows what dask is doing." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://127.0.0.1:8787/status\n" + ] + } + ], "source": [ "load_dotenv()\n", "\n", - "api_key = os.getenv(\"API_KEY\")\n", - "email = os.getenv(\"EMAIL\")" + "### REPLACE WITH YOUR API KEY AND EMAIL ###\n", + "api_key = os.getenv(\"api_key\")\n", + "email = os.getenv(\"email\")\n", + "###########################################\n", + "\n", + "workers = 4\n", + "\n", + "cluster = LocalCluster(\n", + " n_workers=workers,\n", + " processes=True, \n", + " )\n", + "\n", + "client = Client(cluster)\n", + "\n", + "print(\"Daskboard link\")\n", + "print(client.dashboard_link)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Requesting Weather\n", + "\n", + "As with the other script [load_pvgis_distributed.ipynb](./load_pvgis_distributed.ipynb). We will create a list of tuple (latitude, longitude) pairs and call the function on all of them at once. failed will represent a list of failed gids, unique location ID's that correspond to points in space on the NSRDB. These are different than on PVGIS where they are arbitrary indexes that do NOT correspond to a spatial location on earth.\n", + "\n", + "We will request \"PSM3\" data from the Physical Solar Model that represents a typical meteorological year (TMY) from the NSRDB. We will have to supply the api key and email from above here. Refer to the linked script to see this in further detail. The only difference between the scripts, lies in the NSRDB/PSM3 data requiring Api keys." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -37,9 +82,20 @@ "geo_weather, geo_meta, failed = pvdeg.weather.weather_distributed(database=\"PSM3\", coords=coords, api_key=api_key, email=email)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Viewing Results\n", + "\n", + "Same as in the other tutorial, our results are stored in an xarray dataset with a dask backend so you will have to use `.compute()` on the dataset to inspect the individual values of the dask arrays. \n", + "\n", + "Click on the `Data variables` dropdown to expand the dataset viewer." + ] + }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -409,33 +465,853 @@ " stroke: currentColor;\n", " fill: currentColor;\n", "}\n", - "
<xarray.Dataset> Size: 701kB\n",
-       "Dimensions:            (time: 8760)\n",
+       "
<xarray.Dataset> Size: 2MB\n",
+       "Dimensions:            (gid: 2, time: 8760)\n",
        "Coordinates:\n",
        "  * time               (time) datetime64[ns] 70kB 2022-01-01 ... 2022-12-31T2...\n",
-       "    gid                int32 4B 0\n",
-       "Data variables:\n",
-       "    temp_air           (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    relative_humidity  (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    ghi                (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    dni                (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    dhi                (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    IR(h)              (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    wind_speed         (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    wind_direction     (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n",
-       "    pressure           (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314
  • gid
    PandasIndex
    PandasIndex(Index([0, 1], dtype='int32', name='gid'))
  • " ], "text/plain": [ - " Size: 701kB\n", - "Dimensions: (time: 8760)\n", + " Size: 2MB\n", + "Dimensions: (gid: 2, time: 8760)\n", "Coordinates:\n", " * time (time) datetime64[ns] 70kB 2022-01-01 ... 2022-12-31T2...\n", - " gid int32 4B 0\n", - "Data variables:\n", - " temp_air (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " relative_humidity (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " ghi (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " dni (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " dhi (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " IR(h) (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " wind_speed (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " wind_direction (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314\n", - " pressure (time) float64 70kB 6.366e-314 6.366e-314 ... 6.366e-314" + " * gid (gid) int32 8B 0 1\n", + "Data variables: (12/15)\n", + " Year (gid, time) float64 140kB dask.array\n", + " Month (gid, time) float64 140kB dask.array\n", + " Day (gid, time) float64 140kB dask.array\n", + " Hour (gid, time) float64 140kB dask.array\n", + " Minute (gid, time) float64 140kB dask.array\n", + " temp_air (gid, time) float64 140kB dask.array\n", + " ... ...\n", + " ghi (gid, time) float64 140kB dask.array\n", + " albedo (gid, time) float64 140kB dask.array\n", + " pressure (gid, time) float64 140kB dask.array\n", + " wind_direction (gid, time) float64 140kB dask.array\n", + " wind_speed (gid, time) float64 140kB dask.array\n", + " relative_humidity (gid, time) float64 140kB dask.array" ] }, - "execution_count": 13, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "geo_weather.isel(gid=0).compute()" + "geo_weather" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    <xarray.Dataset> Size: 2MB\n",
    +       "Dimensions:            (gid: 2, time: 8760)\n",
    +       "Coordinates:\n",
    +       "  * time               (time) datetime64[ns] 70kB 2022-01-01 ... 2022-12-31T2...\n",
    +       "  * gid                (gid) int32 8B 0 1\n",
    +       "Data variables: (12/15)\n",
    +       "    Year               (gid, time) float64 140kB 2.005e+03 ... 2.002e+03\n",
    +       "    Month              (gid, time) float64 140kB 1.0 1.0 1.0 ... 12.0 12.0 12.0\n",
    +       "    Day                (gid, time) float64 140kB 1.0 1.0 1.0 ... 31.0 31.0 31.0\n",
    +       "    Hour               (gid, time) float64 140kB 0.0 1.0 2.0 ... 21.0 22.0 23.0\n",
    +       "    Minute             (gid, time) float64 140kB 30.0 30.0 30.0 ... 30.0 30.0\n",
    +       "    temp_air           (gid, time) float64 140kB 22.0 21.0 21.0 ... 22.0 22.0\n",
    +       "    ...                 ...\n",
    +       "    ghi                (gid, time) float64 140kB 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0\n",
    +       "    albedo             (gid, time) float64 140kB 0.1 0.1 0.1 ... 0.01 0.01 0.01\n",
    +       "    pressure           (gid, time) float64 140kB 1.02e+03 1.02e+03 ... 1.01e+03\n",
    +       "    wind_direction     (gid, time) float64 140kB 94.0 93.0 92.0 ... 103.0 108.0\n",
    +       "    wind_speed         (gid, time) float64 140kB 6.4 6.1 5.9 5.6 ... 7.9 7.9 7.8\n",
    +       "    relative_humidity  (gid, time) float64 140kB 73.29 77.92 ... 73.29 73.29
    " + ], + "text/plain": [ + " Size: 2MB\n", + "Dimensions: (gid: 2, time: 8760)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 70kB 2022-01-01 ... 2022-12-31T2...\n", + " * gid (gid) int32 8B 0 1\n", + "Data variables: (12/15)\n", + " Year (gid, time) float64 140kB 2.005e+03 ... 2.002e+03\n", + " Month (gid, time) float64 140kB 1.0 1.0 1.0 ... 12.0 12.0 12.0\n", + " Day (gid, time) float64 140kB 1.0 1.0 1.0 ... 31.0 31.0 31.0\n", + " Hour (gid, time) float64 140kB 0.0 1.0 2.0 ... 21.0 22.0 23.0\n", + " Minute (gid, time) float64 140kB 30.0 30.0 30.0 ... 30.0 30.0\n", + " temp_air (gid, time) float64 140kB 22.0 21.0 21.0 ... 22.0 22.0\n", + " ... ...\n", + " ghi (gid, time) float64 140kB 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0\n", + " albedo (gid, time) float64 140kB 0.1 0.1 0.1 ... 0.01 0.01 0.01\n", + " pressure (gid, time) float64 140kB 1.02e+03 1.02e+03 ... 1.01e+03\n", + " wind_direction (gid, time) float64 140kB 94.0 93.0 92.0 ... 103.0 108.0\n", + " wind_speed (gid, time) float64 140kB 6.4 6.1 5.9 5.6 ... 7.9 7.9 7.8\n", + " relative_humidity (gid, time) float64 140kB 73.29 77.92 ... 73.29 73.29" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geo_weather.compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spot Check\n", + "\n", + "We can plot the entire TMY air_temperature to check that our data has loaded correctly. \n", + "\n", + "Explanation of steps\n", + "\n", + "geo_weather is our weather xarray dataset. We can index into the first entry at the 0th index by using isel (index-select). This will grab the data from the first gid. Then we pick the air temperature attribute. This can be replaced with bracket notation so `.temp_air` becomes `[\"temp_air\"]. \n", + "\n", + "This selects a single array from the dataset that is labeled as \"temp_air\". This array will be a dask array so the values will be stored out of memory, we would have to load it using `.compute()` to directly inspect it but when plotting with matplotlib it will load the array for us." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.plot(geo_weather.isel(gid=0).temp_air)" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "weather_arg = {\n", - " 'api_key': api_key,\n", - " 'email': email,\n", - " 'names': 'tmy',\n", - " 'attributes': [],\n", - " 'map_variables': True\n", - "}\n", + "### Next Steps\n", "\n", - "weather, meta = pvdeg.weather.get(database=\"PSM3\", id=(25.783388, -80.189029), **weather_arg)" + "Now we have data ready to use for geospatial calculations. This is shown in the other distributed script [load_pvgis_distributed.ipynb](./load_pvgis_distributed.ipynb). You can also see how to do this in [Geospatial Templates.ipynb](../tutorials_and_tools/tutorials_and_tools/Geospatial%20Templates.ipynb)" ] } ],