|
| 1 | +import requests, datetime, io, gzip |
| 2 | +from dweather_client.ipfs_errors import * |
| 3 | +from dweather_client.utils import listify_period |
| 4 | +import dweather_client.ipfs_datasets |
| 5 | +import ipfshttpclient |
| 6 | +import json |
| 7 | + |
| 8 | +MM_TO_INCHES = 0.0393701 |
| 9 | +RAINFALL_PRECISION = 5 |
| 10 | +GATEWAY_URL = 'https://gateway.arbolmarket.com' |
| 11 | + |
| 12 | + |
| 13 | +def get_heads(url=GATEWAY_URL): |
| 14 | + """ |
| 15 | + Get heads.json for a given IPFS gateway. |
| 16 | + Args: |
| 17 | + url (str): base url of the IPFS gateway url |
| 18 | + Returns (example heads.json): |
| 19 | + { |
| 20 | + 'chirps_05-daily': 'Qm...', |
| 21 | + 'chirps_05-monthly': 'Qm...', |
| 22 | + 'chirps_25-daily': 'Qm...', |
| 23 | + 'chirps_25-monthly': 'Qm...', |
| 24 | + 'cpc_us-daily': 'Qm...', |
| 25 | + 'cpc_us-monthly': 'Qm...' |
| 26 | + } |
| 27 | + """ |
| 28 | + hashes_url = url + "/climate/hashes/heads.json" |
| 29 | + r = requests.get(hashes_url) |
| 30 | + r.raise_for_status() |
| 31 | + return r.json() |
| 32 | + |
| 33 | + |
| 34 | +def cat_metadata(hash_str, client=None): |
| 35 | + """ |
| 36 | + Get the metadata file for a given hash. |
| 37 | + Args: |
| 38 | + url (str): the url of the IPFS server |
| 39 | + hash_str (str): the hash of the ipfs dataset |
| 40 | + Returns (example metadata.json): |
| 41 | + |
| 42 | + { |
| 43 | + 'date range': [ |
| 44 | + '1981/01/01', |
| 45 | + '2019/07/31' |
| 46 | + ], |
| 47 | + 'entry delimiter': ',', |
| 48 | + 'latitude range': [ |
| 49 | + -49.975, 49.975 |
| 50 | + ], |
| 51 | + 'longitude range': [ |
| 52 | + -179.975, 179.975] |
| 53 | + , |
| 54 | + 'name': 'CHIRPS .05 Daily Full Set Uncompressed', |
| 55 | + 'period': 'daily', |
| 56 | + 'precision': 0.01, |
| 57 | + 'resolution': 0.05, |
| 58 | + 'unit of measurement': 'mm', |
| 59 | + 'year delimiter': '\n' |
| 60 | + } |
| 61 | + """ |
| 62 | + if (client is None): |
| 63 | + with ipfshttpclient.connect() as client: |
| 64 | + metadata = client.cat(hash_str + "/metadata.json") |
| 65 | + return json.loads(metadata) |
| 66 | + else: |
| 67 | + metadata = client.cat(hash_str + "/metadata.json") |
| 68 | + return json.loads(metadata) |
| 69 | + |
| 70 | + |
| 71 | +def cat_hash_cell(hash_str, coord_str, client=None): |
| 72 | + if (client is None): |
| 73 | + with ipfshttpclient.connect() as client: |
| 74 | + return client.cat(hash_str + '/' + coord_str) |
| 75 | + else: |
| 76 | + return client.cat(hash_str + '/' + coord_str) |
| 77 | + |
| 78 | +def cat_zipped_hash_cell(url, hash_str, coord_str, client=None): |
| 79 | + """ |
| 80 | + Read a text file on the ipfs server compressed with gzip. |
| 81 | + Args: |
| 82 | + url (str): the url of the ipfs server |
| 83 | + hash_str (str): the hash of the dataset |
| 84 | + coord_str (str): the text file coordinate name e.g. 45.000_-96.000 |
| 85 | + Returns: |
| 86 | + the contents of the file as a string |
| 87 | + """ |
| 88 | + if (client is None): |
| 89 | + with ipfshttpclient.connect() as client: |
| 90 | + cell = client.cat(hash_str + '/' + coord_str + ".gz") |
| 91 | + with gzip.GzipFile(fileobj=io.BytesIO(cell)) as zip_data: |
| 92 | + return zip_data.read().decode("utf-8") |
| 93 | + else: |
| 94 | + cell = client.cat(hash_str + '/' + coord_str + ".gz") |
| 95 | + with gzip.GzipFile(fileobj=io.BytesIO(cell)) as zip_data: |
| 96 | + return zip_data.read().decode("utf-8") |
| 97 | + |
| 98 | + |
| 99 | +def cat_dataset_cell(lat, lon, dataset_revision, client=None): |
| 100 | + """ |
| 101 | + Retrieve the text of a grid cell data file for a given lat lon and dataset. |
| 102 | + Args: |
| 103 | + lat (float): the latitude of the grid cell, to 3 decimals |
| 104 | + lon (float): the longitude of the grid cell, to 3 decimals |
| 105 | + Returns: |
| 106 | + A tuple (json, str) of the dataset metadata file and the grid cell data text |
| 107 | + Raises: |
| 108 | + DatasetError: If no matching dataset found on server |
| 109 | + InputOutOfRangeError: If the lat/lon is outside the dataset range in metadata |
| 110 | + CoordinateNotFoundError: If the lat/lon coordinate is not found on server |
| 111 | + """ |
| 112 | + all_hashes = get_heads() |
| 113 | + if dataset_revision in all_hashes: |
| 114 | + dataset_hash = all_hashes[dataset_revision] |
| 115 | + else: |
| 116 | + raise DatasetError('{} not found on server'.format(dataset_revision)) |
| 117 | + |
| 118 | + metadata = cat_metadata(dataset_hash, client) |
| 119 | + min_lat, max_lat = sorted(metadata["latitude range"]) |
| 120 | + min_lon, max_lon = sorted(metadata["longitude range"]) |
| 121 | + if lat < min_lat or lat > max_lat: |
| 122 | + raise InputOutOfRangeError("Latitude {} out of dataset revision range [{:.3f}, {:.3f}] for {}".format(lat, min_lat, max_lat, dataset_revision)) |
| 123 | + if lon < min_lon or lon > max_lon: |
| 124 | + raise InputOutOfRangeError("Longitude {} out of dataset revision range [{:.3f}, {:.3f}] for {}".format(lon, min_lon, max_lon, dataset_revision)) |
| 125 | + coord_str = "{:.3f}_{:.3f}".format(lat,lon) |
| 126 | + try: |
| 127 | + if "compression" in metadata and metadata["compression"] == "gzip": |
| 128 | + text_data = cat_zipped_hash_cell(GATEWAY_URL, dataset_hash, coord_str, client=client) |
| 129 | + else: |
| 130 | + text_data = cat_hash_cell(dataset_hash, coord_str, client=client) |
| 131 | + return metadata, text_data |
| 132 | + except requests.exceptions.HTTPError as e: |
| 133 | + raise CoordinateNotFoundError('Coordinate ({}, {}) not found on ipfs in dataset revision {}'.format(lat, lon, dataset_revision)) |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | +def cat_rainfall_dict(lat, lon, dataset_revision, return_metadata=False, client=None): |
| 138 | + """ |
| 139 | + Build a dict of rainfall data for a given grid cell. |
| 140 | + Args: |
| 141 | + lat (float): the latitude of the grid cell, to 3 decimals |
| 142 | + lon (float): the longitude of the grid cell, to 3 decimals |
| 143 | + Returns: |
| 144 | + a dict ({datetime.date: float}) of datetime dates and the corresponding rainfall in mm for that date |
| 145 | + Raises: |
| 146 | + DatasetError: If no matching dataset found on server |
| 147 | + InputOutOfRangeError: If the lat/lon is outside the dataset range in metadata |
| 148 | + CoordinateNotFoundError: If the lat/lon coordinate is not found on server |
| 149 | + DataMalformedError: If the grid cell file can't be parsed as rainfall data |
| 150 | + """ |
| 151 | + metadata, rainfall_text = cat_dataset_cell(lat, lon, dataset_revision, client=client) |
| 152 | + dataset_start_date = datetime.datetime.strptime(metadata['date range'][0], "%Y/%m/%d").date() |
| 153 | + dataset_end_date = datetime.datetime.strptime(metadata['date range'][1], "%Y/%m/%d").date() |
| 154 | + timedelta = dataset_end_date - dataset_start_date |
| 155 | + days_in_record = timedelta.days + 1 # we have both the start and end date in the dataset so its the difference + 1 |
| 156 | + try: |
| 157 | + rainfall_text = rainfall_text.decode() |
| 158 | + except: |
| 159 | + pass |
| 160 | + day_strs = rainfall_text.replace(',', ' ').split() |
| 161 | + if (len(day_strs) != days_in_record): |
| 162 | + raise DataMalformedError ("Number of days in data file does not match the provided metadata") |
| 163 | + rainfall_dict = {} |
| 164 | + for i in range(days_in_record): |
| 165 | + if day_strs[i] == metadata["missing value"]: |
| 166 | + rainfall_dict[dataset_start_date + datetime.timedelta(days=i)] = None |
| 167 | + else: |
| 168 | + rainfall_dict[dataset_start_date + datetime.timedelta(days=i)] = float(day_strs[i]) |
| 169 | + if return_metadata: |
| 170 | + return metadata, rainfall_dict |
| 171 | + else: |
| 172 | + return rainfall_dict |
| 173 | + |
| 174 | + |
| 175 | +def cat_rev_rainfall_dict(lat, lon, dataset, desired_end_date, latest_rev): |
| 176 | + """ |
| 177 | + Build a dictionary of rainfall data. Include as much of the most accurate, final data as possible. Start by buidling from the most accurate data, |
| 178 | + then keep appending data from more recent/less accurate versions of the dataset until we run out or reach the end date. |
| 179 | +
|
| 180 | + This will not throw an error if there are no revisions with data available, it will simply return what is available. |
| 181 | + Args: |
| 182 | + lat (float): the grid cell latitude |
| 183 | + lon (float): the grid cell longitude |
| 184 | + dataset (str): the name of the dataset, e.g., "chirps_05-daily" on hashes.json |
| 185 | + desired_end_date (datetime.date): the last day of data needed. |
| 186 | + latest_rev (str): the least accurate revision of the dataset that is considered final |
| 187 | + Returns: |
| 188 | + tuple: |
| 189 | + a dict ({datetime.date: float}) of datetime dates and the corresponding rainfall in mm for that date |
| 190 | + bool is_final: if all data up to desired end date is final, this will be true |
| 191 | + """ |
| 192 | + all_rainfall = {} |
| 193 | + is_final = True |
| 194 | + with ipfshttpclient.connect() as client: |
| 195 | + # Build the rainfall from the most accurate revision of the dataset to the least |
| 196 | + for dataset_revision in dweather_client.ipfs_datasets.datasets[dataset]: |
| 197 | + additional_rainfall = cat_rainfall_dict(lat, lon, dataset_revision, client=client) |
| 198 | + all_dates = list(all_rainfall) + list(additional_rainfall) |
| 199 | + all_rainfall = {date: all_rainfall[date] if date in all_rainfall else additional_rainfall[date] for date in all_dates} |
| 200 | + # stop when we have the desired end date in the dataset |
| 201 | + if desired_end_date in all_rainfall: |
| 202 | + return all_rainfall, is_final |
| 203 | + # data is no longer final after we pass the specified version |
| 204 | + if dataset_revision == latest_rev: |
| 205 | + is_final = False |
| 206 | + |
| 207 | + # If we don't reach the desired dataset, return all data. |
| 208 | + return all_rainfall, is_final |
| 209 | + |
| 210 | + |
| 211 | +def cat_temperature_dict(lat, lon, dataset_revision, return_metadata=False, client=None): |
| 212 | + """ |
| 213 | + Build a dict of temperature data for a given grid cell. |
| 214 | + Args: |
| 215 | + lat (float): the latitude of the grid cell, to 3 decimals |
| 216 | + lon (float): the longitude of the grid cell, to 3 decimals |
| 217 | + Returns: |
| 218 | + tuple (highs, lows) of dicts |
| 219 | + highs: dict ({datetime.date: float}) of datetime dates and the corresponding high temperature in degress F |
| 220 | + lows: dict ({datetime.date: float}) of datetime dates and the corresponding low temperature in degress F |
| 221 | + Raises: |
| 222 | + DatasetError: If no matching dataset_revision found on server |
| 223 | + InputOutOfRangeError: If the lat/lon is outside the dataset_revision range in metadata |
| 224 | + CoordinateNotFoundError: If the lat/lon coordinate is not found on server |
| 225 | + DataMalformedError: If the grid cell file can't be parsed as temperature data |
| 226 | + """ |
| 227 | + metadata, temp_text = cat_dataset_cell(lat, lon, dataset_revision, client=client) |
| 228 | + dataset_start_date = datetime.datetime.strptime(metadata['date range'][0], "%Y/%m/%d").date() |
| 229 | + dataset_end_date = datetime.datetime.strptime(metadata['date range'][1], "%Y/%m/%d").date() |
| 230 | + timedelta = dataset_end_date - dataset_start_date |
| 231 | + days_in_record = timedelta.days + 1 # we have both the start and end date in the dataset_revision so its the difference + 1 |
| 232 | + try: |
| 233 | + temp_text = temp_text.decode() |
| 234 | + except: |
| 235 | + pass |
| 236 | + day_strs = temp_text.replace(',', ' ').split() |
| 237 | + if (len(day_strs) != days_in_record): |
| 238 | + raise DataMalformedError ("Number of days in data file does not match the provided metadata") |
| 239 | + highs = {} |
| 240 | + lows = {} |
| 241 | + for i in range(days_in_record): |
| 242 | + low, high = map(float, day_strs[i].split('/')) |
| 243 | + date_iter = dataset_start_date + datetime.timedelta(days=i) |
| 244 | + highs[date_iter] = high |
| 245 | + lows[date_iter] = low |
| 246 | + if return_metadata: |
| 247 | + return metadata, highs, lows |
| 248 | + else: |
| 249 | + return highs, lows |
| 250 | + |
| 251 | + |
| 252 | +def cat_rev_temperature_dict(lat, lon, dataset, desired_end_date, latest_rev): |
| 253 | + """ |
| 254 | + Build a dictionary of rainfall data. Include as much final data as possible. If the desired end date |
| 255 | + is not in the final dataset, append as much prelim as possible. |
| 256 | + Args: |
| 257 | + lat (float): the latitude of the grid cell, to 3 decimals |
| 258 | + lon (float): the longitude of the grid cell, to 3 decimals |
| 259 | + dataset (str): the dataset name as on hashes.json |
| 260 | + desired_end_date (datetime.date): don't include prelim data after this point if not needed |
| 261 | + latest_rev (str): The least accurate revision that is still considered 'final' |
| 262 | + returns: |
| 263 | + tuple (highs, lows) of dicts and a bool |
| 264 | + highs: dict ({datetime.date: float}) of datetime dates and the corresponding high temperature in degress F |
| 265 | + lows: dict ({datetime.date: float}) of datetime dates and the corresponding low temperature in degress F |
| 266 | + is_final: True if all data is from final dataset, false if prelim included |
| 267 | + """ |
| 268 | + highs = {} |
| 269 | + lows = {} |
| 270 | + is_final = True |
| 271 | + |
| 272 | + with ipfshttpclient.connect() as client: |
| 273 | + # Build the data from the most accurate version of the dataset to the least |
| 274 | + for dataset_revision in dweather_client.ipfs_datasets.datasets[dataset]: |
| 275 | + additional_highs, additional_lows = cat_temperature_dict(lat, lon, dataset_revision, client=client) |
| 276 | + all_dates = list(highs) + list(additional_highs) |
| 277 | + highs = {date: highs[date] if date in highs else additional_highs[date] for date in all_dates} |
| 278 | + lows = {date: lows[date] if date in lows else additional_lows[date] for date in all_dates} |
| 279 | + # Stop early if we have the end date |
| 280 | + if desired_end_date in highs: |
| 281 | + return highs, lows, is_final |
| 282 | + |
| 283 | + # data is no longer final after we pass the specified version |
| 284 | + if dataset_revision == latest_rev: |
| 285 | + is_final = False |
| 286 | + |
| 287 | + # If we don't reach the desired dataset, return all data. |
| 288 | + return highs, lows, is_final |
| 289 | + |
| 290 | + |
| 291 | +def cat_station_csv(station_id, client=None): |
| 292 | + """ |
| 293 | + Retrieve the contents of a station data csv file. |
| 294 | + Args: |
| 295 | + station_id (str): the id of the weather station |
| 296 | + returns: |
| 297 | + the contents of the station csv file as a string |
| 298 | + """ |
| 299 | + all_hashes = get_heads() |
| 300 | + dataset_hash = all_hashes["ghcnd"] |
| 301 | + csv_hash = dataset_hash + '/' + station_id + ".csv.gz" |
| 302 | + if (client is None): |
| 303 | + with ipfshttpclient.connect() as client: |
| 304 | + csv = client.cat(csv_hash) |
| 305 | + with gzip.GzipFile(fileobj=io.BytesIO(cell)) as zip_data: |
| 306 | + return zip_data.read().decode("utf-8") |
| 307 | + else: |
| 308 | + csv = client.cat(csv_hash) |
| 309 | + with gzip.GzipFile(fileobj=io.BytesIO(cell)) as zip_data: |
| 310 | + return zip_data.read().decode("utf-8") |
| 311 | + |
0 commit comments