From fff906ff00eec44ea001d08567151e9436a2d89b Mon Sep 17 00:00:00 2001 From: nitrag Date: Wed, 27 May 2020 13:29:13 -0400 Subject: [PATCH 1/7] Faster tile downloads --- planetutils/elevation_tile_downloader.py | 73 +++++++++++++++++------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/planetutils/elevation_tile_downloader.py b/planetutils/elevation_tile_downloader.py index c03613e..c1d3359 100644 --- a/planetutils/elevation_tile_downloader.py +++ b/planetutils/elevation_tile_downloader.py @@ -1,12 +1,17 @@ #!/usr/bin/env python from __future__ import absolute_import, unicode_literals + +import concurrent import os -import subprocess import math +from retry import retry +from concurrent import futures +from multiprocessing.pool import ThreadPool +from urllib.request import urlopen -from . import download -from . import log -from .bbox import validate_bbox +from planetutils import download +from planetutils import log +from planetutils.bbox import validate_bbox def makedirs(path): try: @@ -15,6 +20,8 @@ def makedirs(path): pass class ElevationDownloader(object): + zoom = 0 + def __init__(self, outpath='.'): self.outpath = outpath @@ -24,36 +31,46 @@ def download_planet(self): def download_bboxes(self, bboxes): for name, bbox in bboxes.items(): self.download_bbox(bbox) - + def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): tiles = self.get_bbox_tiles(bbox) found = set() download = set() - for z,x,y in tiles: + for z, x, y in tiles: od = self.tile_path(z, x, y) op = os.path.join(self.outpath, *od) if self.tile_exists(op): - found.add((x,y)) + found.add((x, y)) else: - download.add((x,y)) - log.info("found %s tiles; %s to download"%(len(found), len(download))) - for x,y in sorted(download): - self.download_tile(bucket, prefix, z, x, y) + download.add((x, y)) + log.info("found %s tiles; %s to download" % (len(found), len(download))) + tasks = {self._tile_url_path(bucket, prefix, self.zoom, x, y) for x, y in sorted(download)} + + with futures.ThreadPoolExecutor() as executor: + # Start the load operations and mark each future with its URL + future_to_url = { + executor.submit(self._download_multi, url_op): url_op for url_op in tasks + } + for future in futures.as_completed(future_to_url): + try: + future.result() + except Exception as exc: + log.error('generated an exception: %s' % exc) + pass def tile_exists(self, op): if os.path.exists(op): return True - def download_tile(self, bucket, prefix, z, x, y, suffix=''): + def _tile_url_path(self, bucket, prefix, z, x, y, suffix=''): od = self.tile_path(z, x, y) op = os.path.join(self.outpath, *od) makedirs(os.path.join(self.outpath, *od[:-1])) if prefix: od = [prefix]+od - url = 'http://s3.amazonaws.com/%s/%s%s'%(bucket, '/'.join(od), suffix) - log.info("downloading %s to %s"%(url, op)) - self._download(url, op) - + url = 'http://s3.amazonaws.com/%s/%s%s' % (bucket, '/'.join(od), suffix) + return url, op + def tile_path(self, z, x, y): raise NotImplementedError @@ -63,6 +80,19 @@ def get_bbox_tiles(self, bbox): def _download(self, url, op): download.download(url, op) + @staticmethod + @retry(exceptions=Exception, tries=5, delay=2, backoff=2, logger=log) + def _download_multi(url_op): + url, op = url_op + log.info("downloading %s to %s" % (url, op)) + request = urlopen(url, timeout=10) + with open(op, 'wb') as f: + try: + f.write(request.read()) + except Exception as exc: + raise Exception("Error downloading %r - %s", (url, exc)) + + class ElevationGeotiffDownloader(ElevationDownloader): def __init__(self, *args, **kwargs): self.zoom = kwargs.pop('zoom', 0) @@ -80,7 +110,7 @@ def get_bbox_tiles(self, bbox): size = 2**self.zoom xt = lambda x:int((x + 180.0) / 360.0 * size) yt = lambda y:int((1.0 - math.log(math.tan(math.radians(y)) + (1 / math.cos(math.radians(y)))) / math.pi) / 2.0 * size) - tiles = [] + tiles = [] for x in range(xt(left), xt(right)+1): for y in range(yt(top), yt(bottom)+1): tiles.append([self.zoom, x, y]) @@ -89,9 +119,10 @@ def get_bbox_tiles(self, bbox): def tile_path(self, z, x, y): return list(map(str, [z, x, str(y)+'.tif'])) + class ElevationSkadiDownloader(ElevationDownloader): HGT_SIZE = (3601 * 3601 * 2) - + def get_bbox_tiles(self, bbox): left, bottom, right, top = validate_bbox(bbox) min_x = int(math.floor(left)) @@ -104,13 +135,13 @@ def get_bbox_tiles(self, bbox): for y in range(min_y, max_y): tiles.add((0, x, y)) return tiles - + def tile_exists(self, op): - if os.path.exists(op) and os.stat(op).st_size == self.HGT_SIZE: + if os.path.exists(op) and os.stat(op).st_size == self.HGT_SIZE: return True def download_tile(self, bucket, prefix, z, x, y, suffix=''): - super(ElevationSkadiDownloader, self).download_tile(bucket, 'skadi', z, x, y, suffix='.gz') + super(ElevationSkadiDownloader, self)._tile_url_path(bucket, 'skadi', z, x, y, suffix='.gz') def tile_path(self, z, x, y): ns = lambda i:'S%02d'%abs(i) if i < 0 else 'N%02d'%abs(i) From b7d50ac862593cdf5c852f7b63a1d9c49f8ef4e8 Mon Sep 17 00:00:00 2001 From: nitrag Date: Wed, 27 May 2020 13:50:02 -0400 Subject: [PATCH 2/7] Add retry package --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1aa90dd..9358114 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ author_email='ian@interline.io', license='MIT', packages=find_packages(exclude=['contrib', 'docs', 'tests']), - install_requires=['future', 'requests'], #, 'osmium', 'boto3' + install_requires=['future', 'requests', 'retry'], #, 'osmium', 'boto3' tests_require=['nose'], test_suite = 'nose.collector', entry_points={ From 64d8aeae8ae10f2c619865ed118197427f489d3b Mon Sep 17 00:00:00 2001 From: nitrag Date: Wed, 27 May 2020 14:58:34 -0400 Subject: [PATCH 3/7] connection pool testing performance of connection pool --- planetutils/elevation_tile_downloader.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/planetutils/elevation_tile_downloader.py b/planetutils/elevation_tile_downloader.py index c1d3359..968277e 100644 --- a/planetutils/elevation_tile_downloader.py +++ b/planetutils/elevation_tile_downloader.py @@ -1,13 +1,14 @@ #!/usr/bin/env python from __future__ import absolute_import, unicode_literals -import concurrent import os import math + +import urllib3 from retry import retry from concurrent import futures -from multiprocessing.pool import ThreadPool -from urllib.request import urlopen + +from urllib3 import Timeout from planetutils import download from planetutils import log @@ -21,6 +22,8 @@ def makedirs(path): class ElevationDownloader(object): zoom = 0 + timeout = Timeout(connect=3.0, read=7.0) + http = urllib3.PoolManager(maxsize=50, timeout=timeout) def __init__(self, outpath='.'): self.outpath = outpath @@ -80,15 +83,14 @@ def get_bbox_tiles(self, bbox): def _download(self, url, op): download.download(url, op) - @staticmethod @retry(exceptions=Exception, tries=5, delay=2, backoff=2, logger=log) - def _download_multi(url_op): + def _download_multi(self, url_op): url, op = url_op log.info("downloading %s to %s" % (url, op)) - request = urlopen(url, timeout=10) + request = self.http.request('GET', url) with open(op, 'wb') as f: try: - f.write(request.read()) + f.write(request.data) except Exception as exc: raise Exception("Error downloading %r - %s", (url, exc)) From a8542717e07d77e453a57b17abb9f6e2cc8d4742 Mon Sep 17 00:00:00 2001 From: Ryan Date: Wed, 8 Jul 2020 00:13:09 -0400 Subject: [PATCH 4/7] add ability to check if files exist on another filesystem; useful if you need to download in batches because of storage limitations --- .gitignore | 1 + planetutils/elevation_tile_download.py | 3 ++- planetutils/elevation_tile_downloader.py | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 199c7c6..f4f44bf 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ data # PyTest .pytest_cache/ virtualenv +data/ diff --git a/planetutils/elevation_tile_download.py b/planetutils/elevation_tile_download.py index 0959087..746c4fc 100644 --- a/planetutils/elevation_tile_download.py +++ b/planetutils/elevation_tile_download.py @@ -10,6 +10,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--outpath', help='Output path for elevation tiles.', default='.') + parser.add_argument('--existpath', help='Directory to check for existing files.', default=None) parser.add_argument('--csv', help='Path to CSV file with bounding box definitions.') parser.add_argument('--bbox', help='Bounding box for extract file. Format for coordinates: left,bottom,right,top') parser.add_argument('--verbose', help="Verbose output", action='store_true') @@ -22,7 +23,7 @@ def main(): log.set_verbose() if args.format == 'geotiff': - p = ElevationGeotiffDownloader(args.outpath, zoom=args.zoom) + p = ElevationGeotiffDownloader(args.outpath, zoom=args.zoom, exist_path=args.existpath) elif args.format == 'skadi': p = ElevationSkadiDownloader(args.outpath) else: diff --git a/planetutils/elevation_tile_downloader.py b/planetutils/elevation_tile_downloader.py index 968277e..52d8ad2 100644 --- a/planetutils/elevation_tile_downloader.py +++ b/planetutils/elevation_tile_downloader.py @@ -25,8 +25,9 @@ class ElevationDownloader(object): timeout = Timeout(connect=3.0, read=7.0) http = urllib3.PoolManager(maxsize=50, timeout=timeout) - def __init__(self, outpath='.'): + def __init__(self, outpath='.', exist_path=None): self.outpath = outpath + self.exist_path = exist_path def download_planet(self): self.download_bbox([-180, -90, 180, 90]) @@ -40,9 +41,11 @@ def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): found = set() download = set() for z, x, y in tiles: + exist_dir = self.exist_path if self.exist_path else self.outpath od = self.tile_path(z, x, y) op = os.path.join(self.outpath, *od) - if self.tile_exists(op): + cp = os.path.join(exist_dir, *od) + if self.tile_exists(cp): found.add((x, y)) else: download.add((x, y)) From a357a8a366ed265cd0b238bb09bcd96ac997b2f0 Mon Sep 17 00:00:00 2001 From: Ryan Gartin Date: Wed, 8 Jul 2020 11:51:04 -0400 Subject: [PATCH 5/7] Improve existing/found tiles search --- planetutils/elevation_tile_download.py | 3 ++- planetutils/elevation_tile_downloader.py | 13 ++++++++----- planetutils/log.py | 6 +++++- setup.py | 3 ++- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/planetutils/elevation_tile_download.py b/planetutils/elevation_tile_download.py index 0959087..9b7d720 100644 --- a/planetutils/elevation_tile_download.py +++ b/planetutils/elevation_tile_download.py @@ -7,6 +7,7 @@ from .bbox import load_features_csv, load_feature_string from .elevation_tile_downloader import ElevationGeotiffDownloader, ElevationSkadiDownloader + def main(): parser = argparse.ArgumentParser() parser.add_argument('--outpath', help='Output path for elevation tiles.', default='.') @@ -26,7 +27,7 @@ def main(): elif args.format == 'skadi': p = ElevationSkadiDownloader(args.outpath) else: - print("Unknown format: %s"%args.format) + print("Unknown format: %s" % args.format) sys.exit(1) if args.csv: diff --git a/planetutils/elevation_tile_downloader.py b/planetutils/elevation_tile_downloader.py index 968277e..bd7632b 100644 --- a/planetutils/elevation_tile_downloader.py +++ b/planetutils/elevation_tile_downloader.py @@ -14,12 +14,14 @@ from planetutils import log from planetutils.bbox import validate_bbox + def makedirs(path): try: os.makedirs(path) except OSError as e: pass + class ElevationDownloader(object): zoom = 0 timeout = Timeout(connect=3.0, read=7.0) @@ -39,12 +41,13 @@ def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): tiles = self.get_bbox_tiles(bbox) found = set() download = set() + for root, dirs, files in os.walk(self.outpath): + path = root.split(os.sep) + for file in files: + if '.tif' in file: + found.add("%s/%s/%s" % (path[-2], path[-1], file.split('.')[0])) for z, x, y in tiles: - od = self.tile_path(z, x, y) - op = os.path.join(self.outpath, *od) - if self.tile_exists(op): - found.add((x, y)) - else: + if '%s/%s/%s' % (z, y, x) not in found: download.add((x, y)) log.info("found %s tiles; %s to download" % (len(found), len(download))) tasks = {self._tile_url_path(bucket, prefix, self.zoom, x, y) for x, y in sorted(download)} diff --git a/planetutils/log.py b/planetutils/log.py index 89e7bde..337471f 100644 --- a/planetutils/log.py +++ b/planetutils/log.py @@ -2,18 +2,22 @@ logging.basicConfig(format='[%(levelname)s] %(message)s') logger = logging.getLogger(__name__) + def set_quiet(): logger.setLevel(logging.ERROR) + def set_verbose(): logger.setLevel(logging.DEBUG) + def set_default(): logger.setLevel(logging.INFO) + set_default() info = logger.info debug = logger.debug warning = logger.warning -error = logger.error \ No newline at end of file +error = logger.error diff --git a/setup.py b/setup.py index 9358114..e70a7d3 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ author_email='ian@interline.io', license='MIT', packages=find_packages(exclude=['contrib', 'docs', 'tests']), - install_requires=['future', 'requests', 'retry'], #, 'osmium', 'boto3' + install_requires=['future', 'requests', 'retry'], #, 'osmium', 'boto3', 'gdal' tests_require=['nose'], test_suite = 'nose.collector', entry_points={ @@ -28,6 +28,7 @@ 'osm_extract_download=planetutils.osm_extract_download:main', 'elevation_tile_download=planetutils.elevation_tile_download:main', 'elevation_tile_merge=planetutils.elevation_tile_merge:main', + 'elevation_tile_terrain=planetutils.elevation_tile_terrain:main', 'valhalla_tilepack_download=planetutils.tilepack_download:main', 'valhalla_tilepack_list=planetutils.tilepack_list:main' ], From 688f8b0462b75e8f2d5efabff616f9328c4b6c9a Mon Sep 17 00:00:00 2001 From: Ryan Gartin Date: Fri, 10 Jul 2020 01:32:04 -0400 Subject: [PATCH 6/7] Fix Z/X/Y format --- planetutils/elevation_tile_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/planetutils/elevation_tile_downloader.py b/planetutils/elevation_tile_downloader.py index d98f31c..2137501 100644 --- a/planetutils/elevation_tile_downloader.py +++ b/planetutils/elevation_tile_downloader.py @@ -49,7 +49,7 @@ def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): if '.tif' in file: found.add("%s/%s/%s" % (path[-2], path[-1], file.split('.')[0])) for z, x, y in tiles: - if '%s/%s/%s' % (z, y, x) not in found: + if '%s/%s/%s' % (z, x, y) not in found: download.add((x, y)) log.info("found %s tiles; %s to download" % (len(found), len(download))) tasks = {self._tile_url_path(bucket, prefix, self.zoom, x, y) for x, y in sorted(download)} From fb20337570cc16ed6e1830f501869d8cefa7b00c Mon Sep 17 00:00:00 2001 From: Ryan Gartin Date: Fri, 10 Jul 2020 01:36:13 -0400 Subject: [PATCH 7/7] efficiency --- planetutils/elevation_tile_downloader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/planetutils/elevation_tile_downloader.py b/planetutils/elevation_tile_downloader.py index 2137501..c5d6ecb 100644 --- a/planetutils/elevation_tile_downloader.py +++ b/planetutils/elevation_tile_downloader.py @@ -38,7 +38,7 @@ def download_bboxes(self, bboxes): for name, bbox in bboxes.items(): self.download_bbox(bbox) - def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): + def filter_needed(self, bbox): tiles = self.get_bbox_tiles(bbox) found = set() download = set() @@ -52,7 +52,11 @@ def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): if '%s/%s/%s' % (z, x, y) not in found: download.add((x, y)) log.info("found %s tiles; %s to download" % (len(found), len(download))) - tasks = {self._tile_url_path(bucket, prefix, self.zoom, x, y) for x, y in sorted(download)} + return download + + def download_bbox(self, bbox, bucket='elevation-tiles-prod', prefix='geotiff'): + download = self.filter_needed(bbox) + tasks = {self._tile_url_path(bucket, prefix, self.zoom, x, y) for x, y in download} with futures.ThreadPoolExecutor() as executor: # Start the load operations and mark each future with its URL