diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4223611 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +*.pyc +.coverage +TEST*.xml +build/* +dist/* +*.swp +.DS_Store +*.orig +*.egg-info +MANIFEST +Vagrantfile +.vagrant/* diff --git a/README.md b/README.md new file mode 100644 index 0000000..86df251 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# python-raster-stats + +Summary statistics of raster dataset values based on vector geometries. + +Docs (API, Topics) +Examples +Build status +Test coverage + +## Quickstart + +You've got a raster dataset representing elevation and a vector dataset representing county boundaries. +Show the average, min and max elevation for each county +Show percentage of land cover by county +Output to csv (via CLI) +Pass geometries via wkt or wkb (single & list) +Integrate with other python packages via __geo_interface__ + +## Features + +* Raster data support: + * Any continuous raster band supported by GDAL +* Vector data support: + * OGR layer +* Python module (returns built in python data structures - list of dicts) +* Depends on GDAL, GEOS, shapely and numpy +* Full coverage unit testing + +## Issues +To report a bug via github issues: provide smallest possible raster, vector and code to reproduce it + +## Docs + +## TODO +* respects null/no-data values +* covering edge cases for input datasets +* command line interface which returns csv data and optionally copies over original vector attrs +* supports categorical +* Vector data: + * Points, Lines, Polygon and Multi-* geometries + * Can specify: + * OGR layer + * single geoms or list of geometries represented by wkts, wkbs or any object that supports the __geo_interface__ +* projection support +* pip installable +* python 2 & 3 support +* buildthedocs OR use some sort of literate programming +* travis-ci and https://coveralls.io/info/features +* Examples for PyShp, GeoDjango, Fiona, Path to OGR resource, ArcPy (as Ipython notebooks?) +* reproject on the fly using projection metadata +* heavily profiled using a wide range of input data. The resulting heuristics used to automatically configure for optimal performance. Optimzation heuristic for determining global_src_extent - number of features - extent of features - available system memory vs raster_extent +* CLI: pivots for categorical +* support parallel processing on multiple CPUs via the `multiprocessing` approach +* zonal majority (http://stackoverflow.com/questions/6252280/find-the-most-frequent-number-in-a-numpy-vector) + +## Alternatives + +Grass r.stats +R spatialdataframe +starspan +zonal statistics arcpy +QGIS + diff --git a/scripts/coverage.sh b/scripts/coverage.sh new file mode 100644 index 0000000..9d15f67 --- /dev/null +++ b/scripts/coverage.sh @@ -0,0 +1 @@ +coverage run setup.py test && coverage report -m --include=*rasterstats* diff --git a/scripts/provision.sh b/scripts/provision.sh new file mode 100644 index 0000000..da8db18 --- /dev/null +++ b/scripts/provision.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Provision base software required for running raster_stats + +#apt-get install -y python-software-properties +#add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable + +apt-get update + +apt-get install -y libgdal-dev gdal-bin \ + python-gdal python-pip python-numpy \ + libspatialindex-dev libspatialindex1 \ + build-essential git atop python-dev python-dateutil + +cd /usr/local/src/python-raster-stats +sudo pip install shapely pytest coverage +python setup.py develop diff --git a/scripts/rasterstats b/scripts/rasterstats new file mode 100644 index 0000000..ac941a1 --- /dev/null +++ b/scripts/rasterstats @@ -0,0 +1,47 @@ +#!/bin/env python +""" +rasterstats + +Usage: + rasterstats COMMAND DATASET [ARGS ...] + rasterstats (-h | --help) + rasterstats --version + +Options: + -h --help Show this screen. + --version Show version. +""" +import sys +from docopt import docopt +from rasterstats import raster_stats + + +def guess_type(thing): + try: + return float(thing) + except: + pass + return str(thing) + + +if __name__ == '__main__': + opt = docopt(__doc__, version='geofu 0.0.1') + if opt['DATASET'] == "-": + # use stdin + opt['DATASET'] = sys.stdin.readlines()[-1].strip() + lyr = load(opt['DATASET']) + args = [] + kwargs = {} + for arg in opt['ARGS']: + if "=" in arg: + k, v = arg.split("=") + v = guess_type(v) + kwargs[k] = v + else: + arg = guess_type(arg) + args.append(arg) + res = getattr(lyr, opt['COMMAND'])(*args, **kwargs) + if isinstance(res, lyr.__class__): + print res.path + else: + print res diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..875240f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +# content of setup.cfg +[pytest] +norecursedirs = examples* src* scripts* +addopts = -rf -v +#addopts = --maxfail=1 --pdb -rf -v diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..de722db --- /dev/null +++ b/setup.py @@ -0,0 +1,49 @@ +import os +import sys +from setuptools import setup +from setuptools.command.test import test as TestCommand + + +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + +class PyTest(TestCommand): + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = [] + self.test_suite = True + + def run_tests(self): + import pytest + errno = pytest.main(self.test_args) + sys.exit(errno) + + +setup( + name="rasterstats", + version="0.1a", + author="Matthew Perry", + author_email="perrygeo@gmail.com", + description=("Summarize geospatial raster datasets based on vector geometries"), + license="BSD", + keywords="gis geospatial geographic raster vector zonal statistics", + url="https://github.com/perrygeo/python-raster-stats", + package_dir={'': 'src'}, + packages=['rasterstats'], + long_description=read('README.md'), + install_requires=[ + 'shapely', + 'numpy', + 'GDAL', + # pandas, pyproj? + ], + scripts=['scripts/rasterstats'], + tests_require=['pytest'], + cmdclass = {'test': PyTest}, + classifiers=[ + "Development Status :: 1 - Planning", + "Topic :: Utilities", + "License :: OSI Approved :: BSD License", + ], +) diff --git a/src/rasterstats/__init__.py b/src/rasterstats/__init__.py new file mode 100644 index 0000000..4abe42d --- /dev/null +++ b/src/rasterstats/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +from .main import raster_stats + +__all__ = ['raster_stats'] + diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py new file mode 100644 index 0000000..37045c7 --- /dev/null +++ b/src/rasterstats/main.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +from shapely.geometry import mapping, shape +import numpy as np +from osgeo import gdal, ogr +from osgeo.gdalconst import GA_ReadOnly +from .utils import bbox_to_pixel_offsets +gdal.PushErrorHandler('CPLQuietErrorHandler') + + +def raster_stats(vector, raster, layer_num=0, band_num=1, nodata_value=None, + global_src_extent=False): + rds = gdal.Open(raster, GA_ReadOnly) + assert(rds) + rb = rds.GetRasterBand(band_num) + rgt = rds.GetGeoTransform() + + if nodata_value is not None: + nodata_value = float(nodata_value) + rb.SetNoDataValue(nodata_value) + + vds = ogr.Open(vector, GA_ReadOnly) + assert(vds) + vlyr = vds.GetLayer(layer_num) + + # create an in-memory numpy array of the source raster data + # covering the whole extent of the vector layer + if global_src_extent: + # use global source extent + # useful only when disk IO or raster scanning inefficiencies are your limiting factor + # advantage: reads raster data in one pass + # disadvantage: large vector extents may have big memory requirements + src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent()) + src_array = rb.ReadAsArray(*src_offset) + + # calculate new geotransform of the layer subset + new_gt = ( + (rgt[0] + (src_offset[0] * rgt[1])), + rgt[1], + 0.0, + (rgt[3] + (src_offset[1] * rgt[5])), + 0.0, + rgt[5] + ) + + mem_drv = ogr.GetDriverByName('Memory') + driver = gdal.GetDriverByName('MEM') + + #geometries = iter(vlyr, None) + #for feat in geometries: + stats = [] + + # Loop through geometries + feat = vlyr.GetNextFeature() + while feat: + if not global_src_extent: + # use local source extent + # fastest option when you have fast disks and well indexed raster (ie tiled Geotiff) + # advantage: each feature uses the smallest raster chunk + # disadvantage: lots of reads on the source raster + src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope()) + src_array = rb.ReadAsArray(*src_offset) + + # calculate new geotransform of the feature subset + new_gt = ( + (rgt[0] + (src_offset[0] * rgt[1])), + rgt[1], + 0.0, + (rgt[3] + (src_offset[1] * rgt[5])), + 0.0, + rgt[5] + ) + + # Create a temporary vector layer in memory + mem_ds = mem_drv.CreateDataSource('out') + mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon) + mem_layer.CreateFeature(feat.Clone()) + + # Rasterize it + rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte) + rvds.SetGeoTransform(new_gt) + gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1]) + rv_array = rvds.ReadAsArray() + + # Mask the source data array with our current feature + # we take the logical_not to flip 0<->1 to get the correct mask effect + # we also mask out nodata values explictly + masked = np.ma.MaskedArray( + src_array, + mask=np.logical_or( + src_array == nodata_value, + np.logical_not(rv_array) + ) + ) + + feature_stats = { + 'min': float(masked.min()), + 'mean': float(masked.mean()), + 'max': float(masked.max()), + 'std': float(masked.std()), + 'sum': float(masked.sum()), + 'count': int(masked.count()), + 'fid': int(feat.GetFID())} + + stats.append(feature_stats) + + rvds = None + mem_ds = None + feat = vlyr.GetNextFeature() + + vds = None + rds = None + return stats diff --git a/src/rasterstats/utils.py b/src/rasterstats/utils.py new file mode 100644 index 0000000..82478c6 --- /dev/null +++ b/src/rasterstats/utils.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- + +def bbox_to_pixel_offsets(gt, bbox): + originX = gt[0] + originY = gt[3] + pixel_width = gt[1] + pixel_height = gt[5] + x1 = int((bbox[0] - originX) / pixel_width) + x2 = int((bbox[1] - originX) / pixel_width) + 1 + + y1 = int((bbox[3] - originY) / pixel_height) + y2 = int((bbox[2] - originY) / pixel_height) + 1 + + xsize = x2 - x1 + ysize = y2 - y1 + return (x1, y1, xsize, ysize) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b554816 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,3 @@ +import logging +import sys +logging.basicConfig(stream=sys.stderr, level=logging.INFO) diff --git a/tests/data/lines.dbf b/tests/data/lines.dbf new file mode 100644 index 0000000..ac84578 Binary files /dev/null and b/tests/data/lines.dbf differ diff --git a/tests/data/lines.prj b/tests/data/lines.prj new file mode 100644 index 0000000..932909e --- /dev/null +++ b/tests/data/lines.prj @@ -0,0 +1,16 @@ +PROJCS["unnamed", + GEOGCS["GRS 1980(IUGG, 1980)", + DATUM["unknown", + SPHEROID["GRS80",6378137,298.257222101]], + PRIMEM["Greenwich",0], + UNIT["degree",0.0174532925199433]], + PROJECTION["Albers_Conic_Equal_Area"], + PARAMETER["standard_parallel_1",43], + PARAMETER["standard_parallel_2",48], + PARAMETER["latitude_of_center",34], + PARAMETER["longitude_of_center",-120], + PARAMETER["false_easting",600000], + PARAMETER["false_northing",0], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]]] + diff --git a/tests/data/lines.qpj b/tests/data/lines.qpj new file mode 100644 index 0000000..5fbc831 --- /dev/null +++ b/tests/data/lines.qpj @@ -0,0 +1 @@ +GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]] diff --git a/tests/data/lines.shp b/tests/data/lines.shp new file mode 100644 index 0000000..dfe374a Binary files /dev/null and b/tests/data/lines.shp differ diff --git a/tests/data/lines.shx b/tests/data/lines.shx new file mode 100644 index 0000000..a52806c Binary files /dev/null and b/tests/data/lines.shx differ diff --git a/tests/data/points.dbf b/tests/data/points.dbf new file mode 100644 index 0000000..3aee283 Binary files /dev/null and b/tests/data/points.dbf differ diff --git a/tests/data/points.prj b/tests/data/points.prj new file mode 100644 index 0000000..932909e --- /dev/null +++ b/tests/data/points.prj @@ -0,0 +1,16 @@ +PROJCS["unnamed", + GEOGCS["GRS 1980(IUGG, 1980)", + DATUM["unknown", + SPHEROID["GRS80",6378137,298.257222101]], + PRIMEM["Greenwich",0], + UNIT["degree",0.0174532925199433]], + PROJECTION["Albers_Conic_Equal_Area"], + PARAMETER["standard_parallel_1",43], + PARAMETER["standard_parallel_2",48], + PARAMETER["latitude_of_center",34], + PARAMETER["longitude_of_center",-120], + PARAMETER["false_easting",600000], + PARAMETER["false_northing",0], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]]] + diff --git a/tests/data/points.qpj b/tests/data/points.qpj new file mode 100644 index 0000000..5fbc831 --- /dev/null +++ b/tests/data/points.qpj @@ -0,0 +1 @@ +GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]] diff --git a/tests/data/points.shp b/tests/data/points.shp new file mode 100644 index 0000000..3fc12bd Binary files /dev/null and b/tests/data/points.shp differ diff --git a/tests/data/points.shx b/tests/data/points.shx new file mode 100644 index 0000000..740f6b2 Binary files /dev/null and b/tests/data/points.shx differ diff --git a/tests/data/points_noproj.dbf b/tests/data/points_noproj.dbf new file mode 100644 index 0000000..3aee283 Binary files /dev/null and b/tests/data/points_noproj.dbf differ diff --git a/tests/data/points_noproj.shp b/tests/data/points_noproj.shp new file mode 100644 index 0000000..3fc12bd Binary files /dev/null and b/tests/data/points_noproj.shp differ diff --git a/tests/data/points_noproj.shx b/tests/data/points_noproj.shx new file mode 100644 index 0000000..740f6b2 Binary files /dev/null and b/tests/data/points_noproj.shx differ diff --git a/tests/data/polygons.dbf b/tests/data/polygons.dbf new file mode 100644 index 0000000..ac84578 Binary files /dev/null and b/tests/data/polygons.dbf differ diff --git a/tests/data/polygons.prj b/tests/data/polygons.prj new file mode 100644 index 0000000..932909e --- /dev/null +++ b/tests/data/polygons.prj @@ -0,0 +1,16 @@ +PROJCS["unnamed", + GEOGCS["GRS 1980(IUGG, 1980)", + DATUM["unknown", + SPHEROID["GRS80",6378137,298.257222101]], + PRIMEM["Greenwich",0], + UNIT["degree",0.0174532925199433]], + PROJECTION["Albers_Conic_Equal_Area"], + PARAMETER["standard_parallel_1",43], + PARAMETER["standard_parallel_2",48], + PARAMETER["latitude_of_center",34], + PARAMETER["longitude_of_center",-120], + PARAMETER["false_easting",600000], + PARAMETER["false_northing",0], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]]] + diff --git a/tests/data/polygons.qpj b/tests/data/polygons.qpj new file mode 100644 index 0000000..5fbc831 --- /dev/null +++ b/tests/data/polygons.qpj @@ -0,0 +1 @@ +GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]] diff --git a/tests/data/polygons.shp b/tests/data/polygons.shp new file mode 100644 index 0000000..80d58e8 Binary files /dev/null and b/tests/data/polygons.shp differ diff --git a/tests/data/polygons.shx b/tests/data/polygons.shx new file mode 100644 index 0000000..a7d2fe5 Binary files /dev/null and b/tests/data/polygons.shx differ diff --git a/tests/data/slope.tif b/tests/data/slope.tif new file mode 100644 index 0000000..c0be86f Binary files /dev/null and b/tests/data/slope.tif differ diff --git a/tests/test_zonal.py b/tests/test_zonal.py new file mode 100644 index 0000000..b861edb --- /dev/null +++ b/tests/test_zonal.py @@ -0,0 +1,30 @@ +# test zonal stats +import os +from rasterstats import raster_stats +DATA = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + +def test_zonal(): + polygons = os.path.join(DATA, 'polygons.shp') + raster = os.path.join(DATA, 'slope.tif') + stats = raster_stats(polygons, raster) + assert sorted(stats[0].keys()) == sorted( + ['std', 'count', 'min', 'max', 'sum', 'fid', 'mean']) + assert len(stats) == 2 + assert stats[0]['count'] == 75 + assert stats[1]['count'] == 50 + +def test_zonal_global_extent(): + polygons = os.path.join(DATA, 'polygons.shp') + raster = os.path.join(DATA, 'slope.tif') + stats = raster_stats(polygons, raster, global_src_extent=True) + assert len(stats) == 2 + assert stats[0]['count'] == 75 + assert stats[1]['count'] == 50 + +def test_zonal_nodata(): + polygons = os.path.join(DATA, 'polygons.shp') + raster = os.path.join(DATA, 'slope.tif') + stats = raster_stats(polygons, raster, global_src_extent=True, nodata_value=0) + assert len(stats) == 2 + assert stats[0]['count'] == 75 + assert stats[1]['count'] == 50