perrygeo · sgoodm · Nov 21, 2016 · Nov 21, 2016 · Jan 11, 2017 · Mar 3, 2017
diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py
@@ -7,7 +7,8 @@
 from shapely.geometry import shape
 from .io import read_features, Raster
 from .utils import (rasterize_geom, get_percentile, check_stats,
-                    remap_categories, key_assoc_val, boxify_points)
+                    remap_categories, key_assoc_val, boxify_points,
+                    rasterize_pctcover_geom)
 
 
 def raster_stats(*args, **kwargs):
@@ -36,6 +37,9 @@ def gen_zonal_stats(
         affine=None,
         stats=None,
         all_touched=False,
+        percent_cover_selection=None,
+        percent_cover_weighting=False,
+        percent_cover_scale=None,
         categorical=False,
         category_map=None,
         add_stats=None,
@@ -80,6 +84,29 @@ def gen_zonal_stats(
         those having a center point within the polygon.
         defaults to `False`
 
+    percent_cover_selection: float, optional
+        Include only raster cells that have at least the given percent
+        covered by the vector feature. Requires percent_cover_scale argument
+        be used to specify scale at which to generate percent coverage
+        estimates
+
+    percent_cover_weighting: bool, optional
+        whether or not to use percent coverage of cells during calculations
+        to adjust stats (only applies to mean, count and sum)
+
+    percent_cover_scale: int, optional
+        Scale used when generating percent coverage estimates of each
+        raster cell by vector feature. Percent coverage is generated by
+        rasterizing the feature at a finer resolution than the raster
+        (based on percent_cover_scale value) then using a summation to aggregate
+        to the raster resolution and dividing by the square of percent_cover_scale
+        to get percent coverage value for each cell. Increasing percent_cover_scale
+        will increase the accuracy of percent coverage values; three orders
+        magnitude finer resolution (percent_cover_scale=1000) is usually enough to
+        get coverage estimates with <1% error in individual edge cells coverage
+        estimates, though much smaller values (e.g., percent_cover_scale=10) are often
+        sufficient (<10% error) and require less memory.
+
     categorical: bool, optional
 
     category_map: dict
@@ -139,20 +166,71 @@ def gen_zonal_stats(
         warnings.warn("Use `band` to specify band number", DeprecationWarning)
         band = band_num
 
+    # check inputs related to percent coverage
+    percent_cover = False
+    if percent_cover_weighting or percent_cover_selection is not None:
+        percent_cover = True
+        if percent_cover_scale is None:
+            warnings.warn('No value for `percent_cover_scale` was given. '
+                          'Using default value of 10.')
+            percent_cover_scale = 10
+
+        try:
+            if percent_cover_scale != int(percent_cover_scale):
+                warnings.warn('Value for `percent_cover_scale` given ({0}) '
+                              'was converted to int ({1}) but does not '
+                              'match original value'.format(
+                                percent_cover_scale, int(percent_cover_scale)))
+
+            percent_cover_scale = int(percent_cover_scale)
+
+            if percent_cover_scale <= 1:
+                raise Exception('Value for `percent_cover_scale` must be '
+                                'greater than one ({0})'.format(
+                                    percent_cover_scale))
+
+        except:
+            raise Exception('Invalid value for `percent_cover_scale` '
+                            'provided ({0}). Must be type int.'.format(
+                                percent_cover_scale))
+
+        if percent_cover_selection is not None:
+            try:
+                percent_cover_selection = float(percent_cover_selection)
+            except:
+                raise Exception('Invalid value for `percent_cover_selection` '
+                                'provided ({0}). Must be able to be converted '
+                                'to a float.'.format(percent_cover_selection))
+
+        if not all_touched:
+            warnings.warn('`all_touched` was not enabled but an option requiring '
+                          'percent_cover calculations was selected. Automatically '
+                          'enabling `all_touched`.')
+        all_touched = True
+
+
     with Raster(raster, affine, nodata, band) as rast:
         features_iter = read_features(vectors, layer)
         for _, feat in enumerate(features_iter):
             geom = shape(feat['geometry'])
 
             if 'Point' in geom.type:
                 geom = boxify_points(geom, rast)
+                percent_cover = False
 
             geom_bounds = tuple(geom.bounds)
 
             fsrc = rast.read(bounds=geom_bounds)
 
             # rasterized geometry
-            rv_array = rasterize_geom(geom, like=fsrc, all_touched=all_touched)
+            if percent_cover:
+                rv_array = rasterize_pctcover_geom(
+                    geom, shape=fsrc.shape, affine=fsrc.affine,
+                    scale=percent_cover_scale)
+            else:
+                rv_array = rasterize_geom(
+                    geom, shape=fsrc.shape, affine=fsrc.affine,
+                    all_touched=all_touched)
 
             # nodata mask
             isnodata = (fsrc.array == fsrc.nodata)
@@ -164,9 +242,14 @@ def gen_zonal_stats(
 
             # Mask the source data array
             # mask everything that is not a valid value or not within our geom
-            masked = np.ma.MaskedArray(
-                fsrc.array,
-                mask=(isnodata | ~rv_array))
+            if percent_cover_selection is not None:
+                masked = np.ma.MaskedArray(
+                    fsrc.array,
+                    mask=(isnodata | ~rv_array | percent_cover > percent_cover_selection))
+            else:
+                masked = np.ma.MaskedArray(
+                    fsrc.array,
+                    mask=(isnodata | ~rv_array))
 
             # execute zone_func on masked zone ndarray
             if zone_func is not None:
@@ -187,7 +270,6 @@ def gen_zonal_stats(
                     pixel_count = dict(zip([np.asscalar(k) for k in keys],
                                            [np.asscalar(c) for c in counts]))
 
-
                 if categorical:
                     feature_stats = dict(pixel_count)
                     if category_map:
@@ -200,12 +282,23 @@ def gen_zonal_stats(
                 if 'max' in stats:
                     feature_stats['max'] = float(masked.max())
                 if 'mean' in stats:
-                    feature_stats['mean'] = float(masked.mean())
+                    if percent_cover_weighting:
+                        feature_stats['mean'] = float(
+                            np.sum(masked * rv_array) /
+                            np.sum(~masked.mask * rv_array))
+                    else:
+                        feature_stats['mean'] = float(masked.mean())
                 if 'count' in stats:
-                    feature_stats['count'] = int(masked.count())
+                    if percent_cover_weighting:
+                        feature_stats['count'] = float(np.sum(~masked.mask * rv_array))
+                    else:
+                        feature_stats['count'] = int(masked.count())
                 # optional
                 if 'sum' in stats:
-                    feature_stats['sum'] = float(masked.sum())
+                    if percent_cover_weighting:
+                        feature_stats['sum'] = float(np.sum(masked * rv_array))
+                    else:
+                        feature_stats['sum'] = float(masked.sum())
                 if 'std' in stats:
                     feature_stats['std'] = float(masked.std())
                 if 'median' in stats:

diff --git a/src/rasterstats/utils.py b/src/rasterstats/utils.py
@@ -3,6 +3,8 @@
 from __future__ import division
 import sys
 from rasterio import features
+from affine import Affine
+from numpy import min_scalar_type
 from shapely.geometry import box, MultiPolygon
 from .io import window_bounds
 
@@ -25,12 +27,13 @@ def get_percentile(stat):
     return q
 
 
-def rasterize_geom(geom, like, all_touched=False):
+def rasterize_geom(geom, shape, affine, all_touched=False):
     """
     Parameters
     ----------
     geom: GeoJSON geometry
-    like: raster object with desired shape and transform
+    shape: desired shape
+    affine: desired transform
     all_touched: rasterization strategy
 
     Returns
@@ -40,15 +43,55 @@ def rasterize_geom(geom, like, all_touched=False):
     geoms = [(geom, 1)]
     rv_array = features.rasterize(
         geoms,
-        out_shape=like.shape,
-        transform=like.affine,
+        out_shape=shape,
+        transform=affine,
         fill=0,
         dtype='uint8',
         all_touched=all_touched)
 
     return rv_array.astype(bool)
 
 
+# https://stackoverflow.com/questions/8090229/
+#   resize-with-averaging-or-rebin-a-numpy-2d-array/8090605#8090605
+def rebin_sum(a, shape, dtype):
+    sh = shape[0],a.shape[0]//shape[0],shape[1],a.shape[1]//shape[1]
+    return a.reshape(sh).sum(-1, dtype=dtype).sum(1, dtype=dtype)
+
+
+def rasterize_pctcover_geom(geom, shape, affine, scale=None):
+    """
+    Parameters
+    ----------
+    geom: GeoJSON geometry
+    shape: desired shape
+    affine: desired transform
+    scale: scale at which to generate percent cover estimate
+
+    Returns
+    -------
+    ndarray: float32
+    """
+    if scale is None:
+        scale = 10
+
+    min_dtype = min_scalar_type(scale**2)
+
+    pixel_size = affine[0]/scale
+    topleftlon = affine[2]
+    topleftlat = affine[5]
+
+    new_affine = Affine(pixel_size, 0, topleftlon,
+                    0, -pixel_size, topleftlat)
+
+    new_shape = (shape[0]*scale, shape[1]*scale)
+
+    rv_array = rasterize_geom(geom, new_shape, new_affine, True)
+    rv_array = rebin_sum(rv_array, shape, min_dtype)
+
+    return rv_array.astype('float32') / (scale**2)
+
+
 def stats_to_csv(stats):
     if sys.version_info[0] >= 3:
         from io import StringIO as IO  # pragma: no cover

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,6 +1,7 @@
 import sys
 import os
 import pytest
+import numpy as np
 from shapely.geometry import LineString
 from rasterstats.utils import \
     stats_to_csv, get_percentile, remap_categories, boxify_points
@@ -63,3 +64,20 @@ def test_boxify_non_point():
     line = LineString([(0, 0), (1, 1)])
     with pytest.raises(ValueError):
         boxify_points(line, None)
+
+
+def test_rebin_sum():
+
+    test_input = np.array(
+        [
+            [1, 1, 2, 2],
+            [1, 1, 2, 2],
+            [3, 3, 4, 4],
+            [3, 3, 4, 4]
+        ])
+
+    test_output = rebin_sum(test_input, (2,2), np.int32)
+
+    correct_output = np.array([[4, 8],[12, 16]])
+
+    assert np.array_equal(test_output, correct_output)