Skip to content

Commit

Permalink
Merge pull request #22 from osm-search/filter-postcodes
Browse files Browse the repository at this point in the history
Improve postcode computation by filtering outliers
  • Loading branch information
lonvia authored Apr 13, 2022
2 parents 7d38126 + 8fe429a commit 42edacb
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 28 deletions.
68 changes: 47 additions & 21 deletions calculate_postcode_centroids.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,26 @@
00586;18.343681;-67.028427
00601;18.181632;-66.757545
"""

from collections import defaultdict
from statistics import mean, median
from math import sqrt
import sys
import csv
import re
import logging

LOG = logging.getLogger()

postcode_summary = {}
postcode_summary = defaultdict(list)

reader = csv.DictReader(sys.stdin, delimiter=';')

def dist(p1, p2):
return sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)

LOG.warning("Reading Postcodes")

cnt = 0
for row in reader:

postcode = row['postcode']
Expand All @@ -33,14 +44,6 @@
if not re.match(r'^\d\d\d\d\d$', postcode):
continue

if postcode not in postcode_summary:
postcode_summary[postcode] = {
'coord_count': 0,
'lat_sum': 0,
'lon_sum': 0
}


# If you 'cat *.csv' then you might end up with multiple CSV header lines.
# Skip those
if row['geometry'] == 'geometry':
Expand All @@ -52,20 +55,43 @@
# our scripts that created them.
assert result

for coord_pair in result[1].split(','):
[lon, lat] = coord_pair.split(' ')
points = result[1].split(',')
postcode_summary[postcode].append([float(p) for p in points[int(len(points)/2)].split(' ')])

cnt += 1

if cnt % 1000000 == 0:
LOG.warning("Processed %s lines.", cnt)

postcode_summary[postcode]['coord_count'] += 1
postcode_summary[postcode]['lat_sum'] += float(lat)
postcode_summary[postcode]['lon_sum'] += float(lon)
LOG.warning("%s lines read.", cnt)

writer = csv.DictWriter(sys.stdout, delimiter=';', fieldnames=['postcode', 'lat', 'lon'])
writer.writeheader()

maxdists = [0.1, 0.3, 0.5, 0.9]

for postcode in sorted(postcode_summary):
summary = postcode_summary[postcode]
writer.writerow({
'postcode': postcode,
'lat': round(summary['lat_sum'] / summary['coord_count'], 6),
'lon': round(summary['lon_sum'] / summary['coord_count'], 6)
})
points = postcode_summary[postcode]

centroid = [median(p) for p in zip(*points)]

for mxd in maxdists:
filtered = [p for p in points if dist(centroid, p) < mxd]

if len(filtered) < 0.7 * len(points):
continue

if len(filtered) < len(points):
LOG.warning("%s: Found %d outliers in %d points.", postcode, - len(filtered) + len(points), len(points))
points = filtered

centroid = [mean(p) for p in zip(*points)]

writer.writerow({
'postcode': postcode,
'lat': round(centroid[1], 6),
'lon': round(centroid[0], 6)
})
break
else:
LOG.warning("%s: Dropped.", postcode)
14 changes: 7 additions & 7 deletions tests/fixtures/expected_us_postcodes.csv
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
postcode;lat;lon
27919;36.306266;-76.52476
27932;36.100279;-76.525517
27944;36.160013;-76.406708
27946;36.355292;-76.544549
27980;36.239719;-76.57298
27985;36.217409;-76.467033
29744;36.224882;-76.320694
27919;36.307466;-76.521998
27932;36.100681;-76.526519
27944;36.140285;-76.425022
27946;36.355092;-76.549201
27980;36.243052;-76.573513
27985;36.219077;-76.46647
29744;36.224978;-76.320802

0 comments on commit 42edacb

Please sign in to comment.