Merge pull request #11 from osm-search/calculate-postcode-centroids

new script calculate_postcode_centroids.py
osm-search · Apr 11, 2022 · 7d38126 · 7d38126
2 parents 9ec428a + fe987d7
commit 7d38126
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 0 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -28,3 +28,5 @@ jobs:
           mkdir output
           ./tiger_address_convert.py tests/fixtures/tl_2020_37143_edges/tl_2020_37143_edges.shp output/37143.csv
           diff output/37143.csv tests/fixtures/expected_37143.csv
+          cat output/37143.csv | ./calculate_postcode_centroids.py > output/us_postcodes.csv
+          diff output/us_postcodes.csv tests/fixtures/expected_us_postcodes.csv
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@ Convert [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo
 dataset of the US Census Bureau to CSV files which can be imported by Nominatim. In Nominatim the created
 tables are separate from OpenStreetMap tables and get queried at search time separately.
 
+
 The dataset gets updated once per year. Downloading is prone to be slow (can take a full day) and converting
 them can take hours as well. There's a mirror on https://downloads.opencagedata.com/public/
 
@@ -30,6 +31,13 @@ Replace '2021' with the current year throughout.
         tar -czf tiger2021-nominatim-preprocessed.tar.gz tiger
 
 
+US Postocodes
+-------------
+Addtionally create a `us_postcodes.csv.gz` file with centroid coordinates.
+
+    cat output-path/*.csv | ./calculate_postcode_centroids.py | gzip -9 > us_postcodes.csv.gz
+
+
 License
 -------
 The source code is available under a GPLv2 license.
diff --git a/calculate_postcode_centroids.py b/calculate_postcode_centroids.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+"""
+Input from STDIN is expected to be a CSV file with columns 'postcode' and
+'geometry'
+
+from;to;interpolation;street;city;state;postcode;geometry
+98;88;all;Sherman Rd;Putnam;NY;10541;LINESTRING(-73.790533 41.390289,-73.790590 41.390301,...
+
+For each postcode a center point gets calculated.
+
+Output to STDOUT is one line per postcode
+
+postcode,lat,lon
+00535;43.089300;-72.613680
+00586;18.343681;-67.028427
+00601;18.181632;-66.757545
+"""
+
+import sys
+import csv
+import re
+
+postcode_summary = {}
+
+reader = csv.DictReader(sys.stdin, delimiter=';')
+
+for row in reader:
+
+    postcode = row['postcode']
+
+    # In rare cases the postcode might be empty
+    if not re.match(r'^\d\d\d\d\d$', postcode):
+        continue
+
+    if postcode not in postcode_summary:
+        postcode_summary[postcode] = {
+            'coord_count': 0,
+            'lat_sum': 0,
+            'lon_sum': 0
+        }
+
+
+    # If you 'cat *.csv' then you might end up with multiple CSV header lines.
+    # Skip those
+    if row['geometry'] == 'geometry':
+        continue
+
+    result = re.match(r'LINESTRING\((.+)\)$', row['geometry'])
+
+    # Fail if geometry can't be parsed. Shouldn't happen because it's one of
+    # our scripts that created them.
+    assert result
+
+    for coord_pair in result[1].split(','):
+        [lon, lat] = coord_pair.split(' ')
+
+        postcode_summary[postcode]['coord_count'] += 1
+        postcode_summary[postcode]['lat_sum'] += float(lat)
+        postcode_summary[postcode]['lon_sum'] += float(lon)
+
+writer = csv.DictWriter(sys.stdout, delimiter=';', fieldnames=['postcode', 'lat', 'lon'])
+writer.writeheader()
+
+for postcode in sorted(postcode_summary):
+    summary = postcode_summary[postcode]
+    writer.writerow({
+        'postcode': postcode,
+        'lat': round(summary['lat_sum'] / summary['coord_count'], 6),
+        'lon': round(summary['lon_sum'] / summary['coord_count'], 6)
+    })