-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDistanceBetweenSimilarNamedPlaces.py
82 lines (66 loc) · 2.71 KB
/
DistanceBetweenSimilarNamedPlaces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
from tqdm import tqdm
import haversine as hs
import csv
#############
# VARIABLES #
#############
pop_limit = 500
n_characters_difference = 1
# change the file paths below to suit. The population limit above is set to choose the dataset to work with.
input_file = fr'C:\Users\andys\OneDrive\Documents\1. HOME\Programming\DistanceBetweenPlaces\cities{pop_limit}.txt'
output_file = fr'C:\Users\andys\OneDrive\Documents\1. HOME\Programming\DistanceBetweenPlaces\output_{pop_limit}.csv'
#############
# CONSTANTS #
#############
headers = ['place1', 'lat1', 'lon1', 'place2', 'lat2', 'lon2', 'haversinedistance']
colnames = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'featureclass', 'featurecode', 'countrycode', 'cc2', 'admin1code', 'admin2code', 'admin3code','admin4code', 'population', 'elevation', 'dem', 'timezone', 'modificationdate']
#############
# FUNCTIONS #
#############
def different_by_n_letters(str1, str2, n):
try:
#print(str1, str2)
if len(str1) != len(str2):
return False
diff_count = 0
for i in range(len(str1)):
if str1[i] != str2[i]:
diff_count += 1
#print(diff_count)
if diff_count == n:
#print(str1, str2)
return True
return False
except:
return False
def haversine_distance(lat1, lon1, lat2, lon2):
loc1 = (lat1, lon1)
loc2 = (lat2, lon2)
dist = hs.haversine(loc1, loc2)
return dist
########
# MAIN #
########
def main():
df = pd.read_table(input_file, names=colnames)
place_names = df['name'].tolist()
ascii_names = df['asciiname'].tolist()
latitudes = df['latitude'].tolist()
longitudes = df['longitude'].tolist()
place_name_count = len(place_names)
with open(output_file, 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(headers)
with open(output_file, 'a', newline='') as file:
writer = csv.writer(file, delimiter=',')
for place in tqdm(range(place_name_count)):
for place2 in range(place, place_name_count):
if different_by_n_letters(ascii_names[place], ascii_names[place2], n_characters_difference):
distance = haversine_distance(latitudes[place], longitudes[place], latitudes[place2], longitudes[place2])
writer.writerow([ascii_names[place],latitudes[place], longitudes[place],ascii_names[place2],latitudes[place2], longitudes[place2], distance])
###############
# ENTRY POINT #
###############
if __name__ == "__main__":
main()