-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextprocess.py
More file actions
151 lines (135 loc) · 5.39 KB
/
textprocess.py
File metadata and controls
151 lines (135 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#most interesting part is the method "checkTLS".
import re
import requests
import time
import urllib3
pool = urllib3.PoolManager()
#make it look like a Firefox query.
headers = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
def processMain(mainList):
#get number of all municipalities
ALL = getAll(mainList)
#get every entry of municipality for themselves, first is just start
entries = re.split('<tr>\n<td', mainList)[1:]
tlsDict = dict(
) #Canton->[Municipality, Canton, HTTP, HTTPS, people]
entryCounter = 0
for municipality in entries:
tlsEntry = processEntry(municipality)
canton = tlsEntry[1]
if canton in tlsDict:
tlsDict[canton].append(tlsEntry)
else:
tlsDict[canton] = [tlsEntry]
entryCounter += 1
if entryCounter % 10 == 0:
print('Processed ' + str(entryCounter) + ' of ' + str(ALL))
#I don't want to DoS, so I have to sleep.
time.sleep(10)
return tlsDict
def processEntry(municipality):
"""
data-sort-value="Aadorf"><a href="/wiki/Aadorf" title="Aadorf">Aadorf </a>
</td>
<td><span style="display:none;">Kanton Thurgau</span><a href="/wiki/Datei:Wappen_Thurgau_matt.svg" class="image" title="Kanton Thurgau"><img alt="Kanton Thurgau" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/71/Wappen_Thurgau_matt.svg/20px-Wappen_Thurgau_matt.svg.png" width="20" height="24" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/71/Wappen_Thurgau_matt.svg/30px-Wappen_Thurgau_matt.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/71/Wappen_Thurgau_matt.svg/40px-Wappen_Thurgau_matt.svg.png 2x" data-file-width="406" data-file-height="493" /></a> <a href="/wiki/Kanton_Thurgau" title="Kanton Thurgau">TG</a>
</td>
<td style="text-align:right;">4551
</td>
<td style="text-align:right;" data-sort-value="8838">8838
</td>
<td style="text-align:right;">19,93
</td>
<td style="text-align:right;">443,5
</td></tr>
<tr>
<td
"""
#get CITY
municipality = re.split('"', municipality, maxsplit=1)[1]
(CITY, municipality) = re.split('"', municipality, maxsplit=1)
#get cityWikiLink
municipality = re.split('href="', municipality, maxsplit=1)[1]
cityWikiLink = 'https://de.wikipedia.org'
(CITYLINK, municipality) = re.split('"', municipality, maxsplit=1)
(HTTP, HTTPS) = checkCity(cityWikiLink + CITYLINK)
#get Canton
CANTON = re.split('</a>', municipality)[2][-2:]
#get PEOPLE
PEOPLE = ''
municipality = re.split('data-sort-value="', municipality)[1]
PEOPLE = re.split('"', municipality)[0]
PEOPLE = int(PEOPLE)
return (CITY, CANTON, HTTP, HTTPS, PEOPLE)
#get the cityLink form the cities wikipediasite
def checkCity(cityWikiLink):
#download cityWikiLink
try:
r = pool.request('GET', cityWikiLink) #get the actual site
except Exception as ex:
print(ex)
print('Internet not working')
quit()
cityWiki = r.data.decode('UTF-8')
#city has no link
if re.search('<td><a rel="nofollow" class="external text" href="',
cityWiki) is None:
print('Municipality has no website: ' + cityWikiLink)
return (True, True)
cityWiki = re.split('<td><a rel="nofollow" class="external text" href="',
cityWiki)[1]
cityLink = re.split('"', cityWiki)[0]
return checkTLS(cityLink)
#checks if the city has a TLS or not.
def checkTLS(cityLink):
#make sure all start with http-link
if cityLink[:5] == 'https':
cityLink = 'http' + cityLink[5:]
(HTTP, responseLink) = testLink(cityLink, True)
#make responseLink&cityLink https
if responseLink is not None:
if not responseLink[:5] == 'https':
responseLink = 'https' + responseLink[4:]
if not cityLink[:5] == 'https':
cityLink = 'https' + cityLink[4:]
#check if HTTPS exists on https://cityLink or https://responseLink
HTTPS = testLink(cityLink, False)[0] or testLink(responseLink, False)[0]
return (HTTP, HTTPS)
def testLink(cityLink, MODE): #MODE==False->HTTPS, MODE==True->HTTP
#no response from http testLink
if cityLink is None:
return (False, None)
try:
r = requests.get(
cityLink, headers=headers, timeout=20) #get the actual site
except requests.exceptions.SSLError as ex: #SSL incorrect on serverside
if MODE: #there are too many https sites with bad Certs
print('SSLError: ' + cityLink)
return (False, None)
except requests.exceptions.ConnectionError as ex:
if MODE: #https doesn't exist
print('ConnectionError: ' + cityLink)
return (False, None)
except requests.exceptions.Timeout as ex:
print('Timeout: ' + cityLink)
return (False, None)
except Exception as ex:
print(cityLink)
print(ex)
print('Internet not working!')
quit()
#check if TLS
responseLink = r.url
if responseLink[:5] == 'https':
return (True, responseLink)
else:
return (False, responseLink)
def getAll(mainList):
mainList = re.split('\(BFS\) ', mainList, maxsplit=1)[1]
ALL = ''
while mainList[0].isdigit():
ALL += mainList[0]
mainList = mainList[1:]
return int(ALL)