-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscript.py
571 lines (427 loc) · 16.5 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
# ## Missing Persons DB Webscrape
#
# https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf
#
# Search with no criteria will bring up all results.
#
# The links to the missing persons pages, can be appended to https://www.services.rcmp-grc.gc.ca/
# ### Code
# ------
# #### Import packages
import datetime
import glob
import json
import re
import time
from collections import defaultdict
from functools import cache
from itertools import product
from multiprocessing import Pool
from os.path import join
from typing import Optional
from urllib.parse import urljoin
import dateparser
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from tenacity import (before_sleep_log, retry, stop_after_attempt, wait_fixed,
wait_random)
from webdriver_manager.chrome import ChromeDriverManager
setattr(dateparser, "parse", cache(dateparser.parse))
MAX_WEIGHT_DIFFERENCE = 10 # kg
MAX_HEIGHT_DIFFERENCE = 15 # cm
HAIR_SCORE = 0.7
BIO_SCORE = 0.7
start_time = datetime.datetime.now()
# This function takes in a string of <dl> ... </dl> and converts it into a dictionary
def dl_to_dict(dl_str):
soup = bs(dl_str, "html.parser")
dl = soup.find("dl")
if not dl:
return {}
data = defaultdict(list)
k = ""
for c in dl.contents:
is_real = (
bool(str(c).strip()) and len(c.contents) > 0
) # real element, not whitespace
if not is_real:
continue
if c.name == "dt":
k = c.contents[0].strip()
elif c.name == "dd":
data[k].append(c.contents[0].strip())
return dict(data)
# #### link to search results + install selenium if it is not already installed
link = "https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf"
# Get a chrome driver if there isn't one locally
service = Service(ChromeDriverManager().install())
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920,1080")
# #### loop through all the search result pages and collect the URLs
browser = webdriver.Chrome(service=service, options=chrome_options)
browser.get(link)
time.sleep(2)
# find search and click it to reach data
try:
search = browser.find_element(By.NAME, "searchForm:j_idt158")
search.click()
except Exception:
print("could not find Search")
time.sleep(3)
# to store all the URLs
URLs = set()
# go through each page and get all URLs
while True:
time.sleep(2) # wait more just in case
page = browser.page_source
pageSoup = bs(page, "html.parser")
# get all the links on the page and add them to array
pageURLs = pageSoup.find_all("a", class_="wet-newwindow")
# take each link on the page and add if not a dupe
for link in pageURLs:
new_url = urljoin(
"https://www.services.rcmp-grc.gc.ca", link.get("href"))
if new_url not in URLs:
URLs.add(new_url)
time.sleep(2) # wait a little
# are we on the last page
try:
# click the next button at the bottom of the page
next_page = browser.find_element(By.XPATH, '//a[text()="Next"]')
print("Found next button to press.")
next_page.click()
time.sleep(2) # wait for next page to load
except Exception:
# should not have a next button on the last page
print("Last page or no next button found!")
break
browser.quit()
# ## Collect all the data from all the detailed case pages
base_url = r"https://www.services.rcmp-grc.gc.ca"
# #### To Avoid Running the URL Collector Again - Run Code Below
# clean the elements
URLs = [u.strip() for u in URLs if u]
# #### Looping through and scraping the data into a json file
@retry(
reraise=True,
wait=wait_fixed(3) + wait_random(0, 3),
stop=stop_after_attempt(5),
)
def scrape_page(page_url):
# page dict
page_dict = {}
# this is where all the person info will go
page_sections = []
to_print = "==============================================\n"
to_print += f"Case URL: {page_url}\n"
# request the html
page = requests.get(page_url, timeout=10)
# structure the page content for parsing
soup = bs(page.content, "html.parser")
# First we have to pull out the content area
content_area = soup.find("main", {"property": "mainContentOfPage"})
try:
# the case reference number
_case_ref = content_area.find("h1")
page_dict["CaseRef"] = " ".join(_case_ref.text.split())
# the main section
sections = content_area.section
# the description
desc = sections.div.p
page_dict["CaseDesc"] = desc.text.strip()
# the category
case_type = sections.h2
page_dict["CaseType"] = " ".join(case_type.text.split())
except:
to_print += "page base info collection error\n"
page_dict["CaseURL"] = page_url
# get the first section with all the persons
persons_section = sections.section
# how many people are we looking through
persons_names = persons_section.find_all("h3")
num_persons = len(persons_names)
# all the blocks within the section
persons_blocks = persons_section.find_all("div", {"class": "row"})
# loop through all the person sections to collect their data
# assigned to their names
for i in range(num_persons):
to_print += f"Person(s) in Case: {i+1}\n"
block = {} # stores the individuals info, some pages have 1+
block["Name"] = " ".join(persons_names[i].text.split())
# select the current persion
current_person = persons_blocks[i]
# dict to save all the individual dl sections
dl_sections = {}
# takes all the DL sections out and saves them
for dl in current_person.find_all("dl"):
# call the dl formatting function
dl_sections |= dl_to_dict(str(dl))
# append the formatted sectins array to the block
block["InfoSection"] = dl_sections
# a list to store references to images for each individual
imageLinks = []
try:
for image in current_person.find_all("img"):
image_src = image["src"]
# check if this matches the no photo image
if not re.search("noPhoto\.png", image_src):
imageLinks.append(
"https://www.services.rcmp-grc.gc.ca/missing-disparus/" + image_src
)
# add the images section
# add to the main dict
block["Images"] = imageLinks
except Exception:
print("no images")
# add the block to the page sections
page_sections.append(block)
to_print += f"{block['Name']}\n"
print(to_print.strip())
# write the section to the dict
page_dict["PersonsData"] = page_sections
# write it all to the main DB
return (page_dict["CaseRef"], page_dict)
def scrape_database(urls_list: list, iteration: int):
# loop through all the URLs
with Pool(10) as p:
complete_db = dict(p.map(scrape_page, urls_list))
# fine name to write to
filename = join("data", f"RCMP_Data_Part_{iteration:03d}.json")
# write JSON to a file
with open(filename, "w") as outfile:
json.dump(complete_db, outfile, indent=2)
print(
f"======================= Done Part {iteration} =======================")
# number of cases in each subfile
page_size = 100
# for testing
# split the list into smaller parts
paginated_list = [URLs[i: i + page_size]
for i in range(0, len(URLs), page_size)]
# loop through the divided list and output files
for count, list_section in enumerate(paginated_list):
scrape_database(list_section, count)
# get all the json files in the folder
json_files = list(sorted(glob.glob(join("data", "RCMP*.json"))))
# create a dict to store data
main_cases = []
for file in json_files:
with open(file) as f:
data = json.load(f)
main_cases.extend(data.values())
unique_attributes = set()
for row in main_cases:
for person in row["PersonsData"]:
unique_attributes.update(list(person["InfoSection"].keys()))
# The data comes in as a list of cases with multiple people per case
# This method will convert the cases to a list of people with a unique key for each person as the index
# Use a list of people to do logic with, and then convert back to cases for the front end
def flatten_cases_to_people(cases):
people = {}
for case in cases:
case_without_persons = case.copy()
case_without_persons.pop("PersonsData")
for person in case["PersonsData"]:
key = case["CaseRef"] + "%%" + person["Name"]
people[key] = (
person["InfoSection"]
| {"Images": person["Images"]}
| case_without_persons
)
people[key]["Name"] = person["Name"]
return people
persons_data = flatten_cases_to_people(main_cases)
missing_persons = [
persons_data[p] | {"PersonID": p}
for p in persons_data
if persons_data[p]["CaseType"] == "Missing"
]
unidentified_remains = [
persons_data[p]
for p in persons_data
if persons_data[p]["CaseType"] == "Unidentified"
]
def format_age_range(raw_age_range_string):
result = re.search(r"(\d+)[^\d]+?(\d+)", raw_age_range_string, re.M)
if result is None or len(result.groups()) != 2:
return None
g = list(result.groups())
return [int(g[0]), int(g[1])]
def format_weight(raw_weight_string):
result = re.search(r"(\d+)kg", raw_weight_string, re.M)
if result is None or len(result.groups()) != 1:
return None
return int(result.groups()[0])
# Missing people have a percise height, unidentified remains have a range or a percise height
def format_height(raw_height_string):
result = re.search(r"(\d+)cm", raw_height_string, re.M)
if result is None or len(result.groups()) != 1:
return None
return int(result.groups()[0])
def format_height_range(raw_height_string):
result = re.search(r"(\d+)cm.* (\d+)cm", raw_height_string, re.M)
if result is None or len(result.groups()) != 2:
return None
g = list(result.groups())
return [int(g[0]), int(g[1])]
def was_found_after_reported(unidentified, missing) -> bool:
unidentified_found = dateparser.parse(unidentified["Discovered on"][0])
missing_missing = dateparser.parse(missing["Missing since"][0])
return unidentified_found > missing_missing
def potential_gender_match(unidentified, missing) -> bool:
try:
unidentified_gender = unidentified["Gender"][0]
missing_gender = missing["Gender"][0]
if unidentified_gender == missing_gender:
return True
elif unidentified_gender.lower() not in [
"male",
"female",
] or missing_gender.lower() not in ["male", "female"]:
return True
except Exception:
return True
return False
def age_approximately_as_expected(u, m) -> bool:
unidentified_age_est = format_age_range(u["Est. age"][0])
if unidentified_age_est is None:
return False
# tweak the range to be a bit more generous
unidentified_age_est = [
unidentified_age_est[0] * 0.8,
unidentified_age_est[1] * 1.2,
]
# this range goes from the youngest possible age, the age at disappearance,
# to the oldest, which is how old they'd be when the unidentified remains
# were found
missing_age_range = (
int(m["Age at disappearance"][0]),
(
dateparser.parse(u["Discovered on"][0])
- dateparser.parse(m["Year of birth"][0])
).days
/ 365,
)
# return if there's overlap between the two ranges
return (
unidentified_age_est[1] > missing_age_range[0]
and unidentified_age_est[0] < missing_age_range[1]
)
def weight_approximately_as_expected(u, m) -> Optional[float]:
# ignore if no weight value
if not ("Weight" in u and "Weight" in m):
return 0.0
u_weight = format_weight(u["Weight"])
m_weight = format_weight(m["Weight"])
# ignore if somehow improperly formatted
if u_weight is None or m_weight is None:
return 0.0
weight_closeness = abs(u_weight - m_weight)
if weight_closeness > MAX_WEIGHT_DIFFERENCE:
return None
return 1 - (weight_closeness / MAX_WEIGHT_DIFFERENCE)
# TODO: change weight range as a function of time. e.g. if they have been
# missing for 2 days, use tight range if missing for years have a more generous
# range
def height_approximately_as_expected(u, m) -> Optional[float]:
# ignore if no weight value
if not ("Height" in u and "Height" in m):
return 0.0
m_height = format_height(m["Weight"])
u_height = format_height_range(u["Weight"])
if u_height is None:
u_height = format_height(u["Weight"])
else:
# make sure height is not a range
u_height = (u_height[0] + u_height[1]) / 2
# ignore if can't parse
if m_height is None or u_height is None:
return 0.0
# number of cm difference
height_closeness = abs(u_height - m_height)
if height_closeness > MAX_HEIGHT_DIFFERENCE:
return None
return 1 - (height_closeness / MAX_HEIGHT_DIFFERENCE)
# naive check to see if either both or neither have a "Tattoo" key
def tattoo_matching(u, m) -> bool:
return ("Tattoo" in u) == ("Tattoo" in m)
def potential_hair_match(unidentified, missing) -> Optional[float]:
try:
unidentified_hair = unidentified["Hair"][0].lower()
missing_hair = missing["Hair"][0].lower()
if unidentified_hair == missing_hair:
return HAIR_SCORE
except Exception:
# if we can't pull any hair data from either it stays a potential match
return 0.0
return None
def potential_bio_group_match(unidentified, missing) -> Optional[float]:
try:
unidentified_bio_group = unidentified["Bio group"][0].lower()
missing_bio_group = missing["Bio group"][0].lower()
if unidentified_bio_group == missing_bio_group:
return BIO_SCORE
except Exception:
return BIO_SCORE
return None
potential_matches_complete_objects: list = []
comparison_functions = [
was_found_after_reported,
potential_gender_match,
age_approximately_as_expected,
potential_hair_match,
tattoo_matching,
potential_bio_group_match,
weight_approximately_as_expected,
height_approximately_as_expected,
]
def get_match_score(unidentified, missing) -> Optional[float]:
match_score = 0.0
for f in comparison_functions:
try:
comparison_score = f(unidentified, missing)
if comparison_score is None or comparison_score is False:
return None
else:
match_score += comparison_score
except Exception:
return None
return match_score
potential_matches = 0
export_unidentified = {
u["CaseRef"]: u | {"MatchedRemains": []} for u in unidentified_remains
}
export_missing = {
mp["PersonID"]: mp | {"MatchedUnidentified": []} for mp in missing_persons
}
for unidentified, missing in product(unidentified_remains, missing_persons):
# do comparisons of missing and unidentified using all known comparison functions
# If it's a possible match, append to the list of potential matches
match_score = get_match_score(unidentified, missing)
if match_score is not None:
potential_matches += 1
export_unidentified[unidentified["CaseRef"]]["MatchedRemains"].append(
missing["PersonID"]
)
export_missing[missing["PersonID"]]["MatchedUnidentified"].append(
unidentified["CaseRef"]
)
possible_total_matches = len(unidentified_remains) * len(missing_persons)
print(
f"Reduced to {potential_matches} out of a possible {possible_total_matches} matches ({int(100*potential_matches/possible_total_matches)}%)"
)
meta = {
"updated_at": datetime.datetime.now().isoformat(),
"time_running": str(datetime.datetime.now() - start_time)
}
with open("missing-persons-viewer-vite/src/shared/data/meta.json", "w") as f:
json.dump(meta, f, indent=4)
with open("missing-persons-viewer-vite/src/shared/data/missing_persons.json", "w") as f:
json.dump(export_missing, f, indent=4)
with open("missing-persons-viewer-vite/src/shared/data/unidentified_persons.json", "w") as f:
json.dump(export_unidentified, f, indent=4)