-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLIMS_scraper.py
478 lines (431 loc) · 23.5 KB
/
LIMS_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
import requests
import re
import json
from subprocess import PIPE, run,check_call, CalledProcessError
from unipath import Path
import os.path
import pytesseract
from PIL import Image
import glob
from PyPDF2 import PdfFileReader
import inquirer
import datetime
from math import floor
# get memorandum doctype: http://lims.dccouncil.us/Download/30232/PR20-0142_Memorandum.pdf
# RC20-0039 has some kind of corruption in the PDF - repaired with ghostscript and then manually extracted text:
# https://superuser.com/questions/278562/how-can-i-fix-repair-a-corrupted-pdf-file/282056#282056
# gs \
# -o repaired.pdf \
# -sDEVICE=pdfwrite \
# -dPDFSETTINGS=/prepress \
# corrupted.pdf
##Get pr20-0142-ENROLLMENT by hand
# need to go through and make sure all criteria choice options work right - subcategoryId = 0 doesn't seem to work when a category is selected
# need to make sure there arent other criteria that use DisplayOrder instead of ID to search
def convertToRegex(searchTerm):
words = searchTerm.split(' ')
regExTerm = '(?i)'
for word in words:
regExTerm +=word+'\s*'
return re.compile(regExTerm)
# to check if user inputs valid date criteria
def validate(dateText):
if dateText:
try:
datetime.datetime.strptime(dateText, '%m/%d/%Y')
except ValueError:
raise(ValueError('Date format should be MM/DD/YYYY'))
else:
pass
# This only downloads introductions - they are the only doc types with a URL in the search result json.
# Would need to iterate through all possible file names (bill #, doctype combinations) to get other kinds of docs
def downloadToText(r,path,docTypes,urlsToDownload=None):
## To download files, convert PDF to text
path1 = path+'text/'
path = path+'pdfs/'
locations = []
if not os.path.isdir(path):
os.mkdir(path)
if not os.path.isdir(path1):
os.mkdir(path1)
if urlsToDownload:
for url in urlsToDownload:
urls = [url]
for docType in docTypes:
urls.append('-'.join(url.split('-')[:-1])+'-'+docType+'.pdf')
loc = path+url.split('/')[-1]
# check if introduction has been downloaded - if it has, don't test other doc types
#if not os.path.isfile(loc): - add this back in to final
for url in urls:
print(url)
docs = '-'.join(url.split('-')[:-1])
# store locations so we can reuse them when extracting text
loc = path+url.split('/')[-1]
# location for pulled downloadToText
loc1 = path1+url.split('/')[-1]
locations.append(loc1)
# only download file if it doesnt exist yet
if not os.path.isfile(loc):
response = requests.get(url)
#for files that don't exist, the longest downloaded 'pdf' i've seen is 8478 bytes - shortest existing pdf is 8879 bytes - there must be a better way to test if the file exists
# shortest existing PDF is 8474 bytes (PR20-0142-INTRODUCTION)
# for PR20-0142, the ENROLLMENT is located in a different folder than the INTRODUCTION (Download/15219/, Download/30232/)
# People at LIMS think this is a fluke, due to transitioning to a new system during council period 20
if len(response.content)>8473:
with open(loc, 'wb') as f:
f.write(response.content)
# if file is already OCR'd, just use pdftotext to pull text
run(['pdftotext', '-enc', 'UTF-8','-layout' , loc,loc1[:-3]+'txt'], stdout=PIPE)
with open(loc1[:-3]+'txt','r') as f:
# small files (i've seen length up to 116 bytes when read) are created when we call pdftotext on a non-OCR'd pdf
if len(f.read())<1000:
# if file is not OCR'd (older PDFs and some angled scans), convert PDFs to png (smaller and higher qual than tiff and run tesseract OCR
# specifying type grayscale breaks pytesseract for some images:
text = extractText(loc,path)
with open(loc1[:-3]+'txt','w') as f:
f.write(text)
else:
for bill in r.json():
# some 'bills' (e.g. public hearings) do not have documents associated w them
if bill['DocumentUrl']:
# documentUrl field only includes URL for introduction (not Signed Act or intermediate documents)
# is it safe to asume that if there is no Introduction there are no other documents?
url = bill['DocumentUrl']
urls = [bill['DocumentUrl']]
# what if not all docs are from same council period? need to iterate over at least next period..
for docType in docTypes:
urls.append('-'.join(url.split('-')[:-1])+'-'+docType+'.pdf')
loc = path+url.split('/')[-1]
# check if introduction has been downloaded - if it has, don't test other doc types
#if not os.path.isfile(loc): - add this back in to final
for url in urls:
print(url)
docs = '-'.join(url.split('-')[:-1])
# store locations so we can reuse them when extracting text
loc = path+url.split('/')[-1]
# location for pulled downloadToText
loc1 = path1+url.split('/')[-1]
locations.append(loc1)
# only download file if it doesnt exist yet
if not os.path.isfile(loc):
response = requests.get(url)
#for files that don't exist, the longest downloaded 'pdf' i've seen is 8478 bytes - shortest existing pdf is 8879 bytes - there must be a better way to test if the file exists
if len(response.content)>8500:
with open(loc, 'wb') as f:
f.write(response.content)
# if file is already OCR'd, just use pdftotext to pull text
run(['pdftotext', '-enc', 'UTF-8','-layout' , loc,loc1[:-3]+'txt'], stdout=PIPE)
with open(loc1[:-3]+'txt','r') as f:
# small files (i've seen length up to 116 bytes when read) are created when we call pdftotext on a non-OCR'd pdf
if len(f.read())<1000:
# if file is not OCR'd (older PDFs and some angled scans), convert PDFs to png (smaller and higher qual than tiff and run tesseract OCR
# specifying type grayscale breaks pytesseract for some images:
text = extractText(loc,path)
with open(loc1[:-3]+'txt','w') as f:
f.write(text)
return(locations)
def convertGrayscale(imageLocation,i):
params = ['convert', '-density','300', '-units','PixelsPerInch', '-type','Grayscale', imageLocation+str([i]), imageLocation[:-4]+'-'+str([i])+'.png']
check_call(params)
def convertColor(imageLocation,i):
params = ['convert', '-density','300', '-units','PixelsPerInch', imageLocation+str([i]), imageLocation[:-4]+'-'+str([i])+'.png']
check_call(params)
# This doesn't deal with encrypted PDFs (why are there encrypted docs?) - if there turns out to be more than just the oen I will have to figure something out
def extractText(imageLocation,path):
# converting to png outputs one file for each page, with -1,-2,...-n.png extensions - need a way to determine how many files are created
text = ''
numPages = PdfFileReader(open(imageLocation,'rb')).getNumPages()
for i in range(numPages):
try:
convertGrayscale(imageLocation,i)
text+=pytesseract.image_to_string(Image.open(imageLocation[:-4]+'-'+str([i])+'.png'))
except:
convertColor(imageLocation,i)
text+=pytesseract.image_to_string(Image.open(imageLocation[:-4]+'-'+str([i])+'.png'))
# delete pngs after text extraction: they are pretty large
for file in glob.glob(os.path.join(path, imageLocation.split('/')[-1][:-4]+'-*.png')):
os.remove(file)
return(text)
def search(loc,searchTerm):
parSplit = re.compile('\n\n')
paragraphs = {}
for file in loc:
try:
with open(file[:-3]+'txt') as doc:
content = doc.read()
docParagraphs = []
for paragraph in re.split(parSplit,content):
if searchTerm.findall(paragraph):
# remove extra whitespace
paragraph = re.sub( '\s+', ' ', paragraph ).strip()
# remove newline chars
paragraph = paragraph.replace('\n',' ')
# strip _ chars
paragraph = paragraph.replace('_','')
# strip tab chars (so they don't mess with tsv output)
paragraph = paragraph.replace('\t','')
docParagraphs.append(paragraph)
# maintain dict of dicts showing which paragraphs came from which document associated with the bill (e.g. introduction, signed act)
paragraphs['-'.join(file.split('/')[-1].split('-')[:2])]={}
paragraphs['-'.join(file.split('/')[-1].split('-')[:2])][str(file.split('/')[-1].split('-')[-1][:-4])]=docParagraphs
except FileNotFoundError:
pass
return paragraphs
# could add a spellchecker to output to fix words where OCR missed a letter or something - would need to make sure it does more good than harm
def saveSearchResults(results,path):
# write to tsv so we don't get messed up by commas in original doc (are there tabs that mess us up?)
with open(path+'searchResults.tsv','w') as outFile:
keys = sorted(results.keys())
outFile.write('Bill ID'+'\t'+'Document Type'+'\t'+'paragraphs where search term was found'+'\n')
for key in keys:
docTypes = results[key].keys()
for docType in docTypes:
values = results[key][docType]
outFile.write(key+'\t'+docType+'\t'+'\t'.join(values)+'\n')
def downloadAndSearch(criteria,path,r,searchTerm,docTypes,urlsToDownload):
# download files, convert PDFs to text, create list of file locations
loc = downloadToText(r,path,docTypes,urlsToDownload)
# search downloaded and converted files, pull paragraphs containing search term
results = search(loc,searchTerm)
# save search results to file
saveSearchResults(results,path)
def getSearchCriteria():
keyword = inquirer.prompt([inquirer.Text('Keyword',
message="Search by Legislation number or Legislation title")])
# master lists to find codes for search criteria
# types master list
categories = requests.get(base+'/api/v1/masters/LegislationCategories').json()
legislationCategories = ["All"]
categoryIdLookup = {}
for i in range(len(categories)):
legislationCategories.append(categories[i]['LegislationCategory'])
categoryIdLookup[categories[i]['LegislationCategory']]=categories[i]['Id']
category = inquirer.prompt([inquirer.List('CategoryId',
message="Choose a category of Legislation to search",
choices=legislationCategories)])
# need Id for search criteria; set to 0 if all selected
if category['CategoryId']!="All":
category['CategoryId'] = categoryIdLookup[category['CategoryId']]
else:
category['CategoryId'] = 0
# subcategories belong to each category - I could implement this but is it worth the time? ask David about it
# same is true for Introduced/co-sponsored by/At the request of/Referred to (committee)/Referred to (committee with comments) - these depend on which council period is selected
# master category: subcategories dictionary
subcategories = requests.get(base+'/api/v1/masters/LegislationTypes').json()
legislationSubcategories = {}
for i in categoryIdLookup.values():
legislationSubcategories[i] = []
subcatIdLookup = {}
for i in subcategories:
subcatIdLookup[i['LegislationType']] = i['DisplayOrder']
for i in range(len(subcategories)):
legislationSubcategories[categoryIdLookup[subcategories[i]['LegislationCategory']]].append(subcategories[i]['LegislationType'])
# now need to offer subcat choice based on cat choice (only some have subcategories)
haveSubcats = [categoryIdLookup['Report'],categoryIdLookup['Bill'],categoryIdLookup['Resolution']]
if category['CategoryId'] in haveSubcats:
subcatChoices=["All"]
for subcat in legislationSubcategories[category['CategoryId']]:
subcatChoices.append(subcat)
subcategory = inquirer.prompt([inquirer.List('SubCategoryId',
message="Choose a subcategory of Legislation to search",
choices=subcatChoices)])
# need Id for search criteria; set to 0 if all selected
if subcategory['SubCategoryId']!="All":
subcategory['SubCategoryId']=subcatIdLookup[subcategory['SubCategoryId']]
else:
subcategory['SubCategoryId'] = 0
# Get council period choice
CouncilPeriod = 0
validChoices = list(range(8,23))
validChoices.append("All")
while CouncilPeriod not in validChoices:
CouncilPeriod = input("Please select a Council Period (8 - 22; default is all periods): ") or "All"
if CouncilPeriod !="All":
try:
CouncilPeriod=int(CouncilPeriod)
except ValueError:
print("That wasn't an integer")
if CouncilPeriod == "All":
CouncilPeriod = 0
## can't do at the request of field - there is no API to access options, and they change every council period.
## Could do it by hand but it would take forever
## there are also tons of typos in the lists of options, seems like it doesn't work as intended anyway
# next fill out introduced by and co-sponsored by fields - don't think it is possible (or desirable, you'd have a 100-member dropdown)
# to replicate the active/inactive list of all current/former members when you select All Council Periods in advanced search
members = requests.get(base+'/api/v1/masters/Members/'+str(CouncilPeriod)).json()
names = ["All"]
nameIdLookup = {}
for i in range(len(members)):
names.append(members[i]['Name'])
nameIdLookup[members[i]['Name']]=members[i]['Id']
introducedBy = inquirer.prompt([inquirer.List('Introducer',
message="Search for legislation introduced by",
choices=names)])
# So you can't choose the same person as introducer and co-sponsor
if introducedBy['Introducer']!="All":
names.remove(introducedBy['Introducer'])
introducedBy['Introducer']=nameIdLookup[introducedBy['Introducer']]
else:
introducedBy['Introducer'] = 0
cosponsor = inquirer.prompt([inquirer.List('CoSponsor',
message="Search for legislation co-sponsored by",
choices=names)])
if cosponsor['CoSponsor']!="All":
cosponsor['CoSponsor']=nameIdLookup[cosponsor['CoSponsor']]
else:
cosponsor['CoSponsor'] = 0
# committee criterai - referred to and referred to (w/ comments)
# master list
committees = requests.get(base+'/api/v1/masters/Committees/'+str(CouncilPeriod)).json()
referredToOptions = ["All"]
committeeIdLookup = {}
for i in range(len(committees)):
referredToOptions.append(committees[i]['Name'])
committeeIdLookup[committees[i]['Name']]=committees[i]['Id']
referredTo = inquirer.prompt([inquirer.List('CommitteeId',
message="Search for legislation referred to the",
choices=referredToOptions)])
if referredTo['CommitteeId']!="All":
referredTo['CommitteeId'] = committeeIdLookup[referredTo['CommitteeId']]
referredToOptions.remove(referredTo['CommitteeId'])
else:
referredTo['CommitteeId'] = 0
try:
referredToOptions.remove('Retained by the Council')
except:
pass
referredToComments = inquirer.prompt([inquirer.List('CommitteeCommentsId',
message="Search for legislation referred with comments to the",
choices=referredToOptions)])
if referredToComments['CommitteeCommentsId']!="All":
referredToComments['CommitteeCommentsId'] = committeeIdLookup[referredToComments['CommitteeCommentsId']]
else:
referredToComments['CommitteeCommentsId'] = 0
# statuses master list
statuses = requests.get(base+'/api/v1/masters/LegislationStatus').json()
LegislationStatuses = ["All"]
statusIdLookup = {}
for i in range(len(statuses)):
LegislationStatuses.append(statuses[i]['Name'])
statusIdLookup[statuses[i]['Name']] = statuses[i]["DisplayOrder"]
legislationStatus=inquirer.prompt([inquirer.List('LegislationStatus',
message='Choose the status of Legislation to search',
choices=LegislationStatuses)])
if legislationStatus['LegislationStatus']!="All":
legislationStatus['LegislationStatus'] = statusIdLookup[legislationStatus['LegislationStatus']]
else:
legislationStatus['LegislationStatus'] = 0
validDates = 0
validStart=0
validEnd =0
while not validDates:
while not (validStart):
StartDate = input('Enter the beginning of the date range to search in MM/DD/YYYY format: ')
try:
validate(StartDate)
validStart = 1
except:
print("Date format should be MM/DD/YYYY. Leave blank to include all dates")
pass
while not (validEnd):
EndDate = input('Enter the end of the date range to search in MM/DD/YYYY format: ')
try:
validate(EndDate)
validEnd = 1
except:
print("Date format should be MM/DD/YYYY. Leave blank to include all dates")
pass
if not (StartDate or EndDate):
validDates = 1
else:
if EndDate:
if datetime.datetime.strptime(StartDate, '%m/%d/%Y') < datetime.datetime.strptime(EndDate, '%m/%d/%Y'):
validDates = 1
else:
print('End date must be after start date.')
else:
validDates=1
# advanced search criteria
# possible categories for advanced search listed at http://lims.dccouncil.us/api/Help/Api/POST-v1-Legislation-AdvancedSearch-rowLimit-offSet
# need to make sure my criteria names match up with API format
# need to change names/text values to their corresponding codes
# CategoryId needs to be CategoryID - Category should be numeric
# If choice is all, need to omit criteria
if not keyword['Keyword']:
try:
criteria = {**category,**subcategory,**introducedBy,**cosponsor,**referredTo,**referredToComments,**legislationStatus}
# if we didn't pick a category that has subcategories
except:
criteria = {**category,**introducedBy,**cosponsor,**referredTo,**referredToComments,**legislationStatus}
else:
try:
criteria = {**keyword,**category,**subcategory,**introducedBy,**cosponsor,**referredTo,**referredToComments,**legislationStatus}
except:
criteria = {**keyword,**category,**introducedBy,**cosponsor,**referredTo,**referredToComments,**legislationStatus}
for criterion in [CouncilPeriod,StartDate,EndDate]:
if not criterion:
pass
else:
criteria['CouncilPeriod']=CouncilPeriod
criteria['StartDate']=StartDate
criteria['EndDate']=EndDate
### do the following even if already downloaded
searchTerm = convertToRegex(input("What would you like to search the downloaded documents for? "))
path = Path(input("Where would you like to save downloaded PDFs and extracted text? (default is current working directory): ") or os.getcwd()+'/')
#path = Path('/Users/joshuakaplan/Documents/Georgetown/Spring 2017/Dr Bailey/Scraping/Signed Acts/')
return(criteria, path, searchTerm)
def docTypeList():
# once we know this part is pulling every doc, we only need to check for the existence of the introduction
docTypes = ['SignedAct','Engrossment','Enrollment','CommitteeReport','ENROLLMENT']
# special case: how many are like this: http://lims.dccouncil.us/Download/36248/B21-0837-Amendments11.pdf
# amendments go up to 17 (seems like they are numbered wrong for b21-0415, and cmmittee reports go up to 12 (also seems numbered wrong for b20-0198))
for i in range(1,13):
docTypes.append('CommitteeReport'+str(i))
for i in range(1,18):
docTypes.append('Amendment'+str(i))
for i in range(1,4):
for docType in ['HearingRecord','HearingNotice']:
docTypes.append(docType+str(i))
return docTypes
# this doesnt work right; need to compare to number of bills that actually have URLS/docs
# 57 of first 1000 results dont have any docs linked.\
global urlsToDownload
urlsToDownload = []
# this only checks for introductions and agendas, will miss other files if we have the introduction but nothing else
def checkDownloaded(r,councilPeriod):
countTerm1= re.compile('([A-Z]+'+str(councilPeriod)+'-\d*-(?i)Introduction)')
countTerm2= re.compile('([A-Z]+'+str(councilPeriod)+'-\d*-(?i)Agenda)')
# will me miss recent updates if we start with an offset?
# offset works in blocks of 1000 records i.e. offset of 1 will start at 1001st record.
urls = {}
for bill in r.json():
if bill['DocumentUrl'].split('/')[-1][:-4] != '':
urls[bill['DocumentUrl'].split('/')[-1][:-4]] = bill['DocumentUrl']
if os.path.isdir('pdfs'):
files = os.listdir('pdfs')
downloaded = countTerm1.findall(' '.join(files))+countTerm2.findall(' '.join(files))
if all(url in downloaded for url in urls.keys()):
return 1
else:
toDownload = [url for url in urls.keys() if url not in downloaded]
for bill in toDownload:
urlsToDownload.append(urls[bill])
return 0
if __name__ == '__main__':
# base url
base = 'http://lims.dccouncil.us'
criteria,path,searchTerm = getSearchCriteria()
print(criteria)
# there seems to be a hard limit of 1000 on the number of search results returned - maybe David can talk to them about raising it
# we can count the number of documents with both [Letters]21 and Introduction in their name to determine the row offset parameter, so we don't have to check that we downloaded everything
# if i want to download more than 1000 in a run i need to make this a loop
offset = 0
r = requests.post(base+'/api/v1/Legislation/AdvancedSearch/1000/'+str(offset),json=criteria)
if criteria['CouncilPeriod']:
# can only check 1000 bills at once
# need to add a cutoff to incrementing offset - otherwise runs forever when there are no docs left
while checkDownloaded(r,criteria['CouncilPeriod']):
offset+=1
r = requests.post(base+'/api/v1/Legislation/AdvancedSearch/1000/'+str(offset),json=criteria)
docTypes = docTypeList()
downloadAndSearch(criteria,path,r,searchTerm,docTypes,urlsToDownload)