-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatch_pubs.py
executable file
·344 lines (290 loc) · 12.2 KB
/
match_pubs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#!/usr/local/bin/python3
"""Match pubs in a set of alerts with pubs in a library and generate a
processed list of pubs from the alerts.
The processed list is then used to help add pubs to the target library.
"""
import argparse
import urllib.parse
import alert
import alert_sources
import email_alert
import lib_types
import known_pub_db
import html_report
import pub_match
DOT_PROXY_OPTION = "dot"
DASH_PROXY_OPTION = "dash"
PROXY_SEPARATOR_OPTIONS = {
DOT_PROXY_OPTION: ".",
DASH_PROXY_OPTION: "-",
}
# Globals
args = None
lib_module = None
input_lib = None
def get_args():
"""Parse and return the command line arguments."""
arg_parser = argparse.ArgumentParser(
description=(
"Match newly reported pubs with pubs in a library, and/or "
+ "pubs from earlier runs, and generate an actionable report "
+ "that can be used to add the new pubs to a library."))
arg_parser.add_argument(
"--libtype", required=True,
help=(
"What type of library are we reading in, and will the "
+ "actionable report be used to update. Options are "
+ lib_types.get_lib_types_as_text_list() + "."))
arg_parser.add_argument(
"--inputlibpath", required=True,
help="path to the library")
arg_parser.add_argument(
"--onlineliburl", required=True,
help=(
"Base URL of the online version of the library. Used to "
+ "generate links."))
arg_parser.add_argument(
"--email", required=False,
help="Email account to pull notifications from")
arg_parser.add_argument(
"--mailbox", required=False,
help=(
"Optional mailbox within email account to limit notifications "
+ "from."))
arg_parser.add_argument(
"--imaphost", required=False,
help=(
"Address of --email's IMAP server. For GMail this is "
+ "imap.gmail.com"))
arg_parser.add_argument(
"--since", required=True,
help=("Only look at alerts from after this date."
+ " Format: DD-Mon-YYYY. Example: 01-Dec-2014."))
arg_parser.add_argument(
"--before", required=False,
help=("Optional. Only look at alerts before this date."
+ " Format: DD-Mon-YYYY. Example: 01-Jan-2015."))
arg_parser.add_argument(
# This is mapped to a list before returning to caller.
"--sources", required=True,
help="Which alert sources to process. Is either 'all' or a "
+ "comma-separated list (no spaces) from these sources: "
+ alert_sources.get_alert_sources_as_text_list())
arg_parser.add_argument(
"--proxy", required=False,
help=(
"string to insert in URLs to access pubs through your paywall. "
+ "For Johns Hopkins, for example, this is: "
+ "'.proxy1.library.jhu.edu'"))
arg_parser.add_argument(
"--proxyseparator", required=False,
help=(
"Some proxies replace dots in the original pub URL with dashes. "
+ "Default is dot."),
choices=PROXY_SEPARATOR_OPTIONS.keys(), default=DOT_PROXY_OPTION)
arg_parser.add_argument(
"--customsearchurl", required=False,
help=(
"URL to use for custom searches. The title of the publication "
+ "will be added to the end of this URL. "
+ "For Johns Hopkins, for example, this is: "
+ "'https://catalyst.library.jhu.edu/"
+ "?utf8=%E2%9C%93&search_field=title&'"))
# TODO: This should accept multiple custom search URLs in a
# comma separated list.
# TODO: should also accept search URL strings where title can be
# embedded in the string.
arg_parser.add_argument(
"--knownpubsin", required=False,
help=(
"Read in curation history (likely) from previous run. Tells you "
+ "what you've seen before."))
arg_parser.add_argument(
"--knownpubsout", required=False,
help="Create a history of this run in TSV format as well.")
arg_parser.add_argument(
"--okduplicatetitles", required=False,
help=(
"Text file containing duplicate titles that have been reviewed "
+ "and are in fact not duplicate titles. These will not get "
+ "reported as duplicates."))
arg_parser.add_argument(
"--excludesearches", required = False,
help=(
"Exclude searches look for matches that we want to exclude "
+ "from our results. These are useful because it is sometimes "
+ "easier to list each exclude search, each in a separate search "
+ "then to include all the excludes in each search (and "
+ "sometimes we can't make the search that long)."))
arg_parser.add_argument(
"--curationpage", required=True,
help="Where to put the HTML page listing all the pubs.")
"""
arg_parser.add_argument(
"--verify1stauthors", required=False,
action="store_true",
help="When we have a paper from more than one source, check that" +
" the two sources have the same first author. This is noisy.")
"""
args = arg_parser.parse_args()
# split comma separated list of sources
args.sources = args.sources.split(",")
return args
def get_pub_proxy_url(pub_url, proxy, proxy_separator):
"""Given the URL to a pub in it's native habitat, return a URL that links
to the pub through the given proxy.
If the pub's URL is say:
"https://thisandthat.org/paper/etc"
This this function will return
"https://thisandthat.org." + proxy + "/paper/etc"
or
"https://thisandthat-org." + proxy + "/paper/etc"
if the pub does not have a URL, then None is returned.
"""
if pub_url:
url_parts = pub_url.split("/")
url_parts[2] = url_parts[2].replace(".", proxy_separator)
proxy_url = (
"/".join(url_parts[0:3]) + proxy + "/" + "/".join(url_parts[3:]))
else:
proxy_url = None
return proxy_url
def pub_match_link_list_html(pub_match):
"""Generate a list of links in HTML format for a PubMatch that can be
used to help curate it.
This is used as a callback from routines that print information about
a pub_match. It's here because this module has most of the knowledge
about what goes in those links.
"""
global lib_module, args, input_lib
output = []
output.append('<p>Links</p>')
output.append('<ul>')
pub_url = pub_match.get_pub_url()
if pub_match.is_new():
if pub_url:
# generate link to add pub to library
add_link = lib_module.gen_add_pub_html_link(pub_url)
if add_link: # not all libraries support URLs to add a pub.
output.append('<li> {0} </li>'.format(add_link))
elif pub_match.is_lib_pub():
# Pub is already in library. Link to that entry.
output.append(
'<li> <a href="{0}" target="zot"> See pub at {1}</a></li>'.format(
input_lib.gen_pub_url_in_lib(
pub_match.get_lib_pub()), lib_module.SERVICE_NAME))
if pub_url:
# link to pub in its native habitat.
output.append(
'<li> <a href="{0}" target="nativepub">See pub</a></li>'.format(
pub_url))
if args.proxy and pub_url:
# link to pub behind proxy.
output.append(
('<li> <a href="{0}" target="proxypub">'
+ 'See pub via proxy</a></li>').format(
get_pub_proxy_url(
pub_url, args.proxy,
PROXY_SEPARATOR_OPTIONS[args.proxyseparator])))
# Search for pub in several places
pub_title = pub_match.get_pub_title()
pub_title_urlencoded = urllib.parse.urlencode({"q": pub_title})
if args.customsearchurl:
# search for title at custom search site
custom_search_site = "/".join(args.customsearchurl.split("/")[0:3])
output.append(
('<li> <a href="{0}" target="custom-search">'
+ 'Search for pub at {1} </a></li>').format(
args.customsearchurl + pub_title_urlencoded,
custom_search_site))
# search for title at Google
output.append(
('<li> <a href="https://www.google.com/search?q={0}" '
+ 'target="googletitlesearch" />Search Google</a></li>').format(
pub_title))
# search for title at Google Scholar
output.append(
('<li> <a href="https://scholar.google.com/scholar?q={0}" '
+ ' target="googlescholarsearch" />'
+ 'Search Google Scholar</a></li>').format(pub_title))
# search PubMed
output.append(
('<li> <a href="https://www.ncbi.nlm.nih.gov/pubmed/?term={0}" '
+ 'target="pubmedtitlesearch" /a>Search Pubmed</a></li>').format(
pub_title))
output.append('</ul>')
output.append('</div>')
return output
def match_pubs(command_line_args):
"""Match up pubs across all provided sources."""
global lib_module, input_lib, args
args = command_line_args
# Read in publication library, from, for example, Zotero
lib_module = lib_types.get_lib_module(args.libtype)
input_lib = lib_module.PubLibrary(args.inputlibpath, args.onlineliburl)
# Get the annotation history database. This includes pubs we added to the
# library, pubs we decided to ignore, and pubs we are still deciding on.
known_pubs_db = None
if args.knownpubsin:
known_pubs_db = known_pub_db.KnownPubDB(args.knownpubsin)
# read in the list of exclude searches; will be empty if there are none
exclude_db = alert.ExcludeAlertsDB(args.excludesearches)
# Get all alerts
pub_alerts = []
# Open connection to alert source.
email_connection = None
if args.email:
email_connection = email_alert.AlertSource(
account=args.email, imaphost=args.imaphost)
# go through each source and pull in all alerts.
source_ids = args.sources
if source_ids[0] == 'all':
source_ids = alert_sources.ALERT_SOURCES
for source_id in source_ids:
source_module = alert_sources.get_alert_source_module(source_id)
if source_module.IS_EMAIL_SOURCE:
connection = email_connection
else:
connection = None
connection.module = source_module
# get every pub_alert from that source
pub_alerts += (
connection.get_pub_alerts(
connection.module.SENDERS, mailbox=args.mailbox,
since=args.since, before=args.before))
# Identify which pub_alerts are exclude alerts. Label as such
for pa in pub_alerts:
pa.alert.exclude = exclude_db.is_an_exclude_alert(pa.alert)
ok_dup_titles = [] # List of titles that it's ok to have duplicates of.
if command_line_args.okduplicatetitles:
dup_titles_file = open(command_line_args.okduplicatetitles, 'r')
for title in dup_titles_file:
ok_dup_titles.append(title)
dup_titles_file.close()
# now have library, a list of pubs we have seen before, and new pub alerts
# to match against each other. Create a matchup DB.
pub_matchups = pub_match.PubMatchDB(
pub_library=input_lib, pub_alerts=pub_alerts,
known_pubs_db=known_pubs_db, ok_dup_titles=ok_dup_titles)
# Print out any matchups that have new pub_alerts
curation_page = open(command_line_args.curationpage, 'w')
curation_page.write(html_report.gen_header())
curation_page.write(
pub_matchups.matchups_with_pub_alerts_to_html(
exclude_db, pub_match_link_list_html))
curation_page.write(html_report.gen_footer())
curation_page.close()
if args.knownpubsout:
# update known pubs DB to include any pubs we didn't know about before.
# What didn't we know about before? Anything without a known pub.
if not known_pubs_db:
known_pubs_db = known_pub_db.KnownPubDB()
new_pubs = pub_matchups.get_matchups_without_known_pub()
known_pubs_db.add_pubs_from_matchups(new_pubs)
known_pubs_db.write_db_to_file(args.knownpubsout)
return None
# MAIN
if __name__ == '__main__':
command_line_args = get_args()
match_pubs(command_line_args)
# And we be done.