-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathproxy-ebooks.py
executable file
·119 lines (99 loc) · 3.5 KB
/
proxy-ebooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
"""
given MARC file:
- proxy certain vendor 856$u URLs
- delete other weird URLs that we don't need
- delete all 856$z subfields (make sense in our OPAC)
outputs to new MARC file. File paths are hardcoded in the "books"
and "processed" variables below.
"""
from pymarc import MARCReader, MARCWriter
books = MARCReader(open('MARC/2013-12-13-full-catalog.MRC'),
to_unicode=True, force_utf8=True, utf8_handling='ignore')
processed = MARCWriter(file('TEST.MRC', 'w'))
# limit output, for quicker testing
limit = 300000
i = 0
# initialize stat counters
num_total_books = 0
num_proxied_ebooks = 0
num_fields_removed = 0
# list of domains in 856 $u that correspond to an ebook subscription we have
subscription_domains = [
'hdl.handle.net', # ACLS Humanities
'galenet.galegroup.com', # GVRL
'find.galengroup.com',
'www.netlibrary.com', # NetLibrary / EBSCO
'www.netLibrary.com', # case sensitive
'search.ebscohost.com',
'web.ebscohost.com',
'online.statref.com', # STAT Ref
'ebooks.greenwood.com', # ABC-CLIO / Greenwood
'ebooks.abc-clio.com',
'historyreferenceonline.abc-clio.com',
'dx.doi.org', # Springer
'www.credoreference.com', # Credo
'ovidsp.ovid.com' # Ovid
]
# weird or useless links that should be removed entirely
weird_domains = [
'www.loc.gov/catdir/', # LOC TOCs, bios, desc, etc.
'lcweb.loc.gov/catdir/',
'catdir.loc.gov/catdir/',
'site.ebrary.com/lib/', # we don't subscribe to ebrary, look like previews
'www.ebrary.com',
'www.josseybass.com', # spam from publisher
'www.e-streams.com', # book reviews
'www.nursespdr.com', # login link
'firstsearch.oclc.org', # login link
'bvbr.bib-bvb.de:8991', # book reviews
'lib.myilibrary.com', # broken links
'www.myilibrary.com',
'fermat.nap.edu', # one broken link
'books.google.com', # unneeded previews
'public.eblib.com', # ToCs from Ebook library
'www.h-net.org', # book reviews
'library2.simpsonuniversity.edu', # broken links
'www.contentreserve.com', # preview content
'images.contentreserve.com',
'edrev.asu.edu', # book reviews
'catalogimages.wiley.com' # book cover
]
# update 856 $u with proxy prefix
def prefix(field):
proxy = "http://ccproxy.idm.oclc.org/login?url="
newU = proxy + field['u']
field.delete_subfield('u')
field.add_subfield('u', newU)
# delete 856 $z field
def deleteZ(field):
field.delete_subfield('z')
for rec in books:
if i < limit:
num_total_books += 1
for field in rec.get_fields('856'):
for u in field.get_subfields('u'):
for domain in weird_domains:
if domain in u:
rec.remove_field(field)
num_fields_removed += 1
for domain in subscription_domains:
if domain in u:
print rec.title()
prefix(field)
num_proxied_ebooks += 1
if len(field.get_subfields('u')) == 0:
# 856 is useless without $u, delete the field
rec.remove_field(field)
num_fields_removed += 1
for z in field.get_subfields('z'):
deleteZ(field)
processed.write(rec)
i += 1
processed.close()
# stats
print "\n"
print "Total Records Processed:", num_total_books
print "Ebooks Proxied:", num_proxied_ebooks
print "856s Deleted:", num_fields_removed
print "\n"