-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgutenberg_cleanup.py
210 lines (154 loc) · 6.81 KB
/
gutenberg_cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import glob, re, os
from bs4 import BeautifulSoup
#
# `get_formatted_number` and `RdfParser` are from
# https://github.com/kiwix/gutenberg
#
def get_formatted_number(num):
"""
Get a formatted string of a number from a not-predictable-string
that may or may not actually contain a number.
Append a BC notation to the number num with, if the
number is negative.
returns: a formatted string of the number, or num if
num is not negative or None.
"""
if not num:
return None
if all(['-' in num, num.replace('-', '').strip().isdigit()]):
return ' '.join([num, 'BC'])
return num
class RdfParser():
def __init__(self, rdf_data, gid):
self.rdf_data = rdf_data
self.gid = gid
self.author_id = None
self.author_name = None
self.first_name = None
self.last_name = None
def parse(self):
soup = BeautifulSoup(self.rdf_data, 'lxml', from_encoding='utf-8')
# The tile of the book: this may or may not be divided
# into a new-line-seperated title and subtitle.
# If it is, then we will just split the title.
self.title = soup.find('dcterms:title')
self.title = self.title.text if self.title else '- No Title -'
self.title = self.title.split('\n')[0]
self.subtitle = ' '.join(self.title.split('\n')[1:])
self.author_id = None
# Parsing the name of the Author. Sometimes it's the name of
# an organization or the name is not known and therefore
# the <dcterms:creator> or <marcrel:com> node only return
# "anonymous" or "unknown". For the case that it's only one word
# `self.last_name` will be null.
# Because of a rare edge case that the field of the parsed author's name
# has more than one comma we will join the first name in reverse, starting
# with the second item.
self.author = soup.find('dcterms:creator') or soup.find('marcrel:com')
if self.author:
self.author_id = self.author.find('pgterms:agent')
self.author_id = self.author_id.attrs['rdf:about'].split('/')[-1] if 'rdf:about' in getattr(self.author_id, 'attrs', '') else None
if self.author.find('pgterms:name'):
self.author_name = self.author.find('pgterms:name')
self.author_name = self.author_name.text.split(',')
if len(self.author_name) > 1:
self.first_name = ' '.join(self.author_name[::-2]).strip()
self.last_name = self.author_name[0]
# Parsing the birth and (death, if the case) year of the author.
# These values are likely to be null.
self.birth_year = soup.find('pgterms:birthdate')
self.birth_year = self.birth_year.text if self.birth_year else None
self.birth_year = get_formatted_number(self.birth_year)
self.death_year = soup.find('pgterms:deathdate')
self.death_year = self.death_year.text if self.death_year else None
self.death_year = get_formatted_number(self.death_year)
# ISO 639-3 language codes that consist of 2 or 3 letters
try :
self.language = soup.find('dcterms:language').find('rdf:value').text
except AttributeError :
self.language = None
# The download count of the books on www.gutenberg.org.
# This will be used to determine the popularity of the book.
self.downloads = soup.find('pgterms:downloads').text
# The book might be licensed under GPL, public domain
# or might be copyrighted
self.license = soup.find('dcterms:rights').text
# Finding out all the file types this book is available in
file_types = soup.find_all('pgterms:file')
self.file_types = {}
for x in file_types:
if not x.find('rdf:value').text.endswith('application/zip'):
k = x.attrs['rdf:about'].split('/')[-1]
v = x.find('rdf:value').text
self.file_types.update({k:v})
return self
def get_text(html_path) :
"""Clean HTML tags, escape characters, special unicode, punctuation, and empty spaces from the raw html
Inputs:
`html_path`: path to the raw html Gutenberg project book file
Outputs:
a string of cleaned, lower-case text
"""
with open(html_path) as f :
source = f.read()
# lower case
source = source.lower()
# define the regular expressions
# remove tags and punctuation
no_tags = re.compile('<.*>')
cleaned = no_tags.sub('',source)
# remove escape characters
no_escape = re.compile('\r?\n|\r')
cleaned = no_escape.sub(' ', cleaned)
# replace umlauts
a_uml = re.compile('\\xc3\\xa4')
cleaned = a_uml.sub('ae', cleaned)
o_uml = re.compile('\\xc3\\xb6')
cleaned = o_uml.sub('oe', cleaned)
u_uml = re.compile('\\xc3\\xbc')
cleaned = u_uml.sub('ue', cleaned)
# remove all non-ascii
no_non_ascii = re.compile('[^\x00-\x7F]+')
cleaned = no_non_ascii.sub(' ', cleaned)
# remove punctuation
no_punctuation = re.compile('[^a-zA-Z0-9\s]')
cleaned = no_punctuation.sub('', cleaned)
# remove numbers
no_numbers = re.compile('[0-9]+')
cleaned = no_numbers.sub('', cleaned)
no_empty_space = re.compile('\s+')
cleaned = no_empty_space.sub(' ', cleaned)
# remove mdash
cleaned = cleaned.replace('mdash', ' ')
# when returning, remove also the left and right space padding
return cleaned.strip()
def get_gid(html_path) :
"""
Return the Gutenberg book ID (`gid`) give the path to the raw html file.
Input:
`html_path`: path to the html Gutenberg book file
Output:
`gid`: the ID of the Gutenberg book
"""
digits = re.compile('[0-9]+')
gid = digits.findall(os.path.splitext(os.path.basename(html_path))[0])[0]
return int(gid)
def get_metadata(gid, rdf_path = '/cluster/work/sdid/roskarr/gutenberg/rdf-files/') :
"""Extract the metadata from the Gutenberg book represented by the gid.
Inputs:
`gid`: the Gutenberg project book ID
Optional Keywords:
`rdf_path`: path to the Gutenberg project RDF projects
Outputs:
dictionary containing the book metadata
"""
gid = str(gid)
rdf_file = glob.glob(rdf_path+gid+'/*')[0]
with open(rdf_file) as f :
rdf_data = f.read()
rp = RdfParser(rdf_data, gid).parse()
return {'gid': int(rp.gid), 'title': rp.title,
'first_name': rp.first_name, 'last_name': rp.last_name,
'birth_year': rp.birth_year, 'death_year': rp.death_year,
'lang': rp.language, 'downloads': rp.downloads}
#return "%d||%s||%s||%s||%s||%s"%(int(rp. gid), rp.title,rp.first_name,rp.last_name,rp.birth_year,rp.death_year)