-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert.py
47 lines (39 loc) · 1.15 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import copy
import json
import os
import re
import string
import sys
raw_books = [(x, open(os.path.join('raw_books', x), 'r').read()) for x in os.listdir('./raw_books')]
header = re.compile('\*\*\*\s?START.*$', re.M)
footer = re.compile('\*\*\*\s?END.*$', re.M)
metadata = {
'title': re.compile('Title:\s?(.*)$', re.M),
'author': re.compile('Author:\s?(.*)$', re.M)
}
for fname, book in raw_books:
data = {}
for n, r in metadata.iteritems():
result = r.search(book)
if result:
data[n] = result.group(1).strip()
data['id'] = int(fname[2:-4])
content = footer.split(header.split(book)[1])[0]
data['data'] = []
too_small = 0
for p in content.strip().split('\r\n\r\n'):
if (len(p) < 100 or len(p) > 500):
too_small += 1
continue
p = string.replace(p.strip(), '\r\n', " ")
paragraph = {
'text': p,
'characters': len(p),
'sentences': len(p.split('.')),
}
data['data'].append(paragraph)
data['paragraphs'] = len(data['data'])
print '%s: %s %s' % (data['id'], data['paragraphs'], too_small)
fobj = open("gae/books/%s.json" % (data['id'],), 'wb')
fobj.write(json.dumps(data))
fobj.close()