-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
81 lines (61 loc) · 2.16 KB
/
preprocess.py
File metadata and controls
81 lines (61 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import re
import smart_open
dirname = './'
locale.setlocale(locale.LC_ALL, 'C')
if sys.version > '3':
control_chars = [chr(0x85)]
else:
control_chars = [unichr(0x85)]
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
norm_text = text.lower()
# Replace breaks with spaces
norm_text = norm_text.replace('<br />', ' ')
# Pad punctuation with spaces on both sides
for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
norm_text = norm_text.replace(char, ' ' + char + ' ')
return norm_text
import time
start = time.clock()
# if not os.path.isfile('alldata-id.txt'):
if True:
# Concat and normalize test/train data
# folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
folders = ['train']
alldata = u''
for fol in folders:
temp = u''
output = fol.replace('/', '-') + '.txt'
# Is there a better pattern to use?
txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
for txt in txt_files:
id = re.findall('\d+', txt)[0]
with smart_open.smart_open(txt, "rb") as t:
line = ""
t_clean = t.read().decode("utf-8")
for c in control_chars:
t_clean = t_clean.replace(c, ' ')
line += t_clean
line = line.strip().replace("\n", " ").replace("\r", " ")
if line:
temp += "_*" + id + " "
temp += line
temp += "\n"
temp_norm = normalize_text(temp)
with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
n.write(temp_norm.encode("utf-8"))
alldata += temp_norm
with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
for idx, line in enumerate(alldata.splitlines()):
# num_line = u"_*{0} {1}\n".format(idx, line)
num_line = u"{0}\n".format(line)
f.write(num_line.encode("utf-8"))
end = time.clock()
print ("total running time: ", end-start)