-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
201 lines (161 loc) · 6.27 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: utf-8 -*-
#!usr/bin/env python
#
# Procesa textos de cualquier tipo para crear un diccionario con las
# raices de las palabras encontradas y la frecuencia de aparicion
#
# Se elimina todo lo que no sean palabras con un minimo de longitud,
# asi como palabras que no aportan lexico (stopwords)
#
# Autor: Jean-Francois Kener
#
# 2017
#
from six.moves import cPickle as pickle
import os, glob, re
from xml.sax.saxutils import unescape # remove &...;
from collections import defaultdict
verbose = True
dataPath = './Data'
stopwordsFile = './External/spanish_stopwords.txt'
minWordLen = 3
generateTrainData = False
generateTestData = True
testInput = './Test/test.txt'
testOutput = './Test/test.pickle'
def CleanText(texto):
#Limpiar HTML
re_clean_html = re.compile('<.*?>')
output = re.sub(re_clean_html, ' ', texto)
output = unescape(output) # remove &...;
#Sustituir todo lo que no sea una letra por un espacio
output = re.sub(r'[^a-zA-ZÁÉÍÓÚáéíóúñç]',' ', output)
#Sustituir varios espacios seguidos por un solo espacio
output = re.sub(r'[\ ]{2,}', ' ', output)
#Pasar todo a minúsculas y eliminar acentos
output = output.lower()
#eliminar acentos pendiente
acentos = {'á':'a', 'é':'e', 'í':'i','ó':'o','ú':'u'}
for key in acentos:
output = output.replace(key, acentos[key])
return output
def Lemmatizador(tokens):
global minWordLen
output = []
for palabra in tokens:
corregida = palabra
regla1 = r'((i[eé]ndo|[aá]ndo|[aáeéií]r|[^u]yendo)(sel[ao]s?|l[aeo]s?|nos|se|me))'
step1 = re.search(regla1, corregida)
if step1:
if (len(palabra)-len(step1.group(1))) >= minWordLen:
corregida = corregida[:-len(step1.group(1))]
elif (len(palabra)-len(step1.group(3))) >= minWordLen:
corregida = corregida[:-len(step1.group(3))]
regla2 = {
'(anzas?|ic[oa]s?|ismos?|[ai]bles?|istas?|os[oa]s?|[ai]mientos?)$' : '',
'((ic)?(adora?|ación|ador[ae]s|aciones|antes?|ancias?))$' : '',
'(log[íi]as?)$' : 'log',
'(ución|uciones)$' : 'u',
'(encias?)$' : 'ente',
'((os|ic|ad|(at)?iv)amente)$' : '',
'(amente)$' : '',
'((ante|[ai]ble)?mente)$' : '',
'((abil|ic|iv)?idad(es)?)$' : '',
'((at)?iv[ao]s?)$' : '',
'(ad[ao])$' : '',
'(ando)$' : '',
'(aci[óo]n)$' : '',
'(es)$' : ''
}
for key in regla2:
tmp = re.sub(key, regla2[key], corregida)
if tmp!=corregida and len(tmp)>=minWordLen:
corregida = tmp
regla3 = {
'(y[ae]n?|yeron|yendo|y[oó]|y[ae]s|yais|yamos)$',
'(en|es|éis|emos)$',
'(([aei]ría|ié(ra|se))mos)$',
'(([aei]re|á[br]a|áse)mos)$',
'([aei]ría[ns]|[aei]réis|ie((ra|se)[ns]|ron|ndo)|a[br]ais|aseis|íamos)$',
'([aei](rá[ns]|ría)|a[bdr]as|id[ao]s|íais|([ai]m|ad)os|ie(se|ra)|[ai]ste|aban|ar[ao]n|ase[ns]|ando)$',
'([aei]r[áé]|a[bdr]a|[ai]d[ao]|ía[ns]|áis|ase)$',
'(í[as]|[aei]d|a[ns]|ió|[aei]r)$',
'(os|a|o|á|í|ó)$',
'(u?é|u?e)$',
'(ual)$',
'([áa]tic[oa]?)$'
}
for pattern in regla3:
tmp = re.sub(pattern, '', corregida)
if tmp!=corregida and len(tmp)>=minWordLen:
corregida = tmp
output.append(corregida)
return output
def GetCorpusFromFolder(folder):
text = ""
for filename in glob.glob(folder+'/*.txt'):
text = text + open(filename, "r").read() + " "
return text
# Preprocess all corpus
def PreprocessAllCorpus():
global dataPath, verbose, stopwordsFile, minWordLen
#Load corpus
dataFolders = list(os.walk(dataPath))[1:]
labels = [os.path.basename(os.path.normpath(x[0])) for x in dataFolders]
#Load stopwords
with open(stopwordsFile, 'r') as f:
stopwords = set(f.read().splitlines())
#Clean corpus, tokenize, remove stopwords and lemmatize, and create freq dictionary
total_n_tokens = 0
total_n_tokens_unique = 0
for i, folder in enumerate(dataFolders):
corpus = GetCorpusFromFolder(folder[0])
tokens = CleanText(corpus).split(' ')
tokens = [word for word in tokens if word not in stopwords]
tokens = Lemmatizador(tokens)
tokens = [token for token in tokens if len(token) >= minWordLen]
#Get number of tokens
n_tokens = len(tokens)
n_tokens_unique = len(set(tokens))
total_n_tokens += n_tokens
total_n_tokens_unique += n_tokens_unique
#Obtain sorted dictionary with frequencies
fq= defaultdict( float )
for w in tokens:
fq[w] += 1
oc_dictionary = sorted(fq.items(), key=lambda item: item[1], reverse=True)
freq_dictionary = [(word, freq/n_tokens) for (word, freq) in oc_dictionary]
with open(dataPath+'/'+labels[i]+".pickle", "wb") as f:
pickle.dump(freq_dictionary, f)
if verbose:
print "Corpus " + labels[i] + "\t(" + str(n_tokens) + " tokens, "+str(n_tokens_unique)+" únicos)"
#PreprocessTestCorpus
def PreprocessTestCorpus():
global verbose, stopwordsFile, minWordLen
global testInput, testOutput
#Load stopwords
with open(stopwordsFile, 'r') as f:
stopwords = set(f.read().splitlines())
#Start processing
corpus = open(testInput, 'r').read()
tokens = CleanText(corpus).split(' ')
tokens = [word for word in tokens if word not in stopwords]
tokens = Lemmatizador(tokens)
tokens = [token for token in tokens if len(token) >= minWordLen]
#Get number of tokens
n_tokens = len(tokens)
n_tokens_unique = len(set(tokens))
#Obtain sorted dictionary with frequencies
fq= defaultdict( float )
for w in tokens:
fq[w] += 1
oc_dictionary = sorted(fq.items(), key=lambda item: item[1], reverse=True)
freq_dictionary = [(word, freq) for (word, freq) in oc_dictionary] # freq/n_tokens
with open(testOutput, "wb") as f:
pickle.dump(freq_dictionary, f)
if verbose:
print "Test generated (" + str(n_tokens) + " tokens, "+str(n_tokens_unique)+" únicos)"
if generateTrainData:
PreprocessAllCorpus()
if generateTestData:
PreprocessTestCorpus()