-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathtext_preprocess.py
51 lines (42 loc) · 1.49 KB
/
text_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
from functools import reduce
class Tfidf:
@property
def word_dictionary(self):
return self.__word_dictionary
def __create_idf(self, corpus):
n_words = np.zeros_like(self.__word_dictionary, dtype=int)
for row in corpus:
_, indexes, _ = np.intersect1d(self.word_dictionary, row, return_indices=True)
n_words[indexes] += 1
return np.log(len(corpus) / (n_words + 1))
def fit_transform(self, X):
'''
Parameters
----------
X : shape (n_corpus, text_length)
Training corpus
Returns
-------
Tf-idf matrix : shape (n_corpus, dictionary_length)
Tf-idf-weighted document-term matrix
'''
self.__word_dictionary = reduce(np.union1d, X)
self.__idf = self.__create_idf(X)
return self.transform(X)
def transform(self, X):
'''
Parameters
----------
X : shape (n_corpus, text_length)
Predicting corpus
Returns
-------
Tf-idf matrix : shape (n_corpus, dictionary_length)
Tf-idf-weighted document-term matrix
'''
tf = np.zeros((len(X), len(self.__word_dictionary)))
for i in range(len(X)):
for j in range(len(self.__word_dictionary)):
tf[i, j] = X[i].count(self.__word_dictionary[j])
return tf * self.__idf