-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnb_train.py
162 lines (111 loc) · 5.18 KB
/
nb_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import pandas as pd
import string
import re
import nltk
import random
import pickle
from nltk.corpus import stopwords
from zipfile import ZipFile
DATA = 'movie.csv'
FEATURE_0 = 'text'
LABEL = 'label'
LABEL_0 = 0
LABEL_1 = 1
NEGATIVE_SENTIMENT = 'neg'
POSITIVE_SENTIMENT = 'pos'
DATA_LENGTH = 2000
STOPWORD_TYPE = 'english'
TRAIN_DATA_LENGTH = 1500
NUM_OF_FEATURES = 5
FILE_NAME = 'nb_classifier'
class NaiveBayesClassifier:
def __init__(self, classifier):
self.classifier = classifier
@classmethod
def from_pickled(cls, path):
'''Unpickles the file given by path, returns an instance of the class initialized with the classifier object.'''
classifier = pickle.load(open(path,'rb'))
return cls(classifier)
def test(self, test_data):
'''Evaluates the model on test data, and returns the accuracy.'''
return nltk.classify.accuracy(classifier, test_data)
def predict(self, str_input):
'''Takes in a string, splits it, and returns a prediction for the sentiment of that string.'''
words = str_input.split(' ')
return classifier.classify(find_features(words, common_words)) # TODO: Will common_words defined in main() be visible to predict()?
def store(self, path):
'''Accepts a file path and stores the model at that file path using the pickle library.'''
with open(path,'wb') as f:
pickle.dump(classifier, f)
def clean(text):
'''Converts the input text into lower case, removes <br> tags, punctuation, whitespace, and stopwords.
Returns a list of words that remain.
Args:
text(str): input text'''
text = ''.join([w.lower() for w in text.replace('<br', '') if w not in string.punctuation])
text = re.sub('\s+',' ', text)
text = [word for word in text.split() if word not in stopwords.words(STOPWORD_TYPE)]
return text
def make_text_label_tuples(df):
'''Make tuples that pair processed strings with their corresponding labels. Returns the tuples in a list.
Args:
df: dataframe'''
tup_list = []
for i in range(len(df)):
word_list = clean(df.iloc[i][FEATURE_0])
sentiment = df.iloc[i][LABEL]
tup_list.append((word_list, sentiment))
return tup_list
def extract_common_words(data, count):
'''Selects the specified number of most common words out of the strings in data, returns the words in a list.
Args:
data(list): a list of words
count(int): the number of most common words to be extracted.'''
word_frequency = nltk.FreqDist(w.lower() for w in data)
common_words = list(word_frequency)[:count]
return common_words
def find_features(word_list, feature_words):
'''Select unique words from a list of words, returns a dictionary mapping features to booleans.
Args:
word_list(list): a list of words split from a sentence
feature_words(list): a list of common words
Returns:
features(dict): a dictionary with each word in feature_words as keys and booleans as values
(T if the word appears in word_list, F if otherwise).'''
words = set(word_list)
features = {}
for w in feature_words:
features[f'contains({w})'] = (w in words)
return features
def train_model(data):
'''Trains the model, returns the model.
Args:
data(list): a list of tuples where the first element of each tuple is a dict of features, the second element is the sentiment.'''
classifier = nltk.NaiveBayesClassifier.train(data)
return classifier
def main():
with ZipFile('imdb ratings.zip') as zf:
f = zf.open(DATA)
df = pd.read_csv(f)
smalldf = df[:DATA_LENGTH]
smalldf = smalldf.drop_duplicates(ignore_index = True)
smalldf.loc[smalldf[LABEL] == LABEL_0, LABEL] = NEGATIVE_SENTIMENT
smalldf.loc[smalldf[LABEL] == LABEL_1, LABEL] = POSITIVE_SENTIMENT
# Pair each string to its corresponding sentiment.
str_to_sentiment = make_text_label_tuples(smalldf)
random.shuffle(str_to_sentiment) # randomly shuffle the list of tuples to randomize the contents of train and test data.
nested_words = [t[0] for t in str_to_sentiment]
words_flatlist = [word for wl in nested_words for word in wl]
common_words = extract_common_words(words_flatlist, 2000)
# TODO: should I only find features from the train data? Am I overfitting by also finding features from the test set?
featuresets = [(find_features(word_list, common_words), label) for (word_list, label) in str_to_sentiment]
train_set, test_set = featuresets[:TRAIN_DATA_LENGTH], featuresets[TRAIN_DATA_LENGTH:]
model_accuracy = nltk.classify.accuracy(train_model(train_set), test_set)
print('NB model accuracy:', model_accuracy)
print(train_model(train_set).show_most_informative_features(NUM_OF_FEATURES))
# Save model.
with open(FILE_NAME,'wb') as f:
pickle.dump(train_model(train_set), f)
model = NaiveBayesClassifier(train_model(train_set))
if __name__ == '__main__':
main()