-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_vector.py
220 lines (185 loc) · 7.43 KB
/
feature_vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import sys
import nltk
import re
import json
import word_category_counter
import data_helper
import argparse
def concat_str(ngram):
# helper function for fwrite_feature_vectors
out=""
for key, value in ngram.items():
out+= "{}:{} ".format( key, value )
return out
def fwrite_feature_vectors( filename, posi_word_ngram, neg_word_ngram, posi_pos_ngram, neg_pos_ngram, posi_liwc_feat, neg_liwc_feat ):
'''
writes out to files
if using a new file other than trainning, development, or testing,
make sure the output is correct for the parsing of dataset to filename
'''
dataset = filename[11:-5]
# Formats output to be written as string formats
# format word tokens
pos_word_feat=concat_str(posi_word_ngram)
neg_word_feat=concat_str(neg_word_ngram)
# format pos_tokens
posi_pos_feat = concat_str( posi_pos_ngram )
neg_pos_feat = concat_str( neg_pos_ngram )
# format LIWC tokens
posi_liwc_feat = concat_str( posi_liwc_feat )
neg_liwc_feat = concat_str( neg_liwc_feat )
# write to files
with open( 'word_features-'+dataset+'-features.txt', "w", encoding="utf-8" ) as fout:
fout.write( "positive %s\nnegative %s"%( pos_word_feat, neg_word_feat ) )
with open( 'word_pos_features-'+dataset+'-features.txt',"w", encoding="utf-8" ) as fout:
fout.write( "positive %s %s\n negative %s %s"%( pos_word_feat, posi_pos_feat, neg_word_feat, neg_pos_feat ) )
with open( 'word_pos_liwc_features-'+dataset+'-features.txt',"w", encoding="utf-8" ) as fout:
fout.write( "positive %s %s %s\n negative %s %s %s"%( pos_word_feat, posi_pos_feat, posi_liwc_feat,neg_word_feat, neg_pos_feat, neg_liwc_feat ) )
return
def get_liwc_features(words):
"""
Adds a simple LIWC derived feature
:param words:
:return:
"""
feature_vectors = {}
text = " ".join(words)
liwc_scores = word_category_counter.score_text(text)
# All possible keys to the scores start on line 269
# of the word_category_counter.py script
negative_score = 0
positive_score = 0
# some values were too high so scalars are present
negative_score += liwc_scores["Negative Emotion"] * 2
negative_score += liwc_scores["Anger"]
negative_score += liwc_scores["Anxiety"]
negative_score += liwc_scores["Sadness"]
negative_score += liwc_scores["Metaphysical issues"]
negative_score += liwc_scores["Death"]
positive_score += liwc_scores["Positive Emotion"] / 6
positive_score += liwc_scores["Optimism and energy"]
positive_score += liwc_scores["Achievement"]
positive_score += liwc_scores["Future Tense"]
if positive_score > negative_score:
feature_vectors["liwc:positive"] = 1
else:
feature_vectors["liwc:negative"] = 1
return feature_vectors
def get_ngram_features(tokens):
'''
This function creates the unigram and bigram features as described in
the assignment3 handout.
:param tokens:
:return: feature_vectors: a dictionary values for each ngram feature
'''
# TODO change this
feature_vectors = {}
unigram_freqdist = nltk.FreqDist( tokens )
bigram_freqdist = nltk.FreqDist( nltk.bigrams( tokens ) )
for token, freq in unigram_freqdist.items():
feature_vectors["UNI_{}".format(token)] = float(freq)/unigram_freqdist.N()
for (b1, b2), freq in bigram_freqdist.items():
feature_vectors["BIGRAM_{}_{}".format(b1, b2)] = float(freq)/bigram_freqdist.N()
return feature_vectors
def get_pos( text ):
tags = []
words = []
words = get_words( text ) # tokenize words
tags = [ t[1] for t in nltk.pos_tag( words ) ]
return tags
def get_words( text ):
'''
This function performs part of speech tagging and extracts the words
from the review text.
- tokenize the text into sentences
- word tokenize each sentence
Returns a list containing all the words of the review and another list
'''
# tokenization for each sentence
words = []
sentences = nltk.sent_tokenize( text )
for sent in sentences:
sent = sent.lower()
words += nltk.word_tokenize( sent )
regex = re.compile(r'(?:\w)+')
normalized_words = []
for w in words:
normalized_words += ( re.findall( regex, w ) )
#tags=[ t[1] for t in nltk.pos_tag(words) ]
#print( tags )
#print( words )
return normalized_words
def features_stub( filename ):
# open restaurant-training.data
# calls data_helper.py to put file in pos or neg category list
# here is where I would call other files as well
datafile = filename
raw_data = data_helper.read_file(datafile)
positive_texts, negative_texts = data_helper.get_reviews(raw_data)
# category_texts creates
# { posive, [... all positive reviews ] , negative, [...all neg ...] }
#
#category_texts = {"positive": positive_texts, "negative": negative_texts}
#feature_set = "word_features"
positive_toks = []
positive_pos_toks = []
negative_toks = []
negative_pos_toks = []
print( 'begin tokenize')
# get word and pos tokens not the most
# efficient but easier to trace
for documents in positive_texts:
positive_toks += get_words( documents )
for documents in negative_texts:
negative_toks += get_words( documents )
for documents in positive_texts:
positive_pos_toks += get_pos( documents )
for documents in negative_texts:
negative_pos_toks += get_pos( documents )
print( 'tokenizing compl' )
# get ngrams for positive and negative categories
posi_word_ngram = {}
posi_pos_ngram = {}
neg_word_ngram = {}
neg_pos_ngram = {}
print( 'begin word ngram' )
#for tokens in positive_toks:
# posi_word_ngram.update( get_ngram_features( tokens ) )
posi_word_ngram.update( get_ngram_features( positive_toks ) )
print( 'all positive word ngram completed')
print( 'begin negative word ngram' )
#for tokens in negative_toks:
# neg_word_ngram.update( get_ngram_features( tokens ) )
neg_word_ngram.update( get_ngram_features( negative_toks ) )
print( 'all negative word ngram completed')
print( 'end word ngram' )
print( 'begin pos ngram' )
#for tokens in positive_toks:
# posi_pos_ngram.update( get_ngram_features( tokens ) )
posi_pos_ngram.update( get_ngram_features( positive_pos_toks ) )
print( 'all pos pos ngram completed')
print( 'begin negative ngram' )
#for tokens in negative_toks:
# neg_pos_ngram.update( get_ngram_features( tokens ) )
neg_pos_ngram.update( get_ngram_features( negative_pos_toks ) )
print( 'all negative pos ngram completed')
print( 'end pos ngram')
print('begin liwc')
# get LIWC features
posi_liwc_feat = get_liwc_features( positive_toks )
neg_liwc_feat = get_liwc_features( negative_toks )
print('end liwc')
print('begin file write')
print( posi_liwc_feat )
print( neg_liwc_feat )
fwrite_feature_vectors( filename, posi_word_ngram, neg_word_ngram, posi_pos_ngram, neg_pos_ngram, posi_liwc_feat, neg_liwc_feat )
if __name__ == "__main__":
if( len( sys.argv ) < 2 ):
print( "error: feature_vector.py called with too few args")
print( "usage: python feature_vector.py [single_file_to_process]" )
print( "note: you also want LIWC files set up as in project description")
exit()
print( "%s processing"%( sys.argv[1]) )
filename = sys.argv[1]
features_stub( filename )
print( "completed" )