-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess2.py
More file actions
62 lines (57 loc) · 2.34 KB
/
Copy pathprocess2.py
File metadata and controls
62 lines (57 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import cPickle as pickle
import numpy as np
import json
import nltk
MAX_SENTENCE = 30
outputMask = []
def convertToLabels(labels, review):
tokens = nltk.tokenize.word_tokenize(review)
tokensToLabels = [labels[x.lower()] if x in labels else labels['<Unk>'] for x in tokens] + [labels['<EOR>']]
if len(tokensToLabels) > MAX_SENTENCE:
tokensToLabels = tokensToLabels[:MAX_SENTENCE] + [labels['<EOR>']]
return tokensToLabels
def convertToOutput(labels, review):
global outputMask
tokens = nltk.tokenize.word_tokenize(review)
tokensToLabels = [labels[x.lower()] if x in labels else labels['<Unk>'] for x in tokens]
if len(tokensToLabels) < MAX_SENTENCE:
tokensToLabels += [labels['<EOR>']]
outputMask.append([1 if tokensToLabels[i] != 30001 else 0 for i in range(len(tokensToLabels))]+[0 for i in range(MAX_SENTENCE - len(tokensToLabels)+1)])
tokensToLabels += [labels['<EOR>'] for i in range(MAX_SENTENCE - len(tokensToLabels)+1)]
elif len(tokensToLabels) >= MAX_SENTENCE:
tokensToLabels = tokensToLabels[:MAX_SENTENCE] + [labels['<EOR>']]
outputMask.append([1 if tokensToLabels[i] != 30001 else 0 for i in range(MAX_SENTENCE+1)])
return tokensToLabels
def processInputs():
global outputMask
labelsDict = pickle.load(open('labelsDict'))
reviews = json.load(open('../../opinion_abstracts/rottentomatoes.json'))
#embeddings = np.load(open('embeddings'))
inputs = []
outputs = []
count = 0
print 'here'
for movie in reviews:
critics = movie['_critics']
bigRev = []
for review in critics:
bigRev += convertToLabels(labelsDict, critics[review])
if len(bigRev) < MAX_SENTENCE*20:
bigRev = [30003 for i in range(MAX_SENTENCE*20 - len(bigRev)+1)] + bigRev
elif len(bigRev) >= MAX_SENTENCE*20:
bigRev = bigRev[:MAX_SENTENCE*20] + [labelsDict['<EOR>']]
inputs.append(bigRev)
outputs.append(convertToOutput(labelsDict, movie['_critic_consensus']))
count += 1
if count %100 == 0:
print count
X2 = np.array(inputs)
Y2 = np.array(outputs)
print X2.shape
print Y2.shape
np.save(open('X2','w'),X2)
np.save(open('Y2','w'),Y2)
mask = np.array(outputMask)
print mask.shape
np.save(open('outputMask2','w'), mask)
processInputs()