-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovieReviewLog.py
160 lines (114 loc) · 4.11 KB
/
movieReviewLog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from collections import defaultdict
import os
import pandas as pd
uniqueDict=defaultdict(list)
dicPos=defaultdict(int)
dicNeg=defaultdict(int)
#for positive data
filesPos = os.listdir("./pos")
print("lehgth of total files in positive is",len(filesPos),". So training data is till 600.");print()
for i in range(600):
file=filesPos[i].split('.')
if file[0] != "":
data=open("./pos/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
df=data.read()
for word in df.split():
if word not in uniqueDict:
uniqueDict[word]
if word not in dicPos:
dicPos[word]=1
else:
dicPos[word]+=1
#for negative data
filesNeg = os.listdir("./neg")
print("lehgth of total files in negative is",len(filesNeg),". So training data is till 600.");print()
for i in range(600):
file=filesNeg[i].split('.')
if file[0] != "":
data=open("./neg/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
df=data.read()
for word in df.split():
if word not in uniqueDict:
uniqueDict[word]
if word not in dicNeg:
dicNeg[word]=1
else:
dicNeg[word]+=1
vocab=len(uniqueDict)
pos=len(dicPos)
neg=len(dicNeg)
print("words in +ve",pos,"; in -ve",neg,"and total unique words",vocab);print()
# Naive Base training for 1200 documents (600 +ve and 600 -ve)
probPos=defaultdict(float)
probNeg=defaultdict(float)
totalPos=0;totalNeg=0 #total counts of duplicates words
for word in dicPos:
totalPos+=dicPos[word]
for word in dicNeg:
totalNeg+=dicNeg[word]
for word in dicPos: #probability of a word in both +ve and -ve
probPos[word] = (1+dicPos[word])/(vocab+totalPos) # plus 1, in case of count=0
for word in dicNeg:
probNeg[word] = (1+dicNeg[word])/(vocab+totalNeg)
# In[2]:
#testing +ve the data remaining 93 files in both +ve and -ve
import math
count=0
for i in range(600,693,1):
file=filesPos[i].split('.')
if file[0] != "":
data=open("./pos/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
df=data.read()
# print(file[0]+"."+file[1],end=" is ")
dic=defaultdict(int)
for word in df.split():
dic[word]= 1 if word not in dic else (dic[word]+1)
positive=0.5
negative=0.5
notFound=10**(-20)
for word in dic: #probability of a word in both +ve and -ve
positive += math.log10(probPos[word]) if word in probPos else math.log10(notFound)
negative += math.log10(probNeg[word]) if word in probNeg else math.log10(notFound)
if (positive >negative):
count+=1
# print("+ve")
# else:
# print("-ve")
# In[3]:
#testing -ve the data remaining
import math
countN=0
for i in range(600,693,1):
file=filesNeg[i].split('.')
if file[0] != "":
data=open("./neg/"+file[0]+'.txt',encoding="utf-8",errors='ignore' )
df=data.read()
# print(file[0]+"."+file[1],end=" is ")
dic=defaultdict(int)
for word in df.split():
dic[word]= 1 if word not in dic else (dic[word]+1)
positive=0.5
negative=0.5
notFound=10**(-20)
for word in dic: #probability of a word in both +ve and -ve
positive += math.log10(probPos[word]) if word in probPos else math.log10(notFound)
negative += math.log10(probNeg[word]) if word in probNeg else math.log10(notFound)
# if word in probPos:
# positive *= probPos[word]
# if word in probNeg:
# negative *= probNeg[word]
# print(positive,negative)
if (positive < negative):
countN+=1
# print("-ve")
# else:
# print("+ve")
# In[4]:
acc=(count)*100/93
accN=countN*100/93
print("accuracy for negative 93 files",accN)
print("accuracy for positive 93 files",acc)
# In[ ]: