forked from sp00/genderPredictor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenderPredictor.py
96 lines (74 loc) · 2.85 KB
/
genderPredictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# encoding: utf-8
"""
genderPredictor.py
"""
from nltk import NaiveBayesClassifier,classify
import USSSALoader
import random
class genderPredictor():
def getFeatures(self):
maleNames,femaleNames=self._loadNames()
featureset = list()
for nameTuple in maleNames:
features = self._nameFeatures(nameTuple[0])
male_prob, female_prob = self._getProbDistr(nameTuple)
features['male_prob'] = male_prob
features['female_prob'] = female_prob
featureset.append((features,'M'))
for nameTuple in femaleNames:
features = self._nameFeatures(nameTuple[0])
male_prob, female_prob = self._getProbDistr(nameTuple)
features['male_prob'] = male_prob
features['female_prob'] = female_prob
featureset.append((features,'F'))
return featureset
def trainAndTest(self,trainingPercent=0.80):
featureset = self.getFeatures()
random.shuffle(featureset)
name_count = len(featureset)
cut_point=int(name_count*trainingPercent)
train_set = featureset[:cut_point]
test_set = featureset[cut_point:]
self.train(train_set)
return self.test(test_set)
def classify(self,name):
feats=self._nameFeatures(name)
return self.classifier.classify(feats)
def train(self,train_set):
self.classifier = NaiveBayesClassifier.train(train_set)
return self.classifier
def test(self,test_set):
return classify.accuracy(self.classifier,test_set)
def _getProbDistr(self,nameTuple):
male_prob = (nameTuple[1] * 1.0) / (nameTuple[1] + nameTuple[2])
if male_prob == 1.0:
male_prob = 0.99
elif male_prob == 0.0:
male_prob = 0.01
else:
pass
female_prob = 1.0 - male_prob
return (male_prob, female_prob)
def getMostInformativeFeatures(self,n=5):
return self.classifier.most_informative_features(n)
def _loadNames(self):
return USSSALoader.getNameList()
def _nameFeatures(self,name):
name=name.upper()
return {
'last_letter': name[-1],
'last_two' : name[-2:],
'last_three': name[-3:],
'last_is_vowel' : (name[-1] in 'AEIOUY')
}
if __name__ == "__main__":
gp = genderPredictor()
accuracy=gp.trainAndTest()
print 'Accuracy: %f'%accuracy
print 'Most Informative Features'
feats=gp.getMostInformativeFeatures(10)
for feat in feats:
print '\t%s = %s'%feat
name = raw_input('Enter name to classify: ')
print '\n%s is classified as %s'%(name, gp.classify(name))