-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtfidf.py
111 lines (52 loc) · 1.92 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from __future__ import division
import re
import os
import sys
import math
import nltk
import utilities
tokenizer = utilities.Tokenizer()
stopwords = nltk.corpus.stopwords.words('english') + ['.', ',', '?', '(', ')', ':', '"', '-', '{', '}', '\'', '--', '\'s', '\'re', 'the', 'you']
pos = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ')
def prepare(path2corpus):
files = os.listdir(path2corpus)
filelists = []
for file in files:
filelist = [term.lower() for term in open(('/').join([path2corpus, file]), 'r').read(-1).split()]
filelists.append(nltk.Text(filelist))
return filelists
def cal_term_freq(text):
sentences = nltk.tokenize.sent_tokenize(text)
terms = [term.lower() for sentence in sentences for term in tokenizer.tokenize(sentence)]
length = len(terms)
freqdist = nltk.FreqDist(terms)
for key in freqdist.keys():
if key in stopwords: #del stop words
del freqdist[key]
else:
freqdist[key] = freqdist[key] / length
return freqdist
def cal_idoc_freq(dist2cal, path2corpus):
filelists = prepare(path2corpus)
for key in dist2cal.keys():
count = 0
for filelist in filelists: #speed up
if filelist.count(key) != 0:
count = count + 1
dist2cal[key] = math.log10(len(filelists) / (count + 1))
return dist2cal
def gen_keywords(text, path2corpus, n):
term_freq = cal_term_freq(text)
idoc_freq = cal_idoc_freq(term_freq, path2corpus)
tfidf = {}
for key in term_freq.keys():
tfidf[key] = term_freq[key] * idoc_freq[key]
tmp = sorted(tfidf, key = tfidf.get) #sort by tfidf
tmp = [[term] for term in tmp] #pre-process
tmp = [nltk.pos_tag(term) for term in tmp] #tegged
keywords = []
for term in tmp:
if term[0][1] in pos:
keywords.append(term)
return keywords[-int(n):]
#print gen_keywords(open(sys.argv[1], 'r').read(), sys.argv[2], sys.argv[3])