-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathtfidf.py
189 lines (138 loc) · 5.76 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import argparse
import json
from collections import defaultdict
from math import log
import os
from typing import Iterable, Tuple, Dict
from nltk.tokenize import TreebankWordTokenizer
from nltk import FreqDist
kUNK = "<UNK>"
def log10(x):
return log(x) / log(10)
def lower(str):
return str.lower()
class TfIdf:
"""Class that builds a vocabulary and then computes tf-idf scores
given a corpus.
"""
def __init__(self, vocab_size=10000,
tokenize_function=TreebankWordTokenizer().tokenize,
normalize_function=lower, unk_cutoff=2):
self._vocab_size = vocab_size
self._total_docs = 0
self._vocab_final = False
self._vocab = {}
self._unk_cutoff = unk_cutoff
self._tokenizer = tokenize_function
self._normalizer = normalize_function
# Add your code here!
def train_seen(self, word: str, count: int=1):
"""Tells the language model that a word has been seen @count times. This
will be used to build the final vocabulary.
word -- The string represenation of the word. After we
finalize the vocabulary, we'll be able to create more
efficient integer representations, but we can't do that just
yet.
count -- How many times we've seen this word (by default, this is one).
"""
assert not self._vocab_final, \
"Trying to add new words to finalized vocab"
# Add your code here!
def add_document(self, text: str):
"""
Tokenize a piece of text and add the entries to the class's counts.
text -- The raw string containing a document
"""
for word in self.tokenize(text):
None
def tokenize(self, sent: str) -> Iterable[int]:
"""Return a generator over tokens in the sentence; return the vocab
of a sentence if finalized, otherwise just return the raw string.
sent -- A string
"""
# You don't need to modify this code.
for ii in self._tokenizer(sent):
if self._vocab_final:
yield self.vocab_lookup(ii)
else:
yield ii
def doc_tfidf(self, doc: str) -> Dict[Tuple[str, int], float]:
"""Given a document, create a dictionary representation of its tfidf vector
doc -- raw string of the document"""
counts = FreqDist(self.tokenize(doc))
d = {}
for ii in self._tokenizer(doc):
ww = self.vocab_lookup(ii)
d[(ww, ii)] = counts.freq(ww) * self.inv_docfreq(ww)
return d
def term_freq(self, word: int) -> float:
"""Return the frequence of a word if it's in the vocabulary, zero otherwise.
word -- The integer lookup of the word.
"""
return 0.0
def inv_docfreq(self, word: int) -> float:
"""Compute the inverse document frequency of a word. Return 0.0 if
the word has never been seen.
Keyword arguments:
word -- The word to look up the document frequency of a word.
"""
return 0.0
def vocab_lookup(self, word: str) -> int:
"""
Given a word, provides a vocabulary integer representation. Words under the
cutoff threshold shold have the same value. All words with counts
greater than or equal to the cutoff should be unique and consistent.
This is useful for turning words into features in later homeworks.
word -- The word to lookup
"""
assert self._vocab_final, \
"Vocab must be finalized before looking up words"
if word in self._vocab:
return self._vocab[word]
else:
return self._vocab[kUNK]
def finalize(self):
"""
Fixes the vocabulary as static, prevents keeping additional vocab from
being added
"""
# Add code to generate the vocabulary that the vocab lookup
# function can use!
self._vocab_final = True
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--root_dir", help="QB Dataset for training",
type=str, default='../',
required=False)
argparser.add_argument("--train_dataset", help="QB Dataset for training",
type=str, default='data/qanta.train.json',
required=False)
argparser.add_argument("--example", help="What answer we use for testing",
type=str, default='Australia',
required=False)
argparser.add_argument("--test_dataset", help="QB Dataset for test",
type=str, default='qanta.dev.json',
required=False)
argparser.add_argument("--limit", help="Number of training documents",
type=int, default=-1, required=False)
args = argparser.parse_args()
vocab = TfIdf()
with open(os.path.join(args.root_dir, args.train_dataset)) as infile:
data = json.load(infile)["questions"]
if args.limit > 0:
data = data[:args.limit]
for ii in data:
for word in vocab.tokenize(ii["text"]):
vocab.train_seen(word)
vocab.finalize()
for ii in data:
vocab.add_document(ii["text"])
with open(os.path.join(args.root_dir, args.train_dataset)) as infile:
data = json.load(infile)["questions"]
example = ""
for ii in data:
if ii["page"] == args.example:
example += " %s " % ii["text"]
vector = vocab.doc_tfidf(example)
for word, tfidf in sorted(vector.items(), key=lambda kv: kv[1], reverse=True)[:50]:
print("%s:%i\t%f" % (word[1], word[0], tfidf))