-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate-embedding.py
executable file
·67 lines (56 loc) · 3.03 KB
/
evaluate-embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
"""
Tool for evaluating the coherence of topic models stred in one or more PKL files, using measure based on word embeddings.
Sample usage:
python evaluate-embedding.py -b -t 10 -m wikipedia2016-w2v-cbow-d100.bin -o results/bbc-coherence.csv data/bbc/nmf_k05/*rank*
"""
import os, os.path, sys
import logging as log
from optparse import OptionParser
import gensim
import unsupervised.rankings, unsupervised.util
import validation.embedding, validation.util
# --------------------------------------------------------------
def main():
parser = OptionParser(usage="usage: %prog [options] ranking_file1 ranking_file2 ...")
parser.add_option("-m", "--model", action="store", type="string", dest="model_path", help="path to Word2Vec model", default=None)
parser.add_option("-t", "--top", action="store", type="int", dest="top", help="number of top terms to include for each topic", default=10)
parser.add_option("-s", "--summmary", action="store_true", dest="summary", help="display summary results only")
parser.add_option("-o","--output", action="store", type="string", dest="out_path", help="path for CSV output file", default=None)
parser.add_option("-b", "--binary", action="store_true", dest="read_binary", help="read a Word2Vec file in binary format")
(options, args) = parser.parse_args()
if len(args) < 1 :
parser.error( "Must specify at least one topic model ranking file" )
if options.model_path is None:
parser.error( "Must specify path to Word2Vec model file")
log.basicConfig(level=20, format='%(message)s')
top = options.top
# Load the word2vec model and create the measures
log.info( "Loading embedding model from %s ..." % args[0] )
if options.read_binary:
model = gensim.models.KeyedVectors.load_word2vec_format(options.model_path, binary=True)
else:
model = gensim.models.Word2Vec.load(options.model_path)
vocab = set(model.vocab.keys())
log.info("Embedding has vocabulary of size %d" % len(vocab) )
# Create coherence measures
measures = { "tc-w2v" : validation.embedding.EmbeddingCoherence(model)}# "td-w2v" : validation.embedding.EmbeddingDistinctiveness(model) }
scores = validation.util.CoherenceScoreCollection( measures )
# Process each topic model results file
log.info( "Processing %d topic models ..." % len(args) )
for in_path in args:
log.debug("Processing topics from %s using top %d terms" % ( in_path, top ) )
(term_rankings,labels) = unsupervised.util.load_term_rankings( in_path )
log.debug("Truncating terms rankings to top %d terms" % top )
truncated_rankings = unsupervised.rankings.truncate_term_rankings( term_rankings, top)#, vocab )
scores.evaluate( in_path, truncated_rankings )
# Display a summary of the results
tab = scores.create_table( include_stats = True, precision = 3 )
log.info(tab)
# Write results to CSV?
if not options.out_path is None:
log.info("Writing results to %s" % options.out_path)
scores.write_table( options.out_path, include_stats = True, precision = 4 )
# --------------------------------------------------------------
if __name__ == "__main__":
main()