-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetadataBased.py
82 lines (67 loc) · 2.91 KB
/
metadataBased.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
""" Analysis metadata of datasets with doc2vec,
each metadata has been saved as a single json file,
including dataset name, descriptions, etc.
Author: Jingchao Yang
Date: Mar 27 2018
"""
from os import listdir
import json
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence
from os.path import isfile, join
""" Doc2Vec is using two things when training your model,
labels and the actual data. The labels can be anything,
but to make it easier each document file name will be its’ label.
"""
docLabels = []
# getting all the names of all json files
docLabels = [f for f in listdir("/Users/YJccccc/doc2vec/RawMetadata/") if f.endswith('.json')]
print(docLabels)
data = [] # storing target text contents all together
for doc in docLabels:
# read each json file
desc = json.load(open("/Users/YJccccc/doc2vec/RawMetadata/" + doc))
# getting data for Dataset-LongName-Full and Dataset-Description
dName = desc['Dataset-LongName-Full']
dDesc = desc['Dataset-Description']
# first check if json has the Dataset-Metadata key
if desc.get('Dataset-Metadata'):
mData = desc['Dataset-Metadata']
dMetaArr = []
# loop into the Dataset-Metadata and replace white space with underline
for i in mData:
dMetaArr.append(i.replace(" ", "_"))
# convert list to string
dMeta = ', '.join(dMetaArr)
else: # if not Dataset-Metadata key, put blank to hole a position in the list
dMeta = " "
data.append('Full Name: ' + dName + '\n' + 'Description: ' + dDesc + '\n' + 'Metadata: ' + dMeta)
# print(data[3])
# for cont in data:
# print(cont,"\n")
""" Preparing the data for Gensim Doc2vec
Gensim Doc2Vec needs model training data in an LabeledSentence iterator object
"""
class LabeledLineSentence(object):
# supply both the raw data and the list of labels
def __init__(self, doc_list, labels_list):
self.labels_list = labels_list
self.doc_list = doc_list
def __iter__(self):
# loop through all the docs, and put the documents filename as a the label for each document
for idx, doc in enumerate(self.doc_list):
yield LabeledSentence(words=doc.split(), tags=[self.labels_list[idx]])
"""Training the model"""
it = LabeledLineSentence(data, docLabels) # create the iter object
model = gensim.models.Doc2Vec(vector_size=300, window=10, min_count=5, workers=11, alpha=0.025,
min_alpha=0.025) # use fixed learning rate
model.build_vocab(it)
for epoch in range(10):
model.train(it, total_examples=model.corpus_count, epochs=model.epochs)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no deca
# model.train(it,total_examples=model.corpus_count,epochs=model.epochs)
# save the model
model.save("/Users/YJccccc/doc2vec/doc2vec.model")
# """Testing the model"""
# print (model.most_similar("Tropical"))