-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec.py
33 lines (26 loc) · 1.08 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
import json
from gensim.models.word2vec import LineSentence, Word2Vec
def func(fin, fout):
for line in fin:
line = line.strip()
if not line:
continue
sentence = json.loads(line)
sentence = sentence["sentText"].strip().strip('"').lower()
fout.write(sentence + '\n')
def make_corpus():
#print("-------------haha")
with open('data/NYT_CoType/corpus.txt', 'wt', encoding='utf-8') as fout:
with open('data/NYT_CoType/train.json', 'rt', encoding='utf-8') as fin:
func(fin, fout)
with open('data/NYT_CoType/test.json', 'rt', encoding='utf-8') as fin:
func(fin, fout)
if __name__ == "__main__":
if not os.path.exists('data/NYT_CoType/corpus.txt'):
make_corpus()
sentences = LineSentence('data/NYT_CoType/corpus.txt')
model = Word2Vec(sentences, sg=1, size=300, workers=4, iter=8, negative=8)
word_vectors = model.wv
word_vectors.save('data/NYT_CoType/word2vec')
word_vectors.save_word2vec_format('data/NYT_CoType/word2vec.txt', fvocab='data/NYT_CoType/vocab.txt')