-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_lm.py
73 lines (64 loc) · 2.51 KB
/
data_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import re
import pickle
import copy
import numpy
import torch
import nltk
word_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
'WDT', 'WP', 'WP$', 'WRB']
currency_tags_words = ['#', '$', 'C$', 'A$']
ellipsis = ['*', '*?*', '0', '*T*', '*ICH*', '*U*', '*RNR*', '*EXP*', '*PPA*', '*NOT*']
punctuation_tags = ['.', ',', ':', '-LRB-', '-RRB-', '\'\'', '``']
punctuation_words = ['.', ',', ':', '-LRB-', '-RRB-', '\'\'', '``', '--', ';', '-', '?', '!', '...', '-LCB-', '-RCB-']
class Corpus(object):
def __init__(self, dictionary):
self.dictionary=dictionary
file_path="/Users/anhadmohananey/Downloads/stanford-corenlp-full-2018-02-27/toronto_dev.ps"
self.test, self.test_sens, self.test_trees, self.test_nltktrees = self.tokenize(file_path)
def tokenize(self, fileid):
filen=open(fileid, "r")
lowercase=True
sens_idx = []
sens = []
trees = []
nltk_trees = []
for line in filen.readlines():
transitions=[]
tokens=[]
for word in line.split(' '):
if word[0] != "(":
if word.strip() == ")":
transitions.append(1)
else:
# Downcase all words to match GloVe.
if lowercase:
tokens.append(word.lower())
else:
tokens.append(word)
transitions.append(0)
arr=[]
tmp=[]
stack=[]
words = ['<EOS>'] + tokens + ['<EOS>']
sens.append(words)
tokens=tokens[::-1]
for x in transitions:
if x == 0:
#shift
stack.append(tokens.pop())
else:
a1=stack.pop()
a2=stack.pop()
stack.append([a2,a1])
idx = []
for word in words:
if word not in self.dictionary:
idx.append(self.dictionary["@@UNKNOWN@@"])
else:
idx.append(self.dictionary[word])
sens_idx.append(torch.LongTensor(idx))
trees.append(stack[0])
nltk_trees.append(stack[0])
return sens_idx, sens, trees, nltk_trees