-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprocess.py
132 lines (104 loc) · 3.13 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
debug process pipeline
"""
import sys
import pickle
from nltk import sent_tokenize
from io import StringIO
import networkx as nx
from data import *
from utils import *
#import stanfordnlp
#nlp = stanfordnlp.Pipeline()
sents = open("toy_data/test.txt").readlines()
orig_sents = [s.split("<#####>")[0] for s in sents]
lines = open("/Users/sailormoon/Downloads/stanford-corenlp-4.1.0/test.txt.out").readlines()
loc = lines.index("Dependency Parse (enhanced plus plus dependencies):\n")
dep_lines = lines[loc+1:]
"""
if stanford nlp
"""
def GetDeps(dep_lines):
tups = []
words = []
for t in dep_lines:
arc = t.split("(")[0]
tokens = t.split("(")[-1].split(")")[:-1][0].split(",")
if arc =="punct" or arc=="root":
continue
if len(tokens) >2:
src = tokens[0].strip()
tgt = ','+tokens[-1].strip()
else:
src = tokens[0].strip()
tgt = tokens[1].strip()
for i in [src, tgt]:
if i not in words and i != '':
words.append(i)
if src == '' or tgt == '':
continue
else:
tups.append((src, tgt, arc))
return tups, words
tmp, words = GetDeps(dep_lines)
vtov, vtoi = wordict(words)
itov = {v:k for k,v in vtoi.items()}
adjs, conn_adjs = BuildAdj(vtoi, tmp) # connected nodes by neighbor, adj: nodes connected by dependency arcs
connA, AG = BuildGraph(conn_adjs, mode="Conn") # returns two things: connA (sorted by nodes), connected adjcency matrix; graph object,
all_pairs = BuildPairs(adjs)
#tmp = GetDeps(dep_lines)
from encoder import *
enc = BLSTMEncoder(300, 300, torch.device("cpu"))
sent = torch.randn(15,1,300) # L X B X H
out = enc((sent, np.array([15])))
from dep_arcs import DEPARCS
arcs = DEPARCS
label_oh, labels = EncodeOnehot(arcs)
from trainer import Trainer
cfg = None
bot = Trainer( cfg, 300, 300, 60, 2, 0.1, False, torch.device("cpu"))
bot.init_onehot()
h_src,h_tgt, h_arc = bot.extract(sent[:,0,:], all_pairs) # nodes x H
out = bot.gat(h_src, h_tgt, h_arc)
new_arc = out * h_arc.squeeze(1)
from gat import *
clsf = Classifier(60)
decision = clsf(new_arc.float())
tmp = "Sokuhi was born in Fuzhou , Fujian , China. Sokuhi was ordained at 17 by Feiyin Tongrong."
from nltk import sent_tokenize
golds = sent_tokenize(tmp)
from distant_superv import *
new_temp, cut_temp, del_temp = KEEP(all_pairs, golds, itov, tmp)
"""
if spacy
"""
#nlp = spacy.load("en_core_web_sm")
#doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# for token in doc:
# print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
# token.shape_, token.is_alpha, token.is_stop)
"""
import spacy
nlp = spacy.load("en_core_web_sm")
def spacy_deps(doc):
tups = []
for tki, token in enumerate(doc):
dep = token.text +"_"+str(tki)
head = token.head.text+"_" +str(token.head.i)
arc = token.dep_
tups.append((dep, head, arc))
return tups
def AddEdges(depG, deps):
for tup in deps:
# dependent, head, arc
depG.add_edge(tup[0], tup[1], label=tup[2])
deps = []
for sent in orig_sents:
doc = nlp(sent)
orig_dep = spacy_deps(doc)
deps.append(orig_dep)
"""
#tmpG = nx.DiGraph(directed=False)
#AddEdges(tmpG, tmp, vtoi)
#A = AdjGraph(tmpG)
#e = (2,3,{'weight':7})