-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparsing_utils.py
151 lines (119 loc) · 4.59 KB
/
parsing_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import visualisation_utils as vs
import extraction as ex
import pandas as pd
import stanza as st
from nltk.tree import Tree
from datetime import datetime
import torch
#Global pipeline needs to be declared and passed as argument to this function!
#nlp = st.Pipeline('en', verbose = False)
def get_string_constituency(string_text, nlp):
doc = nlp(string_text)
constituencies = [] # hold sentence trees in here
print(doc)
#get the constituencies (a span can extend over multiple sentences)
for sentence in doc.sentences:
constituencies.append(sentence.constituency)
#To pretty print a tree, lets say the tree of the first sentence:
#parsetree = Tree.fromstring(str(constituencies[0]))
#parsetree.pretty_print()
return constituencies
def get_post_constituency(post_text, nlp):
""" GET the parse trees of post sentences (singular post)
Input: RedditPost text?
Output: List[ParseTree] -
"""
doc = nlp(post_text)
constituencies = [] # hold sentence trees in here
#get the constituencies
for sentence in doc.sentences:
constituencies.append(sentence.constituency)
#To pretty print a tree, lets say the tree of the first sentence:
#parsetree = Tree.fromstring(str(constituencies[0]))
#parsetree.pretty_print()
return constituencies
def get_posts_constituencies(post_texts,nlp):
""" GET the parse trees of post sentences (all post)
Input: RedditPost texts (all)
Output: List [List[ParseTree]] - there are as many inner lists as are sentences in a post.
The outer list just holds all the posts constituencies
"""
constituencies_all = []
c=1
for post_text in post_texts:
constituencies_all.append(get_post_constituency(post_text,nlp))
print(c)
c += 1
return constituencies_all
#Recursive function to get the text corresponding to a node
# A global list needs to be passed to the recursive function: superList = []
def parse_node(node, superList):
""" GET all the nodes and their text for a sentence.
Output: List[{label,leaves}] #A node would represent a sentence
"""
d = dict()
d["label"] = node.label
d["leaves"] = []
if len(node.children) >=1:
for c in node.children:
d["leaves"].extend(parse_node(c, superList))
superList.append(d)
else:
d["leaves"].append(node.label)
return d["leaves"]
def get_posts_constituency_spans(constituencies_all):
""" GET the spans of all nodes of post sentences (all posts)
Input: List [List[ParseTree]] (all)
Output: List [List[List[{label,leaves}]]] - there are as many inner lists as are sentences in a post.
second list - container for posts
outer list - container for all posts
The outer list just holds all the posts constituencies
"""
cons = constituencies_all
final = []
count = 1
for post in cons:
print("Post: "+ str(count))
post_nodes = []
co_s = 1
for node in post:
print("Sentence "+str(co_s))
sentence_leaves = []
parse_node(node, sentence_leaves)
post_nodes.append(sentence_leaves)
co_s += 1
final.append(post_nodes)
count += 1
now = datetime.now()
print("Post: "+ str(count)+" ready at "+ now.strftime("%H:%M:%S"))
torch.save(final, 'constituency_spans.pt')
return final
def filter_constituency_spans(constituency_spans):
required_nodes = ["S","SBARQ","SQ","SBAR","VP"] #to be potentilly extended
new_constituency_spans = []
count = 1
for post in constituency_spans:
post_c = []
for sentence in post:
sent = []
for dict in sentence:
if dict["label"] in required_nodes:
sent.append(dict)
post_c.append(sent)
new_constituency_spans.append(post_c)
count += 1
now = datetime.now()
print("Post: "+ str(count)+" ready at "+ now.strftime("%H:%M:%S"))
torch.save(new_constituency_spans, 'new_constituency_spans.pt')
return new_constituency_spans
def create_post_constituents_dictionary():
#create dictionary with post_text as id and constituents as value:
posts = ex.read_posts('st1_public_data/st1_train_inc_text.csv')
#Filtered spans:
#Post, sentence, dictionaries(label, array)
constituency_spans = torch.load('new_constituency_spans.pt')
post_spans = dict()
for i in range(len(posts)-1):
post_spans[posts[i].text] = constituency_spans[i]
torch.save(post_spans,'post_spans.pt')
return post_spans