-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_parse_helper.py
297 lines (250 loc) · 11.1 KB
/
data_parse_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import numpy as np
import spacy
from spacy.attrs import LOWER, LIKE_URL, LIKE_EMAIL, IS_OOV
import dateparser
import pandas as pd
import re
import operator
##Testing this library for reverse name lookups
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
nickname_lookup= {"George Eastman Hall": ["EAS", "Eastman"],
"George H. Clark Gymnasium": ["CLK", "Clark Gym","Gym"],
"Student Alumni Union": ["SAU", "Campus Center", "Student Center", "Davis Room", "Fireside Lounge"],
"Wallace Library": ["WAL", "Wallace Center", "Library"],
"Liberal Arts Hall": ["LBR", "COLA"],
"James E. Booth Hall": ["BOO", "Booth"],
"James E. Gleason Hall": ["GLE", "KGCOE", "Gleason"],
"Golisano Hall": ["GOL", "Golisano", "GCCIS"],
"Frank E. Gannett Hall": ["GAN", "Gannett"],
"Thomas Gosnell Hall": ["GOS", "Gosnell", "Gosnell Atrium", "College of Science Atrium"],
"Lewis P. Ross Hall": ["ROS", "Ross"],
"Max Lowenthal Hall": ["LOW", "Saunders", "College of Business", "COB", "Lowenthal"],
"Simone Center for Innovation and Entrepreneurship": ["SIH", "Simone Center", "Fish Bowl", "Toilet Bowl", "Innovation Hall", "Entrepreneurship Hall", "Student Innovation Hall"]}
def make_lookup_table():
df = pd.read_csv("RIT_Buildings_Rooms4.csv", header=None,
names=["building", "room",
"c","d","e","f","g","h"],
usecols=["building", "room"],
error_bad_lines=False,
encoding='latin'
)
return df
##Takes our lookup table df and list of noun phrases to run distance measure on
def run_room_lookup(df, phrases):
building = df.building
building = building.drop_duplicates()
building = building.tolist()
rooms = df.room
rooms = rooms.drop_duplicates()
rooms = rooms.tolist()
lookup_shortened_text = dict()
for i, r in enumerate(rooms):
if type(r) == float:
continue
if re.search(re.compile("[a-zA-Z]{3,}"), r):
no_nums = re.sub(re.compile("[0-9\-]"), "", r)
rooms[i] = no_nums.strip()
lookup_shortened_text[no_nums.strip()] = r
#re.sub(re.compile("[0-9\-]"), "", test)
# result.strip()
building_pairs = []
room_pairs = []
for p in phrases:
building_pairs.append(process.extractOne(p, building))
room_pairs.append(process.extractOne(p, rooms))
return building_pairs, room_pairs, lookup_shortened_text
def get_info(texts, skip=-2, attr=LOWER, merge=False, nlp=None,
**kwargs):
if nlp is None:
print("loading spacy model")
#Load english tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_lg')
#texts = texts.split('\n')
texts = [texts]
pizza_loc = []
dates = []
time = []
buildings = []
for row, doc in enumerate(nlp.pipe(texts, **kwargs)):
#This sifts through noun chunks to find locations as "pobj" noun types
for chunk in doc.noun_chunks:
# print("CHUNK",chunk.text, chunk.root.text, chunk.root.dep_,
# chunk.root.head.text)
if chunk.root.dep_ == "pobj":
buildings.append(chunk.text)
for token in doc:
if token.lower_ in ["pizza"]:
# print("PIZZA FOUND @", token.text, token.idx)
pizza_loc.append(token.idx)
for ent in doc.ents:
if ent.label_ == "DATE":
#print(ent.text)
dates.append([ent.text, ent.start_char, ent.end_char])
# print("Entity @", ent.start_char, ent.end_char, ent.label_, ent.text)
if ent.label =="TIME":
#print("time entity", ent.text)
time.append([ent.text, ent.start_char, ent.end_char])
return pizza_loc, dates, time, buildings
def building_room_pair_search(df, building_room_pair):
##TODO this is only going to get one room...very dirty :(
building = building_room_pair[0][0]
room = building_room_pair[0][2]
##Going to lookup using nickname lookup
for k, v in nickname_lookup.items():
if v[0] == building:
return k
##Secret Sauce?
def dirty_boye_building_search(df, text):
for k, nicknames in nickname_lookup.items():
for nickname in nicknames:
if nickname.lower() in text.lower() and len(nickname)>3:
building_name = k
return building_name
return None
def dirty_boye_room_search(df, text, building_name):
rooms = df[df.building == building_name]
rooms = rooms.room
rooms = rooms.values.tolist()
for r in rooms:
if r in text:
room = r
return room
return "None"
"""
for lack of a better solution,
we will employ this time_finder method that
will be extremely gross and hard to understand
to parse times correctly. Sorry.
"""
def time_finder(text):
time_pattern = re.compile("(\d{1,2})(\:\d{2})?\s?(AM|PM|am|pm)?(\-)?")
'''
TODO
In some cases dates come in as 11-12pm...
in this case, I will need to ditch the "-", and take the
number before. Trouble comes in looking at the time after
the dash to see if the time before the dash should be am vs. pm.
In the case above, you couldnt just take the pm from 12 and use
it on 11, it would make it 11pm. I am just rambling,
I hope this makes sense
'''
#get all matches to our sketchy time pattern
matches = re.findall(time_pattern, text)
#print("MATCHES", matches)
for m in matches:
#Look, its 5am, give me a break here.
if "PM" in m or "AM" in m or "pm" in m or "am" in m:
temp_time = ""
for _ in m:
if _ not in ["-"]:
temp_time += _
return temp_time
return "Unsure of time"
def get_datelocation(text, nlp=None):
pizza_loc, dates, time, phrases = get_info(text, nlp=nlp)
df = make_lookup_table()
#Try regex search to see if there is a simple building room pair
building_room_pair = re.findall(re.compile("([A-Z]{3})(\-|\s)(\w\d{3}|\d{4})"),text)
if len(building_room_pair) != 0:
#Do something to get which building/room combo it is, or at least the building...
building_name = building_room_pair_search(df, building_room_pair)
room_search = df[df.building == building_name]
all_rooms = room_search.values.tolist()
for r in all_rooms:
if building_room_pair[0][2] in r:
room_name = building_room_pair[0][2]
else:
##Should we skip the second half of this code
skip = False
#Change variable name - its a regex search
building_name = dirty_boye_building_search(df, text)
#Room search off found building name
if building_name != None:
room_name = dirty_boye_room_search(df, text, building_name)
skip = True
if skip == False:
b, r, lookup_shortend_text = run_room_lookup(df, phrases)
##Adds up results from b and r (building and room)
sums=[]
for i in range(len(b)):
sums.append(r[i][1] + b[i][1])
try:
index, _ = max(enumerate(sums), key=operator.itemgetter(1))
except:
return "None", "None", "None", "None"
##TODO why are the '2050 - reading room' things hard coded in?
##Where you choose a room
if r[index][1] > b[index][1]:
temp_name = r[index][0]
if temp_name not in nickname_lookup.keys():
try:
temp_name = lookup_shortend_text[temp_name]
except:
pass
#print("room temp", temp_name)
full_loc_name = df[df["room"] == temp_name]
else:
##Where you choose a building
temp_name = b[index][0]
if temp_name not in nickname_lookup.keys():
try:
temp_name = lookup_shortend_text[temp_name]
except:
pass
#print("building temp", temp_name)
full_loc_name = df[df["building"] == temp_name]
#print("full loc name", full_loc_name)
try:
building_name, room_name = full_loc_name.values[0]
except:
return "None", "None", "None", "None"
##This is the naive way of getting closest date...take most verbose
##and leave the rest...
dates_as_text = [d[0] for d in dates]
if len(dates_as_text) == 0:
return "None", "None", "None", "None"
date_to_return = max(dates_as_text, key=len)
if "tomorrow," in date_to_return:
date_to_return = date_to_return.split("tomorrow,")[1].strip()
parsed_date = dateparser.parse(date_to_return)
try:
time_to_return = time[0]
if "-" in time_to_return:
time_split = time_to_return.split()
##Only taking the first time split by "-", its the start time
time_to_return = dateparser.parse(time_split[0])
else:
time_to_return = dateparser.parse(time_to_return)
except:
##When all else fails, return the list? Change to something else
time_to_return = time_finder(text)
try:
return parsed_date, building_name, room_name, time_to_return
except:
try:
return parsed_date, building_name, "None", time_to_return
except:
return "None", "None", "None", "None"
text = open("pizza_test.txt").read()
text2 = """CS1 Exam Review
Sunday, November 12th, 11-1pm in GOL-1400
Come review for the second CS1 exam. Pizza will be provided.
date, location = get_datelocation(text)
"""
'''
The following commented out code is the begining of
the logic of how we will get dates closest to eachother
in text by a naive/simple method of just idx distance.
Leave it for now.
'''
text3 = """
The Simone Center for Innovation and Entrepreneurship invites you to join us tomorrow, Friday, January 19th from 12PM-1PM for Scott A. Snyder, PH.D.’s talk on Technology and Entrepreneurship in LOW-1225. All students, faculty and alumni are welcome.\n
Dr. Snyder is currently the Senior Vice President, Managing Director and Chief Technology and Innovation officer for Safeguard Scientifics, Inc. He has vast experience in the growth of technology related businesses with a particular focus on IoT, AI and enhanced security. He is also an accomplished author and a Senior Fellow at the Warton School as well as adjunct faculty at the Moore School of Engineering at the University of Pennsylvania.
Pizza will be provided. Please email Stephen Burke at [email protected] in the Simone Center for Innovation and Entrepreneurship with any questions.
In addition, requests for interpreters can be submitted to myaccess.rit.edu. Interpreters can be provided upon request and are subject to availability.
"""
##for date in dates:
## date_text = date[0]
## distance = min(abs(date[1]-pizza_loc[0]),
## abs(date[2]-pizza_loc[0]))