-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewCorruption.py
executable file
·98 lines (74 loc) · 2.24 KB
/
newCorruption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import spacy
import numpy as np
import pandas as pd
import utils
nlp = spacy.load("en_core_web_sm")
def test(x):
#due to the mapping between words and integers the DAE does not deal
#well with numbers it has not seen before.
#in the training phase, the prices have been split into '£' and the value
#while doing NER.
#this is just a quick fix, but there has to be a real fix for the test phase at least
try:
return vocab_to_int[x]
except:
try:
return vocab_to_int['£'+x]
except:
print(x)
count = 0
def newCorrupt(sentence):
#get names - propernounse and sometimes nouns.
#get adjectives
#NER will get the cardinal values i.e., numbers.
#get everything in NER
#finally make the whole thing a set.
#using NER doesn't add much to it right now
global count
count += 1
print(count)
sentence = nlp(sentence)
toString = set()
for ent in sentence.ents:
try:
x = float(ent.text)
num = True
except:
num = False
if num == False:
for word in ent.text.split(' '):
toString.add((word))
else:
toString.add((ent.text))
for token in sentence:
pos = token.pos_
add = True
if pos == 'PROPN':
toString.add((token.text))
if pos == "ADJ" or pos == "ADV":
toString.add((token.text))
if pos == "NOUN":
add = np.random.choice([1, 0, 0, 1])
if add:
toString.add((token.text))
curroptedS = ''
for string in toString:
curroptedS += ' ' + string
return curroptedS
if __name__ == "__main__":
trainset = pd.read_csv('./data/questions.csv')
cor = trainset['ref'].apply(lambda x: newCorrupt(x))
trainset = trainset.assign(corrupted=cor)
as_tokens = trainset['corrupted'].apply(lambda x: [test(each) for each in x.split()])
trainset = trainset.assign(tokenized_corrupted=as_tokens)
trainset.to_csv('./data/processedTrainset_ques.csv')
#
#
# sentence = ""
# import spacy
#
# nlp = spacy.load("en_core_web_sm")
# doc = nlp(sentence)
#
# for ent in doc.ents:
# print(ent.text, ent.start_char, ent.end_char, ent.label_)