forked from svs192219/ner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnerdata.py
186 lines (154 loc) · 6.12 KB
/
nerdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# nerdata.py
# Abstraction to bundle words with POS and chunks for featurization
class Token:
def __init__(self, word, pos, chunk):
self.word = word
self.pos = pos
self.chunk = chunk
def __repr__(self):
return self.word
# Thin wrapper around a start and end index coupled with a label, representing,
# e.g., a chunk PER over the span (3,5). Indices are semi-inclusive, so (3,5)
# contains tokens 3 and 4 (0-based indexing).
class Chunk:
def __init__(self, start_idx, end_idx, label):
self.start_idx = start_idx
self.end_idx = end_idx
self.label = label
def __repr__(self):
return "(" + repr(self.start_idx) + ", " + repr(self.end_idx) + ", " + self.label + ")"
def __eq__(self, other):
if isinstance(other, self.__class__):
return self.start_idx == other.start_idx and self.end_idx == other.end_idx and self.label == other.label
else:
return False
def __ne__(self, other):
return not self.__eq__(other)
def __hash__(self):
return hash(self.start_idx) + hash(self.end_idx) + hash(self.label)
# Thin wrapper over a sequence of Tokens representing a sentence and an optional set of chunks
# representation NER labels, which are also stored as BIO tags
class LabeledSentence:
def __init__(self, tokens, chunks=None):
self.tokens = tokens
self.chunks = chunks
if chunks is None:
self.bio_tags = None
else:
self.bio_tags = bio_tags_from_chunks(self.chunks, len(self.tokens))
def __repr__(self):
return repr([repr(tok) for tok in self.tokens]) + "\n" + repr([repr(chunk) for chunk in self.chunks])
def __len__(self):
return len(self.tokens)
def get_bio_tags(self):
return self.bio_tags
# We store NER tags as strings, but they contain two pieces:
# a coarse tag type (BIO) and a label (PER), e.g. B-PER
def isB(ner_tag):
return ner_tag.startswith("B")
def isI(ner_tag):
return ner_tag.startswith("I")
def isO(ner_tag):
return ner_tag == "O"
# Gets the label component of the NER tag: e.g., returns PER for B-PER
def get_tag_label(ner_tag):
if len(ner_tag) > 2:
return ner_tag[2:]
else:
return None
# Convert BIO tags to (start, end, label) chunk representations
# (start, end) are semi-inclusive, meaning that in the sentence
# He met Barack Obama yesterday
# Barack Obama has the span (2, 4)
# N.B. this method only works because chunks are non-overlapping in this data
def chunks_from_bio_tag_seq(bio_tags):
chunks = []
curr_tok_start = -1
curr_tok_label = ""
for idx, tag in enumerate(bio_tags):
if isB(tag):
label = get_tag_label(tag)
if curr_tok_label != "":
chunks.append(Chunk(curr_tok_start, idx, curr_tok_label))
curr_tok_label = label
curr_tok_start = idx
elif isI(tag):
label = get_tag_label(tag)
# if label != curr_tok_label:
# print "WARNING: invalid tag sequence (I after O); ignoring the I: " + repr(bio_tags)
else: # isO(tag):
if curr_tok_label != "":
chunks.append(Chunk(curr_tok_start, idx, curr_tok_label))
curr_tok_label = ""
curr_tok_start = -1
return chunks
# Converts a chunk representation back to BIO tags
def bio_tags_from_chunks(chunks, sent_len):
tags = []
for i in xrange(0, sent_len):
matching_chunks = filter(lambda chunk: chunk.start_idx <= i and i < chunk.end_idx, chunks)
if len(matching_chunks) > 0:
if i == matching_chunks[0].start_idx:
tags.append("B-" + matching_chunks[0].label)
else:
tags.append("I-" + matching_chunks[0].label)
else:
tags.append("O")
return tags
# Reads a dataset in the CoNLL format from a file
# The format is one token per line:
# [word] [POS] [syntactic chunk] *potential junk column* [NER tag]
# One blank line appears after each sentence
def read_data(file):
f = open(file)
sentences = []
curr_tokens = []
curr_bio_tags = []
for line in f:
stripped = line.strip()
if stripped != "":
fields = stripped.split(" ")
if len(fields) == 4 or len(fields) == 5:
# TODO: Modify this line to remember POS tags (fields[1]) or chunks (fields[2]) if desired
curr_tokens.append(Token(fields[0], fields[1], fields[2]))
# N.B. fields[-1] because there are weird extra fields in .train and .testa
curr_bio_tags.append(fields[-1])
elif stripped == "" and len(curr_tokens) > 0:
sentences.append(LabeledSentence(curr_tokens, chunks_from_bio_tag_seq(curr_bio_tags)))
curr_tokens = []
curr_bio_tags = []
return sentences
# Evaluates the guess sentences with respect to the gold sentences
def print_evaluation(gold_sentences, guess_sentences):
correct = 0
num_pred = 0
num_gold = 0
for gold, guess in zip(gold_sentences, guess_sentences):
correct += len(set(guess.chunks) & set(gold.chunks))
num_pred += len(guess.chunks)
num_gold += len(gold.chunks)
if num_pred == 0:
prec = 0
else:
prec = correct/float(num_pred)
if num_gold == 0:
rec = 0
else:
rec = correct/float(num_gold)
if prec == 0 and rec == 0:
f1 = 0
else:
f1 = 2 * prec * rec / (prec + rec)
print "Labeled F1: " + "{0:.2f}".format(f1 * 100) +\
", precision: " + repr(correct) + "/" + repr(num_pred) + " = " + "{0:.2f}".format(prec * 100) + \
", recall: " + repr(correct) + "/" + repr(num_gold) + " = " + "{0:.2f}".format(rec * 100)
# Writes labeled_sentences to outfile in the CoNLL format
def print_output(labeled_sentences, outfile):
f = open(outfile, 'w')
for sentence in labeled_sentences:
bio_tags = sentence.get_bio_tags()
for i in xrange(0, len(sentence)):
tok = sentence.tokens[i]
f.write(tok.word + " " + tok.pos + " " + tok.chunk + " " + bio_tags[i] + "\n")
f.write("\n")
f.close()