Skip to content

Commit 8f0450f

Browse files
committed
Replace PTB3 escapes with characters in the parsing.
1 parent beb06d1 commit 8f0450f

File tree

2 files changed

+27
-7
lines changed

2 files changed

+27
-7
lines changed

corenlp/corenlp.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,11 @@ def parse_bracketed(s):
146146
# Load key-value pairs, substituting as necessary
147147
for attr, val in re.findall(r"([^=\s]*)=([^\s]*)", s):
148148
if val in temp:
149-
val = temp[val]
149+
val = remove_escapes(temp[val])
150150
if attr == 'Text':
151-
word = val
151+
word = remove_escapes(val)
152152
else:
153-
attrs[attr] = val
153+
attrs[attr] = remove_escapes(val)
154154
return (word, attrs)
155155

156156

@@ -171,7 +171,7 @@ def parse_parser_results(text):
171171
state = STATE_TEXT
172172

173173
elif state == STATE_TEXT:
174-
sentence['text'] = line
174+
sentence['text'] = remove_escapes(line)
175175
state = STATE_WORDS
176176

177177
elif state == STATE_WORDS:
@@ -186,7 +186,7 @@ def parse_parser_results(text):
186186
state = STATE_DEPENDENCY
187187
sentence['parsetree'] = " ".join(sentence['parsetree'])
188188
else:
189-
sentence['parsetree'].append(line)
189+
sentence['parsetree'].append(remove_escapes(line))
190190

191191
elif state == STATE_DEPENDENCY:
192192
if len(line) == 0:
@@ -197,7 +197,9 @@ def parse_parser_results(text):
197197
rel, left, leftindex, right, rightindex = split_entry
198198
leftindex = re.sub("[^0-9]", "", leftindex)
199199
rightindex = re.sub("[^0-9]", "", rightindex)
200-
sentence['dependencies'].append(tuple([rel, left, leftindex, right, rightindex]))
200+
sentence['dependencies'].append(tuple([rel,
201+
remove_escapes(left), leftindex, remove_escapes(right),
202+
rightindex]))
201203

202204
elif state == STATE_COREFERENCE:
203205
if "Coreference set" in line:
@@ -474,6 +476,24 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
474476

475477
return parse_xml_output(input_folder, corenlp_path, memory, raw_output=raw_output)
476478

479+
def remove_escapes(text):
480+
"""Given a string, remove PTB3 escape characters.
481+
"""
482+
escapes = {"-lrb-": "(",
483+
"-rrb-": ")",
484+
"-lsb-": "[",
485+
"-rsb-": "]",
486+
"-lcb-": "{",
487+
"-rcb-": "}",
488+
"-LRB-": "(",
489+
"-RRB-": ")",
490+
"-LSB-": "[",
491+
"-RSB-": "]",
492+
"-LCB-": "{",
493+
"-RCB-": "}"}
494+
if text:
495+
pattern = re.compile('|'.join(re.escape(key) for key in escapes.keys()))
496+
return pattern.sub(lambda x: escapes[x.group()], text)
477497

478498
if __name__ == '__main__':
479499
"""

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
AUTHOR = "Hiroyoshi Komatsu, Dustin Smith, Aditi Muralidharan"
77
AUTHOR_EMAIL = "[email protected]"
88
URL = "https://github.com/Wordseer/stanford-corenlp-python"
9-
VERSION = "3.3.5-0"
9+
VERSION = "3.3.6-0"
1010

1111
setup(
1212
name=NAME,

0 commit comments

Comments
 (0)