@@ -146,11 +146,11 @@ def parse_bracketed(s):
146146 # Load key-value pairs, substituting as necessary
147147 for attr , val in re .findall (r"([^=\s]*)=([^\s]*)" , s ):
148148 if val in temp :
149- val = temp [val ]
149+ val = remove_escapes ( temp [val ])
150150 if attr == 'Text' :
151- word = val
151+ word = remove_escapes ( val )
152152 else :
153- attrs [attr ] = val
153+ attrs [attr ] = remove_escapes ( val )
154154 return (word , attrs )
155155
156156
@@ -171,7 +171,7 @@ def parse_parser_results(text):
171171 state = STATE_TEXT
172172
173173 elif state == STATE_TEXT :
174- sentence ['text' ] = line
174+ sentence ['text' ] = remove_escapes ( line )
175175 state = STATE_WORDS
176176
177177 elif state == STATE_WORDS :
@@ -186,7 +186,7 @@ def parse_parser_results(text):
186186 state = STATE_DEPENDENCY
187187 sentence ['parsetree' ] = " " .join (sentence ['parsetree' ])
188188 else :
189- sentence ['parsetree' ].append (line )
189+ sentence ['parsetree' ].append (remove_escapes ( line ) )
190190
191191 elif state == STATE_DEPENDENCY :
192192 if len (line ) == 0 :
@@ -197,7 +197,9 @@ def parse_parser_results(text):
197197 rel , left , leftindex , right , rightindex = split_entry
198198 leftindex = re .sub ("[^0-9]" , "" , leftindex )
199199 rightindex = re .sub ("[^0-9]" , "" , rightindex )
200- sentence ['dependencies' ].append (tuple ([rel , left , leftindex , right , rightindex ]))
200+ sentence ['dependencies' ].append (tuple ([rel ,
201+ remove_escapes (left ), leftindex , remove_escapes (right ),
202+ rightindex ]))
201203
202204 elif state == STATE_COREFERENCE :
203205 if "Coreference set" in line :
@@ -474,6 +476,24 @@ def batch_parse(input_folder, corenlp_path=DIRECTORY, memory="3g", raw_output=Fa
474476
475477 return parse_xml_output (input_folder , corenlp_path , memory , raw_output = raw_output )
476478
479+ def remove_escapes (text ):
480+ """Given a string, remove PTB3 escape characters.
481+ """
482+ escapes = {"-lrb-" : "(" ,
483+ "-rrb-" : ")" ,
484+ "-lsb-" : "[" ,
485+ "-rsb-" : "]" ,
486+ "-lcb-" : "{" ,
487+ "-rcb-" : "}" ,
488+ "-LRB-" : "(" ,
489+ "-RRB-" : ")" ,
490+ "-LSB-" : "[" ,
491+ "-RSB-" : "]" ,
492+ "-LCB-" : "{" ,
493+ "-RCB-" : "}" }
494+ if text :
495+ pattern = re .compile ('|' .join (re .escape (key ) for key in escapes .keys ()))
496+ return pattern .sub (lambda x : escapes [x .group ()], text )
477497
478498if __name__ == '__main__' :
479499 """
0 commit comments