@@ -240,87 +240,101 @@ def __init__(
240
240
241
241
def __call__ (self , obj , tokenizer = None ):
242
242
# tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
243
- tok = tokenizer or self .tokenizer or get_current_tokenizer ()
244
- doc = tok (obj ["text" ] or "" )
245
- doc ._ .note_id = obj .get ("doc_id" , obj .get (FILENAME ))
246
-
247
- entities = {}
248
- spans = []
249
-
250
- for dst in (
251
- * (() if self .span_attributes is None else self .span_attributes .values ()),
252
- * self .default_attributes ,
253
- ):
254
- if not Span .has_extension (dst ):
255
- Span .set_extension (dst , default = None )
256
-
257
- for ent in obj .get ("entities" ) or ():
258
- fragments = (
259
- [
260
- {
261
- "begin" : min (f ["begin" ] for f in ent ["fragments" ]),
262
- "end" : max (f ["end" ] for f in ent ["fragments" ]),
263
- }
264
- ]
265
- if not self .split_fragments
266
- else ent ["fragments" ]
267
- )
268
- for fragment in fragments :
269
- span = doc .char_span (
270
- fragment ["begin" ],
271
- fragment ["end" ],
272
- label = ent ["label" ],
273
- alignment_mode = "expand" ,
274
- )
275
- attributes = (
276
- {a ["label" ]: a ["value" ] for a in ent ["attributes" ]}
277
- if isinstance (ent ["attributes" ], list )
278
- else ent ["attributes" ]
243
+ note_id = obj .get ("doc_id" , obj .get (FILENAME ))
244
+ try :
245
+ tok = tokenizer or self .tokenizer or get_current_tokenizer ()
246
+ doc = tok (obj ["text" ] or "" )
247
+ doc ._ .note_id = note_id
248
+
249
+ entities = {}
250
+ spans = []
251
+
252
+ for dst in (
253
+ * (
254
+ ()
255
+ if self .span_attributes is None
256
+ else self .span_attributes .values ()
257
+ ),
258
+ * self .default_attributes ,
259
+ ):
260
+ if not Span .has_extension (dst ):
261
+ Span .set_extension (dst , default = None )
262
+
263
+ for ent in obj .get ("entities" ) or ():
264
+ fragments = (
265
+ [
266
+ {
267
+ "begin" : min (f ["begin" ] for f in ent ["fragments" ]),
268
+ "end" : max (f ["end" ] for f in ent ["fragments" ]),
269
+ }
270
+ ]
271
+ if not self .split_fragments
272
+ else ent ["fragments" ]
279
273
)
280
- if self .notes_as_span_attribute and ent ["notes" ]:
281
- ent ["attributes" ][self .notes_as_span_attribute ] = "|" .join (
282
- note ["value" ] for note in ent ["notes" ]
274
+ for fragment in fragments :
275
+ span = doc .char_span (
276
+ fragment ["begin" ],
277
+ fragment ["end" ],
278
+ label = ent ["label" ],
279
+ alignment_mode = "expand" ,
283
280
)
284
- for label , value in attributes .items ():
285
- new_name = (
286
- self .span_attributes .get (label , None )
287
- if self .span_attributes is not None
288
- else label
281
+ attributes = (
282
+ {}
283
+ if "attributes" not in ent
284
+ else {a ["label" ]: a ["value" ] for a in ent ["attributes" ]}
285
+ if isinstance (ent ["attributes" ], list )
286
+ else ent ["attributes" ]
289
287
)
290
- if self .span_attributes is None and not Span .has_extension (
291
- new_name
292
- ):
293
- Span .set_extension (new_name , default = None )
294
-
295
- if new_name :
296
- value = True if value is None else value
297
- if not self .keep_raw_attribute_values :
298
- value = (
299
- True
300
- if value in ("True" , "true" )
301
- else False
302
- if value in ("False" , "false" )
303
- else value
304
- )
305
- span ._ .set (new_name , value )
306
-
307
- entities .setdefault (ent ["entity_id" ], []).append (span )
308
- spans .append (span )
309
-
310
- set_spans (doc , spans , span_setter = self .span_setter )
311
- for attr , value in self .default_attributes .items ():
312
- for span in spans :
313
- if span ._ .get (attr ) is None :
314
- span ._ .set (attr , value )
315
-
316
- for relation in obj .get ("relations" , []):
317
- relation_label = relation ["relation_label" ]
318
- from_entity_id = relation ["from_entity_id" ]
319
- to_entity_id = relation ["to_entity_id" ]
320
-
321
- for head in entities [from_entity_id ]:
322
- for tail in entities [to_entity_id ]:
323
- head ._ .rel .setdefault (relation_label , set ()).add (tail )
288
+ if self .notes_as_span_attribute and ent ["notes" ]:
289
+ ent ["attributes" ][self .notes_as_span_attribute ] = "|" .join (
290
+ note ["value" ] for note in ent ["notes" ]
291
+ )
292
+ for label , value in attributes .items ():
293
+ new_name = (
294
+ self .span_attributes .get (label , None )
295
+ if self .span_attributes is not None
296
+ else label
297
+ )
298
+ if self .span_attributes is None and not Span .has_extension (
299
+ new_name
300
+ ):
301
+ Span .set_extension (new_name , default = None )
302
+
303
+ if new_name :
304
+ value = True if value is None else value
305
+ if not self .keep_raw_attribute_values :
306
+ value = (
307
+ True
308
+ if value in ("True" , "true" )
309
+ else False
310
+ if value in ("False" , "false" )
311
+ else value
312
+ )
313
+ span ._ .set (new_name , value )
314
+
315
+ entities .setdefault (ent ["entity_id" ], []).append (span )
316
+ spans .append (span )
317
+
318
+ set_spans (doc , spans , span_setter = self .span_setter )
319
+ for attr , value in self .default_attributes .items ():
320
+ for span in spans :
321
+ if span ._ .get (attr ) is None :
322
+ span ._ .set (attr , value )
323
+
324
+ for relation in obj .get ("relations" , []):
325
+ relation_label = (
326
+ relation ["relation_label" ]
327
+ if "relation_label" in relation
328
+ else relation ["label" ]
329
+ )
330
+ from_entity_id = relation ["from_entity_id" ]
331
+ to_entity_id = relation ["to_entity_id" ]
332
+
333
+ for head in entities .get (from_entity_id , ()):
334
+ for tail in entities .get (to_entity_id , ()):
335
+ head ._ .rel .setdefault (relation_label , set ()).add (tail )
336
+ except Exception :
337
+ raise ValueError (f"Error when processing { note_id } " )
324
338
325
339
return doc
326
340
0 commit comments