@@ -405,19 +405,53 @@ def _train_supervised_for_project(self,
405405 devalue_others )
406406
407407 def _prepare_doc_with_anns (
408- self , doc : MutableDocument ,
408+ self , doc : MutableDocument , ann_doc : MedCATTrainerExportDocument ,
409409 anns : list [MedCATTrainerExportAnnotation ]) -> None :
410410 ents = []
411411 for ann in anns :
412412 tkns = doc .get_tokens (ann ['start' ], ann ['end' ])
413- ents .append (self ._pipeline .entity_from_tokens_in_doc (tkns , doc ))
413+ try :
414+ ents .append (self ._pipeline .entity_from_tokens_in_doc (tkns , doc ))
415+ except ValueError as err :
416+ self ._warn_on_error (
417+ err , doc .base .text ,
418+ (ann ['cui' ], ann ['value' ], ann ['start' ], ann ['end' ]),
419+ (None , ann_doc ['id' ], ann_doc ['name' ]))
414420 # set NER ents
415421 doc .ner_ents .clear ()
416422 doc .ner_ents .extend (ents )
417423 # duplicate for linked as well, but in a a separate list
418424 doc .linked_ents .clear ()
419425 doc .linked_ents .extend (ents )
420426
427+ def _warn_on_error (self , ve : BaseException , cur_text : str ,
428+ mut_context_start : tuple [str , str , int , int ],
429+ mut_context_end : tuple [MutableEntity | None , str , str ]):
430+ start , end = mut_context_start [2 :]
431+ context_window = 20 # characters
432+ splitter_left , splitter_right = "<" , ">"
433+ context_start = max (start - context_window , 0 )
434+ context_end = min (end + context_window , len (cur_text ) - 1 )
435+ context = (cur_text [context_start : start ] +
436+ splitter_left +
437+ cur_text [start : end ] +
438+ splitter_right +
439+ cur_text [end : context_end ])
440+ if context_start > 0 :
441+ context = "[...]" + context
442+ if context_end < len (cur_text ) - 1 :
443+ context += "[...]"
444+ msg_template = (
445+ "Failed to identify '%s' (%s) ([%d:%d]) "
446+ "in '%s' %s within document %s | %s, "
447+ "skipping training for this example" )
448+ msg_context = (
449+ * mut_context_start , context , * mut_context_end )
450+ if self .strict_train :
451+ raise ValueError (msg_template % msg_context ) from ve
452+ else :
453+ logger .warning (msg_template , * msg_context , exc_info = ve )
454+ # 480+ project
421455 def _train_supervised_for_project2 (self ,
422456 docs : list [MedCATTrainerExportDocument ],
423457 current_document : int ,
@@ -433,7 +467,7 @@ def _train_supervised_for_project2(self,
433467 with temp_changed_config (self .config .components .linking ,
434468 'train' , False ):
435469 mut_doc = self .caller (doc ['text' ])
436- self ._prepare_doc_with_anns (mut_doc , doc ['annotations' ])
470+ self ._prepare_doc_with_anns (mut_doc , doc , doc ['annotations' ])
437471
438472 # Compatibility with old output where annotations are a list
439473 for ann , mut_entity in zip (doc ['annotations' ], mut_doc .linked_ents ):
@@ -461,31 +495,10 @@ def _train_supervised_for_project2(self,
461495 mut_entity = mut_entity , negative = deleted ,
462496 devalue_others = devalue_others )
463497 except (ValueError , KeyError ) as ve :
464- context_window = 20 # characters
465- splitter_left , splitter_right = "<" , ">"
466- cur_text = doc ['text' ]
467- context_start = max (start - context_window , 0 )
468- context_end = min (end + context_window , len (cur_text ) - 1 )
469- context = (cur_text [context_start : start ] +
470- splitter_left +
471- cur_text [start : end ] +
472- splitter_right +
473- cur_text [end : context_end ])
474- if context_start > 0 :
475- context = "[...]" + context
476- if context_end < len (cur_text ) - 1 :
477- context += "[...]"
478- msg_template = (
479- "Failed to identify '%s' (%s) ([%d:%d]) "
480- "in '%s' %s within document %s | %s, "
481- "skipping training for this example" )
482- msg_context = (
483- cui , ann ['value' ], ann ['start' ], ann ['end' ],
484- context , mut_entity , doc ['id' ], doc ['name' ])
485- if self .strict_train :
486- raise ValueError (msg_template % msg_context ) from ve
487- else :
488- logger .warning (msg_template , * msg_context , exc_info = ve )
498+ self ._warn_on_error (
499+ ve , doc ['text' ],
500+ (cui , ann ['value' ], ann ['start' ], ann ['end' ]),
501+ (mut_entity , doc ['id' ], doc ['name' ]))
489502 if train_from_false_positives :
490503 fps : list [MutableEntity ] = get_false_positives (doc , mut_doc )
491504
0 commit comments