bug(medcat): CU-869cunfx7 Fix supervised training order of operations issue (#408)

mart-r · github-actions[bot] · web-flow · commit ed2a619d1937 · 2026-04-09T13:27:21.000+01:00
* CU-869cunfx7: Add tests for exception catching

* CU-869cunfx7: Update test for better flow

* CU-869cunfx7: Remove unwarranted deprecation warnings from tokenizers

* CU-869cunfx7: Add deprecation warning to pipeline where it belongs

* CU-869cunfx7: Remove unused import

* CU-869cunfx7: Fix issue with exception raising introduced in release 2.7 / PR 374

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/medcat-v2/medcat/pipeline/pipeline.py b/medcat-v2/medcat/pipeline/pipeline.py
@@ -44,13 +44,6 @@ def create_entity(self, doc: MutableDocument,
             doc, token_start_index, token_end_index, label)
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
-        warnings.warn(
-            "The `medcat.pipeline.pipeline.entity_from_tokens` method is"
-            "depreacated and subject to removal in a future release. Please "
-            "use `medcat.pipeline.pipeline.entity_from_tokens_in_doc` instead.",
-            DeprecationWarning,
-            stacklevel=2
-        )
         return self.tokenizer.entity_from_tokens(tokens)
 
     def entity_from_tokens_in_doc(
@@ -352,6 +345,14 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         Returns:
             MutableEntity: The resulting entity.
         """
+        warnings.warn(
+            "The `medcat.pipeline.pipeline.Pipeline.entity_from_tokens` method is"
+            "depreacated is subject to removal in a future release. Please use "
+            "`medcat.pipeline.pipeline.Pipeline.entity_from_tokens_in_doc` "
+            "instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
         return self._tokenizer.entity_from_tokens(tokens)
 
     def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -2,7 +2,6 @@
 from typing import cast, Optional, Iterator, overload, Union, Any, Type
 from collections import defaultdict
 from bisect import bisect_left, bisect_right
-import warnings
 
 from medcat.tokenizing.tokens import (
     BaseToken, BaseEntity, BaseDocument,
@@ -343,14 +342,6 @@ def create_entity(self, doc: MutableDocument,
         # return Entity(span)
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
-        warnings.warn(
-            "The `medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens` method is"
-            "depreacated and subject to removal in a future release. Please use "
-            "`medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens_in_doc` "
-            "instead.",
-            DeprecationWarning,
-            stacklevel=2
-        )
         if not tokens:
             raise ValueError("Need at least one token for an entity")
         doc = cast(Token, tokens[0])._doc
diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
@@ -3,7 +3,6 @@
 import os
 import shutil
 import logging
-import warnings
 
 import spacy
 from spacy.tokens import Span
@@ -78,14 +77,6 @@ def create_entity(self, doc: MutableDocument,
         return Entity(span)
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
-        warnings.warn(
-            "The `medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens` method is"
-            "depreacated and subject to removal in a future release. Please use "
-            "`medcat.tokenizing.tokenizers.Tokenizer.entity_from_tokens_in_doc` "
-            "instead.",
-            DeprecationWarning,
-            stacklevel=2
-        )
         if not tokens:
             raise ValueError("Need at least one token for an entity")
         spacy_tokens = cast(list[Token], tokens)
diff --git a/medcat-v2/medcat/tokenizing/tokenizers.py b/medcat-v2/medcat/tokenizing/tokenizers.py
@@ -34,7 +34,18 @@ def create_entity(self, doc: MutableDocument,
         pass
 
     def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
-        """Deprecated: use entity_from_tokens_in_doc instead."""
+        """Get an entity from the list of tokens.
+
+        This will create a new instance instead of looking for existing entity.
+        This method should be used only if/when there was no existing entity
+        within the specified document for the given span of tokens.
+
+        Args:
+            tokens (list[MutableToken]): List of tokens.
+
+        Returns:
+            MutableEntity: The resulting entity.
+        """
         pass
 
     def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
diff --git a/medcat-v2/medcat/trainer.py b/medcat-v2/medcat/trainer.py
@@ -405,19 +405,53 @@ def _train_supervised_for_project(self,
                     devalue_others)
 
     def _prepare_doc_with_anns(
-            self, doc: MutableDocument,
+            self, doc: MutableDocument, ann_doc: MedCATTrainerExportDocument,
             anns: list[MedCATTrainerExportAnnotation]) -> None:
         ents = []
         for ann in anns:
             tkns = doc.get_tokens(ann['start'], ann['end'])
-            ents.append(self._pipeline.entity_from_tokens_in_doc(tkns, doc))
+            try:
+                ents.append(self._pipeline.entity_from_tokens_in_doc(tkns, doc))
+            except ValueError as err:
+                self._warn_on_error(
+                    err, doc.base.text,
+                    (ann['cui'], ann['value'], ann['start'], ann['end']),
+                    (None, ann_doc['id'], ann_doc['name']))
         # set NER ents
         doc.ner_ents.clear()
         doc.ner_ents.extend(ents)
         # duplicate for linked as well, but in a a separate list
         doc.linked_ents.clear()
         doc.linked_ents.extend(ents)
 
+    def _warn_on_error(self, ve: BaseException, cur_text: str,
+                       mut_context_start: tuple[str, str, int, int],
+                       mut_context_end: tuple[MutableEntity | None, str, str]):
+        start, end = mut_context_start[2:]
+        context_window = 20  # characters
+        splitter_left, splitter_right = "<", ">"
+        context_start = max(start - context_window, 0)
+        context_end = min(end + context_window, len(cur_text) - 1)
+        context = (cur_text[context_start: start] +
+                    splitter_left +
+                    cur_text[start: end] +
+                    splitter_right +
+                    cur_text[end: context_end])
+        if context_start > 0:
+            context = "[...]" + context
+        if context_end < len(cur_text) - 1:
+            context += "[...]"
+        msg_template = (
+            "Failed to identify '%s' (%s) ([%d:%d]) "
+            "in '%s' %s within document %s | %s, "
+            "skipping training for this example")
+        msg_context = (
+            *mut_context_start, context, *mut_context_end)
+        if self.strict_train:
+            raise ValueError(msg_template % msg_context) from ve
+        else:
+            logger.warning(msg_template, *msg_context, exc_info=ve)
+# 480+ project
     def _train_supervised_for_project2(self,
                                        docs: list[MedCATTrainerExportDocument],
                                        current_document: int,
@@ -433,7 +467,7 @@ def _train_supervised_for_project2(self,
             with temp_changed_config(self.config.components.linking,
                                      'train', False):
                 mut_doc = self.caller(doc['text'])
-            self._prepare_doc_with_anns(mut_doc, doc['annotations'])
+            self._prepare_doc_with_anns(mut_doc, doc, doc['annotations'])
 
             # Compatibility with old output where annotations are a list
             for ann, mut_entity in zip(doc['annotations'], mut_doc.linked_ents):
@@ -461,31 +495,10 @@ def _train_supervised_for_project2(self,
                         mut_entity=mut_entity, negative=deleted,
                         devalue_others=devalue_others)
                 except (ValueError, KeyError) as ve:
-                    context_window = 20  # characters
-                    splitter_left, splitter_right = "<", ">"
-                    cur_text = doc['text']
-                    context_start = max(start - context_window, 0)
-                    context_end = min(end + context_window, len(cur_text) - 1)
-                    context = (cur_text[context_start: start] +
-                               splitter_left +
-                               cur_text[start: end] +
-                               splitter_right +
-                               cur_text[end: context_end])
-                    if context_start > 0:
-                        context = "[...]" + context
-                    if context_end < len(cur_text) - 1:
-                        context += "[...]"
-                    msg_template = (
-                        "Failed to identify '%s' (%s) ([%d:%d]) "
-                        "in '%s' %s within document %s | %s, "
-                        "skipping training for this example")
-                    msg_context = (
-                        cui, ann['value'], ann['start'], ann['end'],
-                        context, mut_entity, doc['id'], doc['name'])
-                    if self.strict_train:
-                        raise ValueError(msg_template % msg_context) from ve
-                    else:
-                        logger.warning(msg_template, *msg_context, exc_info=ve)
+                    self._warn_on_error(
+                        ve, doc['text'],
+                        (cui, ann['value'], ann['start'], ann['end']),
+                        (mut_entity, doc['id'], doc['name']))
             if train_from_false_positives:
                 fps: list[MutableEntity] = get_false_positives(doc, mut_doc)
 
diff --git a/medcat-v2/tests/test_trainer.py b/medcat-v2/tests/test_trainer.py
@@ -252,6 +252,30 @@ def test_training_happens_on_linked_ents_on_doc(self):
                 doc, ent = args.kwargs['mut_doc'], args.kwargs['mut_entity']
                 self.assertIn(ent, doc.linked_ents)
 
+    def test_empty_token_annotation_is_skipped_when_not_strict(self):
+        self.trainer.strict_train = False
+        with unittest.mock.patch.object(
+                self.trainer._pipeline, "entity_from_tokens_in_doc",
+                side_effect=ValueError("No tokens found in span")), \
+                unittest.mock.patch.object(
+                FakeMutDoc, "get_tokens", return_value=[]), \
+                unittest.mock.patch.object(self.trainer, "add_and_train_concept"):
+            try:
+                self.train(self.TRAIN_DATA)
+            except ValueError as err:
+                self.fail(f"Unexpected ValueError for empty-token annotation: {err}")
+
+    def test_empty_token_annotation_raises_when_strict(self):
+        self.trainer.strict_train = True
+        with unittest.mock.patch.object(
+                self.trainer._pipeline, "entity_from_tokens_in_doc",
+                side_effect=ValueError("No tokens found in span")), \
+                unittest.mock.patch.object(
+                FakeMutDoc, "get_tokens", return_value=[]), \
+                unittest.mock.patch.object(self.trainer, "add_and_train_concept"):
+            with self.assertRaises(ValueError):
+                self.train(self.TRAIN_DATA)
+
 
 class FromSratchBase(TrainedModelTests):
     RNG_SEED = 42