perf(medcat-v2): optimize hot path allocations and lookups (#401)

bgriffen · web-flow · commit a76f44cb3c16 · 2026-04-07T17:59:38.000+01:00
* perf: share PerDocumentTokenCache across entities during training

Previously a new PerDocumentTokenCache was created per entity inside
the training loop, discarding cached token validity checks. For a
document with N entities and M tokens this caused N×M validity checks
instead of M. Now the cache is created once per document and shared.

* perf: use dict lookup for CUI index in TwoStepLinker disambiguation

Replace O(n) list.index() call per CUI candidate with O(1) dict
lookup. The cui_to_idx dict is built once before the loop.

* perf: use bisect for O(log n) token lookup in get_tokens

Both regex and spacy Document.get_tokens() previously scanned all
tokens linearly to find those within a character range. With bisect
on the pre-built char_indices array, lookup is O(log n) instead of
O(n). For a 1000-token document with 50 entities this reduces
comparisons from ~50,000 to ~500.

* perf: use mp.get_context instead of global set_start_method

Replace mp.set_start_method("spawn", force=True) which mutates
process-wide state on every batch run with mp.get_context("spawn")
passed to ProcessPoolExecutor. This avoids silently overriding the
start method for other libraries (e.g. PyTorch DataLoaders).
diff --git a/medcat-v2/medcat/cat.py b/medcat-v2/medcat/cat.py
@@ -482,14 +482,16 @@ def _multiprocess(
             saver: Optional[BatchAnnotationSaver],
             ) -> Iterator[tuple[str, Union[dict, Entities, OnlyCUIEntities]]]:
         external_processes = n_process - 1
+        mp_context = None
         if self.FORCE_SPAWN_MP:
             import multiprocessing as mp
             logger.info(
-                "Forcing multiprocessing start method to 'spawn' "
+                "Using 'spawn' multiprocessing context "
                 "due to known compatibility issues with 'fork' and "
                 "libraries using threads or native extensions.")
-            mp.set_start_method("spawn", force=True)
-        with ProcessPoolExecutor(max_workers=external_processes) as executor:
+            mp_context = mp.get_context("spawn")
+        with ProcessPoolExecutor(max_workers=external_processes,
+                                 mp_context=mp_context) as executor:
             while True:
                 try:
                     yield from self._mp_one_batch_per_process(
diff --git a/medcat-v2/medcat/components/linking/context_based_linker.py b/medcat-v2/medcat/components/linking/context_based_linker.py
@@ -110,10 +110,11 @@ def _process_entity_train(self, doc: MutableDocument,
     def _train_on_doc(self, doc: MutableDocument,
                       ner_ents: list[MutableEntity]
                       ) -> Iterator[MutableEntity]:
-        # Run training
+        # Run training — share cache across all entities in the document
+        per_doc_valid_token_cache = PerDocumentTokenCache()
         for entity in ner_ents:
             yield from self._process_entity_train(
-                doc, entity, PerDocumentTokenCache())
+                doc, entity, per_doc_valid_token_cache)
 
     def _process_entity_nt_w_name(
             self, doc: MutableDocument,
diff --git a/medcat-v2/medcat/components/linking/two_step_context_based_linker.py b/medcat-v2/medcat/components/linking/two_step_context_based_linker.py
@@ -132,10 +132,11 @@ def _do_training(self,
                         per_doc_valid_token_cache=per_doc_valid_token_cache)
 
     def _train_for_tuis(self, doc: MutableDocument) -> None:
-        # Run training
+        # Run training — share cache across all entities in the document
+        per_doc_valid_token_cache = PerDocumentTokenCache()
         for entity in doc.ner_ents:
             self._process_entity_train_tuis(
-                doc, entity, PerDocumentTokenCache())
+                doc, entity, per_doc_valid_token_cache)
 
     def _check_similarity(self, cui: str, context_similarity: float) -> bool:
         th_type = self.config.components.linking.similarity_threshold_type
@@ -284,10 +285,11 @@ def _preprocess_disamb(self, ent: MutableEntity, name: str,
             return
         per_cui_type_sims = pew[ent]
         cnf_2step = self.two_step_config
+        cui_to_idx = {c: i for i, c in enumerate(cuis)}
         for cui, type_sim in per_cui_type_sims.items():
-            if cui not in cuis:
+            if cui not in cui_to_idx:
                 continue
-            cui_index = cuis.index(cui)
+            cui_index = cui_to_idx[cui]
             cui_sim = similarities[cui_index]
             ts_coef = sigmoid(
                 cnf_2step.alpha_sharpness * (
diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -1,6 +1,7 @@
 import re
 from typing import cast, Optional, Iterator, overload, Union, Any, Type
 from collections import defaultdict
+from bisect import bisect_left, bisect_right
 import warnings
 
 from medcat.tokenizing.tokens import (
@@ -224,6 +225,7 @@ def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None
                  ) -> None:
         self.text = text
         self._tokens = tokens or []
+        self._char_indices: list[int] = []
         self.ner_ents: list[MutableEntity] = []
         self.linked_ents: list[MutableEntity] = []
 
@@ -256,12 +258,12 @@ def __len__(self) -> int:
 
     def get_tokens(self, start_index: int, end_index: int
                    ) -> list[MutableToken]:
-        tkns = []
-        for tkn in self:
-            if (tkn.base.char_index >= start_index and
-                    tkn.base.char_index <= end_index):
-                tkns.append(tkn)
-        return tkns
+        if self._char_indices:
+            lo = bisect_left(self._char_indices, start_index)
+            hi = bisect_right(self._char_indices, end_index)
+            return self._tokens[lo:hi]
+        return [tkn for tkn in self
+                if start_index <= tkn.base.char_index <= end_index]
 
     def __iter__(self) -> Iterator[MutableToken]:
         yield from self._tokens
@@ -387,6 +389,7 @@ def __call__(self, text: str) -> MutableDocument:
             doc._tokens.append(Token(doc, token, token_w_ws,
                                      start_index, tkn_index,
                                      False, False))
+            doc._char_indices.append(start_index)
         return doc
 
     @classmethod
diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokens.py
@@ -1,4 +1,5 @@
 from typing import Iterator, Union, Optional, overload, cast, Any
+from bisect import bisect_left, bisect_right
 import logging
 
 from spacy.tokens import Token as SpacyToken
@@ -196,6 +197,7 @@ class Document:
 
     def __init__(self, delegate: SpacyDoc) -> None:
         self._delegate = delegate
+        self._char_indices: Optional[list[int]] = None
         self.ner_ents: list[MutableEntity] = []
         self.linked_ents: list[MutableEntity] = []
 
@@ -225,14 +227,17 @@ def __getitem__(self, index: Union[int, slice]
     def __len__(self) -> int:
         return len(self._delegate)
 
+    def _ensure_char_indices(self) -> list[int]:
+        if self._char_indices is None:
+            self._char_indices = [tkn.idx for tkn in self._delegate]
+        return self._char_indices
+
     def get_tokens(self, start_index: int, end_index: int
                    ) -> list[MutableToken]:
-        tkns = []
-        for tkn in self:
-            if (tkn.base.char_index >= start_index and
-                    tkn.base.char_index <= end_index):
-                tkns.append(tkn)
-        return tkns
+        char_indices = self._ensure_char_indices()
+        lo = bisect_left(char_indices, start_index)
+        hi = bisect_right(char_indices, end_index)
+        return [Token(self._delegate[i]) for i in range(lo, hi)]
 
     def set_addon_data(self, path: str, val: Any) -> None:
         if not self._delegate.has_extension(path):