Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement: new TNM regex #366

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 57 additions & 33 deletions edsnlp/pipes/ner/tnm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,21 @@ class Metastasis(TnmEnum):


class TNM(pydantic.BaseModel):
prefix: Optional[Prefix] = None
tumour: Optional[Tumour] = None
tumour_specification: Optional[Specification] = None
tumour_prefix: Optional[str] = None
tumour: Optional[str] = None
tumour_specification: Optional[str] = None
tumour_suffix: Optional[str] = None
node: Optional[Node] = None
node_specification: Optional[Specification] = None
node_prefix: Optional[str] = None
node: Optional[str] = None
node_specification: Optional[str] = None
node_suffix: Optional[str] = None
metastasis: Optional[Metastasis] = None
resection_completeness: Optional[int] = None
metastasis_prefix: Optional[str] = None
metastasis: Optional[str] = None
metastasis_specification: Optional[str] = None
pleura: Optional[str] = None
resection: Optional[str] = None
resection_specification: Optional[str] = None
resection_loc: Optional[str] = None
version: Optional[str] = None
version_year: Optional[int] = None

Expand All @@ -112,32 +118,43 @@ def validate_year(cls, v):
def norm(self) -> str:
norm = []

if self.prefix is not None:
norm.append(str(self.prefix))
if self.tumour_prefix:
norm.append(f"{self.tumour_prefix or ''}")

if (
(self.tumour is not None)
| (self.tumour_specification is not None)
| (self.tumour_suffix is not None)
):
norm.append(f"T{str(self.tumour or '')}")
norm.append(f"{str(self.tumour_specification or '')}")
norm.append(f"{str(self.tumour_suffix or '')}")

if (
(self.node is not None)
| (self.node_specification is not None)
| (self.node_suffix is not None)
):
norm.append(f"N{str(self.node or '')}")
norm.append(f"{str(self.node_specification or '')}")
norm.append(f"{str(self.node_suffix or '')}")
if self.tumour:
norm.append(f"T{self.tumour}")
if self.tumour_specification:
norm.append(f"{self.tumour_specification or ''}")
if self.tumour_suffix:
norm.append(f"{self.tumour_suffix or ''}")

if self.node_prefix:
norm.append(f"{self.node_prefix or ''}")

if self.metastasis is not None:
if self.node:
norm.append(f"N{self.node}")
if self.node_specification:
norm.append(f"{self.node_specification or ''}")
if self.node_suffix:
norm.append(f"{self.node_suffix or ''}")

if self.metastasis_prefix:
norm.append(f"{self.metastasis_prefix or ''}")

if self.metastasis:
norm.append(f"M{self.metastasis}")
if self.metastasis_specification:
norm.append(f"{self.metastasis_specification or ''}")

if self.pleura:
norm.append(f"PL{self.pleura}")

if self.resection_completeness is not None:
norm.append(f"R{self.resection_completeness}")
if self.resection:
norm.append(f"R{self.resection}")
if self.resection_specification:
norm.append(f"{self.resection_specification or ''}")
if self.resection_loc:
norm.append(f"{self.resection_loc or ''}")

if self.version is not None and self.version_year is not None:
norm.append(f" ({self.version.upper()} {self.version_year})")
Expand Down Expand Up @@ -182,14 +199,21 @@ def dict(
set_keys = set(d.keys())
for k in set_keys.intersection(
{
"prefix",
"tumour_prefix",
"tumour",
"node",
"metastasis",
"tumour_specification",
"node_specification",
"tumour_suffix",
"node_prefix",
"node",
"node_specification",
"node_suffix",
"metastasis_prefix",
"metastasis",
"metastasis_specification",
"pleura",
"resection",
"resection_specification",
"resection_loc",
}
):
v = d[k]
Expand Down
12 changes: 7 additions & 5 deletions edsnlp/pipes/ner/tnm/patterns.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
prefix_pattern = r"(?P<prefix>[cpPyraum]p?)"
tumour_pattern = r"T\s?(?P<tumour>([0-4o]|is))?(?P<tumour_specification>[abcdx]|mi)?"
prefix_pattern = r"(?P<tumour_prefix>[cpPyraum]p?)"
tumour_pattern = (
r"T\s?(?P<tumour>([0-4o]|is|[Xx]))?(?P<tumour_specification>[abcdx]|mi)?"
)
tumour_pattern += r"(?:\((?P<tumour_suffix>[^()]{1,10})\))?"
node_pattern = r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}N\s?(?P<node>[0-3o]|x)"
node_pattern += (
r"(?P<node_specification>[abcdx]|mi)?(?:\((?P<node_suffix>[^()]{1,10})\))?)"
)

metastasis_pattern = (
r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P<metastasis>([01o]|x))x?)" # noqa: E501
r"(\s{,2}\/?\s{,2}([cpPyraum]p?)?\s{,2}M\s?(?P<metastasis>([01o]|x))x?)"
)
resection_completeness = r"(\s{,2}\/?\s{,2}R\s?(?P<resection_completeness>[012]))"
resection_pattern = r"(\s{,2}\/?\s{,2}R\s?(?P<resection>[012]))"

version_pattern = (
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)"
Expand All @@ -23,6 +25,6 @@
tnm_pattern += prefix_pattern + r"\s{,2}?" + f"({tumour_pattern})"
tnm_pattern += r"(\s{,2}" + f"{node_pattern})?"
tnm_pattern += r"(\s{,2}" + f"{metastasis_pattern})?"
tnm_pattern += r"(\s{,2}" + f"{resection_completeness})?"
tnm_pattern += r"(\s{,2}" + f"{resection_pattern})?"
tnm_pattern += f"({spacer}{version_pattern})?"
tnm_pattern = r"(?:\b|^)" + tnm_pattern + r"(?:\b|$)"
92 changes: 92 additions & 0 deletions edsnlp/pipes/ner/tnm/patterns_new.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
tumour_pattern = (
r"(?P<tumour_prefix>[cpyramP]{1,2}\s?)?" # Optional tumour prefix
r"T\s?" # 'T' followed by optional space
r"(?P<tumour>([0-4]|is|[Xx]))" # Tumour size (required if 'T' is present)
r"(?P<tumour_specification>[abcdx]|mi)?" # Optional tumour specification
r"(?:\s?\((?P<tumour_suffix>[^()]{1,10})\))?" # Optional tumour suffix
)

node_pattern = (
r"(?P<node_prefix>[cpyraP]{1,2}\s?)?" # Optional node prefix
r"N\s?" # 'N' followed by optional space
r"(?P<node>[Xx01234\+])" # Node size/status (required if 'N' is present)
r"(?P<node_specification>[abcdx]|mi|sn|i[-,+]|mol[-,+]|\(mi\)|\(sn\)|"
r"\(i[-,+]\)|\(mol[-,+]\)|\(\d+\s*/\s*\d+\))?" # Optional node specification
r"(?:\s?\((?P<node_suffix>[^()]{1,10})\))?" # Optional node suffix
)

metastasis_pattern = (
r"(?P<metastasis_prefix>[cpyraP]{1,2}\s?)?" # Optional metastasis prefix
r"M\s?" # 'M' followed by optional space
r"(?P<metastasis>[Xx0123\+])" # Metastasis status (required if 'M' is present)
r"(?P<metastasis_specification>[abcd]|i\+|mol\+|cy\+|\(i\+\)|\(mol\+\)|"
r"\(cy\+\)|PUL|OSS|HEP|BRA|LYM|OTH|MAR|PLE|PER|ADR|SKI)?" # Optional specification
)

pleura_pattern = (
r"PL\s?(?P<pleura>([0123]|x))?" # Optional pleura status (for lung cancer)
)

resection_pattern = (
r"R\s?"
r"(?P<resection>[Xx012])?" # Optional resection completeness
r"(?P<resection_specification>(is|cy\+|\(is\)|\(cy\+\))?)?" # Optional spec
r"(?P<resection_loc>(\((?P<r_loc>[a-z]+)\)[,;\s]*)*)?" # Optional localization
)

version_pattern = (
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)" # TNM version
r"\s+([éeE]ditions|[éeE]d\.?)?\s*"
r"(?P<version_year>\d{4}|\d{2})\)?" # Year of the version
)

TNM_space = r"(\s*[,\/]?\s*|\n)" # Allow space, comma, or slash as delimiters

# We need te exclude pattern like 'T1', 'T2' if they are not followed by node or
# metastasis sections.
exclude_pattern = (
r"(?!T\s*[0-4]\s*[.,\/](?!\s*"
+ node_pattern
+ "?"
+ TNM_space
+ "?"
+ metastasis_pattern
+ "?"
+ "))"
)

tnm_pattern_new = (
r"(?:\b|^)"
+ exclude_pattern
+ r"(?:"
+ r"(?P<T_component>"
+ tumour_pattern
+ ")"
+ TNM_space
+ "?"
+ r"(?P<N_component>"
+ node_pattern
+ ")?"
+ TNM_space
+ "?"
+ r"(?P<M_component>"
+ metastasis_pattern
+ ")?"
+ TNM_space
+ "?"
+ r"(?P<PL_component>"
+ pleura_pattern
+ ")?"
+ TNM_space
+ "?"
+ r"(?P<R_component>"
+ resection_pattern
+ ")?"
+ TNM_space
+ "?"
+ r"(?P<V_component>"
+ version_pattern
+ ")?"
+ r")"
+ r"(?:\b|$|\n)"
)
4 changes: 2 additions & 2 deletions edsnlp/pipes/ner/tnm/tnm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from edsnlp.utils.typing import cast

from .model import TNM
from .patterns import tnm_pattern
from .patterns_new import tnm_pattern_new


class TNMMatcher(BaseNERComponent):
Expand Down Expand Up @@ -75,7 +75,7 @@ def __init__(
nlp: Optional[PipelineProtocol],
name: str = "tnm",
*,
pattern: Optional[Union[List[str], str]] = tnm_pattern,
pattern: Optional[Union[List[str], str]] = tnm_pattern_new,
attr: str = "TEXT",
label: str = "tnm",
span_setter: SpanSetterArg = {"ents": True, "tnm": True},
Expand Down
3 changes: 2 additions & 1 deletion tests/pipelines/ner/test_tnm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from edsnlp.pipes.ner.tnm.patterns import tnm_pattern
from edsnlp.utils.examples import parse_example

examples = [
Expand All @@ -22,7 +23,7 @@


def test_scores(blank_nlp):
blank_nlp.add_pipe("eds.tnm")
blank_nlp.add_pipe("eds.tnm", config=dict(pattern=tnm_pattern))

for example in examples:
text, entities = parse_example(example=example)
Expand Down
Loading