From 12c95ab1e0f770bba71e1b3b3cd99fc32825c0c4 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 1 Aug 2025 10:58:09 +0200 Subject: [PATCH 01/18] Add cordis --- src/pyobo/sources/cordis.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/pyobo/sources/cordis.py diff --git a/src/pyobo/sources/cordis.py b/src/pyobo/sources/cordis.py new file mode 100644 index 00000000..f8631bab --- /dev/null +++ b/src/pyobo/sources/cordis.py @@ -0,0 +1,3 @@ +URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" + +# see euscivoc, which is in skosxl format From 285eb252f071d4f6035bedcfdfd2ab3945390cfa Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Aug 2025 22:34:02 +0200 Subject: [PATCH 02/18] Update cordis.py --- src/pyobo/sources/cordis.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/pyobo/sources/cordis.py b/src/pyobo/sources/cordis.py index f8631bab..3e6b4381 100644 --- a/src/pyobo/sources/cordis.py +++ b/src/pyobo/sources/cordis.py @@ -1,3 +1,38 @@ +"""Converter for CORDIS Projects.""" + +from collections.abc import Iterable + +from pyobo import Obo, Reference, Term +from pyobo.utils.path import ensure_path +from pystow.utils import read_zipfile_csv + +__all__ = [ + "CordisProjectGetter", +] + URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" +PREFIX = "cordis.project" # see euscivoc, which is in skosxl format + + +class CordisProjectGetter(Obo): + """An ontology representation of cordis projects.""" + + ontology = PREFIX + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms() -> Iterable[Term]: + """Iterate over CPT terms.""" + path = ensure_path("cordis", url=URL) + df = read_zipfile_csv(path, "project.csv", sep='\t') + print(df.head()) + + +if __name__ == "__main__": + CordisProjectGetter.cli() From 24c856c20a0c77f9aa11506c5398c9756a99f007 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 12 Aug 2025 00:50:26 +0200 Subject: [PATCH 03/18] Update cordis.py --- src/pyobo/sources/cordis.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/pyobo/sources/cordis.py b/src/pyobo/sources/cordis.py index 3e6b4381..ce4fdd3f 100644 --- a/src/pyobo/sources/cordis.py +++ b/src/pyobo/sources/cordis.py @@ -2,9 +2,10 @@ from collections.abc import Iterable -from pyobo import Obo, Reference, Term +from pystow.utils import open_zip_reader + +from pyobo import Obo, Term from pyobo.utils.path import ensure_path -from pystow.utils import read_zipfile_csv __all__ = [ "CordisProjectGetter", @@ -13,6 +14,7 @@ URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" PREFIX = "cordis.project" + # see euscivoc, which is in skosxl format @@ -30,8 +32,18 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: def iter_terms() -> Iterable[Term]: """Iterate over CPT terms.""" path = ensure_path("cordis", url=URL) - df = read_zipfile_csv(path, "project.csv", sep='\t') - print(df.head()) + # df = read_zipfile_csv(path, "project.csv", sep=';', engine="python") + i = 0 + + with open_zip_reader(path, "project.csv", delimiter=";") as reader: + header = next(reader) + for _row in reader: + i += 1 + if i > 10: + break + dict(zip(header, _row, strict=False)) + + yield from [] if __name__ == "__main__": From 28cbe7e4f49ef45be872e060cbe2bca6b8a10fc3 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 17 May 2026 14:15:27 -0400 Subject: [PATCH 04/18] Update CORDIS --- src/pyobo/sources/__init__.py | 3 + src/pyobo/sources/cordis.py | 50 ---------------- src/pyobo/sources/cordis/__init__.py | 9 +++ src/pyobo/sources/cordis/cordis_basis.py | 39 +++++++++++++ src/pyobo/sources/cordis/cordis_project.py | 66 ++++++++++++++++++++++ src/pyobo/sources/cordis/utils.py | 14 +++++ 6 files changed, 131 insertions(+), 50 deletions(-) delete mode 100644 src/pyobo/sources/cordis.py create mode 100644 src/pyobo/sources/cordis/__init__.py create mode 100644 src/pyobo/sources/cordis/cordis_basis.py create mode 100644 src/pyobo/sources/cordis/cordis_project.py create mode 100644 src/pyobo/sources/cordis/utils.py diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index e3f6875b..60aa105e 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -17,6 +17,7 @@ from .clinicaltrials import ClinicalTrialsGetter from .complexportal import ComplexPortalGetter from .conso import CONSOGetter +from .cordis import CordisBasisGetter, CordisProjectGetter from .cpt import CPTGetter from .credit import CreditGetter from .cvx import CVXGetter @@ -94,6 +95,8 @@ "ChEMBLTissueGetter", "ClinicalTrialsGetter", "ComplexPortalGetter", + "CordisBasisGetter", + "CordisProjectGetter", "CreditGetter", "DepMapGetter", "DictybaseGetter", diff --git a/src/pyobo/sources/cordis.py b/src/pyobo/sources/cordis.py deleted file mode 100644 index ce4fdd3f..00000000 --- a/src/pyobo/sources/cordis.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Converter for CORDIS Projects.""" - -from collections.abc import Iterable - -from pystow.utils import open_zip_reader - -from pyobo import Obo, Term -from pyobo.utils.path import ensure_path - -__all__ = [ - "CordisProjectGetter", -] - -URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" -PREFIX = "cordis.project" - - -# see euscivoc, which is in skosxl format - - -class CordisProjectGetter(Obo): - """An ontology representation of cordis projects.""" - - ontology = PREFIX - dynamic_version = True - - def iter_terms(self, force: bool = False) -> Iterable[Term]: - """Iterate over terms in the ontology.""" - return iter_terms() - - -def iter_terms() -> Iterable[Term]: - """Iterate over CPT terms.""" - path = ensure_path("cordis", url=URL) - # df = read_zipfile_csv(path, "project.csv", sep=';', engine="python") - i = 0 - - with open_zip_reader(path, "project.csv", delimiter=";") as reader: - header = next(reader) - for _row in reader: - i += 1 - if i > 10: - break - dict(zip(header, _row, strict=False)) - - yield from [] - - -if __name__ == "__main__": - CordisProjectGetter.cli() diff --git a/src/pyobo/sources/cordis/__init__.py b/src/pyobo/sources/cordis/__init__.py new file mode 100644 index 00000000..82c34cb9 --- /dev/null +++ b/src/pyobo/sources/cordis/__init__.py @@ -0,0 +1,9 @@ +"""CORDIS sources.""" + +from .cordis_basis import CordisBasisGetter +from .cordis_project import CordisProjectGetter + +__all__ = [ + "CordisBasisGetter", + "CordisProjectGetter", +] diff --git a/src/pyobo/sources/cordis/cordis_basis.py b/src/pyobo/sources/cordis/cordis_basis.py new file mode 100644 index 00000000..570c4f0b --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_basis.py @@ -0,0 +1,39 @@ +"""Converter for CORDIS Projects.""" + +from collections.abc import Iterable + +from pystow.utils import open_zip_reader + +from pyobo import Obo, Term +from pyobo.sources.cordis.utils import get_cordis_path + +__all__ = [ + "CordisBasisGetter", +] + +PREFIX = "cordis.basis" + + +class CordisBasisGetter(Obo): + """An ontology representation of cordis legal bases.""" + + ontology = PREFIX + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms() -> Iterable[Term]: + """Iterate over CORDIS legal basis terms.""" + path = get_cordis_path() + with open_zip_reader(path, "legalBasis.csv", delimiter=";") as reader: + _header = next(reader) + unique = {row[1] for row in reader} + for identifier in sorted(unique): + yield Term.from_triple(PREFIX, identifier) + + +if __name__ == "__main__": + CordisBasisGetter.cli(["--obo"]) diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py new file mode 100644 index 00000000..19840c9f --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -0,0 +1,66 @@ +"""Converter for CORDIS Projects.""" + +from collections.abc import Iterable + +from curies.vocabulary import acronym +from pystow.utils import open_zip_reader +from tqdm import tqdm + +from pyobo import Obo, Reference, Term, TypeDef, default_reference +from pyobo.sources.cordis.utils import get_cordis_path + +__all__ = [ + "CordisProjectGetter", +] + +PREFIX = "cordis.project" + + +# see euscivoc, which is in skosxl format + +HAS_LEGAL_BASIS = TypeDef(reference=default_reference(PREFIX, "hasLegalBasis")) + + +class CordisProjectGetter(Obo): + """An ontology representation of cordis projects.""" + + ontology = PREFIX + typedefs = [HAS_LEGAL_BASIS] + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms() -> Iterable[Term]: + """Iterate over CPT terms.""" + path = get_cordis_path() + # TODO might need to add additional parts + with open_zip_reader(path, "project.csv", delimiter=";") as reader: + header = next(reader) + for row in reader: + row = dict(zip(header, row, strict=False)) + term = Term( + reference=Reference( + prefix="cordis.project", identifier=row["id"], name=row["title"] + ), + # definition=row['objective'], + ) + term.append_synonym(row["acronym"], type=acronym) + term.append_property( + HAS_LEGAL_BASIS, Reference(prefix="cordis.basis", identifier=row["legalBasis"]) + ) + + doi = row["grantDoi"] + try: + doi_reference = Reference(prefix="doi", identifier=doi) + except ValueError: + tqdm.write(f"[{term.curie}] problem with DOI: {doi}") + else: + term.append_exact_match(doi_reference) + yield term + + +if __name__ == "__main__": + CordisProjectGetter.cli(["--obo"]) diff --git a/src/pyobo/sources/cordis/utils.py b/src/pyobo/sources/cordis/utils.py new file mode 100644 index 00000000..455d647d --- /dev/null +++ b/src/pyobo/sources/cordis/utils.py @@ -0,0 +1,14 @@ +"""Utilities for CORDIS resources.""" + +from pathlib import Path + +from pyobo.utils.path import ensure_path + +__all__ = ["get_cordis_path"] + +URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" + + +def get_cordis_path(version: str | None = None) -> Path: + """Get the CORDIS data dump.""" + return ensure_path("cordis", url=URL, version=version) From 4abaf24cd13e5180a266fb508644a2b7e4774d6a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 07:07:37 +0200 Subject: [PATCH 05/18] Update --- pyproject.toml | 2 +- src/pyobo/sources/__init__.py | 3 +- src/pyobo/sources/cordis/__init__.py | 2 + src/pyobo/sources/cordis/cordis_basis.py | 26 ++++++------ .../sources/cordis/cordis_organization.py | 40 +++++++++++++++++++ src/pyobo/sources/cordis/cordis_project.py | 16 +++----- src/pyobo/sources/cordis/cordis_topic.py | 1 + src/pyobo/sources/cordis/utils.py | 17 +++++++- 8 files changed, 78 insertions(+), 29 deletions(-) create mode 100644 src/pyobo/sources/cordis/cordis_organization.py create mode 100644 src/pyobo/sources/cordis/cordis_topic.py diff --git a/pyproject.toml b/pyproject.toml index 4c849cb6..0d1eea8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ dependencies = [ "humanize", "tabulate", "cachier", - "pystow>=0.8.11", + "pystow>=0.8.13", "bioversions>=0.10.11", "bioregistry>=0.13.48", "ssslm>=0.2.0", diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index 47449453..370e9779 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -17,7 +17,7 @@ from .clinicaltrials import ClinicalTrialsGetter from .complexportal import ComplexPortalGetter from .conso import CONSOGetter -from .cordis import CordisBasisGetter, CordisProjectGetter +from .cordis import CordisBasisGetter, CordisOrganizationGetter, CordisProjectGetter from .cpt import CPTGetter from .credit import CreditGetter from .cvx import CVXGetter @@ -98,6 +98,7 @@ "ClinicalTrialsGetter", "ComplexPortalGetter", "CordisBasisGetter", + "CordisOrganizationGetter", "CordisProjectGetter", "CreditGetter", "DepMapGetter", diff --git a/src/pyobo/sources/cordis/__init__.py b/src/pyobo/sources/cordis/__init__.py index 82c34cb9..8bcc08a5 100644 --- a/src/pyobo/sources/cordis/__init__.py +++ b/src/pyobo/sources/cordis/__init__.py @@ -1,9 +1,11 @@ """CORDIS sources.""" from .cordis_basis import CordisBasisGetter +from .cordis_organization import CordisOrganizationGetter from .cordis_project import CordisProjectGetter __all__ = [ "CordisBasisGetter", + "CordisOrganizationGetter", "CordisProjectGetter", ] diff --git a/src/pyobo/sources/cordis/cordis_basis.py b/src/pyobo/sources/cordis/cordis_basis.py index 570c4f0b..673d8820 100644 --- a/src/pyobo/sources/cordis/cordis_basis.py +++ b/src/pyobo/sources/cordis/cordis_basis.py @@ -1,11 +1,9 @@ -"""Converter for CORDIS Projects.""" +"""Converter for CORDIS legal bases.""" from collections.abc import Iterable -from pystow.utils import open_zip_reader - -from pyobo import Obo, Term -from pyobo.sources.cordis.utils import get_cordis_path +from pyobo.sources.cordis.utils import open_cordis +from pyobo.struct import Obo, Term __all__ = [ "CordisBasisGetter", @@ -15,7 +13,7 @@ class CordisBasisGetter(Obo): - """An ontology representation of cordis legal bases.""" + """An ontology representation of CORDIS legal bases.""" ontology = PREFIX dynamic_version = True @@ -25,15 +23,15 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: return iter_terms() -def iter_terms() -> Iterable[Term]: +def iter_terms(version: str | None = None) -> Iterable[Term]: """Iterate over CORDIS legal basis terms.""" - path = get_cordis_path() - with open_zip_reader(path, "legalBasis.csv", delimiter=";") as reader: - _header = next(reader) - unique = {row[1] for row in reader} - for identifier in sorted(unique): - yield Term.from_triple(PREFIX, identifier) + with open_cordis("project.csv", version=version) as reader: + unique = {row["legalBasis"]: row["title"] for row in reader} + for identifier, name in sorted(unique): + yield Term.from_triple(PREFIX, identifier, name) + + # TODO implement some kind of hierarchy? if __name__ == "__main__": - CordisBasisGetter.cli(["--obo"]) + CordisBasisGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_organization.py b/src/pyobo/sources/cordis/cordis_organization.py new file mode 100644 index 00000000..06478a2c --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_organization.py @@ -0,0 +1,40 @@ +"""Converter for CORDIS organizations.""" + +from collections.abc import Iterable + +from pyobo import Obo, Reference, Term +from pyobo.sources.cordis.utils import open_cordis + +__all__ = [ + "CordisOrganizationGetter", +] + +PREFIX = "cordis.organization" + + +class CordisOrganizationGetter(Obo): + """An ontology representation of CORDIS organizations.""" + + ontology = PREFIX + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms(version: str | None = None) -> Iterable[Term]: + """Iterate over CPT terms.""" + # TODO might need to add additional parts + with open_cordis("organization.csv", version=version) as reader: + for row in reader: + term = Term( + reference=Reference( + prefix="cordis.project", identifier=row["id"], name=row["title"] + ), + ) + yield term + + +if __name__ == "__main__": + CordisOrganizationGetter.cli(["--obo"]) diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index 19840c9f..e3168f5d 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -1,13 +1,12 @@ -"""Converter for CORDIS Projects.""" +"""Converter for CORDIS projects.""" from collections.abc import Iterable from curies.vocabulary import acronym -from pystow.utils import open_zip_reader from tqdm import tqdm from pyobo import Obo, Reference, Term, TypeDef, default_reference -from pyobo.sources.cordis.utils import get_cordis_path +from pyobo.sources.cordis.utils import open_cordis __all__ = [ "CordisProjectGetter", @@ -15,7 +14,6 @@ PREFIX = "cordis.project" - # see euscivoc, which is in skosxl format HAS_LEGAL_BASIS = TypeDef(reference=default_reference(PREFIX, "hasLegalBasis")) @@ -33,14 +31,10 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: return iter_terms() -def iter_terms() -> Iterable[Term]: - """Iterate over CPT terms.""" - path = get_cordis_path() - # TODO might need to add additional parts - with open_zip_reader(path, "project.csv", delimiter=";") as reader: - header = next(reader) +def iter_terms(version: str | None = None) -> Iterable[Term]: + """Iterate over CORDIS project terms.""" + with open_cordis("project.csv", version=version) as reader: for row in reader: - row = dict(zip(header, row, strict=False)) term = Term( reference=Reference( prefix="cordis.project", identifier=row["id"], name=row["title"] diff --git a/src/pyobo/sources/cordis/cordis_topic.py b/src/pyobo/sources/cordis/cordis_topic.py new file mode 100644 index 00000000..6db2a99f --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_topic.py @@ -0,0 +1 @@ +"""Converter for CORDIS topics.""" diff --git a/src/pyobo/sources/cordis/utils.py b/src/pyobo/sources/cordis/utils.py index 455d647d..3339c658 100644 --- a/src/pyobo/sources/cordis/utils.py +++ b/src/pyobo/sources/cordis/utils.py @@ -1,14 +1,27 @@ """Utilities for CORDIS resources.""" +import csv +from collections.abc import Generator from pathlib import Path +from pystow.utils import open_zip_dict_reader + from pyobo.utils.path import ensure_path -__all__ = ["get_cordis_path"] +__all__ = ["get_cordis_path", "open_cordis"] URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" -def get_cordis_path(version: str | None = None) -> Path: +def get_cordis_path(*, version: str | None = None) -> Path: """Get the CORDIS data dump.""" return ensure_path("cordis", url=URL, version=version) + + +def open_cordis( + inner_path: str, *, version: str | None = None +) -> Generator[csv.DictReader[str], None, None]: + """Open a CORDIS CSV.""" + path = get_cordis_path(version=version) + with open_zip_dict_reader(path, inner_path, delimiter=";") as reader: + yield reader From 3940d985b943b66f934b304519ecbf52309a7764 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 07:14:52 +0200 Subject: [PATCH 06/18] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0d1eea8b..077ae207 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ dependencies = [ "cachier", "pystow>=0.8.13", "bioversions>=0.10.11", - "bioregistry>=0.13.48", + "bioregistry>=0.13.53", "ssslm>=0.2.0", "zenodo-client>=0.4.1", "class-resolver>=0.7.1", From 8b85c811abdc7635fb8c79bbd3cc328a8d314bd1 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 07:48:46 +0200 Subject: [PATCH 07/18] Update --- src/pyobo/sources/cordis/cordis_basis.py | 8 ++--- .../sources/cordis/cordis_organization.py | 32 +++++++++++++------ src/pyobo/sources/cordis/cordis_project.py | 29 +++++++++++++---- src/pyobo/sources/cordis/utils.py | 16 +++++++++- 4 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/pyobo/sources/cordis/cordis_basis.py b/src/pyobo/sources/cordis/cordis_basis.py index 673d8820..eabcd46f 100644 --- a/src/pyobo/sources/cordis/cordis_basis.py +++ b/src/pyobo/sources/cordis/cordis_basis.py @@ -2,20 +2,18 @@ from collections.abc import Iterable -from pyobo.sources.cordis.utils import open_cordis +from pyobo.sources.cordis.utils import BASIS_PREFIX, open_cordis from pyobo.struct import Obo, Term __all__ = [ "CordisBasisGetter", ] -PREFIX = "cordis.basis" - class CordisBasisGetter(Obo): """An ontology representation of CORDIS legal bases.""" - ontology = PREFIX + ontology = BASIS_PREFIX dynamic_version = True def iter_terms(self, force: bool = False) -> Iterable[Term]: @@ -28,7 +26,7 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: with open_cordis("project.csv", version=version) as reader: unique = {row["legalBasis"]: row["title"] for row in reader} for identifier, name in sorted(unique): - yield Term.from_triple(PREFIX, identifier, name) + yield Term.from_triple(BASIS_PREFIX, identifier, name) # TODO implement some kind of hierarchy? diff --git a/src/pyobo/sources/cordis/cordis_organization.py b/src/pyobo/sources/cordis/cordis_organization.py index 06478a2c..42afe2d3 100644 --- a/src/pyobo/sources/cordis/cordis_organization.py +++ b/src/pyobo/sources/cordis/cordis_organization.py @@ -2,20 +2,22 @@ from collections.abc import Iterable -from pyobo import Obo, Reference, Term -from pyobo.sources.cordis.utils import open_cordis +from curies import vocabulary as v + +from pyobo import Annotation, Obo, Reference, Term +from pyobo.sources.cordis.utils import ORGANIZATION_PREFIX, open_cordis +from pyobo.struct.typedef import has_homepage __all__ = [ "CordisOrganizationGetter", ] -PREFIX = "cordis.organization" - class CordisOrganizationGetter(Obo): """An ontology representation of CORDIS organizations.""" - ontology = PREFIX + ontology = ORGANIZATION_PREFIX + typedefs = [has_homepage] dynamic_version = True def iter_terms(self, force: bool = False) -> Iterable[Term]: @@ -24,15 +26,27 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: def iter_terms(version: str | None = None) -> Iterable[Term]: - """Iterate over CPT terms.""" - # TODO might need to add additional parts + """Iterate over CORDIS organization terms.""" with open_cordis("organization.csv", version=version) as reader: + seen = set() for row in reader: + identifier = row["organisationID"] + if identifier in seen: + continue + seen.add(identifier) term = Term( reference=Reference( - prefix="cordis.project", identifier=row["id"], name=row["title"] - ), + prefix=ORGANIZATION_PREFIX, identifier=identifier, name=row["name"] + ) ) + if short_name := row["shortName"]: + term.append_synonym(short_name, type=v.abbreviation) + if url := row["organizationURL"]: + term.append_property(Annotation.uri(has_homepage, url)) + if vat := row["vatNumber"]: + term.append_exact_match(Reference(prefix="vat", identifier=vat)) + term.append_exact_match(Reference(prefix="eu.rcn", identifier=row["rcn"])) + # TODO city, country, nutsCode yield term diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index e3168f5d..1c5c2d36 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -6,23 +6,27 @@ from tqdm import tqdm from pyobo import Obo, Reference, Term, TypeDef, default_reference -from pyobo.sources.cordis.utils import open_cordis +from pyobo.sources.cordis.utils import ( + BASIS_PREFIX, + ORGANIZATION_PREFIX, + PROJECT_PREFIX, + open_cordis, +) +from pyobo.struct.typedef import has_participant __all__ = [ "CordisProjectGetter", ] -PREFIX = "cordis.project" - # see euscivoc, which is in skosxl format -HAS_LEGAL_BASIS = TypeDef(reference=default_reference(PREFIX, "hasLegalBasis")) +HAS_LEGAL_BASIS = TypeDef(reference=default_reference(PROJECT_PREFIX, "hasLegalBasis")) class CordisProjectGetter(Obo): """An ontology representation of cordis projects.""" - ontology = PREFIX + ontology = PROJECT_PREFIX typedefs = [HAS_LEGAL_BASIS] dynamic_version = True @@ -33,6 +37,7 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: def iter_terms(version: str | None = None) -> Iterable[Term]: """Iterate over CORDIS project terms.""" + terms: dict[str, Term] = {} with open_cordis("project.csv", version=version) as reader: for row in reader: term = Term( @@ -43,7 +48,7 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: ) term.append_synonym(row["acronym"], type=acronym) term.append_property( - HAS_LEGAL_BASIS, Reference(prefix="cordis.basis", identifier=row["legalBasis"]) + HAS_LEGAL_BASIS, Reference(prefix=BASIS_PREFIX, identifier=row["legalBasis"]) ) doi = row["grantDoi"] @@ -53,7 +58,17 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: tqdm.write(f"[{term.curie}] problem with DOI: {doi}") else: term.append_exact_match(doi_reference) - yield term + terms[term.identifier] = term + + with open_cordis("organizations.csv", version=version) as reader: + for row in reader: + project_id = row["projectID"] + organization_id = row["organizationID"] + terms[project_id].append_relationship( + has_participant, + Reference(prefix=ORGANIZATION_PREFIX, identifier=organization_id), + # TODO can add all sorts of annotations from this file, like the cost, role, ordinal + ) if __name__ == "__main__": diff --git a/src/pyobo/sources/cordis/utils.py b/src/pyobo/sources/cordis/utils.py index 3339c658..7294527e 100644 --- a/src/pyobo/sources/cordis/utils.py +++ b/src/pyobo/sources/cordis/utils.py @@ -2,22 +2,36 @@ import csv from collections.abc import Generator +from contextlib import contextmanager from pathlib import Path from pystow.utils import open_zip_dict_reader from pyobo.utils.path import ensure_path -__all__ = ["get_cordis_path", "open_cordis"] +__all__ = [ + "BASIS_PREFIX", + "ORGANIZATION_PREFIX", + "PROJECT_PREFIX", + "URL", + "get_cordis_path", + "open_cordis", +] +#: A URL for the latest CORDIS data dump URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" +PROJECT_PREFIX = "cordis.project" +ORGANIZATION_PREFIX = "cordis.organization" +BASIS_PREFIX = "cordis.basis" + def get_cordis_path(*, version: str | None = None) -> Path: """Get the CORDIS data dump.""" return ensure_path("cordis", url=URL, version=version) +@contextmanager def open_cordis( inner_path: str, *, version: str | None = None ) -> Generator[csv.DictReader[str], None, None]: From aa4a941ace8929103fc5e2ec310eccf96c619162 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 07:54:12 +0200 Subject: [PATCH 08/18] Fix up --- src/pyobo/sources/cordis/cordis_basis.py | 2 +- src/pyobo/sources/cordis/cordis_organization.py | 8 +++++--- src/pyobo/sources/cordis/cordis_project.py | 11 +++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/pyobo/sources/cordis/cordis_basis.py b/src/pyobo/sources/cordis/cordis_basis.py index eabcd46f..c5c8fc8a 100644 --- a/src/pyobo/sources/cordis/cordis_basis.py +++ b/src/pyobo/sources/cordis/cordis_basis.py @@ -25,7 +25,7 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: """Iterate over CORDIS legal basis terms.""" with open_cordis("project.csv", version=version) as reader: unique = {row["legalBasis"]: row["title"] for row in reader} - for identifier, name in sorted(unique): + for identifier, name in sorted(unique.items()): yield Term.from_triple(BASIS_PREFIX, identifier, name) # TODO implement some kind of hierarchy? diff --git a/src/pyobo/sources/cordis/cordis_organization.py b/src/pyobo/sources/cordis/cordis_organization.py index 42afe2d3..9aed07c7 100644 --- a/src/pyobo/sources/cordis/cordis_organization.py +++ b/src/pyobo/sources/cordis/cordis_organization.py @@ -4,7 +4,7 @@ from curies import vocabulary as v -from pyobo import Annotation, Obo, Reference, Term +from pyobo import Obo, Reference, Term from pyobo.sources.cordis.utils import ORGANIZATION_PREFIX, open_cordis from pyobo.struct.typedef import has_homepage @@ -12,6 +12,8 @@ "CordisOrganizationGetter", ] +ABBREVIATION = Reference.from_reference(v.abbreviation) + class CordisOrganizationGetter(Obo): """An ontology representation of CORDIS organizations.""" @@ -40,9 +42,9 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: ) ) if short_name := row["shortName"]: - term.append_synonym(short_name, type=v.abbreviation) + term.append_synonym(short_name, type=ABBREVIATION) if url := row["organizationURL"]: - term.append_property(Annotation.uri(has_homepage, url)) + term.annotate_uri(has_homepage, url) if vat := row["vatNumber"]: term.append_exact_match(Reference(prefix="vat", identifier=vat)) term.append_exact_match(Reference(prefix="eu.rcn", identifier=row["rcn"])) diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index 1c5c2d36..e22bcbfc 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -2,7 +2,7 @@ from collections.abc import Iterable -from curies.vocabulary import acronym +from curies import vocabulary as v from tqdm import tqdm from pyobo import Obo, Reference, Term, TypeDef, default_reference @@ -21,6 +21,7 @@ # see euscivoc, which is in skosxl format HAS_LEGAL_BASIS = TypeDef(reference=default_reference(PROJECT_PREFIX, "hasLegalBasis")) +ACRONYM = Reference.from_reference(v.acronym) class CordisProjectGetter(Obo): @@ -35,7 +36,7 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: return iter_terms() -def iter_terms(version: str | None = None) -> Iterable[Term]: +def iter_terms(*, version: str | None = None) -> Iterable[Term]: """Iterate over CORDIS project terms.""" terms: dict[str, Term] = {} with open_cordis("project.csv", version=version) as reader: @@ -46,8 +47,8 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: ), # definition=row['objective'], ) - term.append_synonym(row["acronym"], type=acronym) - term.append_property( + term.append_synonym(row["acronym"], type=ACRONYM) + term.annotate_object( HAS_LEGAL_BASIS, Reference(prefix=BASIS_PREFIX, identifier=row["legalBasis"]) ) @@ -70,6 +71,8 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: # TODO can add all sorts of annotations from this file, like the cost, role, ordinal ) + yield from terms.values() + if __name__ == "__main__": CordisProjectGetter.cli(["--obo"]) From 2669049ecfe20b3953d7602dd166d47c9c19a8ea Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 07:55:10 +0200 Subject: [PATCH 09/18] Cleanup --- src/pyobo/sources/cordis/cordis_organization.py | 2 +- src/pyobo/sources/cordis/cordis_project.py | 2 +- src/pyobo/sources/cordis/cordis_topic.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) delete mode 100644 src/pyobo/sources/cordis/cordis_topic.py diff --git a/src/pyobo/sources/cordis/cordis_organization.py b/src/pyobo/sources/cordis/cordis_organization.py index 9aed07c7..41125a03 100644 --- a/src/pyobo/sources/cordis/cordis_organization.py +++ b/src/pyobo/sources/cordis/cordis_organization.py @@ -53,4 +53,4 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: if __name__ == "__main__": - CordisOrganizationGetter.cli(["--obo"]) + CordisOrganizationGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index e22bcbfc..3e7391b2 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -75,4 +75,4 @@ def iter_terms(*, version: str | None = None) -> Iterable[Term]: if __name__ == "__main__": - CordisProjectGetter.cli(["--obo"]) + CordisProjectGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_topic.py b/src/pyobo/sources/cordis/cordis_topic.py deleted file mode 100644 index 6db2a99f..00000000 --- a/src/pyobo/sources/cordis/cordis_topic.py +++ /dev/null @@ -1 +0,0 @@ -"""Converter for CORDIS topics.""" From 6de93b21abae2cb843edeb585897e1c6fc4055de Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 11:19:03 +0200 Subject: [PATCH 10/18] Update cordis_project.py --- src/pyobo/sources/cordis/cordis_project.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index 3e7391b2..ad0d6030 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -61,11 +61,11 @@ def iter_terms(*, version: str | None = None) -> Iterable[Term]: term.append_exact_match(doi_reference) terms[term.identifier] = term - with open_cordis("organizations.csv", version=version) as reader: + with open_cordis("organization.csv", version=version) as reader: for row in reader: project_id = row["projectID"] - organization_id = row["organizationID"] - terms[project_id].append_relationship( + organization_id = row["organisationID"] + terms[project_id].annotate_object( has_participant, Reference(prefix=ORGANIZATION_PREFIX, identifier=organization_id), # TODO can add all sorts of annotations from this file, like the cost, role, ordinal From 5326fb9f9fb6301d71d2c8bc56855cd51b81278e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 13:43:44 +0200 Subject: [PATCH 11/18] Update --- src/pyobo/sources/__init__.py | 8 +++++- src/pyobo/sources/cordis/__init__.py | 2 ++ src/pyobo/sources/cordis/cordis_topic.py | 33 ++++++++++++++++++++++++ src/pyobo/sources/cordis/utils.py | 11 +++++++- src/pyobo/struct/struct_utils.py | 2 +- 5 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 src/pyobo/sources/cordis/cordis_topic.py diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index 370e9779..d61f344d 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -17,7 +17,12 @@ from .clinicaltrials import ClinicalTrialsGetter from .complexportal import ComplexPortalGetter from .conso import CONSOGetter -from .cordis import CordisBasisGetter, CordisOrganizationGetter, CordisProjectGetter +from .cordis import ( + CordisBasisGetter, + CordisOrganizationGetter, + CordisProjectGetter, + CordisTopicGetter, +) from .cpt import CPTGetter from .credit import CreditGetter from .cvx import CVXGetter @@ -100,6 +105,7 @@ "CordisBasisGetter", "CordisOrganizationGetter", "CordisProjectGetter", + "CordisTopicGetter", "CreditGetter", "DepMapGetter", "DictybaseGetter", diff --git a/src/pyobo/sources/cordis/__init__.py b/src/pyobo/sources/cordis/__init__.py index 8bcc08a5..95a442df 100644 --- a/src/pyobo/sources/cordis/__init__.py +++ b/src/pyobo/sources/cordis/__init__.py @@ -3,9 +3,11 @@ from .cordis_basis import CordisBasisGetter from .cordis_organization import CordisOrganizationGetter from .cordis_project import CordisProjectGetter +from .cordis_topic import CordisTopicGetter __all__ = [ "CordisBasisGetter", "CordisOrganizationGetter", "CordisProjectGetter", + "CordisTopicGetter", ] diff --git a/src/pyobo/sources/cordis/cordis_topic.py b/src/pyobo/sources/cordis/cordis_topic.py new file mode 100644 index 00000000..2864e956 --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_topic.py @@ -0,0 +1,33 @@ +"""Converter for CORDIS topics.""" + +from collections.abc import Iterable + +from pyobo.sources.cordis.utils import TOPIC_PREFIX, clean_topic_id, open_cordis +from pyobo.struct import Obo, Term + +__all__ = [ + "CordisTopicGetter", +] + + +class CordisTopicGetter(Obo): + """An ontology representation of CORDIS topics.""" + + ontology = TOPIC_PREFIX + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms(version: str | None = None) -> Iterable[Term]: + """Iterate over CORDIS topic terms.""" + with open_cordis("topics.csv", version=version) as reader: + unique = {row["topic"]: row["title"] for row in reader} + for identifier, name in sorted(unique.items()): + yield Term.from_triple(TOPIC_PREFIX, clean_topic_id(identifier), name) + + +if __name__ == "__main__": + CordisTopicGetter.cli(["--obo"]) diff --git a/src/pyobo/sources/cordis/utils.py b/src/pyobo/sources/cordis/utils.py index 7294527e..f1864f80 100644 --- a/src/pyobo/sources/cordis/utils.py +++ b/src/pyobo/sources/cordis/utils.py @@ -13,7 +13,9 @@ "BASIS_PREFIX", "ORGANIZATION_PREFIX", "PROJECT_PREFIX", + "TOPIC_PREFIX", "URL", + "clean_topic_id", "get_cordis_path", "open_cordis", ] @@ -24,6 +26,7 @@ PROJECT_PREFIX = "cordis.project" ORGANIZATION_PREFIX = "cordis.organization" BASIS_PREFIX = "cordis.basis" +TOPIC_PREFIX = "cordis.topic" def get_cordis_path(*, version: str | None = None) -> Path: @@ -37,5 +40,11 @@ def open_cordis( ) -> Generator[csv.DictReader[str], None, None]: """Open a CORDIS CSV.""" path = get_cordis_path(version=version) - with open_zip_dict_reader(path, inner_path, delimiter=";") as reader: + with open_zip_dict_reader(path, inner_path, delimiter=";", quoting=csv.QUOTE_MINIMAL) as reader: yield reader + + +def clean_topic_id(topic_id: str) -> str: + """Fix CORDIS topic IDs that might have spaces in them.""" + # identifier cleanup needed for `RISK FINANCE` and `SCIENCE WAF SOCIETY` + return topic_id.replace(" ", "%20") diff --git a/src/pyobo/struct/struct_utils.py b/src/pyobo/struct/struct_utils.py index b05463db..47ecb6be 100644 --- a/src/pyobo/struct/struct_utils.py +++ b/src/pyobo/struct/struct_utils.py @@ -976,7 +976,7 @@ def _iterate_obo_relations( end = reference_escape(value, ontology_prefix=ontology_prefix) name = value.name case _: - raise TypeError(f"got unexpected value: {values}") + raise TypeError(f"got unexpected type {type(values)} with value: {values}") end += _get_obo_trailing_modifiers( predicate, value, annotations, ontology_prefix=ontology_prefix ) From 0b1fae54f32ff34d7c55db15e7a1a925deae841e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 14:08:29 +0200 Subject: [PATCH 12/18] Add test for quotes in string literals --- .../test_struct/test_obo/test_struct_term.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_struct/test_obo/test_struct_term.py b/tests/test_struct/test_obo/test_struct_term.py index b8ed9a58..90ce971a 100644 --- a/tests/test_struct/test_obo/test_struct_term.py +++ b/tests/test_struct/test_obo/test_struct_term.py @@ -711,6 +711,26 @@ def test_12_property_string_with_language(self) -> None: """, ) + def test_12_property_string_with_quote(self) -> None: + """Test emitting a string property literal with a quote in it.""" + term = Term(reference=LYSINE_DEHYDROGENASE_ACT) + term.annotate_string(RO_DUMMY, "\"value\" added") + self.assert_obo_stanza( + term, + obo="""\ + [Term] + id: GO:0050069 + name: lysine dehydrogenase activity + property_value: RO:1234567 "\\"value\\" added" xsd:string + """, + typedefs={RO_DUMMY.pair: RO_DUMMY}, + ofn="""\ + Declaration(Class(GO:0050069)) + AnnotationAssertion(rdfs:label GO:0050069 "lysine dehydrogenase activity") + AnnotationAssertion(RO:1234567 GO:0050069 "\\"value\\" added"@en) + """, + ) + def test_12_property_integer(self) -> None: """Test emitting property literals that were annotated as a boolean.""" term = Term(reference=LYSINE_DEHYDROGENASE_ACT) From 57eb34842bf4e876d8eb57ded90f559b1aaa122b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 14:08:32 +0200 Subject: [PATCH 13/18] Update struct_utils.py --- src/pyobo/struct/struct_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pyobo/struct/struct_utils.py b/src/pyobo/struct/struct_utils.py index 47ecb6be..29507562 100644 --- a/src/pyobo/struct/struct_utils.py +++ b/src/pyobo/struct/struct_utils.py @@ -501,6 +501,16 @@ def annotate_datetime( """Append a datetime annotation.""" return self.annotate_literal(prop, OBOLiteral.datetime(value), annotations=annotations) + def annotate_date( + self, + prop: ReferenceHint, + value: datetime.datetime | datetime.date | str, + *, + annotations: Iterable[Annotation] | None = None, + ) -> Self: + """Append a date annotation.""" + return self.annotate_literal(prop, OBOLiteral.date(value), annotations=annotations) + def _iterate_obo_properties( self, *, From 1e03a97608d69432a51525e5ab20daa1bfafc9ea Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 14:08:38 +0200 Subject: [PATCH 14/18] Update reference.py --- src/pyobo/struct/reference.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/pyobo/struct/reference.py b/src/pyobo/struct/reference.py index 10802986..599384ad 100644 --- a/src/pyobo/struct/reference.py +++ b/src/pyobo/struct/reference.py @@ -6,6 +6,8 @@ import logging from collections import Counter from collections.abc import Iterable, Sequence +from datetime import date as date_cls +from datetime import datetime as datetime_cls from typing import Any, NamedTuple import bioregistry @@ -14,6 +16,7 @@ import pytz from bioregistry import NormalizedNamableReference as Reference from curies import ReferenceTuple +from curies import vocabulary as v from curies.preprocessing import BlocklistError from ..identifier_utils import ( @@ -304,9 +307,7 @@ def _parse_reference_or_uri_literal( return None -unspecified_matching = Reference( - prefix="semapv", identifier="UnspecifiedMatching", name="unspecified matching process" -) +unspecified_matching = Reference.from_reference(v.unspecified_matching_process) class OBOLiteral(NamedTuple): @@ -319,44 +320,53 @@ class OBOLiteral(NamedTuple): @classmethod def string(cls, value: str, *, language: str | None = None) -> OBOLiteral: """Get a string literal.""" - return cls(value, curies.Reference(prefix="xsd", identifier="string"), language) + return cls(value, v.xsd_string, language) @classmethod def boolean(cls, value: bool) -> OBOLiteral: """Get a boolean literal.""" - return cls(str(value).lower(), curies.Reference(prefix="xsd", identifier="boolean"), None) + return cls(str(value).lower(), v.xsd_boolean, None) @classmethod def decimal(cls, value: float) -> OBOLiteral: """Get a decimal literal.""" - return cls(str(value), curies.Reference(prefix="xsd", identifier="decimal"), None) + return cls(str(value), v.xsd_decimal, None) @classmethod def float(cls, value: float) -> OBOLiteral: """Get a float literal.""" - return cls(str(value), curies.Reference(prefix="xsd", identifier="float"), None) + return cls(str(value), v.xsd_float, None) @classmethod def integer(cls, value: int | str) -> OBOLiteral: """Get a integer literal.""" - return cls(str(int(value)), curies.Reference(prefix="xsd", identifier="integer"), None) + return cls(str(int(value)), v.xsd_integer, None) @classmethod def year(cls, value: int | str) -> OBOLiteral: """Get a year (gYear) literal.""" - return cls(str(int(value)), curies.Reference(prefix="xsd", identifier="gYear"), None) + return cls(str(int(value)), v.xsd_year, None) @classmethod def uri(cls, uri: str) -> OBOLiteral: """Get a string literal for a URI.""" - return cls(uri, curies.Reference(prefix="xsd", identifier="anyURI"), None) + return cls(uri, v.xsd_uri, None) + + @classmethod + def datetime(cls, dt: datetime_cls | str) -> OBOLiteral: + """Get a datetime literal.""" + if isinstance(dt, str): + dt = _parse_datetime(dt) + return cls(dt.isoformat(), v.xsd_datetime, None) @classmethod - def datetime(cls, dt: datetime.datetime | str) -> OBOLiteral: + def date(cls, dt: datetime_cls | date_cls | str) -> OBOLiteral: """Get a datetime literal.""" if isinstance(dt, str): dt = _parse_datetime(dt) - return cls(dt.isoformat(), curies.Reference(prefix="xsd", identifier="dateTime"), None) + if isinstance(dt, datetime.datetime): + dt = dt.date() + return cls(dt.isoformat(), v.xsd_date, None) def _parse_datetime(dd: str) -> datetime.datetime: From 920958d79120c1738a67b234da14752c5977b76e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 14:08:39 +0200 Subject: [PATCH 15/18] Update cordis_project.py --- src/pyobo/sources/cordis/cordis_project.py | 97 +++++++++++++++++++++- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index ad0d6030..0694228c 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -1,8 +1,10 @@ """Converter for CORDIS projects.""" +from collections import Counter from collections.abc import Iterable from curies import vocabulary as v +from tabulate import tabulate from tqdm import tqdm from pyobo import Obo, Reference, Term, TypeDef, default_reference @@ -10,6 +12,8 @@ BASIS_PREFIX, ORGANIZATION_PREFIX, PROJECT_PREFIX, + TOPIC_PREFIX, + clean_topic_id, open_cordis, ) from pyobo.struct.typedef import has_participant @@ -20,7 +24,45 @@ # see euscivoc, which is in skosxl format -HAS_LEGAL_BASIS = TypeDef(reference=default_reference(PROJECT_PREFIX, "hasLegalBasis")) +PROJECT = Term.from_triple("foaf", "Project", "project") +STATUS = Term(reference=default_reference(PROJECT_PREFIX, "status")) +KEY_TO_STATUS = { + "CLOSED": Term(reference=default_reference(PROJECT_PREFIX, "closed")).append_parent(STATUS), + "SIGNED": Term(reference=default_reference(PROJECT_PREFIX, "signed")).append_parent(STATUS), + "TERMINATED": Term(reference=default_reference(PROJECT_PREFIX, "terminated")).append_parent( + STATUS + ), +} + +HAS_LEGAL_BASIS = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasLegalBasis"), domain=PROJECT.reference +) +HAS_TOPIC = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasTopic"), domain=PROJECT.reference +) +HAS_FUNDING_SCHEME = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasFundingScheme"), + domain=PROJECT.reference, + range=Reference.from_reference(v.xsd_string), +) +HAS_KEYWORD = TypeDef( # TODO replace with SDO + reference=default_reference(PROJECT_PREFIX, "hasKeyword"), + range=Reference.from_reference(v.xsd_string), + domain=PROJECT.reference, +) +HAS_STATUS = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasStatus"), domain=PROJECT.reference +) +HAS_START = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasStart"), + domain=PROJECT.reference, + range=Reference.from_reference(v.xsd_date), +) +HAS_END = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasEnd"), + domain=PROJECT.reference, + range=Reference.from_reference(v.xsd_date), +) ACRONYM = Reference.from_reference(v.acronym) @@ -28,8 +70,17 @@ class CordisProjectGetter(Obo): """An ontology representation of cordis projects.""" ontology = PROJECT_PREFIX - typedefs = [HAS_LEGAL_BASIS] + typedefs = [ + HAS_LEGAL_BASIS, + HAS_TOPIC, + HAS_FUNDING_SCHEME, + HAS_KEYWORD, + HAS_STATUS, + HAS_START, + HAS_END, + ] dynamic_version = True + root_terms = [STATUS.reference, PROJECT.reference] def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms in the ontology.""" @@ -39,6 +90,7 @@ def iter_terms(self, force: bool = False) -> Iterable[Term]: def iter_terms(*, version: str | None = None) -> Iterable[Term]: """Iterate over CORDIS project terms.""" terms: dict[str, Term] = {} + scheme_counter: Counter[str] = Counter() with open_cordis("project.csv", version=version) as reader: for row in reader: term = Term( @@ -46,7 +98,7 @@ def iter_terms(*, version: str | None = None) -> Iterable[Term]: prefix="cordis.project", identifier=row["id"], name=row["title"] ), # definition=row['objective'], - ) + ).append_parent(PROJECT) term.append_synonym(row["acronym"], type=ACRONYM) term.annotate_object( HAS_LEGAL_BASIS, Reference(prefix=BASIS_PREFIX, identifier=row["legalBasis"]) @@ -57,22 +109,59 @@ def iter_terms(*, version: str | None = None) -> Iterable[Term]: doi_reference = Reference(prefix="doi", identifier=doi) except ValueError: tqdm.write(f"[{term.curie}] problem with DOI: {doi}") + continue else: term.append_exact_match(doi_reference) + + try: + rcn_id = Reference(prefix="eu.rcn", identifier=row["rcn"]) + except ValueError: + pass # this is probably the same offset issue as above + # tqdm.write(f"[{term.curie}] problem with RCN: {doi}") + else: + term.append_exact_match(rcn_id) + + for topic in row["topics"].split(","): + term.annotate_object( + HAS_TOPIC, Reference(prefix=TOPIC_PREFIX, identifier=clean_topic_id(topic)) + ) + + for keyword in row["keywords"].split(","): + if keyword_stripped := keyword.strip().strip('"').strip(): + term.annotate_string(HAS_KEYWORD, keyword_stripped) + + if funding_scheme := row["fundingScheme"]: + scheme_counter[row["fundingScheme"]] += 1 + term.annotate_string(HAS_FUNDING_SCHEME, funding_scheme) + + if start_date := row["startDate"]: + term.annotate_date(HAS_START, start_date) + if end_date := row["endDate"]: + term.annotate_date(HAS_END, end_date) + + term.annotate_object(HAS_STATUS, KEY_TO_STATUS[row["status"]]) + terms[term.identifier] = term + tqdm.write(tabulate(scheme_counter.most_common())) + with open_cordis("organization.csv", version=version) as reader: for row in reader: project_id = row["projectID"] organization_id = row["organisationID"] + if project_id not in terms: + continue terms[project_id].annotate_object( has_participant, Reference(prefix=ORGANIZATION_PREFIX, identifier=organization_id), # TODO can add all sorts of annotations from this file, like the cost, role, ordinal ) + yield PROJECT + yield STATUS + yield from KEY_TO_STATUS.values() yield from terms.values() if __name__ == "__main__": - CordisProjectGetter.cli() + CordisProjectGetter.cli(["--obo"]) From f6b2f57b374c8875eb4a715474226e9604b625c7 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 14:08:45 +0200 Subject: [PATCH 16/18] Update test_struct_term.py --- tests/test_struct/test_obo/test_struct_term.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_struct/test_obo/test_struct_term.py b/tests/test_struct/test_obo/test_struct_term.py index 90ce971a..1e36d3b3 100644 --- a/tests/test_struct/test_obo/test_struct_term.py +++ b/tests/test_struct/test_obo/test_struct_term.py @@ -714,7 +714,7 @@ def test_12_property_string_with_language(self) -> None: def test_12_property_string_with_quote(self) -> None: """Test emitting a string property literal with a quote in it.""" term = Term(reference=LYSINE_DEHYDROGENASE_ACT) - term.annotate_string(RO_DUMMY, "\"value\" added") + term.annotate_string(RO_DUMMY, '"value" added') self.assert_obo_stanza( term, obo="""\ From d87910531352e65b1ee30d9394dee0015b814100 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 14:28:23 +0200 Subject: [PATCH 17/18] Up --- src/pyobo/sources/cordis/cordis_basis.py | 2 ++ src/pyobo/sources/cordis/cordis_organization.py | 2 ++ src/pyobo/sources/cordis/cordis_project.py | 2 ++ src/pyobo/sources/cordis/cordis_topic.py | 2 ++ src/pyobo/sources/cordis/utils.py | 2 ++ 5 files changed, 10 insertions(+) diff --git a/src/pyobo/sources/cordis/cordis_basis.py b/src/pyobo/sources/cordis/cordis_basis.py index c5c8fc8a..c722af52 100644 --- a/src/pyobo/sources/cordis/cordis_basis.py +++ b/src/pyobo/sources/cordis/cordis_basis.py @@ -1,5 +1,7 @@ """Converter for CORDIS legal bases.""" +from __future__ import annotations + from collections.abc import Iterable from pyobo.sources.cordis.utils import BASIS_PREFIX, open_cordis diff --git a/src/pyobo/sources/cordis/cordis_organization.py b/src/pyobo/sources/cordis/cordis_organization.py index 41125a03..d754f6f1 100644 --- a/src/pyobo/sources/cordis/cordis_organization.py +++ b/src/pyobo/sources/cordis/cordis_organization.py @@ -1,5 +1,7 @@ """Converter for CORDIS organizations.""" +from __future__ import annotations + from collections.abc import Iterable from curies import vocabulary as v diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index 0694228c..71186e8b 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -1,5 +1,7 @@ """Converter for CORDIS projects.""" +from __future__ import annotations + from collections import Counter from collections.abc import Iterable diff --git a/src/pyobo/sources/cordis/cordis_topic.py b/src/pyobo/sources/cordis/cordis_topic.py index 2864e956..12b57566 100644 --- a/src/pyobo/sources/cordis/cordis_topic.py +++ b/src/pyobo/sources/cordis/cordis_topic.py @@ -1,5 +1,7 @@ """Converter for CORDIS topics.""" +from __future__ import annotations + from collections.abc import Iterable from pyobo.sources.cordis.utils import TOPIC_PREFIX, clean_topic_id, open_cordis diff --git a/src/pyobo/sources/cordis/utils.py b/src/pyobo/sources/cordis/utils.py index f1864f80..d24ec9a8 100644 --- a/src/pyobo/sources/cordis/utils.py +++ b/src/pyobo/sources/cordis/utils.py @@ -1,5 +1,7 @@ """Utilities for CORDIS resources.""" +from __future__ import annotations + import csv from collections.abc import Generator from contextlib import contextmanager From f8ec630f93820139363663f50d101b9464754475 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 18 May 2026 15:00:44 +0200 Subject: [PATCH 18/18] Finish --- src/pyobo/sources/cordis/cordis_project.py | 11 ++++++----- src/pyobo/sources/cordis/cordis_topic.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py index 71186e8b..ef16ff5c 100644 --- a/src/pyobo/sources/cordis/cordis_project.py +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -58,12 +58,12 @@ HAS_START = TypeDef( reference=default_reference(PROJECT_PREFIX, "hasStart"), domain=PROJECT.reference, - range=Reference.from_reference(v.xsd_date), + range=Reference.from_reference(v.xsd_datetime), ) HAS_END = TypeDef( reference=default_reference(PROJECT_PREFIX, "hasEnd"), domain=PROJECT.reference, - range=Reference.from_reference(v.xsd_date), + range=Reference.from_reference(v.xsd_datetime), ) ACRONYM = Reference.from_reference(v.acronym) @@ -136,10 +136,11 @@ def iter_terms(*, version: str | None = None) -> Iterable[Term]: scheme_counter[row["fundingScheme"]] += 1 term.annotate_string(HAS_FUNDING_SCHEME, funding_scheme) + # switch to date after https://github.com/protegeproject/protege/issues/1343 if start_date := row["startDate"]: - term.annotate_date(HAS_START, start_date) + term.annotate_datetime(HAS_START, start_date) if end_date := row["endDate"]: - term.annotate_date(HAS_END, end_date) + term.annotate_datetime(HAS_END, end_date) term.annotate_object(HAS_STATUS, KEY_TO_STATUS[row["status"]]) @@ -166,4 +167,4 @@ def iter_terms(*, version: str | None = None) -> Iterable[Term]: if __name__ == "__main__": - CordisProjectGetter.cli(["--obo"]) + CordisProjectGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_topic.py b/src/pyobo/sources/cordis/cordis_topic.py index 12b57566..5baf298c 100644 --- a/src/pyobo/sources/cordis/cordis_topic.py +++ b/src/pyobo/sources/cordis/cordis_topic.py @@ -32,4 +32,4 @@ def iter_terms(version: str | None = None) -> Iterable[Term]: if __name__ == "__main__": - CordisTopicGetter.cli(["--obo"]) + CordisTopicGetter.cli()