diff --git a/pyproject.toml b/pyproject.toml index 4c849cb6..077ae207 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,9 +68,9 @@ dependencies = [ "humanize", "tabulate", "cachier", - "pystow>=0.8.11", + "pystow>=0.8.13", "bioversions>=0.10.11", - "bioregistry>=0.13.48", + "bioregistry>=0.13.53", "ssslm>=0.2.0", "zenodo-client>=0.4.1", "class-resolver>=0.7.1", diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index fbda7a24..d61f344d 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -17,6 +17,12 @@ from .clinicaltrials import ClinicalTrialsGetter from .complexportal import ComplexPortalGetter from .conso import CONSOGetter +from .cordis import ( + CordisBasisGetter, + CordisOrganizationGetter, + CordisProjectGetter, + CordisTopicGetter, +) from .cpt import CPTGetter from .credit import CreditGetter from .cvx import CVXGetter @@ -96,6 +102,10 @@ "ChEMBLTissueGetter", "ClinicalTrialsGetter", "ComplexPortalGetter", + "CordisBasisGetter", + "CordisOrganizationGetter", + "CordisProjectGetter", + "CordisTopicGetter", "CreditGetter", "DepMapGetter", "DictybaseGetter", diff --git a/src/pyobo/sources/cordis/__init__.py b/src/pyobo/sources/cordis/__init__.py new file mode 100644 index 00000000..95a442df --- /dev/null +++ b/src/pyobo/sources/cordis/__init__.py @@ -0,0 +1,13 @@ +"""CORDIS sources.""" + +from .cordis_basis import CordisBasisGetter +from .cordis_organization import CordisOrganizationGetter +from .cordis_project import CordisProjectGetter +from .cordis_topic import CordisTopicGetter + +__all__ = [ + "CordisBasisGetter", + "CordisOrganizationGetter", + "CordisProjectGetter", + "CordisTopicGetter", +] diff --git a/src/pyobo/sources/cordis/cordis_basis.py b/src/pyobo/sources/cordis/cordis_basis.py new file mode 100644 index 00000000..c722af52 --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_basis.py @@ -0,0 +1,37 @@ +"""Converter for CORDIS legal bases.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from pyobo.sources.cordis.utils import BASIS_PREFIX, open_cordis +from pyobo.struct import Obo, Term + +__all__ = [ + "CordisBasisGetter", +] + + +class CordisBasisGetter(Obo): + """An ontology representation of CORDIS legal bases.""" + + ontology = BASIS_PREFIX + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms(version: str | None = None) -> Iterable[Term]: + """Iterate over CORDIS legal basis terms.""" + with open_cordis("project.csv", version=version) as reader: + unique = {row["legalBasis"]: row["title"] for row in reader} + for identifier, name in sorted(unique.items()): + yield Term.from_triple(BASIS_PREFIX, identifier, name) + + # TODO implement some kind of hierarchy? + + +if __name__ == "__main__": + CordisBasisGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_organization.py b/src/pyobo/sources/cordis/cordis_organization.py new file mode 100644 index 00000000..d754f6f1 --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_organization.py @@ -0,0 +1,58 @@ +"""Converter for CORDIS organizations.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from curies import vocabulary as v + +from pyobo import Obo, Reference, Term +from pyobo.sources.cordis.utils import ORGANIZATION_PREFIX, open_cordis +from pyobo.struct.typedef import has_homepage + +__all__ = [ + "CordisOrganizationGetter", +] + +ABBREVIATION = Reference.from_reference(v.abbreviation) + + +class CordisOrganizationGetter(Obo): + """An ontology representation of CORDIS organizations.""" + + ontology = ORGANIZATION_PREFIX + typedefs = [has_homepage] + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms(version: str | None = None) -> Iterable[Term]: + """Iterate over CORDIS organization terms.""" + with open_cordis("organization.csv", version=version) as reader: + seen = set() + for row in reader: + identifier = row["organisationID"] + if identifier in seen: + continue + seen.add(identifier) + term = Term( + reference=Reference( + prefix=ORGANIZATION_PREFIX, identifier=identifier, name=row["name"] + ) + ) + if short_name := row["shortName"]: + term.append_synonym(short_name, type=ABBREVIATION) + if url := row["organizationURL"]: + term.annotate_uri(has_homepage, url) + if vat := row["vatNumber"]: + term.append_exact_match(Reference(prefix="vat", identifier=vat)) + term.append_exact_match(Reference(prefix="eu.rcn", identifier=row["rcn"])) + # TODO city, country, nutsCode + yield term + + +if __name__ == "__main__": + CordisOrganizationGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_project.py b/src/pyobo/sources/cordis/cordis_project.py new file mode 100644 index 00000000..ef16ff5c --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_project.py @@ -0,0 +1,170 @@ +"""Converter for CORDIS projects.""" + +from __future__ import annotations + +from collections import Counter +from collections.abc import Iterable + +from curies import vocabulary as v +from tabulate import tabulate +from tqdm import tqdm + +from pyobo import Obo, Reference, Term, TypeDef, default_reference +from pyobo.sources.cordis.utils import ( + BASIS_PREFIX, + ORGANIZATION_PREFIX, + PROJECT_PREFIX, + TOPIC_PREFIX, + clean_topic_id, + open_cordis, +) +from pyobo.struct.typedef import has_participant + +__all__ = [ + "CordisProjectGetter", +] + +# see euscivoc, which is in skosxl format + +PROJECT = Term.from_triple("foaf", "Project", "project") +STATUS = Term(reference=default_reference(PROJECT_PREFIX, "status")) +KEY_TO_STATUS = { + "CLOSED": Term(reference=default_reference(PROJECT_PREFIX, "closed")).append_parent(STATUS), + "SIGNED": Term(reference=default_reference(PROJECT_PREFIX, "signed")).append_parent(STATUS), + "TERMINATED": Term(reference=default_reference(PROJECT_PREFIX, "terminated")).append_parent( + STATUS + ), +} + +HAS_LEGAL_BASIS = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasLegalBasis"), domain=PROJECT.reference +) +HAS_TOPIC = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasTopic"), domain=PROJECT.reference +) +HAS_FUNDING_SCHEME = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasFundingScheme"), + domain=PROJECT.reference, + range=Reference.from_reference(v.xsd_string), +) +HAS_KEYWORD = TypeDef( # TODO replace with SDO + reference=default_reference(PROJECT_PREFIX, "hasKeyword"), + range=Reference.from_reference(v.xsd_string), + domain=PROJECT.reference, +) +HAS_STATUS = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasStatus"), domain=PROJECT.reference +) +HAS_START = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasStart"), + domain=PROJECT.reference, + range=Reference.from_reference(v.xsd_datetime), +) +HAS_END = TypeDef( + reference=default_reference(PROJECT_PREFIX, "hasEnd"), + domain=PROJECT.reference, + range=Reference.from_reference(v.xsd_datetime), +) +ACRONYM = Reference.from_reference(v.acronym) + + +class CordisProjectGetter(Obo): + """An ontology representation of cordis projects.""" + + ontology = PROJECT_PREFIX + typedefs = [ + HAS_LEGAL_BASIS, + HAS_TOPIC, + HAS_FUNDING_SCHEME, + HAS_KEYWORD, + HAS_STATUS, + HAS_START, + HAS_END, + ] + dynamic_version = True + root_terms = [STATUS.reference, PROJECT.reference] + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms(*, version: str | None = None) -> Iterable[Term]: + """Iterate over CORDIS project terms.""" + terms: dict[str, Term] = {} + scheme_counter: Counter[str] = Counter() + with open_cordis("project.csv", version=version) as reader: + for row in reader: + term = Term( + reference=Reference( + prefix="cordis.project", identifier=row["id"], name=row["title"] + ), + # definition=row['objective'], + ).append_parent(PROJECT) + term.append_synonym(row["acronym"], type=ACRONYM) + term.annotate_object( + HAS_LEGAL_BASIS, Reference(prefix=BASIS_PREFIX, identifier=row["legalBasis"]) + ) + + doi = row["grantDoi"] + try: + doi_reference = Reference(prefix="doi", identifier=doi) + except ValueError: + tqdm.write(f"[{term.curie}] problem with DOI: {doi}") + continue + else: + term.append_exact_match(doi_reference) + + try: + rcn_id = Reference(prefix="eu.rcn", identifier=row["rcn"]) + except ValueError: + pass # this is probably the same offset issue as above + # tqdm.write(f"[{term.curie}] problem with RCN: {doi}") + else: + term.append_exact_match(rcn_id) + + for topic in row["topics"].split(","): + term.annotate_object( + HAS_TOPIC, Reference(prefix=TOPIC_PREFIX, identifier=clean_topic_id(topic)) + ) + + for keyword in row["keywords"].split(","): + if keyword_stripped := keyword.strip().strip('"').strip(): + term.annotate_string(HAS_KEYWORD, keyword_stripped) + + if funding_scheme := row["fundingScheme"]: + scheme_counter[row["fundingScheme"]] += 1 + term.annotate_string(HAS_FUNDING_SCHEME, funding_scheme) + + # switch to date after https://github.com/protegeproject/protege/issues/1343 + if start_date := row["startDate"]: + term.annotate_datetime(HAS_START, start_date) + if end_date := row["endDate"]: + term.annotate_datetime(HAS_END, end_date) + + term.annotate_object(HAS_STATUS, KEY_TO_STATUS[row["status"]]) + + terms[term.identifier] = term + + tqdm.write(tabulate(scheme_counter.most_common())) + + with open_cordis("organization.csv", version=version) as reader: + for row in reader: + project_id = row["projectID"] + organization_id = row["organisationID"] + if project_id not in terms: + continue + terms[project_id].annotate_object( + has_participant, + Reference(prefix=ORGANIZATION_PREFIX, identifier=organization_id), + # TODO can add all sorts of annotations from this file, like the cost, role, ordinal + ) + + yield PROJECT + yield STATUS + yield from KEY_TO_STATUS.values() + yield from terms.values() + + +if __name__ == "__main__": + CordisProjectGetter.cli() diff --git a/src/pyobo/sources/cordis/cordis_topic.py b/src/pyobo/sources/cordis/cordis_topic.py new file mode 100644 index 00000000..5baf298c --- /dev/null +++ b/src/pyobo/sources/cordis/cordis_topic.py @@ -0,0 +1,35 @@ +"""Converter for CORDIS topics.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from pyobo.sources.cordis.utils import TOPIC_PREFIX, clean_topic_id, open_cordis +from pyobo.struct import Obo, Term + +__all__ = [ + "CordisTopicGetter", +] + + +class CordisTopicGetter(Obo): + """An ontology representation of CORDIS topics.""" + + ontology = TOPIC_PREFIX + dynamic_version = True + + def iter_terms(self, force: bool = False) -> Iterable[Term]: + """Iterate over terms in the ontology.""" + return iter_terms() + + +def iter_terms(version: str | None = None) -> Iterable[Term]: + """Iterate over CORDIS topic terms.""" + with open_cordis("topics.csv", version=version) as reader: + unique = {row["topic"]: row["title"] for row in reader} + for identifier, name in sorted(unique.items()): + yield Term.from_triple(TOPIC_PREFIX, clean_topic_id(identifier), name) + + +if __name__ == "__main__": + CordisTopicGetter.cli() diff --git a/src/pyobo/sources/cordis/utils.py b/src/pyobo/sources/cordis/utils.py new file mode 100644 index 00000000..d24ec9a8 --- /dev/null +++ b/src/pyobo/sources/cordis/utils.py @@ -0,0 +1,52 @@ +"""Utilities for CORDIS resources.""" + +from __future__ import annotations + +import csv +from collections.abc import Generator +from contextlib import contextmanager +from pathlib import Path + +from pystow.utils import open_zip_dict_reader + +from pyobo.utils.path import ensure_path + +__all__ = [ + "BASIS_PREFIX", + "ORGANIZATION_PREFIX", + "PROJECT_PREFIX", + "TOPIC_PREFIX", + "URL", + "clean_topic_id", + "get_cordis_path", + "open_cordis", +] + +#: A URL for the latest CORDIS data dump +URL = "https://cordis.europa.eu/data/cordis-h2020projects-csv.zip" + +PROJECT_PREFIX = "cordis.project" +ORGANIZATION_PREFIX = "cordis.organization" +BASIS_PREFIX = "cordis.basis" +TOPIC_PREFIX = "cordis.topic" + + +def get_cordis_path(*, version: str | None = None) -> Path: + """Get the CORDIS data dump.""" + return ensure_path("cordis", url=URL, version=version) + + +@contextmanager +def open_cordis( + inner_path: str, *, version: str | None = None +) -> Generator[csv.DictReader[str], None, None]: + """Open a CORDIS CSV.""" + path = get_cordis_path(version=version) + with open_zip_dict_reader(path, inner_path, delimiter=";", quoting=csv.QUOTE_MINIMAL) as reader: + yield reader + + +def clean_topic_id(topic_id: str) -> str: + """Fix CORDIS topic IDs that might have spaces in them.""" + # identifier cleanup needed for `RISK FINANCE` and `SCIENCE WAF SOCIETY` + return topic_id.replace(" ", "%20")