Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ dependencies = [
"humanize",
"tabulate",
"cachier",
"pystow>=0.8.11",
"pystow>=0.8.13",
"bioversions>=0.10.11",
"bioregistry>=0.13.48",
"bioregistry>=0.13.53",
"ssslm>=0.2.0",
"zenodo-client>=0.4.1",
"class-resolver>=0.7.1",
Expand Down
10 changes: 10 additions & 0 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
from .clinicaltrials import ClinicalTrialsGetter
from .complexportal import ComplexPortalGetter
from .conso import CONSOGetter
from .cordis import (
CordisBasisGetter,
CordisOrganizationGetter,
CordisProjectGetter,
CordisTopicGetter,
)
from .cpt import CPTGetter
from .credit import CreditGetter
from .cvx import CVXGetter
Expand Down Expand Up @@ -96,6 +102,10 @@
"ChEMBLTissueGetter",
"ClinicalTrialsGetter",
"ComplexPortalGetter",
"CordisBasisGetter",
"CordisOrganizationGetter",
"CordisProjectGetter",
"CordisTopicGetter",
"CreditGetter",
"DepMapGetter",
"DictybaseGetter",
Expand Down
13 changes: 13 additions & 0 deletions src/pyobo/sources/cordis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""CORDIS sources."""

from .cordis_basis import CordisBasisGetter
from .cordis_organization import CordisOrganizationGetter
from .cordis_project import CordisProjectGetter
from .cordis_topic import CordisTopicGetter

__all__ = [
"CordisBasisGetter",
"CordisOrganizationGetter",
"CordisProjectGetter",
"CordisTopicGetter",
]
37 changes: 37 additions & 0 deletions src/pyobo/sources/cordis/cordis_basis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Converter for CORDIS legal bases."""

from __future__ import annotations

from collections.abc import Iterable

from pyobo.sources.cordis.utils import BASIS_PREFIX, open_cordis
from pyobo.struct import Obo, Term

__all__ = [
"CordisBasisGetter",
]


class CordisBasisGetter(Obo):
"""An ontology representation of CORDIS legal bases."""

ontology = BASIS_PREFIX
dynamic_version = True

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms()


def iter_terms(version: str | None = None) -> Iterable[Term]:
"""Iterate over CORDIS legal basis terms."""
with open_cordis("project.csv", version=version) as reader:
unique = {row["legalBasis"]: row["title"] for row in reader}
for identifier, name in sorted(unique.items()):
yield Term.from_triple(BASIS_PREFIX, identifier, name)

# TODO implement some kind of hierarchy?


if __name__ == "__main__":
CordisBasisGetter.cli()
58 changes: 58 additions & 0 deletions src/pyobo/sources/cordis/cordis_organization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Converter for CORDIS organizations."""

from __future__ import annotations

from collections.abc import Iterable

from curies import vocabulary as v

from pyobo import Obo, Reference, Term
from pyobo.sources.cordis.utils import ORGANIZATION_PREFIX, open_cordis
from pyobo.struct.typedef import has_homepage

__all__ = [
"CordisOrganizationGetter",
]

ABBREVIATION = Reference.from_reference(v.abbreviation)


class CordisOrganizationGetter(Obo):
"""An ontology representation of CORDIS organizations."""

ontology = ORGANIZATION_PREFIX
typedefs = [has_homepage]
dynamic_version = True

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms()


def iter_terms(version: str | None = None) -> Iterable[Term]:
"""Iterate over CORDIS organization terms."""
with open_cordis("organization.csv", version=version) as reader:
seen = set()
for row in reader:
identifier = row["organisationID"]
if identifier in seen:
continue
seen.add(identifier)
term = Term(
reference=Reference(
prefix=ORGANIZATION_PREFIX, identifier=identifier, name=row["name"]
)
)
if short_name := row["shortName"]:
term.append_synonym(short_name, type=ABBREVIATION)
if url := row["organizationURL"]:
term.annotate_uri(has_homepage, url)
if vat := row["vatNumber"]:
term.append_exact_match(Reference(prefix="vat", identifier=vat))
term.append_exact_match(Reference(prefix="eu.rcn", identifier=row["rcn"]))
# TODO city, country, nutsCode
yield term


if __name__ == "__main__":
CordisOrganizationGetter.cli()
170 changes: 170 additions & 0 deletions src/pyobo/sources/cordis/cordis_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""Converter for CORDIS projects."""

from __future__ import annotations

from collections import Counter
from collections.abc import Iterable

from curies import vocabulary as v
from tabulate import tabulate
from tqdm import tqdm

from pyobo import Obo, Reference, Term, TypeDef, default_reference
from pyobo.sources.cordis.utils import (
BASIS_PREFIX,
ORGANIZATION_PREFIX,
PROJECT_PREFIX,
TOPIC_PREFIX,
clean_topic_id,
open_cordis,
)
from pyobo.struct.typedef import has_participant

__all__ = [
"CordisProjectGetter",
]

# see euscivoc, which is in skosxl format

PROJECT = Term.from_triple("foaf", "Project", "project")
STATUS = Term(reference=default_reference(PROJECT_PREFIX, "status"))
KEY_TO_STATUS = {
"CLOSED": Term(reference=default_reference(PROJECT_PREFIX, "closed")).append_parent(STATUS),
"SIGNED": Term(reference=default_reference(PROJECT_PREFIX, "signed")).append_parent(STATUS),
"TERMINATED": Term(reference=default_reference(PROJECT_PREFIX, "terminated")).append_parent(
STATUS
),
}

HAS_LEGAL_BASIS = TypeDef(
reference=default_reference(PROJECT_PREFIX, "hasLegalBasis"), domain=PROJECT.reference
)
HAS_TOPIC = TypeDef(
reference=default_reference(PROJECT_PREFIX, "hasTopic"), domain=PROJECT.reference
)
HAS_FUNDING_SCHEME = TypeDef(
reference=default_reference(PROJECT_PREFIX, "hasFundingScheme"),
domain=PROJECT.reference,
range=Reference.from_reference(v.xsd_string),
)
HAS_KEYWORD = TypeDef( # TODO replace with SDO
reference=default_reference(PROJECT_PREFIX, "hasKeyword"),
range=Reference.from_reference(v.xsd_string),
domain=PROJECT.reference,
)
HAS_STATUS = TypeDef(
reference=default_reference(PROJECT_PREFIX, "hasStatus"), domain=PROJECT.reference
)
HAS_START = TypeDef(
reference=default_reference(PROJECT_PREFIX, "hasStart"),
domain=PROJECT.reference,
range=Reference.from_reference(v.xsd_datetime),
)
HAS_END = TypeDef(
reference=default_reference(PROJECT_PREFIX, "hasEnd"),
domain=PROJECT.reference,
range=Reference.from_reference(v.xsd_datetime),
)
ACRONYM = Reference.from_reference(v.acronym)


class CordisProjectGetter(Obo):
"""An ontology representation of cordis projects."""

ontology = PROJECT_PREFIX
typedefs = [
HAS_LEGAL_BASIS,
HAS_TOPIC,
HAS_FUNDING_SCHEME,
HAS_KEYWORD,
HAS_STATUS,
HAS_START,
HAS_END,
]
dynamic_version = True
root_terms = [STATUS.reference, PROJECT.reference]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms()


def iter_terms(*, version: str | None = None) -> Iterable[Term]:
"""Iterate over CORDIS project terms."""
terms: dict[str, Term] = {}
scheme_counter: Counter[str] = Counter()
with open_cordis("project.csv", version=version) as reader:
for row in reader:
term = Term(
reference=Reference(
prefix="cordis.project", identifier=row["id"], name=row["title"]
),
# definition=row['objective'],
).append_parent(PROJECT)
term.append_synonym(row["acronym"], type=ACRONYM)
term.annotate_object(
HAS_LEGAL_BASIS, Reference(prefix=BASIS_PREFIX, identifier=row["legalBasis"])
)

doi = row["grantDoi"]
try:
doi_reference = Reference(prefix="doi", identifier=doi)
except ValueError:
tqdm.write(f"[{term.curie}] problem with DOI: {doi}")
continue
else:
term.append_exact_match(doi_reference)

try:
rcn_id = Reference(prefix="eu.rcn", identifier=row["rcn"])
except ValueError:
pass # this is probably the same offset issue as above
# tqdm.write(f"[{term.curie}] problem with RCN: {doi}")
else:
term.append_exact_match(rcn_id)

for topic in row["topics"].split(","):
term.annotate_object(
HAS_TOPIC, Reference(prefix=TOPIC_PREFIX, identifier=clean_topic_id(topic))
)

for keyword in row["keywords"].split(","):
if keyword_stripped := keyword.strip().strip('"').strip():
term.annotate_string(HAS_KEYWORD, keyword_stripped)

if funding_scheme := row["fundingScheme"]:
scheme_counter[row["fundingScheme"]] += 1
term.annotate_string(HAS_FUNDING_SCHEME, funding_scheme)

# switch to date after https://github.com/protegeproject/protege/issues/1343
if start_date := row["startDate"]:
term.annotate_datetime(HAS_START, start_date)
if end_date := row["endDate"]:
term.annotate_datetime(HAS_END, end_date)

term.annotate_object(HAS_STATUS, KEY_TO_STATUS[row["status"]])

terms[term.identifier] = term

tqdm.write(tabulate(scheme_counter.most_common()))

with open_cordis("organization.csv", version=version) as reader:
for row in reader:
project_id = row["projectID"]
organization_id = row["organisationID"]
if project_id not in terms:
continue
terms[project_id].annotate_object(
has_participant,
Reference(prefix=ORGANIZATION_PREFIX, identifier=organization_id),
# TODO can add all sorts of annotations from this file, like the cost, role, ordinal
)

yield PROJECT
yield STATUS
yield from KEY_TO_STATUS.values()
yield from terms.values()


if __name__ == "__main__":
CordisProjectGetter.cli()
35 changes: 35 additions & 0 deletions src/pyobo/sources/cordis/cordis_topic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Converter for CORDIS topics."""

from __future__ import annotations

from collections.abc import Iterable

from pyobo.sources.cordis.utils import TOPIC_PREFIX, clean_topic_id, open_cordis
from pyobo.struct import Obo, Term

__all__ = [
"CordisTopicGetter",
]


class CordisTopicGetter(Obo):
"""An ontology representation of CORDIS topics."""

ontology = TOPIC_PREFIX
dynamic_version = True

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return iter_terms()


def iter_terms(version: str | None = None) -> Iterable[Term]:
"""Iterate over CORDIS topic terms."""
with open_cordis("topics.csv", version=version) as reader:
unique = {row["topic"]: row["title"] for row in reader}
for identifier, name in sorted(unique.items()):
yield Term.from_triple(TOPIC_PREFIX, clean_topic_id(identifier), name)


if __name__ == "__main__":
CordisTopicGetter.cli()
Loading
Loading