Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ dependencies = [
"cachier",
"pystow>=0.7.5",
"bioversions>=0.8.101",
"bioregistry>=0.12.30",
"bioregistry>=0.13.10",
"bioontologies>=0.7.2",
"ssslm>=0.0.13",
"zenodo-client>=0.3.6",
Expand All @@ -80,6 +80,7 @@ dependencies = [
"curies-processing>=0.1.2",
"python-dateutil",
"networkx>=3.4",
"jskos",
# Resource Downloaders
"drugbank_downloader",
"chembl_downloader",
Expand Down
29 changes: 24 additions & 5 deletions src/pyobo/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
import re
from collections.abc import Callable
from pathlib import Path
from typing import Literal, NamedTuple, TypeAlias
from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias

import pystow
from typing_extensions import NotRequired, TypedDict

if TYPE_CHECKING:
from bioregistry.schema import AnnotatedURL

__all__ = [
"DATABASE_DIRECTORY",
"DEFAULT_PREFIX_MAP",
Expand Down Expand Up @@ -225,7 +228,7 @@ class IterHelperHelperDict(SlimGetOntologyKwargs):


#: The ontology format
OntologyFormat: TypeAlias = Literal["obo", "owl", "json", "rdf"]
OntologyFormat: TypeAlias = Literal["obo", "owl", "json", "rdf", "skos", "jskos"]

#: from table 2 of the Functional OWL syntax definition
#: at https://www.w3.org/TR/owl2-syntax/#IRIs
Expand All @@ -244,6 +247,8 @@ class OntologyPathPack(NamedTuple):
format: OntologyFormat
#: The path to the ontology file
path: Path
#: The RDF format
rdf_format: str | None


def _get_obo_download(prefix: str) -> str | None:
Expand All @@ -264,17 +269,31 @@ def _get_json_download(prefix: str) -> str | None:
return bioregistry.get_json_download(prefix)


def _get_rdf_download(prefix: str) -> str | None:
def _get_rdf_download(prefix: str) -> str | AnnotatedURL | None:
import bioregistry

return bioregistry.get_rdf_download(prefix, get_format=True)


def _get_skos_download(prefix: str) -> str | AnnotatedURL | None:
import bioregistry

return bioregistry.get_skos_download(prefix, get_format=True)


def _get_jskos_download(prefix: str) -> str | None:
import bioregistry

return bioregistry.get_rdf_download(prefix)
return bioregistry.get_jskos_download(prefix)


#: Functions that get ontology files. Order matters in this list,
#: since order implicitly defines priority
ONTOLOGY_GETTERS: list[tuple[OntologyFormat, Callable[[str], str | None]]] = [
ONTOLOGY_GETTERS: list[tuple[OntologyFormat, Callable[[str], str | AnnotatedURL | None]]] = [
("obo", _get_obo_download),
("owl", _get_owl_download),
("json", _get_json_download),
("rdf", _get_rdf_download),
("skos", _get_skos_download),
("jskos", _get_jskos_download),
]
65 changes: 58 additions & 7 deletions src/pyobo/getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import click
import pystow.utils
import requests.exceptions
from bioregistry.schema import AnnotatedURL, RDFFormat
from tabulate import tabulate
from tqdm.auto import tqdm
from typing_extensions import Unpack
Expand All @@ -32,6 +33,7 @@
ONTOLOGY_GETTERS,
GetOntologyKwargs,
IterHelperHelperDict,
OntologyFormat,
OntologyPathPack,
SlimGetOntologyKwargs,
)
Expand Down Expand Up @@ -177,18 +179,36 @@ def get_ontology(
path_pack = _ensure_ontology_path(prefix, force=force, version=version)
if path_pack is None:
raise NoBuildError(prefix)
ontology_format, path = path_pack
ontology_format, path, rdf_format = path_pack
if ontology_format == "obo":
pass # all gucci
elif ontology_format in {"owl", "rdf"}:
elif ontology_format == "owl":
path = _convert_to_obo(path)
elif ontology_format == "rdf":
from .struct.generic_rdf import read_generic_rdf

return read_generic_rdf(path=path, prefix=prefix, rdf_format=rdf_format)
elif ontology_format == "json":
from .struct.obograph import read_obograph

obo = read_obograph(prefix=prefix, path=path)
if cache:
obo.write_default(force=force_process)
return obo
elif ontology_format == "skos":
from .struct.skos import read_skos

obo = read_skos(prefix=prefix, path=path, rdf_format=rdf_format)
if cache:
obo.write_default(force=force)
return obo
elif ontology_format == "jskos":
from .struct.jskos_utils import read_jskos

obo = read_jskos(prefix=prefix, path=path)
if cache:
obo.write_default(force=force)
return obo
else:
raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}")

Expand All @@ -206,21 +226,52 @@ def get_ontology(
return obo


ONTOLOGY_FORMAT_TO_SUFFIX: dict[OntologyFormat, str] = {
"skos": ".ttl",
"jskos": ".json",
}

XX_TO_SUFFIX: dict[str, str] = {"rdf/xml": ".xml", "xml": ".xml"}


def _name_from_url(
url: str, ontology_format: OntologyFormat, *, rdf_format: str | None = None
) -> str:
name = pystow.utils.name_from_url(url)
if "." not in name:
if rdf_format is None:
raise ValueError(f"need to curate a RDF format for {url}")
name = name + XX_TO_SUFFIX[rdf_format]
# TODO add unit test that checks all downloads with no extension have a (RDF) format
return name


def _ensure_ontology_path(
prefix: str, *, force: bool, version: str | None
) -> OntologyPathPack | None:
rdf_format: RDFFormat | None
for ontology_format, getter in ONTOLOGY_GETTERS:
url = getter(prefix)
if url is None:
continue
match getter(prefix):
case None:
continue
case AnnotatedURL() as a:
url = a.url
rdf_format = a.rdf_format
case str() as url:
rdf_format = None
case _:
raise TypeError

name = _name_from_url(url, ontology_format, rdf_format=rdf_format)

try:
path = ensure_path(prefix, url=url, force=force, version=version)
path = ensure_path(prefix, url=url, force=force, version=version, name=name)
except (urllib.error.HTTPError, pystow.utils.DownloadError):
continue
except pystow.utils.UnexpectedDirectoryError:
continue # TODO report more info about the URL and the name it tried to make
else:
return OntologyPathPack(ontology_format, path)
return OntologyPathPack(ontology_format, path, rdf_format)
return None


Expand Down
70 changes: 70 additions & 0 deletions src/pyobo/oer_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Get all OER-related prefixes."""

import shutil

import bioregistry
import click
import pystow
from bioontologies.robot import ROBOTError
from more_click import verbose_option
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

import pyobo
from pyobo.getters import NoBuildError

VALIDATED = {"ccso", "iana.mediatype"}
NEEDS_PYOBO = {
"loc.fdd", # see http://www.loc.gov/preservation/digital/formats/fddXML.zip
"oerschema", # see https://github.com/open-curriculum/oerschema/blob/master/src/config/schema.yml
}


# TODO add all vocabularies from https://vocabs.openeduhub.de/


@click.command()
@click.option("-r", "--refresh", is_flag=True)
@verbose_option
def main(refresh: bool = False) -> None:
"""Get all OER-related prefixes."""
collection = bioregistry.get_collection("0000018")
if collection is None:
raise ValueError

prefixes = [p for p in collection.resources if p not in VALIDATED and p not in NEEDS_PYOBO]
if refresh:
for prefix in tqdm(prefixes):
directory = pystow.join("pyobo", "raw", prefix)
if directory.is_dir():
shutil.rmtree(directory)
return

for prefix in tqdm(prefixes, disable=True):
tqdm.write(
click.style(f"[{prefix}] {bioregistry.get_name(prefix, strict=True)}", fg="green")
)
with logging_redirect_tqdm():
try:
ontology = pyobo.get_ontology(prefix, cache=False, force_process=True, force=False)
except NotImplementedError as e:
tqdm.write(click.style(f"[{prefix}] failed because not implemented: {e}", fg="red"))
continue
except NoBuildError:
tqdm.write(click.style(f"[{prefix}] no build", fg="yellow"))
continue
except ROBOTError as e:
tqdm.write(click.style(f"[{prefix}]\n{e}", fg="yellow"))
continue
except Exception as e:
tqdm.write(click.style(f"[{prefix}] failed\n\t{e}\n\n", fg="red"))
raise
terms = list(ontology)
if not terms:
tqdm.write(click.style(f"[{prefix}] failed, got no terms\n", fg="red"))
else:
tqdm.write(f"[{prefix}] got {len(terms):,} terms\n")


if __name__ == "__main__":
main()
Loading