From d252ec14ce27bd46dab060e12a896c66683eb2ca Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 9 Oct 2025 15:35:03 +0200 Subject: [PATCH 01/24] Implement skos reader --- src/pyobo/struct/skosrdf.py | 104 ++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/pyobo/struct/skosrdf.py diff --git a/src/pyobo/struct/skosrdf.py b/src/pyobo/struct/skosrdf.py new file mode 100644 index 00000000..835b0df9 --- /dev/null +++ b/src/pyobo/struct/skosrdf.py @@ -0,0 +1,104 @@ +"""Read SKOS from RDF.""" + +import rdflib +from rdflib import Graph, RDF, SKOS, URIRef, Node, VANN, DCTERMS +from tqdm import tqdm +import curies +from bioregistry import NormalizedNamedReference, NormalizedNamableReference +from pyobo.struct import Term, Obo +from pyobo.identifier_utils import get_converter +from pyobo.struct import build_ontology + +__all__ = [ + "get_skos_ontology", +] + + +def get_skos_ontology(graph: rdflib.Graph, *, prefix: str | None = None, ) -> Obo: + converter = get_converter() + schemes = list(graph.subjects(RDF.type, SKOS.ConceptScheme)) + if len(schemes) != 1: + raise ValueError + scheme = schemes[0] + + print(f'found graph: {scheme}') + + def _get_scheme_object_literal(p: Node) -> str | None: + for o in graph.objects(scheme, p): + return str(o) + return None + + if prefix is None: + prefix = _get_scheme_object_literal(VANN.preferredNamespacePrefix) + + if prefix is None: + raise ValueError(f'no prefix given nor found using {VANN.preferredNamespacePrefix}') + + root_terms = [ + NormalizedNamableReference.from_reference(converter.parse_uri(subject, strict=True)) + for subject in graph.objects(scheme, SKOS.hasTopConcept) + ] + terms = [ + get_term(graph, concept, converter=converter) + for concept in tqdm(graph.subjects(RDF.type, SKOS.Concept)) + ] + + return build_ontology( + prefix=prefix, + terms=terms, + root_terms=root_terms, + idspaces={ + curie_prefix: str(uri_prefix) + for curie_prefix, uri_prefix in graph.namespaces() + }, + name=_get_scheme_object_literal(DCTERMS.title), + description=_get_scheme_object_literal(DCTERMS.description), + ) + + +def _literal_objects(graph: Graph, subject: Node, predicate: Node) -> list[rdflib.Literal]: + return [o for o in graph.objects(subject, predicate) if + isinstance(o, rdflib.Literal) and o._language in DEFAULT_LANGUAGES] + + +DEFAULT_LANGUAGES = {"en", None} + + +def get_term(graph: rdflib.Graph, node: URIRef, converter: curies.Converter) -> Term: + """Get a term.""" + reference_tuple = converter.parse_uri(node, strict=True) + labels = _literal_objects(graph, node, SKOS.prefLabel) + definitions = _literal_objects(graph, node, SKOS.definition) + term = Term( + reference=NormalizedNamedReference(prefix=reference_tuple.prefix, identifier=reference_tuple.identifier, + name=labels[0] if labels else None), + definition=definitions[0] if definitions else None, + ) + for alt in _literal_objects(graph, node, SKOS.altLabel): + term.append_synonym(alt) + + for exact_match in graph.objects(node, SKOS.exactMatch): + term.append_exact_match(converter.parse_uri(exact_match, strict=True)) + + # TODO broad, narrow, related match. add to term functions too + return term + + +def _split_literals(literals: list[rdflib.Literal]) -> tuple[str, str]: + for literal in literals: + if literal._language == "en" or literal._language is None: + return literal, "en", {} + literal = literals[0] + return str(literal), literal._language, {} + + +def main(): + import pystow + url = "https://raw.githubusercontent.com/dini-ag-kim/hcrt/refs/heads/master/hcrt.ttl" + graph = pystow.ensure_rdf("dalia", url=url) + ontology = get_skos_ontology(graph) + ontology.write_obo("/Users/cthoyt/Desktop/hcrt.obo") + + +if __name__ == '__main__': + main() From d80987359690d5e5a5c31c2219d81019b31daed0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 9 Oct 2025 16:08:28 +0200 Subject: [PATCH 02/24] Add finished prototype --- src/pyobo/constants.py | 3 +- src/pyobo/getters.py | 39 +++++++----- src/pyobo/struct/{skosrdf.py => skos.py} | 63 ++++++++++++------- src/pyobo/utils/misc.py | 1 + tests/test_struct/test_skos/__init__.py | 1 + tests/test_struct/test_skos/test.ttl | 28 +++++++++ .../test_struct/test_skos/test_skos_reader.py | 31 +++++++++ 7 files changed, 130 insertions(+), 36 deletions(-) rename src/pyobo/struct/{skosrdf.py => skos.py} (64%) create mode 100644 tests/test_struct/test_skos/__init__.py create mode 100644 tests/test_struct/test_skos/test.ttl create mode 100644 tests/test_struct/test_skos/test_skos_reader.py diff --git a/src/pyobo/constants.py b/src/pyobo/constants.py index 1234095b..c88314c5 100644 --- a/src/pyobo/constants.py +++ b/src/pyobo/constants.py @@ -226,7 +226,7 @@ class IterHelperHelperDict(SlimGetOntologyKwargs): #: The ontology format -OntologyFormat: TypeAlias = Literal["obo", "owl", "json", "rdf"] +OntologyFormat: TypeAlias = Literal["obo", "owl", "json", "rdf", "skos"] #: from table 2 of the Functional OWL syntax definition #: at https://www.w3.org/TR/owl2-syntax/#IRIs @@ -254,4 +254,5 @@ class OntologyPathPack(NamedTuple): ("owl", bioregistry.get_owl_download), ("json", bioregistry.get_json_download), ("rdf", bioregistry.get_rdf_download), + ("skos", bioregistry.get_skos_download), ] diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py index 22712098..c89b5663 100644 --- a/src/pyobo/getters.py +++ b/src/pyobo/getters.py @@ -38,7 +38,7 @@ from .identifier_utils import ParseError, wrap_norm_prefix from .plugins import has_nomenclature_plugin, run_nomenclature_plugin from .struct import Obo -from .struct.obo import from_obo_path, from_obonet +from .struct.obo import from_obonet from .utils.io import safe_open_writer from .utils.misc import _get_version_from_artifact from .utils.path import ensure_path, prefix_directory_join @@ -189,22 +189,33 @@ def get_ontology( if cache: obo.write_default(force=force_process) return obo + elif ontology_format == "skos": + from .struct.skosrdf import read_skos + + obo = read_skos(prefix=prefix, path=path) + if cache: + obo.write_default(force=force) + return obo + elif ontology_format == "obo": + from .struct.obo import from_obo_path + + obo = from_obo_path( + path, + prefix=prefix, + strict=strict, + version=version, + upgrade=upgrade, + use_tqdm=use_tqdm, + _cache_path=obonet_json_gz_path, + ) + if cache: + obo.write_default(force=force_process) + return obo + elif ontology_format == "jskos": + raise NotImplementedError else: raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}") - obo = from_obo_path( - path, - prefix=prefix, - strict=strict, - version=version, - upgrade=upgrade, - use_tqdm=use_tqdm, - _cache_path=obonet_json_gz_path, - ) - if cache: - obo.write_default(force=force_process) - return obo - def _ensure_ontology_path( prefix: str, *, force: bool, version: str | None diff --git a/src/pyobo/struct/skosrdf.py b/src/pyobo/struct/skos.py similarity index 64% rename from src/pyobo/struct/skosrdf.py rename to src/pyobo/struct/skos.py index 835b0df9..9258276b 100644 --- a/src/pyobo/struct/skosrdf.py +++ b/src/pyobo/struct/skos.py @@ -1,28 +1,45 @@ """Read SKOS from RDF.""" +from pathlib import Path + +import curies import rdflib -from rdflib import Graph, RDF, SKOS, URIRef, Node, VANN, DCTERMS +from bioregistry import NormalizedNamableReference, NormalizedNamedReference +from rdflib import DCTERMS, RDF, SKOS, VANN, Graph, Node, URIRef from tqdm import tqdm -import curies -from bioregistry import NormalizedNamedReference, NormalizedNamableReference -from pyobo.struct import Term, Obo + from pyobo.identifier_utils import get_converter -from pyobo.struct import build_ontology +from pyobo.struct import Obo, Term, build_ontology __all__ = [ "get_skos_ontology", + "read_skos", ] -def get_skos_ontology(graph: rdflib.Graph, *, prefix: str | None = None, ) -> Obo: - converter = get_converter() +def read_skos( + path: str | Path, *, prefix: str | None = None, converter: curies.Converter | None = None +) -> Obo: + """Read a SKOS RDF file.""" + graph = rdflib.Graph() + graph.parse(path) + return get_skos_ontology(graph, prefix=prefix, converter=converter) + + +def get_skos_ontology( + graph: rdflib.Graph, + *, + prefix: str | None = None, + converter: curies.Converter | None = None, +) -> Obo: + """Extract an ontology from a SKOS RDF graph.""" + if converter is None: + converter = get_converter() schemes = list(graph.subjects(RDF.type, SKOS.ConceptScheme)) if len(schemes) != 1: raise ValueError scheme = schemes[0] - print(f'found graph: {scheme}') - def _get_scheme_object_literal(p: Node) -> str | None: for o in graph.objects(scheme, p): return str(o) @@ -32,7 +49,7 @@ def _get_scheme_object_literal(p: Node) -> str | None: prefix = _get_scheme_object_literal(VANN.preferredNamespacePrefix) if prefix is None: - raise ValueError(f'no prefix given nor found using {VANN.preferredNamespacePrefix}') + raise ValueError(f"no prefix given nor found using {VANN.preferredNamespacePrefix}") root_terms = [ NormalizedNamableReference.from_reference(converter.parse_uri(subject, strict=True)) @@ -47,18 +64,18 @@ def _get_scheme_object_literal(p: Node) -> str | None: prefix=prefix, terms=terms, root_terms=root_terms, - idspaces={ - curie_prefix: str(uri_prefix) - for curie_prefix, uri_prefix in graph.namespaces() - }, + idspaces={curie_prefix: str(uri_prefix) for curie_prefix, uri_prefix in graph.namespaces()}, name=_get_scheme_object_literal(DCTERMS.title), description=_get_scheme_object_literal(DCTERMS.description), ) def _literal_objects(graph: Graph, subject: Node, predicate: Node) -> list[rdflib.Literal]: - return [o for o in graph.objects(subject, predicate) if - isinstance(o, rdflib.Literal) and o._language in DEFAULT_LANGUAGES] + return [ + o + for o in graph.objects(subject, predicate) + if isinstance(o, rdflib.Literal) and o._language in DEFAULT_LANGUAGES + ] DEFAULT_LANGUAGES = {"en", None} @@ -70,8 +87,11 @@ def get_term(graph: rdflib.Graph, node: URIRef, converter: curies.Converter) -> labels = _literal_objects(graph, node, SKOS.prefLabel) definitions = _literal_objects(graph, node, SKOS.definition) term = Term( - reference=NormalizedNamedReference(prefix=reference_tuple.prefix, identifier=reference_tuple.identifier, - name=labels[0] if labels else None), + reference=NormalizedNamedReference( + prefix=reference_tuple.prefix, + identifier=reference_tuple.identifier, + name=labels[0] if labels else None, + ), definition=definitions[0] if definitions else None, ) for alt in _literal_objects(graph, node, SKOS.altLabel): @@ -92,13 +112,14 @@ def _split_literals(literals: list[rdflib.Literal]) -> tuple[str, str]: return str(literal), literal._language, {} -def main(): +def _demo(): import pystow + url = "https://raw.githubusercontent.com/dini-ag-kim/hcrt/refs/heads/master/hcrt.ttl" graph = pystow.ensure_rdf("dalia", url=url) ontology = get_skos_ontology(graph) ontology.write_obo("/Users/cthoyt/Desktop/hcrt.obo") -if __name__ == '__main__': - main() +if __name__ == "__main__": + _demo() diff --git a/src/pyobo/utils/misc.py b/src/pyobo/utils/misc.py index 8f352c2c..92a9e919 100644 --- a/src/pyobo/utils/misc.py +++ b/src/pyobo/utils/misc.py @@ -132,6 +132,7 @@ def _get_obograph_json_version(prefix: str, url: str) -> str | None: "obo": _get_obo_version, "owl": _get_owl_version, "json": _get_obograph_json_version, + # TODO add version gettersfor SKOS, JSKOS } diff --git a/tests/test_struct/test_skos/__init__.py b/tests/test_struct/test_skos/__init__.py new file mode 100644 index 00000000..55cb8f26 --- /dev/null +++ b/tests/test_struct/test_skos/__init__.py @@ -0,0 +1 @@ +"""Test the SKOS reader.""" diff --git a/tests/test_struct/test_skos/test.ttl b/tests/test_struct/test_skos/test.ttl new file mode 100644 index 00000000..ef35503e --- /dev/null +++ b/tests/test_struct/test_skos/test.ttl @@ -0,0 +1,28 @@ +@base . +@prefix dct: . +@prefix skos: . +@prefix vann: . + + + a skos:ConceptScheme; + dct:title "Hochschulcampus Ressourcentypen"@de, "Higher Education Resource Types"@en, "Brontypen voor het hoger onderwijs"@nl, "Типи ресурсів вищої освіти"@uk, "Typy zdrojů vyššího vzdělání"@cs ; + dct:description "Eine Wertelliste für Typen von Lernressourcen (Learning Resource Type), entstanden im Kontext des Metadatenschemas \"LOM for Higher Education OER Repositories\" (https://w3id.org/dini-ag-kim/hs-oer-lom-profil/latest/)."@de ; + dct:publisher ; + dct:issued "2020-02-07" ; + vann:preferredNamespaceUri "https://w3id.org/kim/hcrt/" ; + vann:preferredNamespacePrefix "hcrt" ; + dct:license ; + skos:hasTopConcept , ,