diff --git a/.github/workflows/update-metadata.yml b/.github/workflows/update-metadata.yml index 75260a781..5e4950278 100644 --- a/.github/workflows/update-metadata.yml +++ b/.github/workflows/update-metadata.yml @@ -47,7 +47,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pyyaml + pip install -r dev-requirements.txt - name: Update version metadata (if triggered by tag) if: steps.extract_version.outputs.is_tag == 'true' @@ -57,12 +57,21 @@ jobs: python -m quadriga.metadata.update_version_from_tag - name: Update metadata files + env: + PYTHONHASHSEED: 0 run: python -m quadriga.metadata.run_all - - name: Check if files changed + - name: Stage metadata files + run: | + # Add all metadata files that exist (handles both new and modified files) + for file in metadata.yml CITATION.bib CITATION.cff .zenodo.json metadata.jsonld metadata.rdf; do + [ -f "$file" ] && git add "$file" + done + + - name: Check if files staged id: check_changes run: | - if git diff --quiet metadata.yml && git diff --quiet CITATION.bib && git diff --quiet CITATION.cff; then + if git diff --cached --quiet; then echo "changes_detected=false" >> $GITHUB_OUTPUT else echo "changes_detected=true" >> $GITHUB_OUTPUT @@ -73,19 +82,14 @@ jobs: run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" - git add metadata.yml CITATION.bib CITATION.cff git commit -m "[Automated] Update metadata files" git push - name: Commit changes and move tag (tag-triggered) if: steps.check_changes.outputs.changes_detected == 'true' && steps.extract_version.outputs.is_tag == 'true' run: | - # Configure git git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" - - # Commit the metadata changes - git add metadata.yml CITATION.bib CITATION.cff git commit -m "[Automated] Update metadata for version ${{ steps.extract_version.outputs.version }}" # Delete the old tag (locally and remotely) diff --git a/.zenodo.json b/.zenodo.json index db40b9de5..1bac1e8a6 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -18,8 +18,8 @@ "orcid": "0000-0002-1421-4320" } ], - "description": "Die vorliegende Fallstudie bereitet – in Form eines “Jupyter Books” – den Prozess und die Ergebnisse eines Forschungsprojekts aus den Digital Humanities didaktisch auf.", - "publication_date": "2025-06-20", + "description": "

Die vorliegende Fallstudie bereitet – in Form eines “Jupyter Books” – den Prozess und die Ergebnisse eines Forschungsprojekts aus den Digital Humanities didaktisch auf.

\n

Das interaktive Lehrbuch kann als Web-Version verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch auf GitHub zur Verfügung.

\n

Die QUADRIGA-OER sind nach einem einheitlichen Template gestaltet, werden nach einem standardisierten Verfahren qualitätsgeprüft und mit Metadaten ausgezeichnet.

\n
QUADRIGA Datenkompetenzzentrum
\n

QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozess möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes Datenkompetenzframework zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.

\n

Zu den Zielgruppen von QUADRIGA zählen insbesondere promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.

\n

QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR) und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartnern zählen:\n

\n

\n\n

Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im Umsetzungskonzept erfahren.

\n\n

Weitere Informationen sowie Publikationen finden Sie auf der Webseite, in der Zenodo-Community und der GitHub-Organisation des Projekts.

\n", + "publication_date": "2026-02-19", "keywords": [ "Open Educational Resource" ], diff --git a/CITATION.bib b/CITATION.bib index 92edc4407..2ff4e1e72 100644 --- a/CITATION.bib +++ b/CITATION.bib @@ -1,7 +1,7 @@ -@book{Skorinkin_Quantitative_2025, +@book{Skorinkin_Quantitative_2026, title = {Quantitative Analyse der kommunikativen Barrierearmut des Berliner Senats (2011-2024). Eine Fallstudie}, author = {Skorinkin, Daniil and Sluyter-Gäthje, Henny and Trilcke, Peer}, - year = {2025}, + year = {2026}, version = {1.0.0-beta}, doi = {10.5281/zenodo.15682652}, url = {https://quadriga-dk.github.io/Text-Fallstudie-2/}, diff --git a/CITATION.cff b/CITATION.cff index cd1b68697..7648b0449 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -37,7 +37,7 @@ references: - name: The Jupyter Book Community website: https://github.com/jupyter-book/jupyter-book/graphs/contributors preferred-citation: - year: '2025' + year: '2026' authors: *id001 title: Quantitative Analyse der kommunikativen Barrierearmut des Berliner Senats (2011-2024). Eine Fallstudie diff --git a/dev-requirements.txt b/dev-requirements.txt index 4818cc541..9d69b3603 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1 +1,2 @@ -pyyaml \ No newline at end of file +pyyaml +rdflib diff --git a/metadata.jsonld b/metadata.jsonld new file mode 100644 index 000000000..50957398f --- /dev/null +++ b/metadata.jsonld @@ -0,0 +1,359 @@ +{ + "@context": { + "schema": "http://schema.org/", + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "lrmi": "http://purl.org/dcx/lrmi-terms/", + "skos": "http://www.w3.org/2004/02/skos/core#", + "@vocab": "http://schema.org/" + }, + "@type": [ + "Book", + "LearningResource" + ], + "name": "Quantitative Analyse der kommunikativen Barrierearmut des Berliner Senats (2011-2024). Eine Fallstudie", + "description": "Die vorliegende Fallstudie bereitet – in Form eines “Jupyter Books” – den Prozess und die Ergebnisse eines Forschungsprojekts aus den Digital Humanities didaktisch auf.", + "identifier": { + "@type": "PropertyValue", + "propertyID": "DOI", + "value": "10.5281/zenodo.TODOTODO", + "url": "https://doi.org/10.5281/zenodo.TODOTODO" + }, + "version": "1.0.0-beta", + "schemaVersion": "1.0.0", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/", + "workExample": { + "@type": "SoftwareSourceCode", + "name": "Source Code Repository", + "codeRepository": "https://github.com/quadriga-dk/Text-Fallstudie-2" + }, + "datePublished": "2024-06-17", + "dateModified": "2026-02-19", + "author": [ + { + "@type": "Person", + "givenName": "Daniil", + "familyName": "Skorinkin", + "name": "Daniil Skorinkin", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-1845-9974", + "url": "https://orcid.org/0000-0002-1845-9974" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Henny", + "familyName": "Sluyter-Gäthje", + "name": "Henny Sluyter-Gäthje", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-2969-3237", + "url": "https://orcid.org/0000-0003-2969-3237" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Peer", + "familyName": "Trilcke", + "name": "Peer Trilcke", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-1421-4320", + "url": "https://orcid.org/0000-0002-1421-4320" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + } + ], + "contributor": [ + { + "@type": "Person", + "givenName": "Hannes", + "familyName": "Schnaitter", + "name": "Hannes Schnaitter", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-1602-6032", + "url": "https://orcid.org/0000-0002-1602-6032" + }, + "affiliation": { + "@type": "Organization", + "name": "Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft" + } + }, + { + "@type": "Person", + "givenName": "Evgenia", + "familyName": "Samoilova", + "name": "Evgenia Samoilova", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-3858-901X", + "url": "https://orcid.org/0000-0003-3858-901X" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Lamia", + "familyName": "Islam", + "name": "Lamia Islam", + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + } + ], + "inLanguage": "de", + "keywords": [ + "Open Educational Resource" + ], + "about": [ + { + "@type": "Thing", + "name": "Open Educational Resource" + }, + { + "@type": "Thing", + "name": "übergreifend" + }, + { + "@type": "Thing", + "name": "übergreifend" + } + ], + "audience": [ + { + "@type": "Audience", + "audienceType": "Forschende (PostDoc)" + }, + { + "@type": "Audience", + "audienceType": "Forschende (Projektleitung)" + }, + { + "@type": "Audience", + "audienceType": "Promovierende" + }, + { + "@type": "Audience", + "audienceType": "Hochschullehrende" + } + ], + "timeRequired": "PT10H", + "license": [ + { + "@type": "CreativeWork", + "name": "Source Code", + "license": "https://opensource.org/licenses/AGPL-3.0" + }, + { + "@type": "CreativeWork", + "name": "Content", + "license": "https://creativecommons.org/licenses/by-sa/4.0/" + } + ], + "hasPart": [ + { + "@type": "LearningResource", + "name": "Präambel", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/präambel/einführung.html", + "timeRequired": "TODO", + "teaches": "TODO", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "TODO", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Fragestellung und Operationalisierung. Einführung in die Fallstudie", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/research_question/research-question_intro.html", + "timeRequired": "TODO", + "teaches": "Sie kennen Grundfragen der geisteswissenschaftlichen Forschung mit Korpora nach dem qualitativen Methodenparadigma und wissen um das Konzept der “Operationalisierung”.", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Sie kennen Ansätze zur Operationalisierung von Forschungsfragen für quantitaive Methoden-Settings.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Korpora als epistemische Objekte", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/corpus_collection/corpus-collection_intro.html", + "timeRequired": "TODO", + "teaches": "Sie kennen unterschiedliche Ansätze des Korpusaufbaus und sind mit der Erstellung basaler Metadaten vertraut.", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Sie verfügen über Basiswissen über Korpora als geisteswissenschaftliche Forschungsobjekte und kennen Typen von Korpora.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie sind mit der Idee von Metadaten vertraut und kennen basale Metadatenschemata für Korpora und Korpus-Elemente.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Texte als digitale Objekte. Einführung in HTML", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/html_intro/html-intro_intro.html", + "timeRequired": "TODO", + "teaches": "Abgrenzung von unterschiedlichen Formate zur Textspeicherung und benennen der Vor- und Nachteile. Struktur von HTML erklären und die Funktion von ausgewählte HTML-Tags benennen. HTML-Tags zur Extraktion von Plain Text aus HTML aufzählen.", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Sie kennen unterschiedliche Realisierungsweisen und Formate von Text im Digitalen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie können die Semantik der textangebenden html-Tags beschreiben und Tags zur Textextraktion auswählen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Scraping als Methode zum Korpusaufbau", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/scraping_intro/scraping_intro.html", + "timeRequired": "TODO", + "teaches": "Sie können HTTP-Abfragen konzeptionell erklären und unterschiedliche Methoden zur automatisierten Abfrage von Websites unterscheiden sowie Vor- und Nachteile benennen.", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Sie können die Komponenten aufzählen, die an einem HTTP-Request beteiligt sind, den Unterschied zwischen dem Aufbau eines HTTP-Requests und einer HTTP-Response erläutern sowie die Response codes 200, 404, 403 und 500 interpretieren.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie können verschiedene Methoden der Website-Abfrage aufzählen und Unterschiede identifizieren.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie können Vor- und Nachteile der Methoden erklären und ermitteln, in welchen Szenarien welche Methode geeignet ist.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Korpusaufbau", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/corpus_building/corpus-building_intro.html", + "timeRequired": "TODO", + "teaches": "Die Lernenden können den Quellcode einer Website untersuchen, geeignete HTML-Tags zur Textextraktion ermitteln und entscheiden, welche Scraping-Methode für die Extraktion verwendet werden muss.", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Lernenden können mit Hilfe eines Jupyter Notebooks Python-Code zur Extraktion des Website-Texts ausführen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Korpusanalyse. Textkomplexität", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/corpus_analysis/corpus-analysis_intro.html", + "timeRequired": "TODO", + "teaches": "Sie können die auf einem Korpus ausgeführte Berechnung der Textkomplexität erklären und die Ergebnisse interpretieren.", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Sie können die Textmaße (Wortlänge, Satzlänge etc.), die zur Berechnung der Textkomplexität dienen, aufzählen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie kennen verschiedene Methoden zur Berechnung der Textkomplexität und können die Vor- und Nachteile der Methoden aufzeigen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie können das Konzept eines Liniendiagramms erklären.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie können die Konzeption der Analyse beschreiben und andere Möglichkeiten des Korpus-Splitting entwerfen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + }, + { + "@type": "AlignmentObject", + "targetName": "Sie können das Konzept eines Balkendiagramms erklären und das erstellte Diagramm interpretieren sowie die Gründe für Ihre Interpretation nennen.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Reflexion", + "description": "TODO", + "url": "https://quadriga-dk.github.io/Text-Fallstudie-2/reflection/reflection_reflection.html", + "timeRequired": "TODO", + "teaches": "TODO", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die methodischen Limitationen einer Digital Humanities-Fallstudie können benannt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar" + } + ] + } + ], + "dcterms:tableOfContents": "- Präambel\n- Fragestellung und Operationalisierung. Einführung in die Fallstudie\n- Korpora als epistemische Objekte\n- Texte als digitale Objekte. Einführung in HTML\n- Scraping als Methode zum Korpusaufbau\n- Korpusaufbau\n- Korpusanalyse\n- Reflexion\n- Epilog", + "funding": "Die vorliegenden Open Educational Resources wurden durch das Datenkompetenzzentrum QUADRIGA erstellt.\n\nFörderkennzeichen: 16DKZ2034", + "learningResourceType": "Jupyter Book", + "lrmi:learningResourceType": "Jupyter Book", + "dcterms:type": "Jupyter Book", + "dc:type": "Jupyter Book" +} \ No newline at end of file diff --git a/metadata.rdf b/metadata.rdf new file mode 100644 index 000000000..1fb6d8907 --- /dev/null +++ b/metadata.rdf @@ -0,0 +1,356 @@ + + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + TODO + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie kennen Ansätze zur Operationalisierung von Forschungsfragen für quantitaive Methoden-Settings. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie verfügen über Basiswissen über Korpora als geisteswissenschaftliche Forschungsobjekte und kennen Typen von Korpora. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie sind mit der Idee von Metadaten vertraut und kennen basale Metadatenschemata für Korpora und Korpus-Elemente. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie kennen unterschiedliche Realisierungsweisen und Formate von Text im Digitalen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können die Semantik der textangebenden html-Tags beschreiben und Tags zur Textextraktion auswählen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können die Komponenten aufzählen, die an einem HTTP-Request beteiligt sind, den Unterschied zwischen dem Aufbau eines HTTP-Requests und einer HTTP-Response erläutern sowie die Response codes 200, 404, 403 und 500 interpretieren. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können verschiedene Methoden der Website-Abfrage aufzählen und Unterschiede identifizieren. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können Vor- und Nachteile der Methoden erklären und ermitteln, in welchen Szenarien welche Methode geeignet ist. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Die Lernenden können mit Hilfe eines Jupyter Notebooks Python-Code zur Extraktion des Website-Texts ausführen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können die Textmaße (Wortlänge, Satzlänge etc.), die zur Berechnung der Textkomplexität dienen, aufzählen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie kennen verschiedene Methoden zur Berechnung der Textkomplexität und können die Vor- und Nachteile der Methoden aufzeigen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können das Konzept eines Liniendiagramms erklären. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können die Konzeption der Analyse beschreiben und andere Möglichkeiten des Korpus-Splitting entwerfen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Sie können das Konzept eines Balkendiagramms erklären und das erstellte Diagramm interpretieren sowie die Gründe für Ihre Interpretation nennen. + + + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: nicht anwendbar | Data Flow: nicht anwendbar + Die methodischen Limitationen einer Digital Humanities-Fallstudie können benannt werden. + + + Jupyter Book + - Präambel +- Fragestellung und Operationalisierung. Einführung in die Fallstudie +- Korpora als epistemische Objekte +- Texte als digitale Objekte. Einführung in HTML +- Scraping als Methode zum Korpusaufbau +- Korpusaufbau +- Korpusanalyse +- Reflexion +- Epilog + Jupyter Book + Jupyter Book + + + Open Educational Resource + + + + + übergreifend + + + + + übergreifend + + + + + Forschende (PostDoc) + + + + + Forschende (Projektleitung) + + + + + Promovierende + + + + + Hochschullehrende + + + + + + Skorinkin + Daniil + + Daniil Skorinkin + + + + + + Sluyter-Gäthje + Henny + + Henny Sluyter-Gäthje + + + + + + Trilcke + Peer + + Peer Trilcke + + + + + + Schnaitter + Hannes + + Hannes Schnaitter + + + + + + Samoilova + Evgenia + + Evgenia Samoilova + + + + + + Islam + Lamia + Lamia Islam + + + 2026-02-19 + 2024-06-17 + Die vorliegende Fallstudie bereitet – in Form eines “Jupyter Books” – den Prozess und die Ergebnisse eines Forschungsprojekts aus den Digital Humanities didaktisch auf. + Die vorliegenden Open Educational Resources wurden durch das Datenkompetenzzentrum QUADRIGA erstellt. + +Förderkennzeichen: 16DKZ2034 + + + TODO + + Präambel + TODO + TODO + + + + + + TODO + + Fragestellung und Operationalisierung. Einführung in die Fallstudie + Sie kennen Grundfragen der geisteswissenschaftlichen Forschung mit Korpora nach dem qualitativen Methodenparadigma und wissen um das Konzept der “Operationalisierung”. + TODO + + + + + + TODO + + + Korpora als epistemische Objekte + Sie kennen unterschiedliche Ansätze des Korpusaufbaus und sind mit der Erstellung basaler Metadaten vertraut. + TODO + + + + + + TODO + + + Texte als digitale Objekte. Einführung in HTML + Abgrenzung von unterschiedlichen Formate zur Textspeicherung und benennen der Vor- und Nachteile. Struktur von HTML erklären und die Funktion von ausgewählte HTML-Tags benennen. HTML-Tags zur Extraktion von Plain Text aus HTML aufzählen. + TODO + + + + + + TODO + + + + Scraping als Methode zum Korpusaufbau + Sie können HTTP-Abfragen konzeptionell erklären und unterschiedliche Methoden zur automatisierten Abfrage von Websites unterscheiden sowie Vor- und Nachteile benennen. + TODO + + + + + + TODO + + Korpusaufbau + Die Lernenden können den Quellcode einer Website untersuchen, geeignete HTML-Tags zur Textextraktion ermitteln und entscheiden, welche Scraping-Methode für die Extraktion verwendet werden muss. + TODO + + + + + + TODO + + + + + + Korpusanalyse. Textkomplexität + Sie können die auf einem Korpus ausgeführte Berechnung der Textkomplexität erklären und die Ergebnisse interpretieren. + TODO + + + + + + TODO + + Reflexion + TODO + TODO + + + + + + DOI + + 10.5281/zenodo.TODOTODO + + + de + Open Educational Resource + Jupyter Book + + + + Source Code + + + + + + Content + + + Quantitative Analyse der kommunikativen Barrierearmut des Berliner Senats (2011-2024). Eine Fallstudie + 1.0.0 + PT10H + + 1.0.0-beta + + + + Source Code Repository + + + + + + Universität Potsdam + + + Universität Potsdam + + + Universität Potsdam + + + Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft + + + Universität Potsdam + + + Universität Potsdam + + + ORCID + + 0000-0002-1845-9974 + + + ORCID + + 0000-0003-2969-3237 + + + ORCID + + 0000-0002-1421-4320 + + + ORCID + + 0000-0002-1602-6032 + + + ORCID + + 0000-0003-3858-901X + + diff --git a/metadata.yml b/metadata.yml index bc0c87738..2fd682663 100644 --- a/metadata.yml +++ b/metadata.yml @@ -1,9 +1,6 @@ # yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json -schema-version: 1.0.0 -book-version: 1.0.0-beta title: Quantitative Analyse der kommunikativen Barrierearmut des Berliner Senats (2011-2024). Eine Fallstudie -identifier: https://doi.org/10.5281/zenodo.TODOTODO authors: - given-names: Daniil family-names: Skorinkin @@ -17,19 +14,11 @@ authors: family-names: Trilcke orcid: https://orcid.org/0000-0002-1421-4320 affiliation: Universität Potsdam -contributors: -- given-names: Hannes - family-names: Schnaitter - orcid: https://orcid.org/0000-0002-1602-6032 - affiliation: Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft -- given-names: Evgenia - family-names: Samoilova - orcid: https://orcid.org/0000-0003-3858-901X - affiliation: Universität Potsdam -- given-names: Lamia - family-names: Islam - affiliation: Universität Potsdam -language: deu +keywords: +- Open Educational Resource +description: Die vorliegende Fallstudie bereitet – in Form eines “Jupyter Books” – + den Prozess und die Ergebnisse eines Forschungsprojekts aus den Digital Humanities + didaktisch auf. table-of-contents: '- Präambel - Fragestellung und Operationalisierung. Einführung in die Fallstudie @@ -47,55 +36,73 @@ table-of-contents: '- Präambel - Reflexion - Epilog' -description: TODO discipline: - übergreifend research-object-type: - übergreifend +target-group: +- Forschende (PostDoc) +- Forschende (Projektleitung) +- Promovierende +- Hochschullehrende +time-required: PT10H +language: de +contributors: +- given-names: Hannes + family-names: Schnaitter + orcid: https://orcid.org/0000-0002-1602-6032 + affiliation: Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft +- given-names: Evgenia + family-names: Samoilova + orcid: https://orcid.org/0000-0003-3858-901X + affiliation: Universität Potsdam +- given-names: Lamia + family-names: Islam + affiliation: Universität Potsdam +identifier: https://doi.org/10.5281/zenodo.TODOTODO +git: https://github.com/quadriga-dk/Text-Fallstudie-2 +url: https://quadriga-dk.github.io/Text-Fallstudie-2/ chapters: - title: Präambel url: https://quadriga-dk.github.io/Text-Fallstudie-2/präambel/einführung.html description: TODO learning-goal: TODO - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: TODO competency: nicht anwendbar data-flow: nicht anwendbar - blooms-category: TODO + blooms-category: nicht anwendbar - title: Fragestellung und Operationalisierung. Einführung in die Fallstudie url: https://quadriga-dk.github.io/Text-Fallstudie-2/research_question/research-question_intro.html description: TODO learning-goal: Sie kennen Grundfragen der geisteswissenschaftlichen Forschung mit Korpora nach dem qualitativen Methodenparadigma und wissen um das Konzept der “Operationalisierung”. - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Sie kennen Ansätze zur Operationalisierung von Forschungsfragen für quantitaive Methoden-Settings. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - title: Korpora als epistemische Objekte url: https://quadriga-dk.github.io/Text-Fallstudie-2/corpus_collection/corpus-collection_intro.html description: TODO learning-goal: Sie kennen unterschiedliche Ansätze des Korpusaufbaus und sind mit der Erstellung basaler Metadaten vertraut. - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Sie verfügen über Basiswissen über Korpora als geisteswissenschaftliche Forschungsobjekte und kennen Typen von Korpora. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie sind mit der Idee von Metadaten vertraut und kennen basale Metadatenschemata für Korpora und Korpus-Elemente. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - title: Texte als digitale Objekte. Einführung in HTML url: https://quadriga-dk.github.io/Text-Fallstudie-2/html_intro/html-intro_intro.html description: TODO @@ -103,123 +110,114 @@ chapters: benennen der Vor- und Nachteile. Struktur von HTML erklären und die Funktion von ausgewählte HTML-Tags benennen. HTML-Tags zur Extraktion von Plain Text aus HTML aufzählen. - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Sie kennen unterschiedliche Realisierungsweisen und Formate von Text im Digitalen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie können die Semantik der textangebenden html-Tags beschreiben und Tags zur Textextraktion auswählen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - title: Scraping als Methode zum Korpusaufbau url: https://quadriga-dk.github.io/Text-Fallstudie-2/scraping_intro/scraping_intro.html description: TODO learning-goal: Sie können HTTP-Abfragen konzeptionell erklären und unterschiedliche Methoden zur automatisierten Abfrage von Websites unterscheiden sowie Vor- und Nachteile benennen. - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Sie können die Komponenten aufzählen, die an einem HTTP-Request beteiligt sind, den Unterschied zwischen dem Aufbau eines HTTP-Requests und einer HTTP-Response erläutern sowie die Response codes 200, 404, 403 und 500 interpretieren. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie können verschiedene Methoden der Website-Abfrage aufzählen und Unterschiede identifizieren. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie können Vor- und Nachteile der Methoden erklären und ermitteln, in welchen Szenarien welche Methode geeignet ist. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - title: Korpusaufbau url: https://quadriga-dk.github.io/Text-Fallstudie-2/corpus_building/corpus-building_intro.html description: TODO learning-goal: Die Lernenden können den Quellcode einer Website untersuchen, geeignete HTML-Tags zur Textextraktion ermitteln und entscheiden, welche Scraping-Methode für die Extraktion verwendet werden muss. - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Die Lernenden können mit Hilfe eines Jupyter Notebooks Python-Code zur Extraktion des Website-Texts ausführen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - title: Korpusanalyse. Textkomplexität url: https://quadriga-dk.github.io/Text-Fallstudie-2/corpus_analysis/corpus-analysis_intro.html description: TODO learning-goal: Sie können die auf einem Korpus ausgeführte Berechnung der Textkomplexität erklären und die Ergebnisse interpretieren. - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Sie können die Textmaße (Wortlänge, Satzlänge etc.), die zur Berechnung der Textkomplexität dienen, aufzählen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie kennen verschiedene Methoden zur Berechnung der Textkomplexität und können die Vor- und Nachteile der Methoden aufzeigen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie können das Konzept eines Liniendiagramms erklären. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie können die Konzeption der Analyse beschreiben und andere Möglichkeiten des Korpus-Splitting entwerfen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - learning-objective: Sie können das Konzept eines Balkendiagramms erklären und das erstellte Diagramm interpretieren sowie die Gründe für Ihre Interpretation nennen. - competency: TODO - data-flow: TODO - blooms-category: TODO + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar - title: Reflexion url: https://quadriga-dk.github.io/Text-Fallstudie-2/reflection/reflection_reflection.html description: TODO learning-goal: TODO - duration: TODO - educational-level: TODO + time-required: TODO learning-objectives: - learning-objective: Die methodischen Limitationen einer Digital Humanities-Fallstudie können benannt werden. - competency: TODO - data-flow: TODO - blooms-category: TODO -target-group: -- Forschende (PostDoc) -- Forschende (Projektleitung) -- Promovierende -- Hochschullehrende -duration: 10h -date-published: '2024-06-17' -date-modified: '2025-06-20' + competency: nicht anwendbar + data-flow: nicht anwendbar + blooms-category: nicht anwendbar +date-issued: '2024-06-17' +date-modified: '2026-02-19' +version: 1.0.0-beta context-of-creation: 'Die vorliegenden Open Educational Resources wurden durch das Datenkompetenzzentrum QUADRIGA erstellt. Förderkennzeichen: 16DKZ2034' -url: https://quadriga-dk.github.io/Text-Fallstudie-2/ -git: https://github.com/quadriga-dk/Text-Fallstudie-2 +quality-assurance: + description: https://doi.org/TODO + date: '2026-11-14' +learning-resource-type: Jupyter Book +schema-version: 1.0.0 license: code: https://opensource.org/licenses/AGPL-3.0 content: url: https://creativecommons.org/licenses/by-sa/4.0/ name: CC BY-SA 4.0 -keywords: -- Open Educational Resource diff --git a/quadriga/metadata/__init__.py b/quadriga/metadata/__init__.py index 8710fbacb..fc1ab8d71 100644 --- a/quadriga/metadata/__init__.py +++ b/quadriga/metadata/__init__.py @@ -7,15 +7,17 @@ __all__ = [ "create_bibtex", - "update_citation_cff", "extract_from_book_config", + "update_citation_cff", "update_version_from_tag", "utils", ] # Import the modules to make their functions available -from . import create_bibtex -from . import update_citation_cff -from . import extract_from_book_config -from . import update_version_from_tag -from . import utils +from . import ( + create_bibtex, + extract_from_book_config, + update_citation_cff, + update_version_from_tag, + utils, +) diff --git a/quadriga/metadata/create_bibtex.py b/quadriga/metadata/create_bibtex.py index 09a083d22..aa960ac98 100644 --- a/quadriga/metadata/create_bibtex.py +++ b/quadriga/metadata/create_bibtex.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import logging import sys -from pathlib import Path from .utils import ( extract_keywords, @@ -11,6 +12,7 @@ ) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) # Map CFF types to BibTeX entry types CFF_TO_BIBTEX_TYPES = { @@ -70,14 +72,15 @@ } -def create_bibtex_from_cff(): +def create_bibtex_from_cff() -> bool | None: """ - Creates a CITATION.bib file from CITATION.cff. + Create a CITATION.bib file from CITATION.cff. - It reads citation data, prioritizing the 'preferred-citation' block if available, + Reads citation data, prioritizing the 'preferred-citation' block if available, formats authors, generates a citation key, and constructs a BibTeX entry. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: @@ -86,28 +89,31 @@ def create_bibtex_from_cff(): repo_root = get_file_path("") # Get repo root citation_cff_path = get_file_path("CITATION.cff", repo_root) citation_bib_path = get_file_path("CITATION.bib", repo_root) - except Exception as e: - logging.error(f"Failed to resolve file paths: {str(e)}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Check if citation_cff_path exists - if not Path(citation_cff_path).exists(): - logging.error(f"CITATION.cff file not found at {citation_cff_path}") + if not citation_cff_path.exists(): + logger.error("CITATION.cff file not found at %s", citation_cff_path) return False # Read CITATION.cff using utility function citation_data = load_yaml_file(citation_cff_path) - if not citation_data: - logging.error(f"Could not load {citation_cff_path}. Exiting.") + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") return False # Extract data from preferred-citation or root if "preferred-citation" in citation_data: - logging.info("Using 'preferred-citation' section from CITATION.cff") + logger.info("Using 'preferred-citation' section from CITATION.cff") pref = citation_data.get("preferred-citation") + if not isinstance(pref, dict): + logger.error("preferred-citation is not a dictionary") + return False else: - logging.info("No 'preferred-citation' section found, using root data") + logger.info("No 'preferred-citation' section found, using root data") pref = citation_data # Validate required fields @@ -116,19 +122,19 @@ def create_bibtex_from_cff(): year = str(pref.get("year", "")) # Ensure year is a string for generate_citation_key if not authors: - logging.warning("No authors found in CITATION.cff") + logger.warning("No authors found in CITATION.cff") if title == "Untitled": - logging.warning("No title found in CITATION.cff, using 'Untitled'") + logger.warning("No title found in CITATION.cff, using 'Untitled'") if not year: - logging.warning("No year found in CITATION.cff") + logger.warning("No year found in CITATION.cff") # Use utility function to format authors try: author_str = format_authors_for_bibtex(authors) - except Exception as e: - logging.error(f"Error formatting authors: {str(e)}") + except Exception: + logger.exception("Error formatting authors") author_str = "" # Choose entry type based on type field @@ -141,19 +147,19 @@ def create_bibtex_from_cff(): if entry_type == "thesis": # Check for thesis type information thesis_type = pref.get("thesis-type", "").lower() - if thesis_type == "master" or thesis_type == "masters" or thesis_type == "master's": + if thesis_type in {"master", "masters", "master's"}: entry_type = "mastersthesis" else: # Default to phdthesis if type is not specified or is something else entry_type = "phdthesis" - logging.info(f"Converting CFF type '{cff_type}' to BibTeX entry type: {entry_type}") + logger.info("Converting CFF type '%s' to BibTeX entry type: %s", cff_type, entry_type) # Use utility function to generate citation key try: citation_key = generate_citation_key(authors, title, year) - except Exception as e: - logging.error(f"Error generating citation key: {str(e)}") + except Exception: + logger.exception("Error generating citation key") citation_key = "Unknown_Citation_Key" # Compile BibTeX entry @@ -254,8 +260,8 @@ def create_bibtex_from_cff(): try: editor_str = format_authors_for_bibtex(pref["collection-editors"]) bibtex_lines.append(f" editor = {{{editor_str}}},") - except Exception as e: - logging.warning(f"Error formatting collection editors: {str(e)}") + except (KeyError, TypeError, AttributeError) as e: + logger.warning("Error formatting collection editors: %s", e) # Special handling for software, code, data entries if cff_type.lower().startswith("software") or cff_type.lower() in [ @@ -269,9 +275,7 @@ def create_bibtex_from_cff(): if "repository-code" in pref and "note" not in pref: bibtex_lines.append(f" note = {{Repository: {pref['repository-code']}}},") - # Add version info - if "version" in pref: - bibtex_lines.append(f" version = {{{pref['version']}}},") + # Note: version is already added in the common fields section above # Add software-specific details as howpublished if not present if ("howpublished" not in pref) and ("repository-code" in pref or "url" in pref): @@ -299,22 +303,22 @@ def create_bibtex_from_cff(): bibtex_lines.append(f" {field:<9} = {{{field_value}}},") # Handle list fields like languages - if "languages" in pref and pref["languages"]: + if pref.get("languages"): try: languages_str = ", ".join(pref["languages"]) bibtex_lines.append(f" language = {{{languages_str}}},") - except Exception as e: - logging.warning(f"Error processing languages field: {str(e)}") + except (TypeError, AttributeError) as e: + logger.warning("Error processing languages field: %s", e) # Handle keywords field - if "keywords" in pref and pref["keywords"]: + if pref.get("keywords"): try: keywords_list = extract_keywords(pref["keywords"]) if keywords_list: keywords_str = ", ".join(keywords_list) bibtex_lines.append(f" keywords = {{{keywords_str}}},") - except Exception as e: - logging.warning(f"Error processing keywords field: {str(e)}") + except (TypeError, AttributeError) as e: + logger.warning("Error processing keywords field: %s", e) # Close the entry bibtex_lines.append("}") @@ -322,16 +326,17 @@ def create_bibtex_from_cff(): # Write to CITATION.bib try: - with open(citation_bib_path, "w", encoding="utf-8") as f: + with citation_bib_path.open("w", encoding="utf-8") as f: f.write(bibtex) - logging.info(f"BibTeX citation successfully created at {citation_bib_path}") - return True - except IOError as e: - logging.error(f"Error writing to {citation_bib_path}: {e}") + except OSError: + logger.exception("Error writing to %s", citation_bib_path) return False + else: + logger.info("BibTeX citation successfully created at %s", citation_bib_path) + return True - except Exception as e: - logging.exception(f"Unexpected error in create_bibtex_from_cff: {str(e)}") + except Exception: + logger.exception("Unexpected error in create_bibtex_from_cff") return False diff --git a/quadriga/metadata/create_jsonld.py b/quadriga/metadata/create_jsonld.py new file mode 100644 index 000000000..e7b8017b5 --- /dev/null +++ b/quadriga/metadata/create_jsonld.py @@ -0,0 +1,595 @@ +""" +Creates a JSON-LD file from metadata.yml using QUADRIGA schema x-mappings. + +This script reads metadata from 'metadata.yml' and transforms it into JSON-LD +format using the x-mappings defined in the QUADRIGA schema. The output follows +Schema.org, Dublin Core, LRMI, and other standard vocabularies. + +The JSON-LD file provides machine-readable linked data that can be consumed by +search engines, digital repositories, and other semantic web applications. +""" + +from __future__ import annotations + +import json +import logging +import sys +from pathlib import Path +from typing import Any + +from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + + +def build_jsonld_context() -> dict[str, str]: + """ + Build the JSON-LD @context with vocabulary namespaces. + + Returns + ------- + dict: Context dictionary with vocabulary prefixes + """ + return { + "schema": "http://schema.org/", + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "lrmi": "http://purl.org/dcx/lrmi-terms/", + "skos": "http://www.w3.org/2004/02/skos/core#", + "@vocab": "http://schema.org/", + } + + +def clean_orcid(orcid_string: str) -> str | None: + """ + Extract ORCID identifier from an ORCID string or URL. + + Args: + orcid_string (str): ORCID string which may include URL prefix + + Returns + ------- + str: Clean ORCID identifier (e.g., "0000-0002-1602-6032") + """ + if not orcid_string: + return None + + orcid = str(orcid_string) + prefixes = ["https://orcid.org/", "http://orcid.org/", "orcid:"] + for prefix in prefixes: + if orcid.startswith(prefix): + orcid = orcid[len(prefix) :] + break + + return orcid.strip() + + +def clean_doi(doi_string: str) -> str | None: + """ + Extract DOI identifier from a DOI string or URL. + + Args: + doi_string (str): DOI string which may include URL prefix + + Returns + ------- + str: Clean DOI identifier (e.g., "10.5281/zenodo.14970672") + """ + if not doi_string: + return None + + doi = str(doi_string) + prefixes = ["https://doi.org/", "http://doi.org/", "doi:"] + for prefix in prefixes: + if doi.startswith(prefix): + doi = doi[len(prefix) :] + break + + return doi.strip() + + +def transform_person(person_data: Any) -> dict[str, Any]: + """ + Transform author or contributor to Schema.org Person. + + Uses x-mappings: + - author/contributor -> schema:Person + - given-names -> schema:givenName + - family-names -> schema:familyName + - orcid -> schema:identifier + - affiliation -> schema:affiliation + - credit -> not included (no standard schema.org mapping for CRediT roles) + + Args: + person_data (dict): Author or contributor dictionary + + Returns + ------- + dict: Schema.org Person object + """ + if not isinstance(person_data, dict): + logger.warning("Invalid person data: %s", person_data) + return {} + + person: dict[str, Any] = {"@type": "Person"} + + # given-names -> schema:givenName (exactMatch) + if "given-names" in person_data: + person["givenName"] = person_data["given-names"] + + # family-names -> schema:familyName (exactMatch) + if "family-names" in person_data: + person["familyName"] = person_data["family-names"] + + # Construct full name + if "given-names" in person_data or "family-names" in person_data: + given = person_data.get("given-names", "") + family = person_data.get("family-names", "") + person["name"] = f"{given} {family}".strip() + + # orcid -> schema:identifier (exactMatch) + if "orcid" in person_data: + clean_orcid_id = clean_orcid(person_data["orcid"]) + if clean_orcid_id: + person["identifier"] = { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": clean_orcid_id, + "url": f"https://orcid.org/{clean_orcid_id}", + } + + # affiliation -> schema:affiliation (mapped in both author and contributor) + if "affiliation" in person_data: + person["affiliation"] = { + "@type": "Organization", + "name": person_data["affiliation"], + } + + # Note: CRediT roles (credit field) are not included in JSON-LD + # because schema.org does not have a standard property for contributor roles + # on Person objects within author/contributor arrays + + return person + + +def transform_learning_objective(objective_data: Any) -> dict[str, Any]: + """ + Transform learning objective entry to AlignmentObject. + + Uses x-mappings: + - learning-objective -> schema:teaches / lrmi:teaches (closeMatch) + - competency -> maps to modalia:Skill + - blooms-category -> part of educational alignment + - assessment -> lrmi:assesses / schema:assesses (closeMatch) + + Args: + objective_data (dict): Learning objective dictionary + + Returns + ------- + dict: Schema.org AlignmentObject + """ + if not isinstance(objective_data, dict): + return {} + + objective = { + "@type": "AlignmentObject", + } + + # learning-objective text + if "learning-objective" in objective_data: + objective["targetName"] = objective_data["learning-objective"] + + # Add competency framework information if available + if "competency" in objective_data: + objective["educationalFramework"] = "QUADRIGA Competency Framework" + objective["targetDescription"] = f"Competency: {objective_data['competency']}" + + # Add Bloom's taxonomy level if available + if "blooms-category" in objective_data: + if "targetDescription" in objective: + objective["targetDescription"] += f" | Bloom's: {objective_data['blooms-category']}" + else: + objective["targetDescription"] = f"Bloom's: {objective_data['blooms-category']}" + + # Add data flow if available + if "data-flow" in objective_data: + if "targetDescription" in objective: + objective["targetDescription"] += f" | Data Flow: {objective_data['data-flow']}" + else: + objective["targetDescription"] = f"Data Flow: {objective_data['data-flow']}" + + # assessment -> lrmi:assesses / schema:assesses (closeMatch) + if "assessment" in objective_data: + objective["lrmi:assesses"] = objective_data["assessment"] + + return objective + + +def transform_chapter(chapter_data: Any) -> dict[str, Any]: + """ + Transform chapter to Schema.org/LRMI LearningResource. + + Uses x-mappings: + - chapter -> schema:LearningResource / lrmi:LearningResource (exactMatch) + - title -> schema:name (exactMatch) + - description -> schema:description (exactMatch) + - url -> schema:url (exactMatch) + - time-required -> schema:timeRequired (exactMatch) + - learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + - learning-objectives -> schema:teaches / lrmi:teaches (closeMatch) + + Args: + chapter_data (dict): Chapter dictionary + + Returns + ------- + dict: Schema.org LearningResource object + """ + if not isinstance(chapter_data, dict): + return {} + + chapter: dict[str, Any] = { + "@type": "LearningResource", + } + + # title -> schema:name (exactMatch) + if "title" in chapter_data: + chapter["name"] = chapter_data["title"] + + # description -> schema:description (exactMatch) + if "description" in chapter_data: + chapter["description"] = chapter_data["description"] + + # url -> schema:url (exactMatch) + if "url" in chapter_data: + chapter["url"] = chapter_data["url"] + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in chapter_data: + chapter["timeRequired"] = chapter_data["time-required"] + + # learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + if "learning-goal" in chapter_data: + chapter["teaches"] = chapter_data["learning-goal"] + + # learning-objectives -> schema:teaches / lrmi:teaches (closeMatch) + # Map to AlignmentObject for structured representation + if chapter_data.get("learning-objectives"): + objectives = [] + for obj in chapter_data["learning-objectives"]: + transformed = transform_learning_objective(obj) + if transformed and len(transformed) > 1: # More than just @type + objectives.append(transformed) + + if objectives: + chapter["educationalAlignment"] = objectives + + # language -> schema:inLanguage (exactMatch) + # Chapter-level language override (supports single language or array) + if "language" in chapter_data: + chapter["inLanguage"] = chapter_data["language"] + + return chapter + + +def transform_license(license_data: Any) -> dict | list | str | None: + """ + Transform license information to Schema.org license. + + The QUADRIGA schema has separate licenses for code and content. + Uses x-mappings: + - license -> schema:license (exactMatch) + + Args: + license_data: License dictionary or string + + Returns + ------- + dict or list: Schema.org license representation + """ + if not license_data: + return None + + # Handle string license (simple case) + if isinstance(license_data, str): + return license_data + + # Handle complex license structure (code vs content) + if isinstance(license_data, dict): + licenses = [] + + # Code license + if "code" in license_data: + code_license = { + "@type": "CreativeWork", + "name": "Source Code", + "license": license_data["code"], + } + licenses.append(code_license) + + # Content license + if "content" in license_data: + content_license_data = license_data["content"] + if isinstance(content_license_data, dict): + content_license = { + "@type": "CreativeWork", + "name": "Content", + } + if "url" in content_license_data: + content_license["license"] = content_license_data["url"] + # Note: licenseName is not a valid schema.org property + # The license URL should be sufficient for identification + licenses.append(content_license) + elif isinstance(content_license_data, str): + content_license = { + "@type": "CreativeWork", + "name": "Content", + "license": content_license_data, + } + licenses.append(content_license) + + return licenses if len(licenses) > 1 else licenses[0] if licenses else None + + return None + + +def create_jsonld() -> bool | None: + """ + Create a metadata.jsonld file from metadata.yml using QUADRIGA schema x-mappings. + + The function reads metadata from metadata.yml and transforms it into JSON-LD + format using the x-mappings defined in the QUADRIGA schema. The output uses + Schema.org as the primary vocabulary, with additional terms from Dublin Core, + LRMI (Learning Resource Metadata Initiative), and other standards. + + Returns + ------- + bool: True if successful, False otherwise. + """ + try: + # Define file paths + try: + repo_root = get_repo_root() + metadata_path = get_file_path("metadata.yml", repo_root) + jsonld_path = get_file_path("metadata.jsonld", repo_root) + except Exception: + logger.exception("Failed to resolve file paths") + return False + + # Check if metadata.yml exists + if not Path(metadata_path).exists(): + logger.error("metadata.yml file not found at %s", metadata_path) + return False + + # Load metadata.yml + metadata = load_yaml_file(metadata_path) + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") + return False + + # Build JSON-LD structure + jsonld: dict[str, Any] = { + "@context": build_jsonld_context(), + "@type": ["Book", "LearningResource"], + } + + # ===== BASIC METADATA ===== + + # title -> schema:name (exactMatch) + if "title" in metadata: + jsonld["name"] = metadata["title"] + logger.info("Added title: %s", metadata["title"]) + else: + logger.warning("No title found in metadata.yml") + + # description -> schema:description (exactMatch) + if "description" in metadata: + jsonld["description"] = metadata["description"] + logger.info("Added description") + + # identifier (DOI) -> schema:identifier (exactMatch) + if "identifier" in metadata: + clean_doi_id = clean_doi(metadata["identifier"]) + if clean_doi_id: + jsonld["identifier"] = { + "@type": "PropertyValue", + "propertyID": "DOI", + "value": clean_doi_id, + "url": metadata["identifier"], + } + logger.info("Added DOI identifier: %s", clean_doi_id) + + # version -> schema:version (exactMatch) + if "version" in metadata: + jsonld["version"] = str(metadata["version"]) + logger.info("Added version: %s", metadata["version"]) + + # schema-version -> schema:schemaVersion + if "schema-version" in metadata: + jsonld["schemaVersion"] = str(metadata["schema-version"]) + logger.info("Added schema version: %s", metadata["schema-version"]) + + # url -> schema:url (exactMatch) + if "url" in metadata: + jsonld["url"] = metadata["url"] + logger.info("Added URL: %s", metadata["url"]) + + # git -> schema:workExample as SoftwareSourceCode + # codeRepository is not valid for Book type, so we link to source code as a workExample + if "git" in metadata: + jsonld["workExample"] = { + "@type": "SoftwareSourceCode", + "name": "Source Code Repository", + "codeRepository": metadata["git"], + } + logger.info("Added code repository as workExample: %s", metadata["git"]) + + # ===== DATES ===== + + # date-issued -> schema:datePublished (exactMatch) + if "date-issued" in metadata: + # Handle both date objects and strings + date_value = metadata["date-issued"] + if hasattr(date_value, "isoformat"): + jsonld["datePublished"] = date_value.isoformat() + else: + jsonld["datePublished"] = str(date_value) + logger.info("Added datePublished: %s", jsonld["datePublished"]) + + # date-modified -> schema:dateModified (exactMatch) + if "date-modified" in metadata: + date_value = metadata["date-modified"] + if hasattr(date_value, "isoformat"): + jsonld["dateModified"] = date_value.isoformat() + else: + jsonld["dateModified"] = str(date_value) + logger.info("Added dateModified: %s", jsonld["dateModified"]) + + # ===== PEOPLE ===== + + # authors -> schema:author (exactMatch) + if metadata.get("authors"): + authors = [] + for author in metadata["authors"]: + person = transform_person(author) + if person and len(person) > 1: # More than just @type + authors.append(person) + if authors: + jsonld["author"] = authors + logger.info("Added %d authors", len(authors)) + else: + logger.warning("No authors found in metadata.yml") + + # contributors -> schema:contributor (exactMatch) + if metadata.get("contributors"): + contributors = [] + for contributor in metadata["contributors"]: + person = transform_person(contributor) + if person and len(person) > 1: # More than just @type + contributors.append(person) + if contributors: + jsonld["contributor"] = contributors + logger.info("Added %d contributors", len(contributors)) + + # ===== LANGUAGE & KEYWORDS ===== + + # language -> schema:inLanguage (exactMatch) + # Supports both single language (string) and multiple languages (array) + if "language" in metadata: + language_value = metadata["language"] + # If it's already a list, use it as-is + # If it's a single string, use it as-is (Schema.org supports both) + jsonld["inLanguage"] = language_value + if isinstance(language_value, list): + logger.info("Added languages: %s", ", ".join(language_value)) + else: + logger.info("Added language: %s", language_value) + + # keywords -> schema:keywords (exactMatch) and schema:about (closeMatch) + if metadata.get("keywords"): + keywords_list = extract_keywords(metadata["keywords"]) + if keywords_list: + jsonld["keywords"] = keywords_list + # Also add as 'about' for closeMatch mapping + jsonld["about"] = [{"@type": "Thing", "name": kw} for kw in keywords_list] + logger.info("Added %d keywords", len(keywords_list)) + + # ===== EDUCATIONAL METADATA ===== + + # discipline -> schema:about (closeMatch) and modalia:Discipline (exactMatch) + if metadata.get("discipline"): + if "about" not in jsonld: + jsonld["about"] = [] + for disc in metadata["discipline"]: + jsonld["about"].append({"@type": "Thing", "name": disc}) + logger.info("Added %d disciplines", len(metadata["discipline"])) + + # research-object-type -> schema:about (broadMatch) + if metadata.get("research-object-type"): + if "about" not in jsonld: + jsonld["about"] = [] + for obj_type in metadata["research-object-type"]: + jsonld["about"].append({"@type": "Thing", "name": obj_type}) + logger.info("Added %d research object types", len(metadata["research-object-type"])) + + # target-group -> schema:audience (closeMatch) and lrmi:educationalAudience (closeMatch) + if metadata.get("target-group"): + jsonld["audience"] = [ + {"@type": "Audience", "audienceType": group} for group in metadata["target-group"] + ] + logger.info("Added %d target groups", len(jsonld["audience"])) + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in metadata: + jsonld["timeRequired"] = metadata["time-required"] + logger.info("Added time required: %s", metadata["time-required"]) + + # ===== LICENSE ===== + + # license -> schema:license (exactMatch) + if "license" in metadata: + license_data = transform_license(metadata["license"]) + if license_data: + jsonld["license"] = license_data + logger.info("Added license information") + + # ===== CHAPTERS (hasPart) ===== + + # chapters -> schema:hasPart (closeMatch) + if metadata.get("chapters"): + parts = [] + for chapter in metadata["chapters"]: + chapter_obj = transform_chapter(chapter) + if chapter_obj and len(chapter_obj) > 1: # More than just @type + parts.append(chapter_obj) + if parts: + jsonld["hasPart"] = parts + logger.info("Added %d chapters", len(parts)) + + # table-of-contents -> dcterms:tableOfContents (exactMatch) + if "table-of-contents" in metadata: + jsonld["dcterms:tableOfContents"] = metadata["table-of-contents"] + logger.info("Added table of contents") + + # ===== ADDITIONAL METADATA ===== + + # context-of-creation -> modalia:Community (closeMatch) + if "context-of-creation" in metadata: + jsonld["funding"] = metadata["context-of-creation"] + logger.info("Added context of creation") + + # learning-resource-type -> schema:learningResourceType (closeMatch) + # -> lrmi:learningResourceType (closeMatch) + # -> dcterms:type (broadMatch) + # -> dc:type (broadMatch) + if "learning-resource-type" in metadata: + jsonld["learningResourceType"] = metadata["learning-resource-type"] + jsonld["lrmi:learningResourceType"] = metadata["learning-resource-type"] + jsonld["dcterms:type"] = metadata["learning-resource-type"] + jsonld["dc:type"] = metadata["learning-resource-type"] + logger.info("Added learning resource type: %s", metadata["learning-resource-type"]) + + # quality-assurance: not mapped to JSON-LD + # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output + + # Write JSON-LD file + try: + with jsonld_path.open("w", encoding="utf-8") as f: + json.dump(jsonld, f, ensure_ascii=False, indent=2) + except OSError: + logger.exception("Error writing to %s", jsonld_path) + return False + else: + logger.info("JSON-LD metadata successfully created at %s", jsonld_path) + return True + + except Exception: + logger.exception("Unexpected error in create_jsonld") + return False + + +if __name__ == "__main__": + success = create_jsonld() + sys.exit(0 if success else 1) diff --git a/quadriga/metadata/create_rdfxml.py b/quadriga/metadata/create_rdfxml.py new file mode 100644 index 000000000..e07525676 --- /dev/null +++ b/quadriga/metadata/create_rdfxml.py @@ -0,0 +1,642 @@ +""" +Creates an RDF/XML file from metadata.yml using QUADRIGA schema x-mappings. + +This script reads metadata from 'metadata.yml' and transforms it into RDF/XML +format using the x-mappings defined in the QUADRIGA schema. The output follows +Schema.org, Dublin Core, LRMI, and other standard vocabularies. + +The RDF/XML file provides machine-readable linked data that can be consumed by +semantic web applications, triple stores, and other RDF-aware systems. +""" + +from __future__ import annotations + +import logging +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any + +from rdflib import RDF, Graph, Literal, Namespace, URIRef # type: ignore[import-not-found] +from rdflib.namespace import DCTERMS, SKOS, XSD # type: ignore[import-not-found] + +from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + +# Define namespaces +SCHEMA = Namespace("http://schema.org/") +DC = Namespace("http://purl.org/dc/elements/1.1/") +LRMI = Namespace("http://purl.org/dcx/lrmi-terms/") + + +def _sort_xml_element(element: ET.Element) -> None: + """ + Recursively sort child elements for deterministic XML output. + + Sorts by tag name, then by attributes (as sorted key-value pairs), + then by text content. This ensures identical output regardless of + Python's hash randomization (PYTHONHASHSEED). + + Args: + element: XML element whose children will be sorted in-place + """ + children = list(element) + for child in children: + _sort_xml_element(child) + children.sort(key=lambda e: (e.tag, sorted(e.attrib.items()), e.text or "")) + element[:] = children + + +def clean_orcid(orcid_string: str) -> str | None: + """ + Extract ORCID identifier from an ORCID string or URL. + + Args: + orcid_string (str): ORCID string which may include URL prefix + + Returns + ------- + str: Clean ORCID identifier (e.g., "0000-0002-1602-6032") + """ + if not orcid_string: + return None + + orcid = str(orcid_string) + prefixes = ["https://orcid.org/", "http://orcid.org/", "orcid:"] + for prefix in prefixes: + if orcid.startswith(prefix): + orcid = orcid[len(prefix) :] + break + + return orcid.strip() + + +def clean_doi(doi_string: str) -> str | None: + """ + Extract DOI identifier from a DOI string or URL. + + Args: + doi_string (str): DOI string which may include URL prefix + + Returns + ------- + str: Clean DOI identifier (e.g., "10.5281/zenodo.14970672") + """ + if not doi_string: + return None + + doi = str(doi_string) + prefixes = ["https://doi.org/", "http://doi.org/", "doi:"] + for prefix in prefixes: + if doi.startswith(prefix): + doi = doi[len(prefix) :] + break + + return doi.strip() + + +def add_person( + graph: Graph, person_data: Any, base_uri: str, person_type: str, index: int +) -> URIRef | None: + """ + Add a person (author or contributor) to the RDF graph. + + Uses x-mappings: + - author/contributor -> schema:Person + - given-names -> schema:givenName + - family-names -> schema:familyName + - orcid -> schema:identifier + - affiliation -> schema:affiliation + + Args: + graph: RDF graph to add triples to + person_data: Author or contributor dictionary + base_uri: Base URI for the resource + person_type: Type of person ('author' or 'contributor') + index: Index of the person in the list + + Returns + ------- + URIRef: URI of the person node, or None if invalid + """ + if not isinstance(person_data, dict): + logger.warning("Invalid person data: %s", person_data) + return None + + # Create person URI + person_uri = URIRef(f"{base_uri}#{person_type}_{index}") + graph.add((person_uri, RDF.type, SCHEMA.Person)) + + # given-names -> schema:givenName (exactMatch) + if "given-names" in person_data: + graph.add((person_uri, SCHEMA.givenName, Literal(person_data["given-names"]))) + + # family-names -> schema:familyName (exactMatch) + if "family-names" in person_data: + graph.add((person_uri, SCHEMA.familyName, Literal(person_data["family-names"]))) + + # Construct full name + if "given-names" in person_data or "family-names" in person_data: + given = person_data.get("given-names", "") + family = person_data.get("family-names", "") + full_name = f"{given} {family}".strip() + graph.add((person_uri, SCHEMA.name, Literal(full_name))) + + # orcid -> schema:identifier (exactMatch) + if "orcid" in person_data: + clean_orcid_id = clean_orcid(person_data["orcid"]) + if clean_orcid_id: + # Create PropertyValue node for ORCID + orcid_node = URIRef(f"{base_uri}#{person_type}_{index}_orcid") + graph.add((orcid_node, RDF.type, SCHEMA.PropertyValue)) + graph.add((orcid_node, SCHEMA.propertyID, Literal("ORCID"))) + graph.add((orcid_node, SCHEMA.value, Literal(clean_orcid_id))) + graph.add( + (orcid_node, SCHEMA.url, URIRef(f"https://orcid.org/{clean_orcid_id}")) + ) + graph.add((person_uri, SCHEMA.identifier, orcid_node)) + + # affiliation -> schema:affiliation (mapped in both author and contributor) + if "affiliation" in person_data: + # Create Organization node + org_node = URIRef(f"{base_uri}#{person_type}_{index}_org") + graph.add((org_node, RDF.type, SCHEMA.Organization)) + graph.add((org_node, SCHEMA.name, Literal(person_data["affiliation"]))) + graph.add((person_uri, SCHEMA.affiliation, org_node)) + + # Note: CRediT roles (credit field) are not included in RDF + # because schema.org does not have a standard property for contributor roles + + return person_uri + + +def add_learning_objective( + graph: Graph, objective_data: Any, base_uri: str, chapter_index: int, obj_index: int +) -> URIRef | None: + """ + Add a learning objective to the RDF graph as an AlignmentObject. + + Uses x-mappings: + - learning-objective -> schema:teaches / lrmi:teaches (closeMatch) + - competency -> maps to modalia:Skill + - blooms-category -> part of educational alignment + - assessment -> lrmi:assesses / schema:assesses (closeMatch) + + Args: + graph: RDF graph to add triples to + objective_data: Learning objective dictionary + base_uri: Base URI for the resource + chapter_index: Index of the chapter + obj_index: Index of the objective + + Returns + ------- + URIRef: URI of the alignment object node, or None if invalid + """ + if not isinstance(objective_data, dict): + return None + + obj_uri = URIRef(f"{base_uri}#chapter_{chapter_index}_objective_{obj_index}") + graph.add((obj_uri, RDF.type, SCHEMA.AlignmentObject)) + + # learning-objective text + if "learning-objective" in objective_data: + graph.add((obj_uri, SCHEMA.targetName, Literal(objective_data["learning-objective"]))) + + # Add competency framework information + descriptions = [] + if "competency" in objective_data: + graph.add( + ( + obj_uri, + SCHEMA.educationalFramework, + Literal("QUADRIGA Competency Framework"), + ) + ) + descriptions.append(f"Competency: {objective_data['competency']}") + + # Add Bloom's taxonomy level + if "blooms-category" in objective_data: + descriptions.append(f"Bloom's: {objective_data['blooms-category']}") + + # Add data flow + if "data-flow" in objective_data: + descriptions.append(f"Data Flow: {objective_data['data-flow']}") + + # Combine descriptions + if descriptions: + graph.add((obj_uri, SCHEMA.targetDescription, Literal(" | ".join(descriptions)))) + + # assessment -> lrmi:assesses (closeMatch) + if "assessment" in objective_data: + graph.add((obj_uri, LRMI.assesses, Literal(objective_data["assessment"]))) + + return obj_uri + + +def add_chapter( + graph: Graph, chapter_data: Any, base_uri: str, chapter_index: int +) -> URIRef | None: + """ + Add a chapter to the RDF graph as a LearningResource. + + Uses x-mappings: + - chapter -> schema:LearningResource / lrmi:LearningResource (exactMatch) + - title -> schema:name (exactMatch) + - description -> schema:description (exactMatch) + - url -> schema:url (exactMatch) + - time-required -> schema:timeRequired (exactMatch) + - learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + - learning-objectives -> schema:teaches / lrmi:teaches (closeMatch) + + Args: + graph: RDF graph to add triples to + chapter_data: Chapter dictionary + base_uri: Base URI for the resource + chapter_index: Index of the chapter + + Returns + ------- + URIRef: URI of the chapter node, or None if invalid + """ + if not isinstance(chapter_data, dict): + return None + + chapter_uri = URIRef(f"{base_uri}#chapter_{chapter_index}") + graph.add((chapter_uri, RDF.type, SCHEMA.LearningResource)) + + # title -> schema:name (exactMatch) + if "title" in chapter_data: + graph.add((chapter_uri, SCHEMA.name, Literal(chapter_data["title"]))) + + # description -> schema:description (exactMatch) + if "description" in chapter_data: + graph.add((chapter_uri, SCHEMA.description, Literal(chapter_data["description"]))) + + # url -> schema:url (exactMatch) + if "url" in chapter_data: + graph.add((chapter_uri, SCHEMA.url, URIRef(chapter_data["url"]))) + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in chapter_data: + graph.add((chapter_uri, SCHEMA.timeRequired, Literal(chapter_data["time-required"]))) + + # learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + if "learning-goal" in chapter_data: + graph.add((chapter_uri, SCHEMA.teaches, Literal(chapter_data["learning-goal"]))) + + # learning-objectives -> educationalAlignment with AlignmentObject + if chapter_data.get("learning-objectives"): + for obj_index, obj_data in enumerate(chapter_data["learning-objectives"]): + obj_uri = add_learning_objective( + graph, obj_data, base_uri, chapter_index, obj_index + ) + if obj_uri: + graph.add((chapter_uri, SCHEMA.educationalAlignment, obj_uri)) + + # language -> schema:inLanguage (exactMatch) + if "language" in chapter_data: + language_value = chapter_data["language"] + if isinstance(language_value, list): + for lang in language_value: + graph.add((chapter_uri, SCHEMA.inLanguage, Literal(lang))) + else: + graph.add((chapter_uri, SCHEMA.inLanguage, Literal(language_value))) + + return chapter_uri + + +def create_rdfxml() -> bool | None: + """ + Create a metadata.rdf file from metadata.yml using QUADRIGA schema x-mappings. + + The function reads metadata from metadata.yml and transforms it into RDF/XML + format using the x-mappings defined in the QUADRIGA schema. The output uses + Schema.org as the primary vocabulary, with additional terms from Dublin Core, + LRMI (Learning Resource Metadata Initiative), and other standards. + + Returns + ------- + bool: True if successful, False otherwise. + """ + try: + # Define file paths + try: + repo_root = get_repo_root() + metadata_path = get_file_path("metadata.yml", repo_root) + rdf_path = get_file_path("metadata.rdf", repo_root) + except Exception: + logger.exception("Failed to resolve file paths") + return False + + # Check if metadata.yml exists + if not Path(metadata_path).exists(): + logger.error("metadata.yml file not found at %s", metadata_path) + return False + + # Load metadata.yml + metadata = load_yaml_file(metadata_path) + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") + return False + + # Create RDF graph + graph = Graph() + graph.bind("schema", SCHEMA) + graph.bind("dc", DC) + graph.bind("dcterms", DCTERMS) + graph.bind("lrmi", LRMI) + graph.bind("skos", SKOS) + + # Create base URI for the resource + base_uri = metadata.get("url", metadata.get("identifier", "http://example.org/book")) + if isinstance(base_uri, str) and not base_uri.startswith("http"): + base_uri = f"http://example.org/{base_uri}" + + resource_uri = URIRef(base_uri) + + # Add types: Book and LearningResource + graph.add((resource_uri, RDF.type, SCHEMA.Book)) + graph.add((resource_uri, RDF.type, SCHEMA.LearningResource)) + + # ===== BASIC METADATA ===== + + # title -> schema:name (exactMatch) + if "title" in metadata: + graph.add((resource_uri, SCHEMA.name, Literal(metadata["title"]))) + logger.info("Added title: %s", metadata["title"]) + else: + logger.warning("No title found in metadata.yml") + + # description -> schema:description (exactMatch) + if "description" in metadata: + graph.add((resource_uri, SCHEMA.description, Literal(metadata["description"]))) + logger.info("Added description") + + # identifier (DOI) -> schema:identifier (exactMatch) + if "identifier" in metadata: + clean_doi_id = clean_doi(metadata["identifier"]) + if clean_doi_id: + # Create PropertyValue node for DOI + doi_node = URIRef(f"{base_uri}#doi") + graph.add((doi_node, RDF.type, SCHEMA.PropertyValue)) + graph.add((doi_node, SCHEMA.propertyID, Literal("DOI"))) + graph.add((doi_node, SCHEMA.value, Literal(clean_doi_id))) + graph.add((doi_node, SCHEMA.url, URIRef(metadata["identifier"]))) + graph.add((resource_uri, SCHEMA.identifier, doi_node)) + logger.info("Added DOI identifier: %s", clean_doi_id) + + # version -> schema:version (exactMatch) + if "version" in metadata: + graph.add((resource_uri, SCHEMA.version, Literal(str(metadata["version"])))) + logger.info("Added version: %s", metadata["version"]) + + # schema-version -> schema:schemaVersion + if "schema-version" in metadata: + graph.add( + (resource_uri, SCHEMA.schemaVersion, Literal(str(metadata["schema-version"]))) + ) + logger.info("Added schema version: %s", metadata["schema-version"]) + + # url -> schema:url (exactMatch) + if "url" in metadata: + graph.add((resource_uri, SCHEMA.url, URIRef(metadata["url"]))) + logger.info("Added URL: %s", metadata["url"]) + + # git -> schema:workExample as SoftwareSourceCode + if "git" in metadata: + repo_node = URIRef(f"{base_uri}#repository") + graph.add((repo_node, RDF.type, SCHEMA.SoftwareSourceCode)) + graph.add((repo_node, SCHEMA.name, Literal("Source Code Repository"))) + graph.add((repo_node, SCHEMA.codeRepository, URIRef(metadata["git"]))) + graph.add((resource_uri, SCHEMA.workExample, repo_node)) + logger.info("Added code repository as workExample: %s", metadata["git"]) + + # ===== DATES ===== + + # date-issued -> schema:datePublished (exactMatch) + if "date-issued" in metadata: + date_value = metadata["date-issued"] + if hasattr(date_value, "isoformat"): + date_str = date_value.isoformat() + else: + date_str = str(date_value) + graph.add((resource_uri, SCHEMA.datePublished, Literal(date_str, datatype=XSD.date))) + logger.info("Added datePublished: %s", date_str) + + # date-modified -> schema:dateModified (exactMatch) + if "date-modified" in metadata: + date_value = metadata["date-modified"] + if hasattr(date_value, "isoformat"): + date_str = date_value.isoformat() + else: + date_str = str(date_value) + graph.add((resource_uri, SCHEMA.dateModified, Literal(date_str, datatype=XSD.date))) + logger.info("Added dateModified: %s", date_str) + + # ===== PEOPLE ===== + + # authors -> schema:author (exactMatch) + if metadata.get("authors"): + for i, author in enumerate(metadata["authors"]): + person_uri = add_person(graph, author, base_uri, "author", i) + if person_uri: + graph.add((resource_uri, SCHEMA.author, person_uri)) + logger.info("Added %d authors", len(metadata["authors"])) + else: + logger.warning("No authors found in metadata.yml") + + # contributors -> schema:contributor (exactMatch) + if metadata.get("contributors"): + for i, contributor in enumerate(metadata["contributors"]): + person_uri = add_person(graph, contributor, base_uri, "contributor", i) + if person_uri: + graph.add((resource_uri, SCHEMA.contributor, person_uri)) + logger.info("Added %d contributors", len(metadata["contributors"])) + + # ===== LANGUAGE & KEYWORDS ===== + + # language -> schema:inLanguage (exactMatch) + if "language" in metadata: + language_value = metadata["language"] + if isinstance(language_value, list): + for lang in language_value: + graph.add((resource_uri, SCHEMA.inLanguage, Literal(lang))) + logger.info("Added languages: %s", ", ".join(language_value)) + else: + graph.add((resource_uri, SCHEMA.inLanguage, Literal(language_value))) + logger.info("Added language: %s", language_value) + + # keywords -> schema:keywords (exactMatch) and schema:about (closeMatch) + if metadata.get("keywords"): + keywords_list = extract_keywords(metadata["keywords"]) + if keywords_list: + for keyword in keywords_list: + graph.add((resource_uri, SCHEMA.keywords, Literal(keyword))) + # Also add as 'about' for closeMatch mapping + keyword_node = URIRef(f"{base_uri}#keyword_{keywords_list.index(keyword)}") + graph.add((keyword_node, RDF.type, SCHEMA.Thing)) + graph.add((keyword_node, SCHEMA.name, Literal(keyword))) + graph.add((resource_uri, SCHEMA.about, keyword_node)) + logger.info("Added %d keywords", len(keywords_list)) + + # ===== EDUCATIONAL METADATA ===== + + # discipline -> schema:about (closeMatch) + if metadata.get("discipline"): + for i, disc in enumerate(metadata["discipline"]): + disc_node = URIRef(f"{base_uri}#discipline_{i}") + graph.add((disc_node, RDF.type, SCHEMA.Thing)) + graph.add((disc_node, SCHEMA.name, Literal(disc))) + graph.add((resource_uri, SCHEMA.about, disc_node)) + logger.info("Added %d disciplines", len(metadata["discipline"])) + + # research-object-type -> schema:about (broadMatch) + if metadata.get("research-object-type"): + for i, obj_type in enumerate(metadata["research-object-type"]): + obj_node = URIRef(f"{base_uri}#research_object_{i}") + graph.add((obj_node, RDF.type, SCHEMA.Thing)) + graph.add((obj_node, SCHEMA.name, Literal(obj_type))) + graph.add((resource_uri, SCHEMA.about, obj_node)) + logger.info("Added %d research object types", len(metadata["research-object-type"])) + + # target-group -> schema:audience (closeMatch) + if metadata.get("target-group"): + for i, group in enumerate(metadata["target-group"]): + audience_node = URIRef(f"{base_uri}#audience_{i}") + graph.add((audience_node, RDF.type, SCHEMA.Audience)) + graph.add((audience_node, SCHEMA.audienceType, Literal(group))) + graph.add((resource_uri, SCHEMA.audience, audience_node)) + logger.info("Added %d target groups", len(metadata["target-group"])) + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in metadata: + graph.add((resource_uri, SCHEMA.timeRequired, Literal(metadata["time-required"]))) + logger.info("Added time required: %s", metadata["time-required"]) + + # ===== LICENSE ===== + + # license -> schema:license (exactMatch) + if "license" in metadata: + license_data = metadata["license"] + if isinstance(license_data, str): + graph.add((resource_uri, SCHEMA.license, URIRef(license_data))) + elif isinstance(license_data, dict): + # Code license + if "code" in license_data: + code_license_node = URIRef(f"{base_uri}#license_code") + graph.add((code_license_node, RDF.type, SCHEMA.CreativeWork)) + graph.add((code_license_node, SCHEMA.name, Literal("Source Code"))) + graph.add((code_license_node, SCHEMA.license, URIRef(license_data["code"]))) + graph.add((resource_uri, SCHEMA.license, code_license_node)) + + # Content license + if "content" in license_data: + content_license_data = license_data["content"] + content_license_node = URIRef(f"{base_uri}#license_content") + graph.add((content_license_node, RDF.type, SCHEMA.CreativeWork)) + graph.add((content_license_node, SCHEMA.name, Literal("Content"))) + if isinstance(content_license_data, dict): + if "url" in content_license_data: + graph.add( + ( + content_license_node, + SCHEMA.license, + URIRef(content_license_data["url"]), + ) + ) + elif isinstance(content_license_data, str): + graph.add( + (content_license_node, SCHEMA.license, URIRef(content_license_data)) + ) + graph.add((resource_uri, SCHEMA.license, content_license_node)) + logger.info("Added license information") + + # ===== CHAPTERS (hasPart) ===== + + # chapters -> schema:hasPart (closeMatch) + if metadata.get("chapters"): + for i, chapter in enumerate(metadata["chapters"]): + chapter_uri = add_chapter(graph, chapter, base_uri, i) + if chapter_uri: + graph.add((resource_uri, SCHEMA.hasPart, chapter_uri)) + logger.info("Added %d chapters", len(metadata["chapters"])) + + # table-of-contents -> dcterms:tableOfContents (exactMatch) + if "table-of-contents" in metadata: + graph.add( + (resource_uri, DCTERMS.tableOfContents, Literal(metadata["table-of-contents"])) + ) + logger.info("Added table of contents") + + # ===== ADDITIONAL METADATA ===== + + # context-of-creation -> schema:funding (adapted mapping) + if "context-of-creation" in metadata: + graph.add((resource_uri, SCHEMA.funding, Literal(metadata["context-of-creation"]))) + logger.info("Added context of creation") + + # learning-resource-type -> schema:learningResourceType (closeMatch) + # -> lrmi:learningResourceType (closeMatch) + # -> dcterms:type (broadMatch) + # -> dc:type (broadMatch) + if "learning-resource-type" in metadata: + lrt = Literal(metadata["learning-resource-type"]) + graph.add((resource_uri, SCHEMA.learningResourceType, lrt)) + graph.add((resource_uri, LRMI.learningResourceType, lrt)) + graph.add((resource_uri, DCTERMS.type, lrt)) + graph.add((resource_uri, DC.type, lrt)) + logger.info("Added learning resource type: %s", metadata["learning-resource-type"]) + + # quality-assurance: not mapped to RDF + # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output + + # Serialize to RDF/XML and post-process for deterministic output. + # rdflib's pretty-xml serializer uses Python dicts internally, so element + # and namespace ordering varies across process invocations due to hash + # randomization. We sort the XML elements after serialization to guarantee + # reproducible output regardless of PYTHONHASHSEED. + logger.info("Serializing %d triples to RDF/XML...", len(graph)) + + try: + xml_bytes = graph.serialize(format="pretty-xml", encoding="utf-8") + xml_str = xml_bytes.decode("utf-8") if isinstance(xml_bytes, bytes) else xml_bytes + + # Register namespace prefixes so ElementTree preserves them + for prefix, uri in [ + ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), + ("schema", str(SCHEMA)), + ("dc", str(DC)), + ("dcterms", str(DCTERMS)), + ("lrmi", str(LRMI)), + ("skos", str(SKOS)), + ]: + ET.register_namespace(prefix, uri) + + # Parse, sort elements recursively, and re-serialize + root = ET.fromstring(xml_str) # noqa: S314 — parsing our own rdflib output + _sort_xml_element(root) + ET.indent(root, space=" ") + + sorted_xml = ET.tostring(root, encoding="unicode", xml_declaration=True) + + with rdf_path.open("w", encoding="utf-8") as f: + f.write(sorted_xml) + f.write("\n") + except OSError: + logger.exception("Error writing to %s", rdf_path) + return False + else: + logger.info("RDF/XML metadata successfully created at %s", rdf_path) + return True + + except Exception: + logger.exception("Unexpected error in create_rdfxml") + return False + + +if __name__ == "__main__": + success = create_rdfxml() + sys.exit(0 if success else 1) diff --git a/quadriga/metadata/create_zenodo_json.py b/quadriga/metadata/create_zenodo_json.py index 47cb2a896..5879b5e18 100644 --- a/quadriga/metadata/create_zenodo_json.py +++ b/quadriga/metadata/create_zenodo_json.py @@ -8,13 +8,17 @@ The upload_type is set to "lesson" as specified for QUADRIGA OERs. """ +from __future__ import annotations + import json import logging import sys -from pathlib import Path +from typing import Any from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file +logger = logging.getLogger(__name__) + def clean_doi(doi_string: str) -> str | None: """ @@ -78,20 +82,20 @@ def format_creators_for_zenodo(authors: list) -> list: list: List of creator dictionaries in Zenodo format """ if not authors: - logging.warning("No authors provided to format_creators_for_zenodo") + logger.warning("No authors provided to format_creators_for_zenodo") return [] creators = [] for i, author in enumerate(authors): if not isinstance(author, dict): - logging.warning(f"Author at index {i} is not a dictionary: {author}") + logger.warning("Author at index %d is not a dictionary: %s", i, author) continue family = author.get("family-names", "") given = author.get("given-names", "") if not family and not given: - logging.warning(f"Author at index {i} is missing both family-names and given-names") + logger.warning("Author at index %d is missing both family-names and given-names", i) continue creator = {"name": f"{family}, {given}" if family and given else (family or given)} @@ -111,7 +115,7 @@ def format_creators_for_zenodo(authors: list) -> list: return creators -def format_contributors_for_zenodo(contributors): +def format_contributors_for_zenodo(contributors: list | None) -> list: """ Format contributors list for Zenodo contributors field. @@ -167,11 +171,11 @@ def format_contributors_for_zenodo(contributors): return formatted_contributors -def create_zenodo_json(): +def create_zenodo_json() -> bool | None: """ - Creates a .zenodo.json file from CITATION.cff and metadata.yml. + Create a .zenodo.json file from CITATION.cff and metadata.yml. - The function reads the 'preferred-citation' section from CITATION.cff + Reads the 'preferred-citation' section from CITATION.cff and combines it with data from metadata.yml to create a Zenodo-compliant metadata file. The upload_type is always set to "lesson" for QUADRIGA OERs. @@ -186,47 +190,50 @@ def create_zenodo_json(): citation_cff_path = get_file_path("CITATION.cff", repo_root) metadata_path = get_file_path("metadata.yml", repo_root) zenodo_json_path = get_file_path(".zenodo.json", repo_root) - except Exception as e: - logging.exception(f"Failed to resolve file paths: {e!s}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Check if required files exist - if not Path(citation_cff_path).exists(): - logging.error(f"CITATION.cff file not found at {citation_cff_path}") + if not citation_cff_path.exists(): + logger.error("CITATION.cff file not found at %s", citation_cff_path) return False - if not Path(metadata_path).exists(): - logging.error(f"metadata.yml file not found at {metadata_path}") + if not metadata_path.exists(): + logger.error("metadata.yml file not found at %s", metadata_path) return False # Load CITATION.cff citation_data = load_yaml_file(citation_cff_path) - if not citation_data: - logging.error("Could not load CITATION.cff. Exiting.") + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - if not metadata: - logging.error("Could not load metadata.yml. Exiting.") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False # Extract data from preferred-citation or root if "preferred-citation" in citation_data: - logging.info("Using 'preferred-citation' section from CITATION.cff") + logger.info("Using 'preferred-citation' section from CITATION.cff") pref = citation_data.get("preferred-citation") + if not isinstance(pref, dict): + logger.error("preferred-citation is not a dictionary") + return False else: - logging.info("No 'preferred-citation' section found, using root data") + logger.info("No 'preferred-citation' section found, using root data") pref = citation_data - zenodo_metadata = {"upload_type": "lesson"} + zenodo_metadata: dict[str, Any] = {"upload_type": "lesson"} # title if "title" in pref: zenodo_metadata["title"] = pref["title"] - logging.info(f"Added title: {pref['title']}") + logger.info("Added title: %s", pref["title"]) else: - logging.error("No title found in CITATION.cff") + logger.error("No title found in CITATION.cff") return False # creators @@ -234,24 +241,42 @@ def create_zenodo_json(): creators = format_creators_for_zenodo(pref["authors"]) if creators: zenodo_metadata["creators"] = creators - logging.info(f"Added {len(creators)} creators") + logger.info("Added %d creators", len(creators)) else: - logging.error("Could not format any creators from authors") + logger.error("Could not format any creators from authors") return False else: - logging.error("No authors found in preferred-citation") + logger.error("No authors found in preferred-citation") return False # description - description = citation_data.get("abstract") - if not description: - description = metadata.get("description") - - if description: - zenodo_metadata["description"] = description - logging.info("Added description") - else: - logging.warning("No description/abstract found") + description = "

" + metadata.get("description") + "

" + + description_base = f""" +

Das interaktive Lehrbuch kann als Web-Version verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch auf GitHub zur Verfügung.

+

Die QUADRIGA-OER sind nach einem einheitlichen Template gestaltet, werden nach einem standardisierten Verfahren qualitätsgeprüft und mit Metadaten ausgezeichnet.

+
QUADRIGA Datenkompetenzzentrum
+

QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozess möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes Datenkompetenzframework zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.

+

Zu den Zielgruppen von QUADRIGA zählen insbesondere promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.

+

QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR) und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartnern zählen: +

+

+ +

Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im Umsetzungskonzept erfahren.

+ +

Weitere Informationen sowie Publikationen finden Sie auf der Webseite, in der Zenodo-Community und der GitHub-Organisation des Projekts.

+""" + zenodo_metadata["description"] = description + description_base + logger.info("Added description") # publication date publication_date = None @@ -265,16 +290,16 @@ def create_zenodo_json(): else: # It's already a string publication_date = str(date_value) - logging.info(f"Added publication_date from metadata.yml: {publication_date}") + logger.info("Added publication_date from metadata.yml: %s", publication_date) elif "year" in pref: # Fall back to year from CITATION.cff year = str(pref["year"]) # Zenodo expects ISO 8601 date format (YYYY-MM-DD) # We use January 1st as default when only year is provided publication_date = f"{year}-01-01" - logging.info(f"Added publication_date from year (fallback): {publication_date}") + logger.info("Added publication_date from year (fallback): %s", publication_date) else: - logging.warning("No publication date or year found") + logger.warning("No publication date or year found") if publication_date: zenodo_metadata["publication_date"] = publication_date @@ -287,7 +312,7 @@ def create_zenodo_json(): keywords_list = extract_keywords(pref["keywords"]) if keywords_list: zenodo_metadata["keywords"] = keywords_list - logging.info(f"Added {len(keywords_list)} keywords") + logger.info("Added %d keywords", len(keywords_list)) # license license_id = None @@ -300,7 +325,7 @@ def create_zenodo_json(): # Clean up common variations license_clean = str(license_id).upper().replace("_", "-") zenodo_metadata["license"] = license_clean - logging.info(f"Added license: {license_clean}") + logger.info("Added license: %s", license_clean) # language if pref.get("languages"): @@ -308,14 +333,14 @@ def create_zenodo_json(): pref["languages"][0] if isinstance(pref["languages"], list) else pref["languages"] ) zenodo_metadata["language"] = lang - logging.info(f"Added language: {lang}") + logger.info("Added language: %s", lang) # contributors if metadata.get("contributors"): contributors = format_contributors_for_zenodo(metadata["contributors"]) if contributors: zenodo_metadata["contributors"] = contributors - logging.info(f"Added {len(contributors)} contributors") + logger.info("Added %d contributors", len(contributors)) # related_identifiers related_identifiers = [] @@ -324,38 +349,39 @@ def create_zenodo_json(): related_identifiers.append( {"identifier": repo_url, "relation": "isSupplementedBy", "scheme": "url"} ) - logging.info("Added repository URL as related identifier") + logger.info("Added repository URL as related identifier") url = pref.get("url") if url and url != repo_url: related_identifiers.append( {"identifier": url, "relation": "isAlternateIdentifier", "scheme": "url"} ) - logging.info("Added URL as related identifier") + logger.info("Added URL as related identifier") if related_identifiers: zenodo_metadata["related_identifiers"] = related_identifiers # community zenodo_metadata["communities"] = [{"identifier": "quadriga"}] - logging.info("Added QUADRIGA community") + logger.info("Added QUADRIGA community") # version if "version" in pref: zenodo_metadata["version"] = str(pref["version"]) - logging.info(f"Added version: {pref['version']}") + logger.info("Added version: %s", pref["version"]) # write .zenodo.json try: with zenodo_json_path.open("w", encoding="utf-8") as f: json.dump(zenodo_metadata, f, ensure_ascii=False, indent=2) - logging.info(f"Zenodo metadata successfully created at {zenodo_json_path}") - return True - except OSError as e: - logging.exception(f"Error writing to {zenodo_json_path}: {e}") + except OSError: + logger.exception("Error writing to %s", zenodo_json_path) return False + else: + logger.info("Zenodo metadata successfully created at %s", zenodo_json_path) + return True - except Exception as e: - logging.exception(f"Unexpected error in create_zenodo_json: {e!s}") + except Exception: + logger.exception("Unexpected error in create_zenodo_json") return False diff --git a/quadriga/metadata/extract_from_book_config.py b/quadriga/metadata/extract_from_book_config.py index 79b87cbef..ea3946d51 100644 --- a/quadriga/metadata/extract_from_book_config.py +++ b/quadriga/metadata/extract_from_book_config.py @@ -1,12 +1,14 @@ """ -This script extracts the title from _config.yml and the first level of the TOC from _toc.yml. -It then uses this information to update metadata.yml. -The titles for the TOC chapters are extracted from the first heading of the corresponding files. +Extract the title from _config.yml and the first level of the TOC from _toc.yml. + +It then uses this information to update metadata.yml. The titles for the TOC chapters are extracted +from the first heading of the corresponding files. """ +from __future__ import annotations + import logging import sys -from datetime import datetime from pathlib import Path from .utils import ( @@ -19,21 +21,23 @@ # Configure logging logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def extract_and_update(): +def extract_and_update() -> bool | None: """ Extract information from _config.yml and _toc.yml files and update metadata.yml. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: # Get the repository root directory try: repo_root = get_repo_root() - except Exception as e: - logging.error(f"Failed to get repository root: {e}") + except Exception: + logger.exception("Failed to get repository root") return False # Define file paths using the get_file_path utility function @@ -48,7 +52,7 @@ def extract_and_update(): (metadata_path, "metadata.yml"), ]: if not path.exists(): - logging.error(f"Required file {name} not found at {path}") + logger.error("Required file %s not found at %s", name, path) return False # Load the files @@ -56,27 +60,32 @@ def extract_and_update(): toc_data = load_yaml_file(toc_path) metadata_data = load_yaml_file(metadata_path) - if not all([config_data, toc_data, metadata_data]): - logging.error("One or more required files couldn't be loaded. Exiting.") + if not config_data or not isinstance(config_data, dict): + logger.error("Could not load _config.yml or invalid format. Exiting.") + return False + if not toc_data or not isinstance(toc_data, dict): + logger.error("Could not load _toc.yml or invalid format. Exiting.") + return False + if not metadata_data or not isinstance(metadata_data, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False # Extract information from _config.yml title = config_data.get("title", "") - author = config_data.get("author", "") if not title: - logging.warning("No title found in _config.yml") + logger.warning("No title found in _config.yml") # Extract chapters and their titles from _toc.yml toc_chapters = [] missing_files = [] if "chapters" not in toc_data: - logging.warning("No 'chapters' section found in _toc.yml") + logger.warning("No 'chapters' section found in _toc.yml") else: for chapter in toc_data["chapters"]: if "file" not in chapter: - logging.warning("Found chapter entry without 'file' attribute in _toc.yml") + logger.warning("Found chapter entry without 'file' attribute in _toc.yml") continue try: @@ -94,7 +103,7 @@ def extract_and_update(): # Check if file exists if not full_path.exists(): missing_files.append(str(full_path)) - logging.warning(f"Chapter file not found: {full_path}") + logger.warning("Chapter file not found: %s", full_path) # Use filename as fallback title toc_chapters.append(f"[Missing: {p.stem}]") continue @@ -104,19 +113,19 @@ def extract_and_update(): # Add to the list of chapters toc_chapters.append(chapter_title) - except Exception as e: - logging.error(f"Error processing chapter {chapter.get('file', 'unknown')}: {e}") + except Exception: + logger.exception("Error processing chapter %s", chapter.get("file", "unknown")) # Add a placeholder with the filename if possible try: toc_chapters.append(f"[Error: {p.stem}]") - except: + except Exception: toc_chapters.append("[Error: unknown chapter]") if missing_files: - logging.warning(f"Missing {len(missing_files)} chapter files") + logger.warning("Missing %d chapter files", len(missing_files)) if not toc_chapters: - logging.warning("No chapter titles were extracted") + logger.warning("No chapter titles were extracted") # Format the TOC as a string with proper indentation and single newline between items toc_formatted = "- " + "\n- ".join(toc_chapters) @@ -132,7 +141,7 @@ def extract_and_update(): if "table-of-contents" in metadata_data: metadata_data["table-of-contents"] = toc_formatted else: - logging.warning("No 'table-of-contents' field found in metadata.yml") + logger.warning("No 'table-of-contents' field found in metadata.yml") # Save the updated metadata if save_yaml_file( @@ -140,20 +149,19 @@ def extract_and_update(): metadata_data, schema_comment="# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json", ): - logging.info("Metadata updated successfully!") + logger.info("Metadata updated successfully!") return True - else: - logging.error("Failed to save metadata.yml") - return False - except Exception as e: - logging.exception(f"Error updating metadata.yml: {e}") + logger.error("Failed to save metadata.yml") + return False + except Exception: + logger.exception("Error updating metadata.yml") return False else: - logging.error("Metadata file couldn't be loaded or is empty.") + logger.error("Metadata file couldn't be loaded or is empty.") return False - except Exception as e: - logging.exception(f"Unexpected error in extract_and_update: {e}") + except Exception: + logger.exception("Unexpected error in extract_and_update") return False diff --git a/quadriga/metadata/inject_all_metadata.py b/quadriga/metadata/inject_all_metadata.py new file mode 100644 index 000000000..bfbddcce7 --- /dev/null +++ b/quadriga/metadata/inject_all_metadata.py @@ -0,0 +1,889 @@ +""" +Injects all metadata (JSON-LD, OpenGraph, and RDF links) into generated HTML files. + +This unified script combines JSON-LD structured data injection and OpenGraph +social media metadata into a single efficient pass through HTML files. + +It reads metadata.jsonld and injects: +- OpenGraph tags for social media previews +- JSON-LD ') + + # 3. RDF discovery links + if add_link_elements: + injection_parts.append( + ' ' + ) + injection_parts.append( + ' ' + ) + + # Join all parts with newlines + full_injection = "\n".join(injection_parts) + + # Find optimal injection point + # Priority: after viewport, after charset, before (fallback) + injection_point = None + + # Try to inject after viewport meta tag (best practice for OpenGraph) + viewport_match = re.search(r"(]*>\s*)", html_content, re.IGNORECASE) + if viewport_match: + injection_point = viewport_match.end() + else: + # Fallback: try after charset meta tag + charset_match = re.search(r"(]*>\s*)", html_content, re.IGNORECASE) + if charset_match: + injection_point = charset_match.end() + + # Final fallback: inject before + if injection_point is None: + injection_point = html_content.find("") + if injection_point == -1: + logger.warning("No tag found in %s, skipping", html_path.name) + return False + # For injection, add before the tag + html_content = html_content[:injection_point] + f"\n{full_injection}\n" + html_content[injection_point:] + else: + # For after viewport/charset injection, insert at found position + html_content = html_content[:injection_point] + f"\n{full_injection}\n\n" + html_content[injection_point:] + + # Write the modified HTML back + with html_path.open("w", encoding="utf-8") as f: + f.write(html_content) + + logger.info("Injected all metadata into %s", html_path.name) + return True + + except FileNotFoundError: + logger.exception("HTML file not found: %s", html_path) + return False + except Exception: + logger.exception("Error injecting metadata into %s", html_path) + return False + + +# ============================================================================ +# Main Injection Orchestration +# ============================================================================ + + +def inject_all_metadata( + build_dir: Path | None = None, + jsonld_path: Path | None = None, + config_path: Path | None = None, + toc_path: Path | None = None, +) -> bool: + """ + Inject all metadata (OpenGraph, JSON-LD, and RDF links) into Jupyter Book HTML files. + + This unified function combines OpenGraph social media metadata and JSON-LD structured + data injection into a single efficient operation. + + For the root page, it injects: + - OpenGraph: og:type="book" with book:author, book:release_date, book:tag + - JSON-LD: Full book metadata + + For chapter pages, it injects: + - OpenGraph: og:type="article" with article:author, article:published_time, article:modified_time + - JSON-LD: Chapter-specific LearningResource with isPartOf reference to book + + Args: + build_dir (Path, optional): Path to _build/html directory (default: ./_build/html) + jsonld_path (Path, optional): Path to metadata.jsonld (default: ./metadata.jsonld) + config_path (Path, optional): Path to _config.yml (default: ./_config.yml) + toc_path (Path, optional): Path to _toc.yml (default: ./_toc.yml) + + Returns + ------- + bool: True if successful, False otherwise + """ + try: + # Determine paths + if build_dir is None: + build_dir = Path.cwd() / "_build" / "html" + if jsonld_path is None: + jsonld_path = Path.cwd() / "metadata.jsonld" + if config_path is None: + config_path = Path.cwd() / "_config.yml" + if toc_path is None: + toc_path = Path.cwd() / "_toc.yml" + + # Check if build directory exists + if not build_dir.exists(): + logger.error("Build directory not found: %s", build_dir) + return False + + # Check if JSON-LD file exists + if not jsonld_path.exists(): + logger.error("JSON-LD file not found: %s", jsonld_path) + return False + + # Read and validate JSON-LD + try: + with jsonld_path.open(encoding="utf-8") as f: + jsonld_data = json.load(f) + logger.info("Loaded JSON-LD from %s", jsonld_path) + except json.JSONDecodeError: + logger.exception("Invalid JSON in %s", jsonld_path) + return False + + # Extract base URL from metadata + base_url = jsonld_data.get("url", "") + if not base_url: + logger.error("No URL found in metadata.jsonld") + return False + + # Extract book title for og:site_name + book_title = jsonld_data.get("name", "") + if not book_title: + logger.warning("No book title found in metadata.jsonld") + + # Get logo filename from config + logo_filename = get_logo_from_config(config_path) + logger.info("Using logo: %s", logo_filename) + + # Determine the actual root page from _toc.yml + root_html = get_root_page_from_toc(toc_path, build_dir) + + # Fall back to index.html if we couldn't determine the root + if root_html is None or not root_html.exists(): + root_html = build_dir / "index.html" + logger.info("Using index.html as root page") + + # ==================================================================== + # Process root page + # ==================================================================== + if root_html.exists(): + # Generate OpenGraph tags for root page (og:type="book") + og_tags = create_opengraph_meta_tags( + jsonld_data, + base_url, + book_title, + logo_filename, + is_chapter=False, + ) + + # Format JSON-LD content with proper indentation + jsonld_content = json.dumps(jsonld_data, ensure_ascii=False, indent=2) + jsonld_content = "\n".join(" " + line for line in jsonld_content.split("\n")) + + # Inject both OpenGraph and JSON-LD into root page + if not inject_all_metadata_into_html(root_html, og_tags, jsonld_content): + logger.error("Failed to inject metadata into %s", root_html.name) + return False + else: + logger.warning("Root HTML file not found at %s", root_html) + return False + + # ==================================================================== + # Process index.html redirect page (if different from root) + # ==================================================================== + # Jupyter Book may create index.html as a meta-refresh redirect + # Social media crawlers don't follow these redirects, so we need + # to inject OpenGraph metadata into index.html as well + index_html = build_dir / "index.html" + if index_html.exists() and index_html != root_html: + try: + with index_html.open(encoding="utf-8") as f: + index_content = f.read() + + # Check if this is a simple meta-refresh redirect page (with or without proper HTML structure) + if "meta http-equiv" in index_content.lower(): + logger.info("Found index.html redirect page, injecting OpenGraph metadata") + + # Generate OpenGraph tags (but skip JSON-LD for redirect page) + og_tags = create_opengraph_meta_tags( + jsonld_data, + base_url, + book_title, + logo_filename, + is_chapter=False, + ) + + # Check if the redirect page has proper HTML structure + if "]*>', index_content, re.IGNORECASE) + meta_refresh = meta_refresh_match.group(0) if meta_refresh_match else "" + + # Create proper HTML with OpenGraph metadata and meta refresh + new_index_content = f""" + + + + {meta_refresh} +{og_tags} + {escape_html(book_title)} + + +

Redirecting to {root_html.name}...

+ + +""" + # Write the new index.html + with index_html.open("w", encoding="utf-8") as f: + f.write(new_index_content) + logger.info("Successfully created index.html with OpenGraph metadata and redirect") + else: + # Has proper HTML structure, inject normally + if not inject_all_metadata_into_html(index_html, og_tags, "", add_link_elements=False): + logger.warning("Failed to inject OpenGraph into index.html redirect page") + else: + logger.info("Successfully injected OpenGraph metadata into index.html redirect page") + except Exception: + logger.exception("Error processing index.html redirect page") + + # ==================================================================== + # Process chapter pages + # ==================================================================== + chapters_injected = 0 + if jsonld_data.get("hasPart"): + logger.info("Processing %d chapters...", len(jsonld_data["hasPart"])) + + for chapter in jsonld_data["hasPart"]: + if not isinstance(chapter, dict): + continue + + # Get chapter URL + chapter_url = chapter.get("url") + if not chapter_url: + logger.warning("Chapter missing URL: %s", chapter.get("name", "Unknown")) + continue + + # Find the HTML file for this chapter + chapter_html_path = get_html_path_from_url(chapter_url, build_dir) + if not chapter_html_path: + logger.warning("Could not find HTML file for chapter: %s", chapter.get("name", "Unknown")) + continue + + # Create chapter metadata for OpenGraph (combining chapter + book data) + chapter_og_metadata = { + "name": chapter.get("name", ""), + "url": chapter_url, + "description": chapter.get("description", ""), + "author": jsonld_data.get("author", []), # Inherit from book + "datePublished": jsonld_data.get("datePublished"), # Inherit from book + "dateModified": jsonld_data.get("dateModified"), # Inherit from book + "inLanguage": jsonld_data.get("inLanguage"), # Inherit from book + } + + # Generate OpenGraph tags for chapter (og:type="article") + chapter_og_tags = create_opengraph_meta_tags( + chapter_og_metadata, + base_url, + book_title, + logo_filename, + is_chapter=True, + ) + + # Create chapter-specific JSON-LD with book-level metadata + chapter_jsonld = create_chapter_jsonld(chapter, jsonld_data) + + # Convert to formatted string + chapter_jsonld_str = json.dumps(chapter_jsonld, ensure_ascii=False, indent=2) + chapter_jsonld_str = "\n".join(" " + line for line in chapter_jsonld_str.split("\n")) + + # Inject both OpenGraph and JSON-LD into chapter HTML + if inject_all_metadata_into_html(chapter_html_path, chapter_og_tags, chapter_jsonld_str): + chapters_injected += 1 + else: + logger.warning("Failed to inject metadata into chapter: %s", chapter.get("name", "Unknown")) + + logger.info("Injected metadata into %d chapter pages", chapters_injected) + + logger.info("All metadata injection completed successfully") + return True + + except Exception: + logger.exception("Unexpected error in inject_all_metadata") + return False + + +# ============================================================================ +# CLI Entry Point +# ============================================================================ + + +def main() -> None: + """ + Run the unified metadata injection script. + + Usage: + python -m quadriga.metadata.inject_all_metadata + """ + import argparse + + parser = argparse.ArgumentParser( + description="Inject all metadata (OpenGraph, JSON-LD, RDF links) into Jupyter Book HTML" + ) + parser.add_argument( + "--build-dir", + type=Path, + help="Path to _build/html directory (default: ./_build/html)", + ) + parser.add_argument( + "--jsonld-path", + type=Path, + help="Path to metadata.jsonld file (default: ./metadata.jsonld)", + ) + parser.add_argument( + "--config-path", + type=Path, + help="Path to _config.yml file (default: ./_config.yml)", + ) + parser.add_argument( + "--toc-path", + type=Path, + help="Path to _toc.yml file (default: ./_toc.yml)", + ) + + args = parser.parse_args() + + success = inject_all_metadata( + build_dir=args.build_dir, + jsonld_path=args.jsonld_path, + config_path=args.config_path, + toc_path=args.toc_path, + ) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/quadriga/metadata/run_all.py b/quadriga/metadata/run_all.py index b6c0b8e47..6407619e6 100644 --- a/quadriga/metadata/run_all.py +++ b/quadriga/metadata/run_all.py @@ -1,17 +1,30 @@ -""" -This script runs the various metadata update scripts in the correct order. -""" +"""Script to coordinate the different metadata transformation scripts for QUADRIGA Jupyter Books.""" + +from __future__ import annotations import logging +import os import sys -from .create_bibtex import create_bibtex_from_cff -from .extract_from_book_config import extract_and_update -from .update_citation_cff import update_citation -from .create_zenodo_json import create_zenodo_json +# Add current working directory to sys.path if not present +# This allows the script to run with python -m without package installation +cwd = os.getcwd() +if cwd not in sys.path: + sys.path.insert(0, cwd) + +from quadriga.metadata.create_bibtex import create_bibtex_from_cff +from quadriga.metadata.create_jsonld import create_jsonld +from quadriga.metadata.create_rdfxml import create_rdfxml +from quadriga.metadata.create_zenodo_json import create_zenodo_json +from quadriga.metadata.extract_from_book_config import extract_and_update +from quadriga.metadata.update_citation_cff import update_citation +from quadriga.metadata.validate_schema import validate_schema +logger = logging.getLogger(__name__) -def main(): + +def main() -> bool | None: + """Run the different metadata transformation scripts in order.""" try: # Configure logging with timestamp logging.basicConfig( @@ -20,56 +33,86 @@ def main(): datefmt="%Y-%m-%d %H:%M:%S", ) - logging.info("Running all metadata update scripts...") + logger.info("Running all metadata update scripts...") + + # Validate metadata.yml against QUADRIGA schema first + try: + logger.info("Validating metadata.yml against QUADRIGA schema...") + if not validate_schema(): + logger.error("Schema validation failed.") + return False + except Exception: + logger.exception("Unexpected error during schema validation") + return False # Execute extract_and_update with error handling try: - logging.info("Extracting metadata from _config.yml and _toc.yml...") + logger.info("Extracting metadata from _config.yml and _toc.yml...") if not extract_and_update(): - logging.error("Extract and update process failed.") + logger.error("Extract and update process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during extract_and_update: {str(e)}") + except Exception: + logger.exception("Unexpected error during extract_and_update") return False # Execute update_citation with error handling try: - logging.info("Updating CITATION.cff...") + logger.info("Updating CITATION.cff...") if not update_citation(): - logging.error("Update citation process failed.") + logger.error("Update citation process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during update_citation: {str(e)}") + except Exception: + logger.exception("Unexpected error during update_citation") return False # Execute create_bibtex_from_cff with error handling try: - logging.info("Creating CITATION.bib from CITATION.cff...") + logger.info("Creating CITATION.bib from CITATION.cff...") if not create_bibtex_from_cff(): - logging.error("Create BibTeX process failed.") + logger.error("Create BibTeX process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during create_bibtex_from_cff: {str(e)}") + except Exception: + logger.exception("Unexpected error during create_bibtex_from_cff") return False # Execute create_zenodo_json with error handling try: - logging.info("Creating .zenodo.json from CITATION.cff and metadata.yml...") + logger.info("Creating .zenodo.json from CITATION.cff and metadata.yml...") if not create_zenodo_json(): - logging.error("Create Zenodo JSON process failed.") + logger.error("Create Zenodo JSON process failed.") + return False + except Exception: + logger.exception("Unexpected error during create_zenodo_json") + return False + + # Execute create_jsonld with error handling + try: + logger.info("Creating metadata.jsonld from metadata.yml...") + if not create_jsonld(): + logger.error("Create JSON-LD process failed.") + return False + except Exception: + logger.exception("Unexpected error during create_jsonld") + return False + + # Execute create_rdfxml with error handling + try: + logger.info("Creating metadata.rdf from metadata.yml...") + if not create_rdfxml(): + logger.error("Create RDF/XML process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during create_zenodo_json: {str(e)}") + except Exception: + logger.exception("Unexpected error during create_rdfxml") return False - logging.info("All scripts executed successfully.") + logger.info("All scripts executed successfully.") return True except KeyboardInterrupt: - logging.warning("Process interrupted by user.") + logger.warning("Process interrupted by user.") return False - except Exception as e: - logging.exception(f"Unexpected error in main: {str(e)}") + except Exception: + logger.exception("Unexpected error in main") return False diff --git a/quadriga/metadata/update_citation_cff.py b/quadriga/metadata/update_citation_cff.py index 29671031c..61c7e815f 100644 --- a/quadriga/metadata/update_citation_cff.py +++ b/quadriga/metadata/update_citation_cff.py @@ -1,10 +1,13 @@ """ -Updates the CITATION.cff file with metadata from metadata.yml. +Update or create the CITATION.cff file with metadata from metadata.yml. This script reads metadata from 'metadata.yml' and updates the corresponding fields in 'CITATION.cff'. It handles fields like title, authors, URL, repository URL, and publication date. It also ensures that the 'preferred-citation' section, if present, is updated consistently. + +If CITATION.cff does not exist, a new one is created from metadata.yml with +the required CFF boilerplate fields. """ import logging @@ -14,20 +17,105 @@ from .utils import extract_keywords, get_file_path, load_yaml_file, save_yaml_file logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def update_citation(): +def _create_initial_citation_cff(metadata: dict) -> dict: """ - Updates the CITATION.cff file using data from the metadata.yml file. + Create an initial CITATION.cff data structure from metadata.yml. + + Builds a minimal valid CFF 1.2.0 file with the required fields + (cff-version, message, title, authors) plus optional fields + that can be derived from metadata.yml. - The function performs the following steps: - 1. Constructs absolute paths to 'metadata.yml' and 'CITATION.cff'. - 2. Loads data from both YAML files. - 3. Updates 'CITATION.cff' fields (title, authors, URL, repository-code, - and publication year in preferred-citation) based on 'metadata.yml'. - 4. For authors, it attempts to preserve existing author details in - 'CITATION.cff' if a matching author (by given and family names) is found. - 5. Saves the updated data back to 'CITATION.cff', including a schema comment. + Args: + metadata: Parsed metadata.yml data + + Returns + ------- + dict: A valid CFF data structure + """ + citation_data: dict = { + "cff-version": "1.2.0", + "message": "If you use this work, please cite it using the metadata from this file.", + "type": "dataset", + } + + # Title (required by CFF) + citation_data["title"] = metadata.get("title", "Untitled") + + # Authors (required by CFF) + if metadata.get("authors"): + citation_authors = [] + for author in metadata["authors"]: + cff_author: dict = {} + if "given-names" in author: + cff_author["given-names"] = author["given-names"] + if "family-names" in author: + cff_author["family-names"] = author["family-names"] + if "orcid" in author: + cff_author["orcid"] = author["orcid"] + if "affiliation" in author: + cff_author["affiliation"] = author["affiliation"] + if cff_author: + citation_authors.append(cff_author) + citation_data["authors"] = citation_authors if citation_authors else [{"name": "Unknown"}] + else: + citation_data["authors"] = [{"name": "Unknown"}] + + # Optional fields from metadata + if "version" in metadata: + citation_data["version"] = metadata["version"] + + if "url" in metadata: + citation_data["url"] = metadata["url"] + + if "git" in metadata: + citation_data["repository-code"] = metadata["git"] + + if "identifier" in metadata: + doi_url = metadata["identifier"] + if "doi.org" in str(doi_url): + # Extract DOI value from URL + doi_value = str(doi_url).split("doi.org/")[-1] if "doi.org/" in str(doi_url) else None + if doi_value: + citation_data["identifiers"] = [ + {"type": "doi", "value": doi_value, "description": "Zenodo"} + ] + + if "description" in metadata: + citation_data["abstract"] = metadata["description"] + + if metadata.get("keywords"): + flattened = extract_keywords(metadata["keywords"]) + if flattened: + citation_data["keywords"] = flattened + + if "license" in metadata: + license_data = metadata["license"] + if isinstance(license_data, str): + citation_data["license"] = license_data + elif isinstance(license_data, dict) and "content" in license_data: + # Use content license as primary (most relevant for citation) + content_license = license_data["content"] + if isinstance(content_license, str): + citation_data["license"] = content_license + elif isinstance(content_license, list) and content_license: + citation_data["license"] = content_license[0] + + logger.info("Created initial CITATION.cff from metadata.yml") + return citation_data + + +def update_citation() -> bool: + """ + Update or create the CITATION.cff file using data from the metadata.yml file. + + If CITATION.cff exists, the function updates its fields based on metadata.yml, + preserving existing author details and preferred-citation sections. + + If CITATION.cff does not exist, a new one is created from metadata.yml with + the required CFF boilerplate. Returns ------- @@ -39,32 +127,31 @@ def update_citation(): repo_root = get_file_path("") # Get repo root by providing empty relative path metadata_path = get_file_path("metadata.yml", repo_root) citation_cff_path = get_file_path("CITATION.cff", repo_root) - except Exception as e: - logging.exception(f"Failed to resolve file paths: {e!s}") + except Exception: + logger.exception("Failed to resolve file paths") return False - # Check if files exist - for path, name in [ - (metadata_path, "metadata.yml"), - (citation_cff_path, "CITATION.cff"), - ]: - if not Path(path).exists(): - logging.error(f"Required file {name} not found at {path}") - return False + # metadata.yml must exist + if not Path(metadata_path).exists(): + logger.error("Required file metadata.yml not found at %s", metadata_path) + return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - # Load existing CITATION.cff - citation_data = load_yaml_file(citation_cff_path) - - if not metadata: - logging.error("Could not load metadata.yml. Exiting.") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False - if not citation_data: - logging.error("Could not load CITATION.cff. Exiting.") - return False + # Load existing CITATION.cff or create initial structure + if Path(citation_cff_path).exists(): + citation_data = load_yaml_file(citation_cff_path) + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") + return False + else: + logger.info("CITATION.cff not found — creating from metadata.yml") + citation_data = _create_initial_citation_cff(metadata) # Track if updates were made updates_made = False @@ -76,18 +163,18 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["title"] = metadata["title"] updates_made = True - logging.info(f"Updated title to: {metadata['title']}") + logger.info("Updated title to: %s", metadata["title"]) else: - logging.warning("No title found in metadata.yml") + logger.warning("No title found in metadata.yml") - if "book-version" in metadata: - citation_data["version"] = metadata["book-version"] + if "version" in metadata: + citation_data["version"] = metadata["version"] if "preferred-citation" in citation_data: - citation_data["preferred-citation"]["version"] = metadata["book-version"] + citation_data["preferred-citation"]["version"] = metadata["version"] updates_made = True - logging.info(f"Updated version to: {metadata['book-version']}") + logger.info("Updated version to: %s", metadata["version"]) else: - logging.warning("No book version found in metadata.yml, skipping version update") + logger.warning("No version found in metadata.yml, skipping version update") if metadata.get("authors"): try: @@ -102,13 +189,12 @@ def update_citation(): and "family-names" in cit_author and "given-names" in author and "family-names" in author + ) and ( + cit_author["given-names"] == author["given-names"] + and cit_author["family-names"] == author["family-names"] ): - if ( - cit_author["given-names"] == author["given-names"] - and cit_author["family-names"] == author["family-names"] - ): - new_author_entry = cit_author - break + new_author_entry = cit_author + break # Update author entry with metadata if "given-names" in author: @@ -129,13 +215,13 @@ def update_citation(): citation_data["preferred-citation"]["authors"] = citation_authors updates_made = True - logging.info(f"Updated {len(citation_authors)} authors") + logger.info("Updated %d authors", len(citation_authors)) else: - logging.warning("Failed to process authors from metadata.yml") - except Exception as e: - logging.exception(f"Error processing authors: {e!s}") + logger.warning("Failed to process authors from metadata.yml") + except Exception: + logger.exception("Error processing authors") else: - logging.warning("No authors found in metadata.yml") + logger.warning("No authors found in metadata.yml") # Update URL if present in metadata if "url" in metadata: @@ -143,7 +229,7 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["url"] = metadata["url"] updates_made = True - logging.info(f"Updated URL to: {metadata['url']}") + logger.info("Updated URL to: %s", metadata["url"]) # Update repository URL if present in metadata if "git" in metadata: @@ -151,27 +237,33 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["repository-code"] = metadata["git"] updates_made = True - logging.info(f"Updated repository-code to: {metadata['git']}") + logger.info("Updated repository-code to: %s", metadata["git"]) - # Update publication year based on date-modified or date-published + # Update publication year based on date-modified or date-issued # Prefer newer date-modified, if available year_source = None year_value = None + year_digits = 4 if "date-modified" in metadata: date_str = metadata["date-modified"] - if isinstance(date_str, str) and len(date_str) >= 4: + if isinstance(date_str, str) and len(date_str) >= year_digits: year_value = date_str[:4] year_source = "date-modified" - elif "date-published" in metadata: - date_str = metadata["date-published"] - if isinstance(date_str, str) and len(date_str) >= 4: + elif "date-issued" in metadata: + date_str = metadata["date-issued"] + if isinstance(date_str, str) and len(date_str) >= year_digits: year_value = date_str[:4] # Extract year from YYYY-MM-DD - year_source = "date-published" + year_source = "date-issued" if year_value and "preferred-citation" in citation_data: citation_data["preferred-citation"]["year"] = year_value updates_made = True - logging.info(f"Updated publication year to: {year_value} (from {year_source})") + logger.info("Updated publication year to: %s (from %s)", year_value, year_source) + + if "description" in metadata: + citation_data["abstract"] = metadata["description"] + updates_made = True + logger.info("Updated abstract from description") # Update keywords if present in metadata # Extract keywords to flatten any language-keyed formats @@ -182,27 +274,26 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["keywords"] = flattened_keywords updates_made = True - logging.info(f"Updated keywords with {len(flattened_keywords)} items") + logger.info("Updated keywords with %d items", len(flattened_keywords)) else: - logging.warning("Keywords found in metadata.yml but could not be extracted") + logger.warning("Keywords found in metadata.yml but could not be extracted") else: - logging.warning("No keywords found in metadata.yml") + logger.warning("No keywords found in metadata.yml") # No changes if not updates_made: - logging.warning("No updates were made to CITATION.cff") + logger.warning("No updates were made to CITATION.cff") return True # Not an error, just no changes needed # Save updated CITATION.cff - success = save_yaml_file( + return save_yaml_file( citation_cff_path, citation_data, schema_comment="# yaml-language-server: $schema=https://citation-file-format.github.io/1.2.0/schema.json", ) - return success - except Exception as e: - logging.exception(f"Unexpected error in update_citation: {e!s}") + except Exception: + logger.exception("Unexpected error in update_citation") return False diff --git a/quadriga/metadata/update_version_from_tag.py b/quadriga/metadata/update_version_from_tag.py index fd6cb5a77..57e1fffde 100644 --- a/quadriga/metadata/update_version_from_tag.py +++ b/quadriga/metadata/update_version_from_tag.py @@ -1,70 +1,70 @@ -""" -Updates book-version and date-modified in metadata.yml based on git tag. -""" +"""Update version and date-modified in metadata.yml based on git tag.""" import logging import os import sys -from datetime import datetime +from datetime import UTC, datetime from .utils import get_file_path, load_yaml_file, save_yaml_file logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def update_version_from_tag(): +def update_version_from_tag() -> bool: """ - Updates book-version and date-modified in metadata.yml from git tag. + Update book and date-modified in metadata.yml from git tag. Expects the version to be passed via environment variable TAG_VERSION. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: # Get version from environment variable (set by GitHub Actions) version = os.environ.get("TAG_VERSION") if not version: - logging.info("No TAG_VERSION environment variable found - skipping version update") + logger.info("No TAG_VERSION environment variable found - skipping version update") return True - logging.info(f"Updating metadata for version: {version}") + logger.info("Updating metadata for version: %s", version) # Get file path try: repo_root = get_file_path("") metadata_path = get_file_path("metadata.yml", repo_root) - except Exception as e: - logging.error(f"Failed to resolve file paths: {str(e)}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - if not metadata: - logging.error("Could not load metadata.yml") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format") return False # Track if updates were made updates_made = False - # Update book-version - current_version = metadata.get("book-version") + # Update version + current_version = metadata.get("version") if current_version != version: - metadata["book-version"] = version + metadata["version"] = version updates_made = True - logging.info(f"Updated book-version from '{current_version}' to '{version}'") + logger.info("Updated version from '%s' to '%s'", current_version, version) else: - logging.info(f"book-version already matches tag version: {version}") + logger.info("version already matches tag version: %s", version) # Update date-modified - current_date = datetime.now().strftime("%Y-%m-%d") + current_date = datetime.now(UTC).strftime("%Y-%m-%d") old_date = metadata.get("date-modified") if old_date != current_date: metadata["date-modified"] = current_date updates_made = True - logging.info(f"Updated date-modified from '{old_date}' to '{current_date}'") + logger.info("Updated date-modified from '%s' to '%s'", old_date, current_date) else: - logging.info(f"date-modified already current: {current_date}") + logger.info("date-modified already current: %s", current_date) # Save if updates were made if updates_made: @@ -74,14 +74,13 @@ def update_version_from_tag(): schema_comment="# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json", ) if success: - logging.info("Successfully updated metadata.yml") + logger.info("Successfully updated metadata.yml") return success - else: - logging.info("No updates needed") - return True + logger.info("No updates needed") + return True - except Exception as e: - logging.exception(f"Unexpected error in update_version_from_tag: {str(e)}") + except Exception: + logger.exception("Unexpected error in update_version_from_tag") return False diff --git a/quadriga/metadata/utils.py b/quadriga/metadata/utils.py index c11a3766d..eb1ebe15c 100644 --- a/quadriga/metadata/utils.py +++ b/quadriga/metadata/utils.py @@ -1,19 +1,20 @@ """ Common utility functions for metadata management in the Quadriga Book Template. + This module provides reused functionality across different metadata scripts. """ +from __future__ import annotations + import json import logging -import os import re -import sys -from datetime import datetime from pathlib import Path import yaml logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) # ---- File Path Handling ---- @@ -22,10 +23,12 @@ def get_repo_root() -> Path: """ Get the path to the repository root, assuming this module is in quadriga/metadata/. - Returns: + Returns + ------- Path: Absolute path to the repository root - Raises: + Raises + ------ FileNotFoundError: If the repository structure is not as expected """ try: @@ -38,14 +41,16 @@ def get_repo_root() -> Path: found_files = [f for f in required_files if (repo_root / f).exists()] if len(found_files) < 1: - raise FileNotFoundError( - f"Repository root at {repo_root} doesn't contain expected files (_config.yml or _toc.yml)" + msg = ( + f"Repository root at {repo_root} doesn't contain expected files " + "(_config.yml or _toc.yml)" ) - - return repo_root - except Exception as e: - logging.exception(f"Error resolving repository root: {e}") + raise FileNotFoundError(msg) + except Exception: + logger.exception("Error resolving repository root") raise + else: + return repo_root def get_file_path(relative_path: str | Path, repo_root: Path | None = None) -> Path: @@ -56,47 +61,53 @@ def get_file_path(relative_path: str | Path, repo_root: Path | None = None) -> P relative_path (str | Path): Relative path from the repository root repo_root (Path, optional): Repository root path. If None, it will be determined - Returns: + Returns + ------- Path: Absolute path to the file """ - try: - if repo_root is None: - repo_root = get_repo_root() - return repo_root / Path(relative_path) - except Exception as e: - logging.error(f"Error resolving file path for '{relative_path}': {e}") - # Return the relative path as a fallback - return Path(relative_path) + if repo_root is None: + repo_root = get_repo_root() + return repo_root / Path(relative_path) # ---- YAML Handling ---- -def load_yaml_file(file_path: str | Path): +def load_yaml_file(file_path: str | Path) -> dict | list | None: """ Load a YAML file and return its contents as a Python object. Args: file_path (str | Path): Path to the YAML file - Returns: + Returns + ------- dict/list: Contents of the YAML file, or None if an error occurs """ + # Convert to Path at the edge + path = Path(file_path) + try: - with open(file_path, "r", encoding="utf-8") as file: - return yaml.safe_load(file) + with path.open(encoding="utf-8") as file: + data = yaml.safe_load(file) + # yaml.safe_load returns Any; ensure it's dict or list + if isinstance(data, (dict, list)): + return data + return None except FileNotFoundError: - logging.error(f"File not found: {Path(file_path).name}") + logger.exception("File not found: %s", path.name) return None - except yaml.YAMLError as e: - logging.error(f"YAML parsing error in {Path(file_path).name}: {e}") + except yaml.YAMLError: + logger.exception("YAML parsing error in %s", path.name) return None - except Exception as e: - logging.error(f"Error loading {Path(file_path).name}: {e}") + except Exception: + logger.exception("Error loading %s", path.name) return None -def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = None): +def save_yaml_file( + file_path: str | Path, data: dict | list, schema_comment: str | None = None +) -> bool: """ Save Python object as YAML to the specified file. @@ -104,19 +115,23 @@ def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = Non file_path (str | Path): Path where the YAML file should be saved data (dict/list): Data to save schema_comment (str, optional): Schema comment to add at the start of the file - e.g. "# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/schema.json" + e.g. "# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/latest/schema.json" - Returns: + Returns + ------- bool: True if successful, False otherwise """ + # Convert to Path at the edge + path = Path(file_path) + try: # Create directories if they don't exist - directory = Path(file_path).parent + directory = path.parent if not directory.exists(): directory.mkdir(parents=True) - logging.info(f"Created directory: {directory}") + logger.info("Created directory: %s", directory) - with open(file_path, "w", encoding="utf-8") as file: + with path.open("w", encoding="utf-8") as file: yaml.dump( data, file, @@ -127,25 +142,25 @@ def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = Non if schema_comment: try: - with open(file_path, "r+", encoding="utf-8") as file: + with path.open("r+", encoding="utf-8") as file: content = file.read() file.seek(0, 0) file.write(f"{schema_comment}\n" + content) - except Exception as e: - logging.warning(f"Failed to add schema comment to {Path(file_path).name}: {e}") + except Exception: + logger.exception("Failed to add schema comment to %s", path.name) # Not a critical error, proceed - - logging.info(f"Successfully updated {Path(file_path).name}") - return True - except yaml.YAMLError as e: - logging.error(f"YAML encoding error for {Path(file_path).name}: {e}") + except yaml.YAMLError: + logger.exception("YAML encoding error for %s", path.name) return False - except PermissionError as e: - logging.error(f"Permission denied when saving {Path(file_path).name}: {e}") + except PermissionError: + logger.exception("Permission denied when saving %s", path.name) return False - except Exception as e: - logging.error(f"Error saving to {Path(file_path).name}: {e}") + except Exception: + logger.exception("Error saving to %s", path.name) return False + else: + logger.info("Successfully updated %s", path.name) + return True # ---- Markdown and Jupyter Content Handling ---- @@ -161,7 +176,8 @@ def remove_yaml_frontmatter(text: str) -> str: Args: text (str): Markdown content that may contain frontmatter - Returns: + Returns + ------- str: Content with frontmatter removed """ pattern = r"^\s*---\s*\n(.*?)\n\s*---\s*(\n|$)" @@ -175,14 +191,16 @@ def extract_first_heading(file_path: str | Path) -> str: Args: file_path (str | Path): Path to the file - Returns: + Returns + ------- str: The content of the first heading or filename if no heading found """ + # Convert to Path at the edge file_path_obj = Path(file_path) try: if file_path_obj.suffix == ".ipynb": try: - with open(file_path_obj, "r", encoding="utf-8") as file: + with file_path_obj.open(encoding="utf-8") as file: notebook = json.load(file) for cell in notebook.get("cells", []): @@ -191,30 +209,30 @@ def extract_first_heading(file_path: str | Path) -> str: heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if heading_match: return heading_match.group(1).strip() - except json.JSONDecodeError as e: - logging.error(f"Invalid JSON in notebook {file_path_obj.name}: {e}") - except Exception as e: - logging.error(f"Error reading notebook {file_path_obj.name}: {e}") + except json.JSONDecodeError: + logger.exception("Invalid JSON in notebook %s", file_path_obj.name) + except Exception: + logger.exception("Error reading notebook %s", file_path_obj.name) elif file_path_obj.suffix == ".md": try: - with open(file_path_obj, "r", encoding="utf-8") as file: + with file_path_obj.open(encoding="utf-8") as file: content = file.read() content = remove_yaml_frontmatter(content) heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if heading_match: return heading_match.group(1).strip() - except Exception as e: - logging.error(f"Error reading markdown {file_path_obj.name}: {e}") + except Exception: + logger.exception("Error reading markdown %s", file_path_obj.name) else: - logging.warning(f"Unsupported file type for heading extraction: {file_path_obj.name}") + logger.warning("Unsupported file type for heading extraction: %s", file_path_obj.name) return file_path_obj.stem except FileNotFoundError: - logging.error(f"File not found: {file_path_obj.name}") - except Exception as e: - logging.error(f"Error processing {file_path_obj.name}: {e}") + logger.exception("File not found: %s", file_path_obj.name) + except Exception: + logger.exception("Error processing %s", file_path_obj.name) return file_path_obj.stem @@ -222,43 +240,44 @@ def extract_first_heading(file_path: str | Path) -> str: # ---- Citation Handling ---- -def format_authors_for_bibtex(authors): +def format_authors_for_bibtex(authors: list) -> str: """ Format a list of authors in the proper BibTeX format. Args: authors (list): List of author dictionaries with 'given-names' and 'family-names' - Returns: + Returns + ------- str: Authors formatted for BibTeX """ try: if not authors: - logging.warning("No authors provided to format_authors_for_bibtex") + logger.warning("No authors provided to format_authors_for_bibtex") return "" formatted_authors = [] for i, author in enumerate(authors): if not isinstance(author, dict): - logging.warning(f"Author at index {i} is not a dictionary: {author}") + logger.warning("Author at index %s is not a dictionary: %s", i, author) continue family = author.get("family-names", "") given = author.get("given-names", "") if not family and not given: - logging.warning(f"Author at index {i} is missing both family-names and given-names") + logger.warning("Author at index %s is missing both family-names and given-names", i) continue formatted_authors.append(f"{family}, {given}") return " and ".join(formatted_authors) - except Exception as e: - logging.exception(f"Error formatting authors for BibTeX: {e}") + except Exception: + logger.exception("Error formatting authors for BibTeX") return "" -def generate_citation_key(authors, title, year): +def generate_citation_key(authors: list, title: str, year: str) -> str: """ Generate a citation key for BibTeX. @@ -267,7 +286,8 @@ def generate_citation_key(authors, title, year): title (str): Title of the work year (str): Year of publication - Returns: + Returns + ------- str: Citation key """ try: @@ -277,7 +297,7 @@ def generate_citation_key(authors, title, year): family_name = first_author.get("family-names", "Unknown") else: family_name = "Unknown" - logging.warning("No valid authors provided for citation key generation") + logger.warning("No valid authors provided for citation key generation") # Get the first word of the title or use 'Untitled' if title and isinstance(title, str): @@ -285,11 +305,11 @@ def generate_citation_key(authors, title, year): first_word = title_words[0] if title_words else "Untitled" else: first_word = "Untitled" - logging.warning("No valid title provided for citation key generation") + logger.warning("No valid title provided for citation key generation") # Use the year or empty string if not year: - logging.warning("No year provided for citation key generation") + logger.warning("No year provided for citation key generation") year = "" # Create a citation key with no invalid characters @@ -297,17 +317,17 @@ def generate_citation_key(authors, title, year): # Clean the key - remove special characters clean_key = re.sub(r"[^a-zA-Z0-9_]", "", raw_key) - - return clean_key or "Unknown_Citation" - except Exception as e: - logging.exception(f"Error generating citation key: {e}") + except Exception: + logger.exception("Error generating citation key") return "Unknown_Citation_Error" + else: + return clean_key or "Unknown_Citation" # ---- Keyword Handling ---- -def extract_keywords(keywords_data): +def extract_keywords(keywords_data: list | None) -> list: """ Extract keywords from various formats. @@ -319,15 +339,13 @@ def extract_keywords(keywords_data): Args: keywords_data: Keywords in various formats - Returns: + Returns + ------- list: List of keyword strings """ if not keywords_data: return [] - if not isinstance(keywords_data, list): - return [] - keywords = [] for item in keywords_data: if isinstance(item, str): @@ -336,10 +354,8 @@ def extract_keywords(keywords_data): elif isinstance(item, dict): # Dictionary format with language codes # Extract all values from the dictionary (should be only one per item) - for lang_code, keyword in item.items(): - if keyword: - keywords.append(str(keyword)) + keywords.extend(str(keyword) for keyword in item.values() if keyword) else: - logging.warning(f"Unexpected keyword format: {item}") + logger.warning("Unexpected keyword format: %s", item) return keywords diff --git a/quadriga/metadata/validate_schema.py b/quadriga/metadata/validate_schema.py new file mode 100644 index 000000000..71cd9a833 --- /dev/null +++ b/quadriga/metadata/validate_schema.py @@ -0,0 +1,119 @@ +"""Validate metadata.yml against the QUADRIGA JSON Schema. + +This module fetches the QUADRIGA schema (and referenced sub-schemas) from the +remote URL and validates a metadata dictionary against it. +""" + +from __future__ import annotations + +import json +import logging +import urllib.request + +from quadriga.metadata.utils import get_file_path, load_yaml_file + +logger = logging.getLogger(__name__) + +QUADRIGA_SCHEMA_URL = ( + "https://quadriga-dk.github.io/quadriga-schema/latest/schema.json" +) + + +def _fetch_json(url: str) -> dict: + """Fetch a JSON document from a URL. + + Args: + url: URL to fetch + + Returns + ------- + dict: Parsed JSON content + + Raises + ------ + urllib.error.URLError: If the URL cannot be reached + json.JSONDecodeError: If the response is not valid JSON + """ + with urllib.request.urlopen(url, timeout=30) as resp: # noqa: S310 + return json.loads(resp.read()) + + +def _validate_metadata( + metadata: dict, schema_url: str = QUADRIGA_SCHEMA_URL +) -> tuple[bool, list[str]]: + """Validate a metadata dictionary against the QUADRIGA JSON Schema. + + Fetches the schema (and any ``$ref`` sub-schemas) fresh from the given URL. + + Args: + metadata: Metadata dictionary (e.g. parsed from metadata.yml) + schema_url: URL of the main QUADRIGA schema + + Returns + ------- + tuple[bool, list[str]]: ``(True, [])`` when valid, or + ``(False, [error_message, ...])`` when validation fails or the schema + cannot be fetched. + """ + try: + from jsonschema import Draft202012Validator + from referencing import Registry, Resource + from referencing.jsonschema import DRAFT202012 + except ImportError: + logger.warning( + "jsonschema package not installed – skipping schema validation. " + "Install it via: pip install jsonschema" + ) + return True, [] + + try: + logger.info("Fetching QUADRIGA schema from %s ...", schema_url) + main_schema = _fetch_json(schema_url) + except Exception: + logger.exception("Failed to fetch schema from %s", schema_url) + return False, [f"Could not fetch schema from {schema_url}"] + + def retrieve(uri: str) -> Resource: + data = _fetch_json(uri) + return Resource.from_contents(data, default_specification=DRAFT202012) + + try: + registry: Registry = Registry(retrieve=retrieve) + validator = Draft202012Validator(main_schema, registry=registry) + errors = list(validator.iter_errors(metadata)) + except Exception: + logger.exception("Error during schema validation") + return False, ["Unexpected error during schema validation"] + + if errors: + messages = [] + for err in errors: + path = err.json_path if err.json_path != "$" else "(root)" + messages.append(f"{path}: {err.message}") + return False, messages + + return True, [] + + +def validate_schema() -> bool: + """Load metadata.yml and validate it against the QUADRIGA schema. + + Returns + ------- + bool: True if validation passed, False otherwise. + """ + metadata_path = get_file_path("metadata.yml") + metadata = load_yaml_file(metadata_path) + if metadata is None: + logger.error("Could not load metadata.yml for validation.") + return False + + valid, errors = _validate_metadata(metadata) + if valid: + logger.info("Schema validation passed.") + return True + + logger.error("Schema validation failed with %d error(s):", len(errors)) + for i, error in enumerate(errors, 1): + logger.error(" %d. %s", i, error) + return False