diff --git a/.github/workflows/update-metadata.yml b/.github/workflows/update-metadata.yml index c0d907d..5e49502 100644 --- a/.github/workflows/update-metadata.yml +++ b/.github/workflows/update-metadata.yml @@ -20,35 +20,96 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.ref }} + fetch-depth: 0 + + - name: Extract version from tag (if triggered by tag) + id: extract_version + run: | + if [[ "$GITHUB_REF" == refs/tags/v* ]]; then + TAG_NAME=${GITHUB_REF#refs/tags/} + # Remove 'v' prefix if present (v1.0.0 -> 1.0.0) + VERSION=${TAG_NAME#v} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT + echo "is_tag=true" >> $GITHUB_OUTPUT + echo "Triggered by version tag: $TAG_NAME (version: $VERSION)" + else + echo "is_tag=false" >> $GITHUB_OUTPUT + echo "Triggered by regular push to: $GITHUB_REF" + fi - name: Set up Python uses: actions/setup-python@v5 with: python-version-file: '.python-version' cache: pip - + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pyyaml - + pip install -r dev-requirements.txt + + - name: Update version metadata (if triggered by tag) + if: steps.extract_version.outputs.is_tag == 'true' + env: + TAG_VERSION: ${{ steps.extract_version.outputs.version }} + run: | + python -m quadriga.metadata.update_version_from_tag + - name: Update metadata files + env: + PYTHONHASHSEED: 0 run: python -m quadriga.metadata.run_all - - - name: Check if files changed + + - name: Stage metadata files + run: | + # Add all metadata files that exist (handles both new and modified files) + for file in metadata.yml CITATION.bib CITATION.cff .zenodo.json metadata.jsonld metadata.rdf; do + [ -f "$file" ] && git add "$file" + done + + - name: Check if files staged id: check_changes run: | - if git diff --quiet metadata.yml && git diff --quiet CITATION.bib && git diff --quiet CITATION.cff; then + if git diff --cached --quiet; then echo "changes_detected=false" >> $GITHUB_OUTPUT else echo "changes_detected=true" >> $GITHUB_OUTPUT fi - - - name: Commit changes if necessary - if: steps.check_changes.outputs.changes_detected == 'true' + + - name: Commit changes (regular push) + if: steps.check_changes.outputs.changes_detected == 'true' && steps.extract_version.outputs.is_tag == 'false' run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" - git add metadata.yml CITATION.bib CITATION.cff git commit -m "[Automated] Update metadata files" - git push \ No newline at end of file + git push + + - name: Commit changes and move tag (tag-triggered) + if: steps.check_changes.outputs.changes_detected == 'true' && steps.extract_version.outputs.is_tag == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git commit -m "[Automated] Update metadata for version ${{ steps.extract_version.outputs.version }}" + + # Delete the old tag (locally and remotely) + git tag -d ${{ steps.extract_version.outputs.tag_name }} + git push origin :refs/tags/${{ steps.extract_version.outputs.tag_name }} + + # Create new tag at the current commit (with updated metadata) + git tag ${{ steps.extract_version.outputs.tag_name }} + + # Push the changes and the new tag + git push origin HEAD:main + git push origin ${{ steps.extract_version.outputs.tag_name }} + + echo "Tag ${{ steps.extract_version.outputs.tag_name }} moved to commit with updated metadata" + + - name: No changes needed + if: steps.check_changes.outputs.changes_detected == 'false' + run: | + if [[ "${{ steps.extract_version.outputs.is_tag }}" == "true" ]]; then + echo "Metadata already matches the tag version - no changes needed" + else + echo "No metadata changes detected" + fi diff --git a/.zenodo.json b/.zenodo.json index 51862d3..c6e9cdb 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -22,7 +22,7 @@ "affiliation": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" } ], - "description": "Diese Fallstudie geht auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen ein. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt.", + "description": "

Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt.

\n

Das interaktive Lehrbuch kann als Web-Version verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch auf GitHub zur Verfügung.

\n

Die QUADRIGA-OER sind nach einem einheitlichen Template gestaltet, werden nach einem standardisierten Verfahren qualitätsgeprüft und mit Metadaten ausgezeichnet.

\n
QUADRIGA Datenkompetenzzentrum
\n

QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozess möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes Datenkompetenzframework zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.

\n

Zu den Zielgruppen von QUADRIGA zählen insbesondere promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.

\n

QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR) und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartnern zählen:\n

\n

\n\n

Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im Umsetzungskonzept erfahren.

\n\n

Weitere Informationen sowie Publikationen finden Sie auf der Webseite, in der Zenodo-Community und der GitHub-Organisation des Projekts.

\n", "publication_date": "2025-03-24", "keywords": [ "Verwaltung", @@ -37,7 +37,7 @@ "Reproduzierbarkeit", "R (Programmiersprache)" ], - "license": "CC-BY-SA-4.0", + "license": "CC BY 4.0", "language": "deu", "contributors": [ { diff --git a/CITATION.bib b/CITATION.bib index b4558da..f908f0f 100644 --- a/CITATION.bib +++ b/CITATION.bib @@ -4,7 +4,6 @@ @misc{Plomin_Reproduzierbarkeit_2025 year = {2025}, version = {1.0.0-beta.2}, note = {Repository: https://github.com/quadriga-dk/Tabelle-Fallstudie-1}, - version = {1.0.0-beta.2}, howpublished = {Available from: https://github.com/quadriga-dk/Tabelle-Fallstudie-1}, doi = {10.5281/zenodo.14975202}, url = {https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/0_Intro.html}, diff --git a/CITATION.cff b/CITATION.cff index ec6dc12..5b15ac3 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,9 +3,11 @@ cff-version: 1.2.0 version: 1.0.0-beta.2 title: 'Reproduzierbarkeit von Datenanalysen: Ein Fallbeispiel aus dem Nationalen Bildungsbericht. QUADRIGA Open Educational Resources: Tabelle 1' -abstract: Diese Fallstudie geht auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen - ein. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von - Datenqualität und Nachvollziehbarkeit von Analysen behandelt. +abstract: Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf + in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage + auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu + werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität + und Nachvollziehbarkeit von Analysen behandelt. type: software message: Please cite this software using the metadata from `preferred-citation` in `CITATION.cff`. diff --git a/dev-requirements.txt b/dev-requirements.txt index 4818cc5..2d753e8 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1 +1,3 @@ -pyyaml \ No newline at end of file +pyyaml +jsonschema +rdflib diff --git a/metadata.jsonld b/metadata.jsonld new file mode 100644 index 0000000..ed42d01 --- /dev/null +++ b/metadata.jsonld @@ -0,0 +1,523 @@ +{ + "@context": { + "schema": "http://schema.org/", + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "lrmi": "http://purl.org/dcx/lrmi-terms/", + "skos": "http://www.w3.org/2004/02/skos/core#", + "@vocab": "http://schema.org/" + }, + "@type": [ + "Book", + "LearningResource" + ], + "name": "Reproduzierbarkeit von Datenanalysen: Ein Fallbeispiel aus dem Nationalen Bildungsbericht. QUADRIGA Open Educational Resources: Tabelle 1", + "description": "Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt.", + "identifier": { + "@type": "PropertyValue", + "propertyID": "DOI", + "value": "10.5281/zenodo.14975202", + "url": "https://doi.org/10.5281/zenodo.14975202" + }, + "version": "1.0.0-beta.2", + "schemaVersion": "1.0.0", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/0_Intro.html", + "workExample": { + "@type": "SoftwareSourceCode", + "name": "Source Code Repository", + "codeRepository": "https://github.com/quadriga-dk/Tabelle-Fallstudie-1" + }, + "datePublished": "2024-06-13", + "dateModified": "2025-03-24", + "author": [ + { + "@type": "Person", + "givenName": "Jana", + "familyName": "Plomin", + "name": "Jana Plomin", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-0127-7493", + "url": "https://orcid.org/0000-0003-0127-7493" + }, + "affiliation": { + "@type": "Organization", + "name": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" + } + }, + { + "@type": "Person", + "givenName": "Juliane", + "familyName": "Schmeling", + "name": "Juliane Schmeling", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0009-0005-9814-1139", + "url": "https://orcid.org/0009-0005-9814-1139" + }, + "affiliation": { + "@type": "Organization", + "name": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" + } + }, + { + "@type": "Person", + "givenName": "Paul", + "familyName": "Walter", + "name": "Paul Walter", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-9236-3268", + "url": "https://orcid.org/0000-0002-9236-3268" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Anton", + "familyName": "Schulze", + "name": "Anton Schulze", + "affiliation": { + "@type": "Organization", + "name": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" + } + } + ], + "contributor": [ + { + "@type": "Person", + "givenName": "Hannes", + "familyName": "Schnaitter", + "name": "Hannes Schnaitter", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-1602-6032", + "url": "https://orcid.org/0000-0002-1602-6032" + }, + "affiliation": { + "@type": "Organization", + "name": "Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft" + } + }, + { + "@type": "Person", + "givenName": "Evgenia", + "familyName": "Samoilova", + "name": "Evgenia Samoilova", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-3858-901X", + "url": "https://orcid.org/0000-0003-3858-901X" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Carsten", + "familyName": "Schneemann", + "name": "Carsten Schneemann", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-2683-5853", + "url": "https://orcid.org/0000-0002-2683-5853" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Lamia", + "familyName": "Islam", + "name": "Lamia Islam", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0009-0001-1879-9880", + "url": "https://orcid.org/0009-0001-1879-9880" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Yue", + "familyName": "Zhang", + "name": "Yue Zhang", + "affiliation": { + "@type": "Organization", + "name": "Technische Universität Berlin" + } + }, + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Wiemer", + "name": "Philip Wiemer" + }, + { + "@type": "Person", + "givenName": "Jan", + "familyName": "Bernoth", + "name": "Jan Bernoth", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-4127-0053", + "url": "https://orcid.org/0000-0002-4127-0053" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Melanie", + "familyName": "Seltmann", + "name": "Melanie Seltmann", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-7588-4395", + "url": "https://orcid.org/0000-0002-7588-4395" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Henny", + "familyName": "Sluyther-Gäthhje", + "name": "Henny Sluyther-Gäthhje", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-2969-3237", + "url": "https://orcid.org/0000-0003-2969-3237" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Vivien", + "familyName": "Petras", + "name": "Vivien Petras", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-8113-1509", + "url": "https://orcid.org/0000-0002-8113-1509" + }, + "affiliation": { + "@type": "Organization", + "name": "Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft" + } + }, + { + "@type": "Person", + "givenName": "Heike", + "familyName": "Neuroth", + "name": "Heike Neuroth", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-3637-3154", + "url": "https://orcid.org/0000-0002-3637-3154" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + } + ], + "inLanguage": "de", + "keywords": [ + "Verwaltung", + "Verwaltungswissenschaft", + "Tabelle", + "Lerneinheit", + "Public Sector", + "Open Educational Resource", + "FAIR-Prinzipien", + "Datenqualität", + "Persistente Identifikatoren", + "Reproduzierbarkeit", + "R (Programmiersprache)" + ], + "about": [ + { + "@type": "Thing", + "name": "Verwaltung" + }, + { + "@type": "Thing", + "name": "Verwaltungswissenschaft" + }, + { + "@type": "Thing", + "name": "Tabelle" + }, + { + "@type": "Thing", + "name": "Lerneinheit" + }, + { + "@type": "Thing", + "name": "Public Sector" + }, + { + "@type": "Thing", + "name": "Open Educational Resource" + }, + { + "@type": "Thing", + "name": "FAIR-Prinzipien" + }, + { + "@type": "Thing", + "name": "Datenqualität" + }, + { + "@type": "Thing", + "name": "Persistente Identifikatoren" + }, + { + "@type": "Thing", + "name": "Reproduzierbarkeit" + }, + { + "@type": "Thing", + "name": "R (Programmiersprache)" + }, + { + "@type": "Thing", + "name": "Verwaltungswissenschaften" + }, + { + "@type": "Thing", + "name": "Informationswissenschaft" + }, + { + "@type": "Thing", + "name": "Tabelle" + } + ], + "audience": [ + { + "@type": "Audience", + "audienceType": "Forschende (PostDoc)" + }, + { + "@type": "Audience", + "audienceType": "Forschende (Projektleitung)" + }, + { + "@type": "Audience", + "audienceType": "Promovierende" + }, + { + "@type": "Audience", + "audienceType": "Hochschullehrende" + } + ], + "timeRequired": "PT3H15M", + "license": [ + { + "@type": "CreativeWork", + "name": "Source Code", + "license": "https://opensource.org/licenses/AGPL-3.0" + }, + { + "@type": "CreativeWork", + "name": "Content", + "license": "https://creativecommons.org/licenses/by-sa/4.0/" + } + ], + "hasPart": [ + { + "@type": "LearningResource", + "name": "Datenbasis", + "description": "Dieses Kapitel beschreibt die in dieser Fallstudie genutzten Daten.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/2_Datenbasis.html", + "timeRequired": "PT5M", + "teaches": "Forschungssfrage", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Forschungsfrage kann erläutert und in einen Kontext mit dem Bildungsbericht gesetzt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: 2 Verstehen | Data Flow: nicht anwendbar", + "lrmi:assesses": "nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Datennachnutzung", + "description": "Dieses Kapitel befasst sich mit der Datennachnutzung und den FAIR-Prinzipien.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/05_Datennachnutzung.html", + "timeRequired": "PT15M", + "teaches": "Grundsätze des Datenmanagements", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung der vier FAIR-Prinzipien kann anhand eines konkreten Beispiels erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: Orientierungswissen | Bloom's: 2 Verstehen | Data Flow: übergreifend", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die FAIRness eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: Orientierungswissen | Bloom's: 3 Anwenden | Data Flow: übergreifend", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Qualitätsbewertung", + "description": "In diesem Kapitel stehen die Kriterien der Qualitätsbewertung im Mittelpunkt.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Qualit%C3%A4tsbewertung.html", + "timeRequired": "PT30M", + "teaches": "Sicherstellen der Qualität von Datensätzen", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung zentraler Qualitätskriterien für Datensätze kann für die Forschung erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.2 Qualitätssicherung | Bloom's: 2 Verstehen | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Vor- und Nachteile zwischen XLSX- und CSV-Dateiformaten können für verschiedene Anwendungsfälle aufgezeigt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.2 Qualitätssicherung | Bloom's: 2 Verstehen | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Qualität eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.2 Qualitätssicherung | Bloom's: 3 Anwenden | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Identifikatoren", + "description": "Dieses Kapitel behandelt Identifikatoren der eindeutigen und persistenten Auszeichnung von Daten sowie das Zitieren von Forschungsdaten.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Identifikatoren.html", + "timeRequired": "PT25M", + "teaches": "Datenzitierung und PID", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die allgemein anerkannten Methoden der Datenzitierung können beschrieben werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.3 Ethik und Recht | Bloom's: 2 Verstehen | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Rolle von Persistenten Identifikatoren (PIDs) in der Datenzitierung kann erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 5.2 Datenpublikation | Bloom's: 2 Verstehen | Data Flow: 5 Publikation und Nachnutzung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die spezifischen Einsatzbereiche verschiedener Arten von PIDs (z.B. DOI, ORCID) können erklärt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 5.2 Datenpublikation | Bloom's: 2 Verstehen | Data Flow: 5 Publikation und Nachnutzung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Datenaufbereitung I - Organisation und Strukturierung", + "description": "Dieses Kapitel widmet sich der Datenmanipulation als ein entscheidender Bestandteil, um die Qualität und den Aufbau von Datensätzen zu evaluieren und zu verbessern.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Datenmanipulation1.html", + "timeRequired": "PT60M", + "teaches": "Datenaufbereitung und -strukturierung", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung von Datenaufbereitung kann erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 2.2 Validierung | Bloom's: 2 Verstehen | Data Flow: 2 Erhebung und Aufbereitung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Anwendung spezifischer Aufbereitungstechniken auf einen gegebenen Datensatz kann beschrieben werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 2.3 Aufbereitung | Bloom's: 2 Verstehen | Data Flow: 2 Erhebung und Aufbereitung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung einer klaren Datenstruktur für effektive Datenaufbereitung kann anhand eines Beispiels demonstriert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 2.3 Aufbereitung | Bloom's: 3 Anwenden | Data Flow: 2 Erhebung und Aufbereitung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Datenaufbereitung II - Analyse und Reproduzierbarkeit", + "description": "Dieses Kapitel widmet sich der Datenanalyse und -reproduzierbarkeit mit Hilfe der Programmiersprache R.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Datenmanipulation2.html", + "timeRequired": "PT60M", + "teaches": "Datenanalyse und -reproduzierbarkeit", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Wichtigkeit einer sorgfältigen Dokumentation bei der Durchführung einer deskriptiven Analyse kann beschrieben werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 4.1 Datenanalyse | Bloom's: 2 Verstehen | Data Flow: 4 Analyse", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Anhand eines konkreten Beispiels (aus der Fallstudie zur Reproduzierbarkeit) können mindestens drei für die Reproduzierbarkeit besonders relevante Aspekte erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 4.1 Datenanalyse | Bloom's: 2 Verstehen | Data Flow: 4 Analyse", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + } + ], + "dcterms:tableOfContents": "- Präambel\n- Datenbasis: Nationaler Bildungsbericht\n- Datennachnutzung\n- Qualitätsbewertung\n- Identifikatoren\n- Datenaufbereitung I - Organisation und Strukturierung\n- Datenaufbereitung II - Analyse und Reproduzierbarkeit\n- Zusammenfassung und Reflexion\n- Epilog", + "funding": "Die vorliegenden Open Educational Resources wurden durch das Datenkompetenzzentrum QUADRIGA erstellt.\n\nFörderkennzeichen: 16DKZ2034", + "learningResourceType": "Jupyter Book", + "lrmi:learningResourceType": "Jupyter Book", + "dcterms:type": "Jupyter Book", + "dc:type": "Jupyter Book" +} \ No newline at end of file diff --git a/metadata.rdf b/metadata.rdf new file mode 100644 index 0000000..3ee67da --- /dev/null +++ b/metadata.rdf @@ -0,0 +1,542 @@ + + + + nicht anwendbar + QUADRIGA Competency Framework + Competency: nicht anwendbar | Bloom's: 2 Verstehen | Data Flow: nicht anwendbar + Die Forschungsfrage kann erläutert und in einen Kontext mit dem Bildungsbericht gesetzt werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: Orientierungswissen | Bloom's: 2 Verstehen | Data Flow: übergreifend + Die Bedeutung der vier FAIR-Prinzipien kann anhand eines konkreten Beispiels erläutert werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: Orientierungswissen | Bloom's: 3 Anwenden | Data Flow: übergreifend + Die FAIRness eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 1.2 Qualitätssicherung | Bloom's: 2 Verstehen | Data Flow: 1 Planung + Die Bedeutung zentraler Qualitätskriterien für Datensätze kann für die Forschung erläutert werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 1.2 Qualitätssicherung | Bloom's: 2 Verstehen | Data Flow: 1 Planung + Die Vor- und Nachteile zwischen XLSX- und CSV-Dateiformaten können für verschiedene Anwendungsfälle aufgezeigt werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 1.2 Qualitätssicherung | Bloom's: 3 Anwenden | Data Flow: 1 Planung + Die Qualität eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 1.3 Ethik und Recht | Bloom's: 2 Verstehen | Data Flow: 1 Planung + Die allgemein anerkannten Methoden der Datenzitierung können beschrieben werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 5.2 Datenpublikation | Bloom's: 2 Verstehen | Data Flow: 5 Publikation und Nachnutzung + Die Rolle von Persistenten Identifikatoren (PIDs) in der Datenzitierung kann erläutert werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 5.2 Datenpublikation | Bloom's: 2 Verstehen | Data Flow: 5 Publikation und Nachnutzung + Die spezifischen Einsatzbereiche verschiedener Arten von PIDs (z.B. DOI, ORCID) können erklärt werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 2.2 Validierung | Bloom's: 2 Verstehen | Data Flow: 2 Erhebung und Aufbereitung + Die Bedeutung von Datenaufbereitung kann erläutert werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 2.3 Aufbereitung | Bloom's: 2 Verstehen | Data Flow: 2 Erhebung und Aufbereitung + Die Anwendung spezifischer Aufbereitungstechniken auf einen gegebenen Datensatz kann beschrieben werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 2.3 Aufbereitung | Bloom's: 3 Anwenden | Data Flow: 2 Erhebung und Aufbereitung + Die Bedeutung einer klaren Datenstruktur für effektive Datenaufbereitung kann anhand eines Beispiels demonstriert werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 4.1 Datenanalyse | Bloom's: 2 Verstehen | Data Flow: 4 Analyse + Die Wichtigkeit einer sorgfältigen Dokumentation bei der Durchführung einer deskriptiven Analyse kann beschrieben werden. + + + Multiple-Choice-Quiz, Reflexionsfragen + QUADRIGA Competency Framework + Competency: 4.1 Datenanalyse | Bloom's: 2 Verstehen | Data Flow: 4 Analyse + Anhand eines konkreten Beispiels (aus der Fallstudie zur Reproduzierbarkeit) können mindestens drei für die Reproduzierbarkeit besonders relevante Aspekte erläutert werden. + + + Jupyter Book + - Präambel +- Datenbasis: Nationaler Bildungsbericht +- Datennachnutzung +- Qualitätsbewertung +- Identifikatoren +- Datenaufbereitung I - Organisation und Strukturierung +- Datenaufbereitung II - Analyse und Reproduzierbarkeit +- Zusammenfassung und Reflexion +- Epilog + Jupyter Book + Jupyter Book + + + Verwaltung + + + + + Verwaltungswissenschaft + + + + + Tabelle + + + + + Lerneinheit + + + + + Public Sector + + + + + Open Educational Resource + + + + + FAIR-Prinzipien + + + + + Datenqualität + + + + + Persistente Identifikatoren + + + + + Reproduzierbarkeit + + + + + R (Programmiersprache) + + + + + Verwaltungswissenschaften + + + + + Informationswissenschaft + + + + + Tabelle + + + + + Forschende (PostDoc) + + + + + Forschende (Projektleitung) + + + + + Promovierende + + + + + Hochschullehrende + + + + + + Plomin + Jana + + Jana Plomin + + + + + + Schmeling + Juliane + + Juliane Schmeling + + + + + + Walter + Paul + + Paul Walter + + + + + + Schulze + Anton + Anton Schulze + + + + + + Schnaitter + Hannes + + Hannes Schnaitter + + + + + + Samoilova + Evgenia + + Evgenia Samoilova + + + + + + Schneemann + Carsten + + Carsten Schneemann + + + + + + Islam + Lamia + + Lamia Islam + + + + + + Zhang + Yue + Yue Zhang + + + + + Wiemer + Philip + Philip Wiemer + + + + + + Bernoth + Jan + + Jan Bernoth + + + + + + Seltmann + Melanie + + Melanie Seltmann + + + + + + Sluyther-Gäthhje + Henny + + Henny Sluyther-Gäthhje + + + + + + Petras + Vivien + + Vivien Petras + + + + + + Neuroth + Heike + + Heike Neuroth + + + 2025-03-24 + 2024-06-13 + Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt. + Die vorliegenden Open Educational Resources wurden durch das Datenkompetenzzentrum QUADRIGA erstellt. + +Förderkennzeichen: 16DKZ2034 + + + Dieses Kapitel beschreibt die in dieser Fallstudie genutzten Daten. + + Datenbasis + Forschungssfrage + PT5M + + + + + + Dieses Kapitel befasst sich mit der Datennachnutzung und den FAIR-Prinzipien. + + + Datennachnutzung + Grundsätze des Datenmanagements + PT15M + + + + + + In diesem Kapitel stehen die Kriterien der Qualitätsbewertung im Mittelpunkt. + + + + Qualitätsbewertung + Sicherstellen der Qualität von Datensätzen + PT30M + + + + + + Dieses Kapitel behandelt Identifikatoren der eindeutigen und persistenten Auszeichnung von Daten sowie das Zitieren von Forschungsdaten. + + + + Identifikatoren + Datenzitierung und PID + PT25M + + + + + + Dieses Kapitel widmet sich der Datenmanipulation als ein entscheidender Bestandteil, um die Qualität und den Aufbau von Datensätzen zu evaluieren und zu verbessern. + + + + Datenaufbereitung I - Organisation und Strukturierung + Datenaufbereitung und -strukturierung + PT60M + + + + + + Dieses Kapitel widmet sich der Datenanalyse und -reproduzierbarkeit mit Hilfe der Programmiersprache R. + + + Datenaufbereitung II - Analyse und Reproduzierbarkeit + Datenanalyse und -reproduzierbarkeit + PT60M + + + + + + DOI + + 10.5281/zenodo.14975202 + + + de + Datenqualität + FAIR-Prinzipien + Lerneinheit + Open Educational Resource + Persistente Identifikatoren + Public Sector + R (Programmiersprache) + Reproduzierbarkeit + Tabelle + Verwaltung + Verwaltungswissenschaft + Jupyter Book + + + + Source Code + + + + + + Content + + + Reproduzierbarkeit von Datenanalysen: Ein Fallbeispiel aus dem Nationalen Bildungsbericht. QUADRIGA Open Educational Resources: Tabelle 1 + 1.0.0 + PT3H15M + + 1.0.0-beta.2 + + + + Source Code Repository + + + + + + Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS + + + Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS + + + Fachhochschule Potsdam + + + Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS + + + Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft + + + Fachhochschule Potsdam + + + Universität Potsdam + + + Fachhochschule Potsdam + + + Universität Potsdam + + + Technische Universität Berlin + + + Universität Potsdam + + + Fachhochschule Potsdam + + + Universität Potsdam + + + Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft + + + ORCID + + 0000-0003-0127-7493 + + + ORCID + + 0009-0005-9814-1139 + + + ORCID + + 0000-0002-9236-3268 + + + ORCID + + 0000-0002-1602-6032 + + + ORCID + + 0000-0002-3637-3154 + + + ORCID + + 0000-0003-3858-901X + + + ORCID + + 0000-0002-2683-5853 + + + ORCID + + 0009-0001-1879-9880 + + + ORCID + + 0000-0002-4127-0053 + + + ORCID + + 0000-0002-7588-4395 + + + ORCID + + 0000-0003-2969-3237 + + + ORCID + + 0000-0002-8113-1509 + + diff --git a/metadata.yml b/metadata.yml index 87aa59b..1cd0ed6 100644 --- a/metadata.yml +++ b/metadata.yml @@ -155,8 +155,9 @@ contributors: identifier: https://doi.org/10.5281/zenodo.14975202 git: https://github.com/quadriga-dk/Tabelle-Fallstudie-1 url: https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/0_Intro.html -prerequisites: Ein Grundverständnis für statistische Software bzw. für das Programmieren - wird empfohlen. Ansonsten sind keine Vorkenntnisse nötig. +prerequisites: +- Ein Grundverständnis für statistische Software bzw. für das Programmieren wird empfohlen. + Ansonsten sind keine Vorkenntnisse nötig. used-tools: - name: R (Programmiersprache) url: https://cran.rstudio.com/ @@ -205,19 +206,19 @@ chapters: learning-objectives: - learning-objective: Die Bedeutung zentraler Qualitätskriterien für Datensätze kann für die Forschung erläutert werden. - competency: 2 Qualitätssicherung + competency: 1.2 Qualitätssicherung data-flow: 1 Planung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Die Vor- und Nachteile zwischen XLSX- und CSV-Dateiformaten können für verschiedene Anwendungsfälle aufgezeigt werden. - competency: 2 Qualitätssicherung + competency: 1.2 Qualitätssicherung data-flow: 1 Planung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Die Qualität eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden. - competency: 2 Qualitätssicherung + competency: 1.2 Qualitätssicherung data-flow: 1 Planung blooms-category: 3 Anwenden assessment: Multiple-Choice-Quiz, Reflexionsfragen @@ -233,19 +234,19 @@ chapters: learning-objectives: - learning-objective: Die allgemein anerkannten Methoden der Datenzitierung können beschrieben werden. - competency: 3 Ethik und Recht + competency: 1.3 Ethik und Recht data-flow: 1 Planung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Die Rolle von Persistenten Identifikatoren (PIDs) in der Datenzitierung kann erläutert werden. - competency: 13 Datenpublikation + competency: 5.2 Datenpublikation data-flow: 5 Publikation und Nachnutzung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Die spezifischen Einsatzbereiche verschiedener Arten von PIDs (z.B. DOI, ORCID) können erklärt werden. - competency: 13 Datenpublikation + competency: 5.2 Datenpublikation data-flow: 5 Publikation und Nachnutzung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen @@ -261,19 +262,19 @@ chapters: learning-goal: Datenaufbereitung und -strukturierung learning-objectives: - learning-objective: Die Bedeutung von Datenaufbereitung kann erläutert werden. - competency: 5 Validierung + competency: 2.2 Validierung data-flow: 2 Erhebung und Aufbereitung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Die Anwendung spezifischer Aufbereitungstechniken auf einen gegebenen Datensatz kann beschrieben werden. - competency: 6 Aufbereitung + competency: 2.3 Aufbereitung data-flow: 2 Erhebung und Aufbereitung blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Die Bedeutung einer klaren Datenstruktur für effektive Datenaufbereitung kann anhand eines Beispiels demonstriert werden. - competency: 6 Aufbereitung + competency: 2.3 Aufbereitung data-flow: 2 Erhebung und Aufbereitung blooms-category: 3 Anwenden assessment: Multiple-Choice-Quiz, Reflexionsfragen @@ -288,27 +289,30 @@ chapters: learning-objectives: - learning-objective: Die Wichtigkeit einer sorgfältigen Dokumentation bei der Durchführung einer deskriptiven Analyse kann beschrieben werden. - competency: 9 Datenanalyse + competency: 4.1 Datenanalyse data-flow: 4 Analyse blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen - learning-objective: Anhand eines konkreten Beispiels (aus der Fallstudie zur Reproduzierbarkeit) können mindestens drei für die Reproduzierbarkeit besonders relevante Aspekte erläutert werden. - competency: 9 Datenanalyse + competency: 4.1 Datenanalyse data-flow: 4 Analyse blooms-category: 2 Verstehen assessment: Multiple-Choice-Quiz, Reflexionsfragen supplemented-by: - https://www-genesis.destatis.de/datenbank/online/statistic/21341/table/21341-0001/search/s/cGVyc29uYWwlMjBob2Noc2NodWxl -date-issued: 2024-06-13 -date-modified: 2025-03-24 +date-issued: '2024-06-13' +date-modified: '2025-03-24' version: 1.0.0-beta.2 context-of-creation: 'Die vorliegenden Open Educational Resources wurden durch das - Datenkompetenzzentrum QUADRIGA erstellt. Förderkennzeichen: 16DKZ2034' + Datenkompetenzzentrum QUADRIGA erstellt. + + + Förderkennzeichen: 16DKZ2034' quality-assurance: -- description: DOI des Qualitätsprüfungstextes hinzufügen -- date: 2025-10-01 + description: https://doi.org/TODO + date: '2025-10-01' learning-resource-type: Jupyter Book schema-version: 1.0.0 license: diff --git a/quadriga/assessment.py b/quadriga/assessment.py index 32328c3..bfb29d1 100644 --- a/quadriga/assessment.py +++ b/quadriga/assessment.py @@ -1,4 +1,7 @@ from IPython.display import HTML +import json +import uuid + def create_answer_box(question_id, rows=4): """Create an answer box with a submit button.""" @@ -7,4 +10,126 @@ def create_answer_box(question_id, rows=4): - """) \ No newline at end of file + """) + + +class DragDropQuiz: + """ + A simple drag-and-drop quiz generator for Jupyter Books. + + Usage: + quiz = DragDropQuiz() + quiz.create_matching_quiz( + title="Your Quiz Title", + descriptions=["Description 1", "Description 2", "Description 3"], + options=["Option A", "Option B", "Option C"], + correct_mapping={"Description 1": "Option A", "Description 2": "Option B", "Description 3": "Option C"} + ) + """ + + def __init__(self): + self.quiz_counter = 0 + + def create_matching_quiz(self, title, descriptions, options, correct_mapping, show_feedback=True, feedback_messages=None): + """ + Create a drag-and-drop matching quiz. + + Parameters: + - title (str): The quiz title/question + - descriptions (list): List of items to be matched (static labels) + - options (list): List of draggable options (draggable items) + - correct_mapping (dict): Dictionary mapping descriptions to correct options + - show_feedback (bool): Whether to show feedback after submission + - feedback_messages (dict): Custom feedback messages with keys 'correct', 'incorrect', 'partial' + """ + self.quiz_counter += 1 + quiz_id = f"drag_drop_quiz_{self.quiz_counter}_{uuid.uuid4().hex[:8]}" + + # Set default feedback messages if none provided + if feedback_messages is None: + feedback_messages = { + "correct": "Perfekt! Alle {total} Zuordnungen sind korrekt!", + "incorrect": "Leider sind keine Zuordnungen korrekt. Versuchen Sie es noch einmal!", + "partial": "Teilweise richtig: {correct} von {total} Zuordnungen sind korrekt." + } + + # Convert correct mapping to use indices for easier JavaScript handling + desc_to_idx = {desc: i for i, desc in enumerate(descriptions)} + opt_to_idx = {opt: i for i, opt in enumerate(options)} + + correct_pairs = [] + for desc, opt in correct_mapping.items(): + if desc in desc_to_idx and opt in opt_to_idx: + correct_pairs.append([desc_to_idx[desc], opt_to_idx[opt]]) + + html_content = self._generate_html( + quiz_id, title, descriptions, options, correct_pairs, show_feedback, feedback_messages + ) + + return HTML(html_content) + + def _generate_html(self, quiz_id, title, descriptions, options, correct_pairs, show_feedback, feedback_messages): + """Generate the complete HTML for the drag-and-drop quiz.""" + + # Generate static description labels with drop zones + description_zones = "" + for i, desc in enumerate(descriptions): + description_zones += f''' +
+
{desc}
+
+ Hier ablegen +
+
+ ''' + + # Generate draggable options + draggable_options = "" + for i, option in enumerate(options): + draggable_options += f''' +
+ {option} +
+ ''' + + return f''' +
+
{title}
+ +
+
+ {description_zones} +
+
+ +
+
Ziehen Sie diese zu den passenden Beschreibungen
+
+ {draggable_options} +
+
+ +
+ + +
+ +
+
+ + ''' + + \ No newline at end of file diff --git a/quadriga/metadata/__init__.py b/quadriga/metadata/__init__.py index 8710fba..fc1ab8d 100644 --- a/quadriga/metadata/__init__.py +++ b/quadriga/metadata/__init__.py @@ -7,15 +7,17 @@ __all__ = [ "create_bibtex", - "update_citation_cff", "extract_from_book_config", + "update_citation_cff", "update_version_from_tag", "utils", ] # Import the modules to make their functions available -from . import create_bibtex -from . import update_citation_cff -from . import extract_from_book_config -from . import update_version_from_tag -from . import utils +from . import ( + create_bibtex, + extract_from_book_config, + update_citation_cff, + update_version_from_tag, + utils, +) diff --git a/quadriga/metadata/create_bibtex.py b/quadriga/metadata/create_bibtex.py index 09a083d..aa960ac 100644 --- a/quadriga/metadata/create_bibtex.py +++ b/quadriga/metadata/create_bibtex.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import logging import sys -from pathlib import Path from .utils import ( extract_keywords, @@ -11,6 +12,7 @@ ) logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) # Map CFF types to BibTeX entry types CFF_TO_BIBTEX_TYPES = { @@ -70,14 +72,15 @@ } -def create_bibtex_from_cff(): +def create_bibtex_from_cff() -> bool | None: """ - Creates a CITATION.bib file from CITATION.cff. + Create a CITATION.bib file from CITATION.cff. - It reads citation data, prioritizing the 'preferred-citation' block if available, + Reads citation data, prioritizing the 'preferred-citation' block if available, formats authors, generates a citation key, and constructs a BibTeX entry. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: @@ -86,28 +89,31 @@ def create_bibtex_from_cff(): repo_root = get_file_path("") # Get repo root citation_cff_path = get_file_path("CITATION.cff", repo_root) citation_bib_path = get_file_path("CITATION.bib", repo_root) - except Exception as e: - logging.error(f"Failed to resolve file paths: {str(e)}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Check if citation_cff_path exists - if not Path(citation_cff_path).exists(): - logging.error(f"CITATION.cff file not found at {citation_cff_path}") + if not citation_cff_path.exists(): + logger.error("CITATION.cff file not found at %s", citation_cff_path) return False # Read CITATION.cff using utility function citation_data = load_yaml_file(citation_cff_path) - if not citation_data: - logging.error(f"Could not load {citation_cff_path}. Exiting.") + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") return False # Extract data from preferred-citation or root if "preferred-citation" in citation_data: - logging.info("Using 'preferred-citation' section from CITATION.cff") + logger.info("Using 'preferred-citation' section from CITATION.cff") pref = citation_data.get("preferred-citation") + if not isinstance(pref, dict): + logger.error("preferred-citation is not a dictionary") + return False else: - logging.info("No 'preferred-citation' section found, using root data") + logger.info("No 'preferred-citation' section found, using root data") pref = citation_data # Validate required fields @@ -116,19 +122,19 @@ def create_bibtex_from_cff(): year = str(pref.get("year", "")) # Ensure year is a string for generate_citation_key if not authors: - logging.warning("No authors found in CITATION.cff") + logger.warning("No authors found in CITATION.cff") if title == "Untitled": - logging.warning("No title found in CITATION.cff, using 'Untitled'") + logger.warning("No title found in CITATION.cff, using 'Untitled'") if not year: - logging.warning("No year found in CITATION.cff") + logger.warning("No year found in CITATION.cff") # Use utility function to format authors try: author_str = format_authors_for_bibtex(authors) - except Exception as e: - logging.error(f"Error formatting authors: {str(e)}") + except Exception: + logger.exception("Error formatting authors") author_str = "" # Choose entry type based on type field @@ -141,19 +147,19 @@ def create_bibtex_from_cff(): if entry_type == "thesis": # Check for thesis type information thesis_type = pref.get("thesis-type", "").lower() - if thesis_type == "master" or thesis_type == "masters" or thesis_type == "master's": + if thesis_type in {"master", "masters", "master's"}: entry_type = "mastersthesis" else: # Default to phdthesis if type is not specified or is something else entry_type = "phdthesis" - logging.info(f"Converting CFF type '{cff_type}' to BibTeX entry type: {entry_type}") + logger.info("Converting CFF type '%s' to BibTeX entry type: %s", cff_type, entry_type) # Use utility function to generate citation key try: citation_key = generate_citation_key(authors, title, year) - except Exception as e: - logging.error(f"Error generating citation key: {str(e)}") + except Exception: + logger.exception("Error generating citation key") citation_key = "Unknown_Citation_Key" # Compile BibTeX entry @@ -254,8 +260,8 @@ def create_bibtex_from_cff(): try: editor_str = format_authors_for_bibtex(pref["collection-editors"]) bibtex_lines.append(f" editor = {{{editor_str}}},") - except Exception as e: - logging.warning(f"Error formatting collection editors: {str(e)}") + except (KeyError, TypeError, AttributeError) as e: + logger.warning("Error formatting collection editors: %s", e) # Special handling for software, code, data entries if cff_type.lower().startswith("software") or cff_type.lower() in [ @@ -269,9 +275,7 @@ def create_bibtex_from_cff(): if "repository-code" in pref and "note" not in pref: bibtex_lines.append(f" note = {{Repository: {pref['repository-code']}}},") - # Add version info - if "version" in pref: - bibtex_lines.append(f" version = {{{pref['version']}}},") + # Note: version is already added in the common fields section above # Add software-specific details as howpublished if not present if ("howpublished" not in pref) and ("repository-code" in pref or "url" in pref): @@ -299,22 +303,22 @@ def create_bibtex_from_cff(): bibtex_lines.append(f" {field:<9} = {{{field_value}}},") # Handle list fields like languages - if "languages" in pref and pref["languages"]: + if pref.get("languages"): try: languages_str = ", ".join(pref["languages"]) bibtex_lines.append(f" language = {{{languages_str}}},") - except Exception as e: - logging.warning(f"Error processing languages field: {str(e)}") + except (TypeError, AttributeError) as e: + logger.warning("Error processing languages field: %s", e) # Handle keywords field - if "keywords" in pref and pref["keywords"]: + if pref.get("keywords"): try: keywords_list = extract_keywords(pref["keywords"]) if keywords_list: keywords_str = ", ".join(keywords_list) bibtex_lines.append(f" keywords = {{{keywords_str}}},") - except Exception as e: - logging.warning(f"Error processing keywords field: {str(e)}") + except (TypeError, AttributeError) as e: + logger.warning("Error processing keywords field: %s", e) # Close the entry bibtex_lines.append("}") @@ -322,16 +326,17 @@ def create_bibtex_from_cff(): # Write to CITATION.bib try: - with open(citation_bib_path, "w", encoding="utf-8") as f: + with citation_bib_path.open("w", encoding="utf-8") as f: f.write(bibtex) - logging.info(f"BibTeX citation successfully created at {citation_bib_path}") - return True - except IOError as e: - logging.error(f"Error writing to {citation_bib_path}: {e}") + except OSError: + logger.exception("Error writing to %s", citation_bib_path) return False + else: + logger.info("BibTeX citation successfully created at %s", citation_bib_path) + return True - except Exception as e: - logging.exception(f"Unexpected error in create_bibtex_from_cff: {str(e)}") + except Exception: + logger.exception("Unexpected error in create_bibtex_from_cff") return False diff --git a/quadriga/metadata/create_jsonld.py b/quadriga/metadata/create_jsonld.py new file mode 100644 index 0000000..e7b8017 --- /dev/null +++ b/quadriga/metadata/create_jsonld.py @@ -0,0 +1,595 @@ +""" +Creates a JSON-LD file from metadata.yml using QUADRIGA schema x-mappings. + +This script reads metadata from 'metadata.yml' and transforms it into JSON-LD +format using the x-mappings defined in the QUADRIGA schema. The output follows +Schema.org, Dublin Core, LRMI, and other standard vocabularies. + +The JSON-LD file provides machine-readable linked data that can be consumed by +search engines, digital repositories, and other semantic web applications. +""" + +from __future__ import annotations + +import json +import logging +import sys +from pathlib import Path +from typing import Any + +from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + + +def build_jsonld_context() -> dict[str, str]: + """ + Build the JSON-LD @context with vocabulary namespaces. + + Returns + ------- + dict: Context dictionary with vocabulary prefixes + """ + return { + "schema": "http://schema.org/", + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "lrmi": "http://purl.org/dcx/lrmi-terms/", + "skos": "http://www.w3.org/2004/02/skos/core#", + "@vocab": "http://schema.org/", + } + + +def clean_orcid(orcid_string: str) -> str | None: + """ + Extract ORCID identifier from an ORCID string or URL. + + Args: + orcid_string (str): ORCID string which may include URL prefix + + Returns + ------- + str: Clean ORCID identifier (e.g., "0000-0002-1602-6032") + """ + if not orcid_string: + return None + + orcid = str(orcid_string) + prefixes = ["https://orcid.org/", "http://orcid.org/", "orcid:"] + for prefix in prefixes: + if orcid.startswith(prefix): + orcid = orcid[len(prefix) :] + break + + return orcid.strip() + + +def clean_doi(doi_string: str) -> str | None: + """ + Extract DOI identifier from a DOI string or URL. + + Args: + doi_string (str): DOI string which may include URL prefix + + Returns + ------- + str: Clean DOI identifier (e.g., "10.5281/zenodo.14970672") + """ + if not doi_string: + return None + + doi = str(doi_string) + prefixes = ["https://doi.org/", "http://doi.org/", "doi:"] + for prefix in prefixes: + if doi.startswith(prefix): + doi = doi[len(prefix) :] + break + + return doi.strip() + + +def transform_person(person_data: Any) -> dict[str, Any]: + """ + Transform author or contributor to Schema.org Person. + + Uses x-mappings: + - author/contributor -> schema:Person + - given-names -> schema:givenName + - family-names -> schema:familyName + - orcid -> schema:identifier + - affiliation -> schema:affiliation + - credit -> not included (no standard schema.org mapping for CRediT roles) + + Args: + person_data (dict): Author or contributor dictionary + + Returns + ------- + dict: Schema.org Person object + """ + if not isinstance(person_data, dict): + logger.warning("Invalid person data: %s", person_data) + return {} + + person: dict[str, Any] = {"@type": "Person"} + + # given-names -> schema:givenName (exactMatch) + if "given-names" in person_data: + person["givenName"] = person_data["given-names"] + + # family-names -> schema:familyName (exactMatch) + if "family-names" in person_data: + person["familyName"] = person_data["family-names"] + + # Construct full name + if "given-names" in person_data or "family-names" in person_data: + given = person_data.get("given-names", "") + family = person_data.get("family-names", "") + person["name"] = f"{given} {family}".strip() + + # orcid -> schema:identifier (exactMatch) + if "orcid" in person_data: + clean_orcid_id = clean_orcid(person_data["orcid"]) + if clean_orcid_id: + person["identifier"] = { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": clean_orcid_id, + "url": f"https://orcid.org/{clean_orcid_id}", + } + + # affiliation -> schema:affiliation (mapped in both author and contributor) + if "affiliation" in person_data: + person["affiliation"] = { + "@type": "Organization", + "name": person_data["affiliation"], + } + + # Note: CRediT roles (credit field) are not included in JSON-LD + # because schema.org does not have a standard property for contributor roles + # on Person objects within author/contributor arrays + + return person + + +def transform_learning_objective(objective_data: Any) -> dict[str, Any]: + """ + Transform learning objective entry to AlignmentObject. + + Uses x-mappings: + - learning-objective -> schema:teaches / lrmi:teaches (closeMatch) + - competency -> maps to modalia:Skill + - blooms-category -> part of educational alignment + - assessment -> lrmi:assesses / schema:assesses (closeMatch) + + Args: + objective_data (dict): Learning objective dictionary + + Returns + ------- + dict: Schema.org AlignmentObject + """ + if not isinstance(objective_data, dict): + return {} + + objective = { + "@type": "AlignmentObject", + } + + # learning-objective text + if "learning-objective" in objective_data: + objective["targetName"] = objective_data["learning-objective"] + + # Add competency framework information if available + if "competency" in objective_data: + objective["educationalFramework"] = "QUADRIGA Competency Framework" + objective["targetDescription"] = f"Competency: {objective_data['competency']}" + + # Add Bloom's taxonomy level if available + if "blooms-category" in objective_data: + if "targetDescription" in objective: + objective["targetDescription"] += f" | Bloom's: {objective_data['blooms-category']}" + else: + objective["targetDescription"] = f"Bloom's: {objective_data['blooms-category']}" + + # Add data flow if available + if "data-flow" in objective_data: + if "targetDescription" in objective: + objective["targetDescription"] += f" | Data Flow: {objective_data['data-flow']}" + else: + objective["targetDescription"] = f"Data Flow: {objective_data['data-flow']}" + + # assessment -> lrmi:assesses / schema:assesses (closeMatch) + if "assessment" in objective_data: + objective["lrmi:assesses"] = objective_data["assessment"] + + return objective + + +def transform_chapter(chapter_data: Any) -> dict[str, Any]: + """ + Transform chapter to Schema.org/LRMI LearningResource. + + Uses x-mappings: + - chapter -> schema:LearningResource / lrmi:LearningResource (exactMatch) + - title -> schema:name (exactMatch) + - description -> schema:description (exactMatch) + - url -> schema:url (exactMatch) + - time-required -> schema:timeRequired (exactMatch) + - learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + - learning-objectives -> schema:teaches / lrmi:teaches (closeMatch) + + Args: + chapter_data (dict): Chapter dictionary + + Returns + ------- + dict: Schema.org LearningResource object + """ + if not isinstance(chapter_data, dict): + return {} + + chapter: dict[str, Any] = { + "@type": "LearningResource", + } + + # title -> schema:name (exactMatch) + if "title" in chapter_data: + chapter["name"] = chapter_data["title"] + + # description -> schema:description (exactMatch) + if "description" in chapter_data: + chapter["description"] = chapter_data["description"] + + # url -> schema:url (exactMatch) + if "url" in chapter_data: + chapter["url"] = chapter_data["url"] + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in chapter_data: + chapter["timeRequired"] = chapter_data["time-required"] + + # learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + if "learning-goal" in chapter_data: + chapter["teaches"] = chapter_data["learning-goal"] + + # learning-objectives -> schema:teaches / lrmi:teaches (closeMatch) + # Map to AlignmentObject for structured representation + if chapter_data.get("learning-objectives"): + objectives = [] + for obj in chapter_data["learning-objectives"]: + transformed = transform_learning_objective(obj) + if transformed and len(transformed) > 1: # More than just @type + objectives.append(transformed) + + if objectives: + chapter["educationalAlignment"] = objectives + + # language -> schema:inLanguage (exactMatch) + # Chapter-level language override (supports single language or array) + if "language" in chapter_data: + chapter["inLanguage"] = chapter_data["language"] + + return chapter + + +def transform_license(license_data: Any) -> dict | list | str | None: + """ + Transform license information to Schema.org license. + + The QUADRIGA schema has separate licenses for code and content. + Uses x-mappings: + - license -> schema:license (exactMatch) + + Args: + license_data: License dictionary or string + + Returns + ------- + dict or list: Schema.org license representation + """ + if not license_data: + return None + + # Handle string license (simple case) + if isinstance(license_data, str): + return license_data + + # Handle complex license structure (code vs content) + if isinstance(license_data, dict): + licenses = [] + + # Code license + if "code" in license_data: + code_license = { + "@type": "CreativeWork", + "name": "Source Code", + "license": license_data["code"], + } + licenses.append(code_license) + + # Content license + if "content" in license_data: + content_license_data = license_data["content"] + if isinstance(content_license_data, dict): + content_license = { + "@type": "CreativeWork", + "name": "Content", + } + if "url" in content_license_data: + content_license["license"] = content_license_data["url"] + # Note: licenseName is not a valid schema.org property + # The license URL should be sufficient for identification + licenses.append(content_license) + elif isinstance(content_license_data, str): + content_license = { + "@type": "CreativeWork", + "name": "Content", + "license": content_license_data, + } + licenses.append(content_license) + + return licenses if len(licenses) > 1 else licenses[0] if licenses else None + + return None + + +def create_jsonld() -> bool | None: + """ + Create a metadata.jsonld file from metadata.yml using QUADRIGA schema x-mappings. + + The function reads metadata from metadata.yml and transforms it into JSON-LD + format using the x-mappings defined in the QUADRIGA schema. The output uses + Schema.org as the primary vocabulary, with additional terms from Dublin Core, + LRMI (Learning Resource Metadata Initiative), and other standards. + + Returns + ------- + bool: True if successful, False otherwise. + """ + try: + # Define file paths + try: + repo_root = get_repo_root() + metadata_path = get_file_path("metadata.yml", repo_root) + jsonld_path = get_file_path("metadata.jsonld", repo_root) + except Exception: + logger.exception("Failed to resolve file paths") + return False + + # Check if metadata.yml exists + if not Path(metadata_path).exists(): + logger.error("metadata.yml file not found at %s", metadata_path) + return False + + # Load metadata.yml + metadata = load_yaml_file(metadata_path) + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") + return False + + # Build JSON-LD structure + jsonld: dict[str, Any] = { + "@context": build_jsonld_context(), + "@type": ["Book", "LearningResource"], + } + + # ===== BASIC METADATA ===== + + # title -> schema:name (exactMatch) + if "title" in metadata: + jsonld["name"] = metadata["title"] + logger.info("Added title: %s", metadata["title"]) + else: + logger.warning("No title found in metadata.yml") + + # description -> schema:description (exactMatch) + if "description" in metadata: + jsonld["description"] = metadata["description"] + logger.info("Added description") + + # identifier (DOI) -> schema:identifier (exactMatch) + if "identifier" in metadata: + clean_doi_id = clean_doi(metadata["identifier"]) + if clean_doi_id: + jsonld["identifier"] = { + "@type": "PropertyValue", + "propertyID": "DOI", + "value": clean_doi_id, + "url": metadata["identifier"], + } + logger.info("Added DOI identifier: %s", clean_doi_id) + + # version -> schema:version (exactMatch) + if "version" in metadata: + jsonld["version"] = str(metadata["version"]) + logger.info("Added version: %s", metadata["version"]) + + # schema-version -> schema:schemaVersion + if "schema-version" in metadata: + jsonld["schemaVersion"] = str(metadata["schema-version"]) + logger.info("Added schema version: %s", metadata["schema-version"]) + + # url -> schema:url (exactMatch) + if "url" in metadata: + jsonld["url"] = metadata["url"] + logger.info("Added URL: %s", metadata["url"]) + + # git -> schema:workExample as SoftwareSourceCode + # codeRepository is not valid for Book type, so we link to source code as a workExample + if "git" in metadata: + jsonld["workExample"] = { + "@type": "SoftwareSourceCode", + "name": "Source Code Repository", + "codeRepository": metadata["git"], + } + logger.info("Added code repository as workExample: %s", metadata["git"]) + + # ===== DATES ===== + + # date-issued -> schema:datePublished (exactMatch) + if "date-issued" in metadata: + # Handle both date objects and strings + date_value = metadata["date-issued"] + if hasattr(date_value, "isoformat"): + jsonld["datePublished"] = date_value.isoformat() + else: + jsonld["datePublished"] = str(date_value) + logger.info("Added datePublished: %s", jsonld["datePublished"]) + + # date-modified -> schema:dateModified (exactMatch) + if "date-modified" in metadata: + date_value = metadata["date-modified"] + if hasattr(date_value, "isoformat"): + jsonld["dateModified"] = date_value.isoformat() + else: + jsonld["dateModified"] = str(date_value) + logger.info("Added dateModified: %s", jsonld["dateModified"]) + + # ===== PEOPLE ===== + + # authors -> schema:author (exactMatch) + if metadata.get("authors"): + authors = [] + for author in metadata["authors"]: + person = transform_person(author) + if person and len(person) > 1: # More than just @type + authors.append(person) + if authors: + jsonld["author"] = authors + logger.info("Added %d authors", len(authors)) + else: + logger.warning("No authors found in metadata.yml") + + # contributors -> schema:contributor (exactMatch) + if metadata.get("contributors"): + contributors = [] + for contributor in metadata["contributors"]: + person = transform_person(contributor) + if person and len(person) > 1: # More than just @type + contributors.append(person) + if contributors: + jsonld["contributor"] = contributors + logger.info("Added %d contributors", len(contributors)) + + # ===== LANGUAGE & KEYWORDS ===== + + # language -> schema:inLanguage (exactMatch) + # Supports both single language (string) and multiple languages (array) + if "language" in metadata: + language_value = metadata["language"] + # If it's already a list, use it as-is + # If it's a single string, use it as-is (Schema.org supports both) + jsonld["inLanguage"] = language_value + if isinstance(language_value, list): + logger.info("Added languages: %s", ", ".join(language_value)) + else: + logger.info("Added language: %s", language_value) + + # keywords -> schema:keywords (exactMatch) and schema:about (closeMatch) + if metadata.get("keywords"): + keywords_list = extract_keywords(metadata["keywords"]) + if keywords_list: + jsonld["keywords"] = keywords_list + # Also add as 'about' for closeMatch mapping + jsonld["about"] = [{"@type": "Thing", "name": kw} for kw in keywords_list] + logger.info("Added %d keywords", len(keywords_list)) + + # ===== EDUCATIONAL METADATA ===== + + # discipline -> schema:about (closeMatch) and modalia:Discipline (exactMatch) + if metadata.get("discipline"): + if "about" not in jsonld: + jsonld["about"] = [] + for disc in metadata["discipline"]: + jsonld["about"].append({"@type": "Thing", "name": disc}) + logger.info("Added %d disciplines", len(metadata["discipline"])) + + # research-object-type -> schema:about (broadMatch) + if metadata.get("research-object-type"): + if "about" not in jsonld: + jsonld["about"] = [] + for obj_type in metadata["research-object-type"]: + jsonld["about"].append({"@type": "Thing", "name": obj_type}) + logger.info("Added %d research object types", len(metadata["research-object-type"])) + + # target-group -> schema:audience (closeMatch) and lrmi:educationalAudience (closeMatch) + if metadata.get("target-group"): + jsonld["audience"] = [ + {"@type": "Audience", "audienceType": group} for group in metadata["target-group"] + ] + logger.info("Added %d target groups", len(jsonld["audience"])) + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in metadata: + jsonld["timeRequired"] = metadata["time-required"] + logger.info("Added time required: %s", metadata["time-required"]) + + # ===== LICENSE ===== + + # license -> schema:license (exactMatch) + if "license" in metadata: + license_data = transform_license(metadata["license"]) + if license_data: + jsonld["license"] = license_data + logger.info("Added license information") + + # ===== CHAPTERS (hasPart) ===== + + # chapters -> schema:hasPart (closeMatch) + if metadata.get("chapters"): + parts = [] + for chapter in metadata["chapters"]: + chapter_obj = transform_chapter(chapter) + if chapter_obj and len(chapter_obj) > 1: # More than just @type + parts.append(chapter_obj) + if parts: + jsonld["hasPart"] = parts + logger.info("Added %d chapters", len(parts)) + + # table-of-contents -> dcterms:tableOfContents (exactMatch) + if "table-of-contents" in metadata: + jsonld["dcterms:tableOfContents"] = metadata["table-of-contents"] + logger.info("Added table of contents") + + # ===== ADDITIONAL METADATA ===== + + # context-of-creation -> modalia:Community (closeMatch) + if "context-of-creation" in metadata: + jsonld["funding"] = metadata["context-of-creation"] + logger.info("Added context of creation") + + # learning-resource-type -> schema:learningResourceType (closeMatch) + # -> lrmi:learningResourceType (closeMatch) + # -> dcterms:type (broadMatch) + # -> dc:type (broadMatch) + if "learning-resource-type" in metadata: + jsonld["learningResourceType"] = metadata["learning-resource-type"] + jsonld["lrmi:learningResourceType"] = metadata["learning-resource-type"] + jsonld["dcterms:type"] = metadata["learning-resource-type"] + jsonld["dc:type"] = metadata["learning-resource-type"] + logger.info("Added learning resource type: %s", metadata["learning-resource-type"]) + + # quality-assurance: not mapped to JSON-LD + # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output + + # Write JSON-LD file + try: + with jsonld_path.open("w", encoding="utf-8") as f: + json.dump(jsonld, f, ensure_ascii=False, indent=2) + except OSError: + logger.exception("Error writing to %s", jsonld_path) + return False + else: + logger.info("JSON-LD metadata successfully created at %s", jsonld_path) + return True + + except Exception: + logger.exception("Unexpected error in create_jsonld") + return False + + +if __name__ == "__main__": + success = create_jsonld() + sys.exit(0 if success else 1) diff --git a/quadriga/metadata/create_rdfxml.py b/quadriga/metadata/create_rdfxml.py new file mode 100644 index 0000000..e075256 --- /dev/null +++ b/quadriga/metadata/create_rdfxml.py @@ -0,0 +1,642 @@ +""" +Creates an RDF/XML file from metadata.yml using QUADRIGA schema x-mappings. + +This script reads metadata from 'metadata.yml' and transforms it into RDF/XML +format using the x-mappings defined in the QUADRIGA schema. The output follows +Schema.org, Dublin Core, LRMI, and other standard vocabularies. + +The RDF/XML file provides machine-readable linked data that can be consumed by +semantic web applications, triple stores, and other RDF-aware systems. +""" + +from __future__ import annotations + +import logging +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any + +from rdflib import RDF, Graph, Literal, Namespace, URIRef # type: ignore[import-not-found] +from rdflib.namespace import DCTERMS, SKOS, XSD # type: ignore[import-not-found] + +from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + +# Define namespaces +SCHEMA = Namespace("http://schema.org/") +DC = Namespace("http://purl.org/dc/elements/1.1/") +LRMI = Namespace("http://purl.org/dcx/lrmi-terms/") + + +def _sort_xml_element(element: ET.Element) -> None: + """ + Recursively sort child elements for deterministic XML output. + + Sorts by tag name, then by attributes (as sorted key-value pairs), + then by text content. This ensures identical output regardless of + Python's hash randomization (PYTHONHASHSEED). + + Args: + element: XML element whose children will be sorted in-place + """ + children = list(element) + for child in children: + _sort_xml_element(child) + children.sort(key=lambda e: (e.tag, sorted(e.attrib.items()), e.text or "")) + element[:] = children + + +def clean_orcid(orcid_string: str) -> str | None: + """ + Extract ORCID identifier from an ORCID string or URL. + + Args: + orcid_string (str): ORCID string which may include URL prefix + + Returns + ------- + str: Clean ORCID identifier (e.g., "0000-0002-1602-6032") + """ + if not orcid_string: + return None + + orcid = str(orcid_string) + prefixes = ["https://orcid.org/", "http://orcid.org/", "orcid:"] + for prefix in prefixes: + if orcid.startswith(prefix): + orcid = orcid[len(prefix) :] + break + + return orcid.strip() + + +def clean_doi(doi_string: str) -> str | None: + """ + Extract DOI identifier from a DOI string or URL. + + Args: + doi_string (str): DOI string which may include URL prefix + + Returns + ------- + str: Clean DOI identifier (e.g., "10.5281/zenodo.14970672") + """ + if not doi_string: + return None + + doi = str(doi_string) + prefixes = ["https://doi.org/", "http://doi.org/", "doi:"] + for prefix in prefixes: + if doi.startswith(prefix): + doi = doi[len(prefix) :] + break + + return doi.strip() + + +def add_person( + graph: Graph, person_data: Any, base_uri: str, person_type: str, index: int +) -> URIRef | None: + """ + Add a person (author or contributor) to the RDF graph. + + Uses x-mappings: + - author/contributor -> schema:Person + - given-names -> schema:givenName + - family-names -> schema:familyName + - orcid -> schema:identifier + - affiliation -> schema:affiliation + + Args: + graph: RDF graph to add triples to + person_data: Author or contributor dictionary + base_uri: Base URI for the resource + person_type: Type of person ('author' or 'contributor') + index: Index of the person in the list + + Returns + ------- + URIRef: URI of the person node, or None if invalid + """ + if not isinstance(person_data, dict): + logger.warning("Invalid person data: %s", person_data) + return None + + # Create person URI + person_uri = URIRef(f"{base_uri}#{person_type}_{index}") + graph.add((person_uri, RDF.type, SCHEMA.Person)) + + # given-names -> schema:givenName (exactMatch) + if "given-names" in person_data: + graph.add((person_uri, SCHEMA.givenName, Literal(person_data["given-names"]))) + + # family-names -> schema:familyName (exactMatch) + if "family-names" in person_data: + graph.add((person_uri, SCHEMA.familyName, Literal(person_data["family-names"]))) + + # Construct full name + if "given-names" in person_data or "family-names" in person_data: + given = person_data.get("given-names", "") + family = person_data.get("family-names", "") + full_name = f"{given} {family}".strip() + graph.add((person_uri, SCHEMA.name, Literal(full_name))) + + # orcid -> schema:identifier (exactMatch) + if "orcid" in person_data: + clean_orcid_id = clean_orcid(person_data["orcid"]) + if clean_orcid_id: + # Create PropertyValue node for ORCID + orcid_node = URIRef(f"{base_uri}#{person_type}_{index}_orcid") + graph.add((orcid_node, RDF.type, SCHEMA.PropertyValue)) + graph.add((orcid_node, SCHEMA.propertyID, Literal("ORCID"))) + graph.add((orcid_node, SCHEMA.value, Literal(clean_orcid_id))) + graph.add( + (orcid_node, SCHEMA.url, URIRef(f"https://orcid.org/{clean_orcid_id}")) + ) + graph.add((person_uri, SCHEMA.identifier, orcid_node)) + + # affiliation -> schema:affiliation (mapped in both author and contributor) + if "affiliation" in person_data: + # Create Organization node + org_node = URIRef(f"{base_uri}#{person_type}_{index}_org") + graph.add((org_node, RDF.type, SCHEMA.Organization)) + graph.add((org_node, SCHEMA.name, Literal(person_data["affiliation"]))) + graph.add((person_uri, SCHEMA.affiliation, org_node)) + + # Note: CRediT roles (credit field) are not included in RDF + # because schema.org does not have a standard property for contributor roles + + return person_uri + + +def add_learning_objective( + graph: Graph, objective_data: Any, base_uri: str, chapter_index: int, obj_index: int +) -> URIRef | None: + """ + Add a learning objective to the RDF graph as an AlignmentObject. + + Uses x-mappings: + - learning-objective -> schema:teaches / lrmi:teaches (closeMatch) + - competency -> maps to modalia:Skill + - blooms-category -> part of educational alignment + - assessment -> lrmi:assesses / schema:assesses (closeMatch) + + Args: + graph: RDF graph to add triples to + objective_data: Learning objective dictionary + base_uri: Base URI for the resource + chapter_index: Index of the chapter + obj_index: Index of the objective + + Returns + ------- + URIRef: URI of the alignment object node, or None if invalid + """ + if not isinstance(objective_data, dict): + return None + + obj_uri = URIRef(f"{base_uri}#chapter_{chapter_index}_objective_{obj_index}") + graph.add((obj_uri, RDF.type, SCHEMA.AlignmentObject)) + + # learning-objective text + if "learning-objective" in objective_data: + graph.add((obj_uri, SCHEMA.targetName, Literal(objective_data["learning-objective"]))) + + # Add competency framework information + descriptions = [] + if "competency" in objective_data: + graph.add( + ( + obj_uri, + SCHEMA.educationalFramework, + Literal("QUADRIGA Competency Framework"), + ) + ) + descriptions.append(f"Competency: {objective_data['competency']}") + + # Add Bloom's taxonomy level + if "blooms-category" in objective_data: + descriptions.append(f"Bloom's: {objective_data['blooms-category']}") + + # Add data flow + if "data-flow" in objective_data: + descriptions.append(f"Data Flow: {objective_data['data-flow']}") + + # Combine descriptions + if descriptions: + graph.add((obj_uri, SCHEMA.targetDescription, Literal(" | ".join(descriptions)))) + + # assessment -> lrmi:assesses (closeMatch) + if "assessment" in objective_data: + graph.add((obj_uri, LRMI.assesses, Literal(objective_data["assessment"]))) + + return obj_uri + + +def add_chapter( + graph: Graph, chapter_data: Any, base_uri: str, chapter_index: int +) -> URIRef | None: + """ + Add a chapter to the RDF graph as a LearningResource. + + Uses x-mappings: + - chapter -> schema:LearningResource / lrmi:LearningResource (exactMatch) + - title -> schema:name (exactMatch) + - description -> schema:description (exactMatch) + - url -> schema:url (exactMatch) + - time-required -> schema:timeRequired (exactMatch) + - learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + - learning-objectives -> schema:teaches / lrmi:teaches (closeMatch) + + Args: + graph: RDF graph to add triples to + chapter_data: Chapter dictionary + base_uri: Base URI for the resource + chapter_index: Index of the chapter + + Returns + ------- + URIRef: URI of the chapter node, or None if invalid + """ + if not isinstance(chapter_data, dict): + return None + + chapter_uri = URIRef(f"{base_uri}#chapter_{chapter_index}") + graph.add((chapter_uri, RDF.type, SCHEMA.LearningResource)) + + # title -> schema:name (exactMatch) + if "title" in chapter_data: + graph.add((chapter_uri, SCHEMA.name, Literal(chapter_data["title"]))) + + # description -> schema:description (exactMatch) + if "description" in chapter_data: + graph.add((chapter_uri, SCHEMA.description, Literal(chapter_data["description"]))) + + # url -> schema:url (exactMatch) + if "url" in chapter_data: + graph.add((chapter_uri, SCHEMA.url, URIRef(chapter_data["url"]))) + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in chapter_data: + graph.add((chapter_uri, SCHEMA.timeRequired, Literal(chapter_data["time-required"]))) + + # learning-goal -> schema:teaches / lrmi:teaches (closeMatch) + if "learning-goal" in chapter_data: + graph.add((chapter_uri, SCHEMA.teaches, Literal(chapter_data["learning-goal"]))) + + # learning-objectives -> educationalAlignment with AlignmentObject + if chapter_data.get("learning-objectives"): + for obj_index, obj_data in enumerate(chapter_data["learning-objectives"]): + obj_uri = add_learning_objective( + graph, obj_data, base_uri, chapter_index, obj_index + ) + if obj_uri: + graph.add((chapter_uri, SCHEMA.educationalAlignment, obj_uri)) + + # language -> schema:inLanguage (exactMatch) + if "language" in chapter_data: + language_value = chapter_data["language"] + if isinstance(language_value, list): + for lang in language_value: + graph.add((chapter_uri, SCHEMA.inLanguage, Literal(lang))) + else: + graph.add((chapter_uri, SCHEMA.inLanguage, Literal(language_value))) + + return chapter_uri + + +def create_rdfxml() -> bool | None: + """ + Create a metadata.rdf file from metadata.yml using QUADRIGA schema x-mappings. + + The function reads metadata from metadata.yml and transforms it into RDF/XML + format using the x-mappings defined in the QUADRIGA schema. The output uses + Schema.org as the primary vocabulary, with additional terms from Dublin Core, + LRMI (Learning Resource Metadata Initiative), and other standards. + + Returns + ------- + bool: True if successful, False otherwise. + """ + try: + # Define file paths + try: + repo_root = get_repo_root() + metadata_path = get_file_path("metadata.yml", repo_root) + rdf_path = get_file_path("metadata.rdf", repo_root) + except Exception: + logger.exception("Failed to resolve file paths") + return False + + # Check if metadata.yml exists + if not Path(metadata_path).exists(): + logger.error("metadata.yml file not found at %s", metadata_path) + return False + + # Load metadata.yml + metadata = load_yaml_file(metadata_path) + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") + return False + + # Create RDF graph + graph = Graph() + graph.bind("schema", SCHEMA) + graph.bind("dc", DC) + graph.bind("dcterms", DCTERMS) + graph.bind("lrmi", LRMI) + graph.bind("skos", SKOS) + + # Create base URI for the resource + base_uri = metadata.get("url", metadata.get("identifier", "http://example.org/book")) + if isinstance(base_uri, str) and not base_uri.startswith("http"): + base_uri = f"http://example.org/{base_uri}" + + resource_uri = URIRef(base_uri) + + # Add types: Book and LearningResource + graph.add((resource_uri, RDF.type, SCHEMA.Book)) + graph.add((resource_uri, RDF.type, SCHEMA.LearningResource)) + + # ===== BASIC METADATA ===== + + # title -> schema:name (exactMatch) + if "title" in metadata: + graph.add((resource_uri, SCHEMA.name, Literal(metadata["title"]))) + logger.info("Added title: %s", metadata["title"]) + else: + logger.warning("No title found in metadata.yml") + + # description -> schema:description (exactMatch) + if "description" in metadata: + graph.add((resource_uri, SCHEMA.description, Literal(metadata["description"]))) + logger.info("Added description") + + # identifier (DOI) -> schema:identifier (exactMatch) + if "identifier" in metadata: + clean_doi_id = clean_doi(metadata["identifier"]) + if clean_doi_id: + # Create PropertyValue node for DOI + doi_node = URIRef(f"{base_uri}#doi") + graph.add((doi_node, RDF.type, SCHEMA.PropertyValue)) + graph.add((doi_node, SCHEMA.propertyID, Literal("DOI"))) + graph.add((doi_node, SCHEMA.value, Literal(clean_doi_id))) + graph.add((doi_node, SCHEMA.url, URIRef(metadata["identifier"]))) + graph.add((resource_uri, SCHEMA.identifier, doi_node)) + logger.info("Added DOI identifier: %s", clean_doi_id) + + # version -> schema:version (exactMatch) + if "version" in metadata: + graph.add((resource_uri, SCHEMA.version, Literal(str(metadata["version"])))) + logger.info("Added version: %s", metadata["version"]) + + # schema-version -> schema:schemaVersion + if "schema-version" in metadata: + graph.add( + (resource_uri, SCHEMA.schemaVersion, Literal(str(metadata["schema-version"]))) + ) + logger.info("Added schema version: %s", metadata["schema-version"]) + + # url -> schema:url (exactMatch) + if "url" in metadata: + graph.add((resource_uri, SCHEMA.url, URIRef(metadata["url"]))) + logger.info("Added URL: %s", metadata["url"]) + + # git -> schema:workExample as SoftwareSourceCode + if "git" in metadata: + repo_node = URIRef(f"{base_uri}#repository") + graph.add((repo_node, RDF.type, SCHEMA.SoftwareSourceCode)) + graph.add((repo_node, SCHEMA.name, Literal("Source Code Repository"))) + graph.add((repo_node, SCHEMA.codeRepository, URIRef(metadata["git"]))) + graph.add((resource_uri, SCHEMA.workExample, repo_node)) + logger.info("Added code repository as workExample: %s", metadata["git"]) + + # ===== DATES ===== + + # date-issued -> schema:datePublished (exactMatch) + if "date-issued" in metadata: + date_value = metadata["date-issued"] + if hasattr(date_value, "isoformat"): + date_str = date_value.isoformat() + else: + date_str = str(date_value) + graph.add((resource_uri, SCHEMA.datePublished, Literal(date_str, datatype=XSD.date))) + logger.info("Added datePublished: %s", date_str) + + # date-modified -> schema:dateModified (exactMatch) + if "date-modified" in metadata: + date_value = metadata["date-modified"] + if hasattr(date_value, "isoformat"): + date_str = date_value.isoformat() + else: + date_str = str(date_value) + graph.add((resource_uri, SCHEMA.dateModified, Literal(date_str, datatype=XSD.date))) + logger.info("Added dateModified: %s", date_str) + + # ===== PEOPLE ===== + + # authors -> schema:author (exactMatch) + if metadata.get("authors"): + for i, author in enumerate(metadata["authors"]): + person_uri = add_person(graph, author, base_uri, "author", i) + if person_uri: + graph.add((resource_uri, SCHEMA.author, person_uri)) + logger.info("Added %d authors", len(metadata["authors"])) + else: + logger.warning("No authors found in metadata.yml") + + # contributors -> schema:contributor (exactMatch) + if metadata.get("contributors"): + for i, contributor in enumerate(metadata["contributors"]): + person_uri = add_person(graph, contributor, base_uri, "contributor", i) + if person_uri: + graph.add((resource_uri, SCHEMA.contributor, person_uri)) + logger.info("Added %d contributors", len(metadata["contributors"])) + + # ===== LANGUAGE & KEYWORDS ===== + + # language -> schema:inLanguage (exactMatch) + if "language" in metadata: + language_value = metadata["language"] + if isinstance(language_value, list): + for lang in language_value: + graph.add((resource_uri, SCHEMA.inLanguage, Literal(lang))) + logger.info("Added languages: %s", ", ".join(language_value)) + else: + graph.add((resource_uri, SCHEMA.inLanguage, Literal(language_value))) + logger.info("Added language: %s", language_value) + + # keywords -> schema:keywords (exactMatch) and schema:about (closeMatch) + if metadata.get("keywords"): + keywords_list = extract_keywords(metadata["keywords"]) + if keywords_list: + for keyword in keywords_list: + graph.add((resource_uri, SCHEMA.keywords, Literal(keyword))) + # Also add as 'about' for closeMatch mapping + keyword_node = URIRef(f"{base_uri}#keyword_{keywords_list.index(keyword)}") + graph.add((keyword_node, RDF.type, SCHEMA.Thing)) + graph.add((keyword_node, SCHEMA.name, Literal(keyword))) + graph.add((resource_uri, SCHEMA.about, keyword_node)) + logger.info("Added %d keywords", len(keywords_list)) + + # ===== EDUCATIONAL METADATA ===== + + # discipline -> schema:about (closeMatch) + if metadata.get("discipline"): + for i, disc in enumerate(metadata["discipline"]): + disc_node = URIRef(f"{base_uri}#discipline_{i}") + graph.add((disc_node, RDF.type, SCHEMA.Thing)) + graph.add((disc_node, SCHEMA.name, Literal(disc))) + graph.add((resource_uri, SCHEMA.about, disc_node)) + logger.info("Added %d disciplines", len(metadata["discipline"])) + + # research-object-type -> schema:about (broadMatch) + if metadata.get("research-object-type"): + for i, obj_type in enumerate(metadata["research-object-type"]): + obj_node = URIRef(f"{base_uri}#research_object_{i}") + graph.add((obj_node, RDF.type, SCHEMA.Thing)) + graph.add((obj_node, SCHEMA.name, Literal(obj_type))) + graph.add((resource_uri, SCHEMA.about, obj_node)) + logger.info("Added %d research object types", len(metadata["research-object-type"])) + + # target-group -> schema:audience (closeMatch) + if metadata.get("target-group"): + for i, group in enumerate(metadata["target-group"]): + audience_node = URIRef(f"{base_uri}#audience_{i}") + graph.add((audience_node, RDF.type, SCHEMA.Audience)) + graph.add((audience_node, SCHEMA.audienceType, Literal(group))) + graph.add((resource_uri, SCHEMA.audience, audience_node)) + logger.info("Added %d target groups", len(metadata["target-group"])) + + # time-required -> schema:timeRequired (exactMatch) + if "time-required" in metadata: + graph.add((resource_uri, SCHEMA.timeRequired, Literal(metadata["time-required"]))) + logger.info("Added time required: %s", metadata["time-required"]) + + # ===== LICENSE ===== + + # license -> schema:license (exactMatch) + if "license" in metadata: + license_data = metadata["license"] + if isinstance(license_data, str): + graph.add((resource_uri, SCHEMA.license, URIRef(license_data))) + elif isinstance(license_data, dict): + # Code license + if "code" in license_data: + code_license_node = URIRef(f"{base_uri}#license_code") + graph.add((code_license_node, RDF.type, SCHEMA.CreativeWork)) + graph.add((code_license_node, SCHEMA.name, Literal("Source Code"))) + graph.add((code_license_node, SCHEMA.license, URIRef(license_data["code"]))) + graph.add((resource_uri, SCHEMA.license, code_license_node)) + + # Content license + if "content" in license_data: + content_license_data = license_data["content"] + content_license_node = URIRef(f"{base_uri}#license_content") + graph.add((content_license_node, RDF.type, SCHEMA.CreativeWork)) + graph.add((content_license_node, SCHEMA.name, Literal("Content"))) + if isinstance(content_license_data, dict): + if "url" in content_license_data: + graph.add( + ( + content_license_node, + SCHEMA.license, + URIRef(content_license_data["url"]), + ) + ) + elif isinstance(content_license_data, str): + graph.add( + (content_license_node, SCHEMA.license, URIRef(content_license_data)) + ) + graph.add((resource_uri, SCHEMA.license, content_license_node)) + logger.info("Added license information") + + # ===== CHAPTERS (hasPart) ===== + + # chapters -> schema:hasPart (closeMatch) + if metadata.get("chapters"): + for i, chapter in enumerate(metadata["chapters"]): + chapter_uri = add_chapter(graph, chapter, base_uri, i) + if chapter_uri: + graph.add((resource_uri, SCHEMA.hasPart, chapter_uri)) + logger.info("Added %d chapters", len(metadata["chapters"])) + + # table-of-contents -> dcterms:tableOfContents (exactMatch) + if "table-of-contents" in metadata: + graph.add( + (resource_uri, DCTERMS.tableOfContents, Literal(metadata["table-of-contents"])) + ) + logger.info("Added table of contents") + + # ===== ADDITIONAL METADATA ===== + + # context-of-creation -> schema:funding (adapted mapping) + if "context-of-creation" in metadata: + graph.add((resource_uri, SCHEMA.funding, Literal(metadata["context-of-creation"]))) + logger.info("Added context of creation") + + # learning-resource-type -> schema:learningResourceType (closeMatch) + # -> lrmi:learningResourceType (closeMatch) + # -> dcterms:type (broadMatch) + # -> dc:type (broadMatch) + if "learning-resource-type" in metadata: + lrt = Literal(metadata["learning-resource-type"]) + graph.add((resource_uri, SCHEMA.learningResourceType, lrt)) + graph.add((resource_uri, LRMI.learningResourceType, lrt)) + graph.add((resource_uri, DCTERMS.type, lrt)) + graph.add((resource_uri, DC.type, lrt)) + logger.info("Added learning resource type: %s", metadata["learning-resource-type"]) + + # quality-assurance: not mapped to RDF + # All schema x-mappings are relatedMatch only — too loose for RDF/JSON-LD output + + # Serialize to RDF/XML and post-process for deterministic output. + # rdflib's pretty-xml serializer uses Python dicts internally, so element + # and namespace ordering varies across process invocations due to hash + # randomization. We sort the XML elements after serialization to guarantee + # reproducible output regardless of PYTHONHASHSEED. + logger.info("Serializing %d triples to RDF/XML...", len(graph)) + + try: + xml_bytes = graph.serialize(format="pretty-xml", encoding="utf-8") + xml_str = xml_bytes.decode("utf-8") if isinstance(xml_bytes, bytes) else xml_bytes + + # Register namespace prefixes so ElementTree preserves them + for prefix, uri in [ + ("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), + ("schema", str(SCHEMA)), + ("dc", str(DC)), + ("dcterms", str(DCTERMS)), + ("lrmi", str(LRMI)), + ("skos", str(SKOS)), + ]: + ET.register_namespace(prefix, uri) + + # Parse, sort elements recursively, and re-serialize + root = ET.fromstring(xml_str) # noqa: S314 — parsing our own rdflib output + _sort_xml_element(root) + ET.indent(root, space=" ") + + sorted_xml = ET.tostring(root, encoding="unicode", xml_declaration=True) + + with rdf_path.open("w", encoding="utf-8") as f: + f.write(sorted_xml) + f.write("\n") + except OSError: + logger.exception("Error writing to %s", rdf_path) + return False + else: + logger.info("RDF/XML metadata successfully created at %s", rdf_path) + return True + + except Exception: + logger.exception("Unexpected error in create_rdfxml") + return False + + +if __name__ == "__main__": + success = create_rdfxml() + sys.exit(0 if success else 1) diff --git a/quadriga/metadata/create_zenodo_json.py b/quadriga/metadata/create_zenodo_json.py index 47cb2a8..5879b5e 100644 --- a/quadriga/metadata/create_zenodo_json.py +++ b/quadriga/metadata/create_zenodo_json.py @@ -8,13 +8,17 @@ The upload_type is set to "lesson" as specified for QUADRIGA OERs. """ +from __future__ import annotations + import json import logging import sys -from pathlib import Path +from typing import Any from .utils import extract_keywords, get_file_path, get_repo_root, load_yaml_file +logger = logging.getLogger(__name__) + def clean_doi(doi_string: str) -> str | None: """ @@ -78,20 +82,20 @@ def format_creators_for_zenodo(authors: list) -> list: list: List of creator dictionaries in Zenodo format """ if not authors: - logging.warning("No authors provided to format_creators_for_zenodo") + logger.warning("No authors provided to format_creators_for_zenodo") return [] creators = [] for i, author in enumerate(authors): if not isinstance(author, dict): - logging.warning(f"Author at index {i} is not a dictionary: {author}") + logger.warning("Author at index %d is not a dictionary: %s", i, author) continue family = author.get("family-names", "") given = author.get("given-names", "") if not family and not given: - logging.warning(f"Author at index {i} is missing both family-names and given-names") + logger.warning("Author at index %d is missing both family-names and given-names", i) continue creator = {"name": f"{family}, {given}" if family and given else (family or given)} @@ -111,7 +115,7 @@ def format_creators_for_zenodo(authors: list) -> list: return creators -def format_contributors_for_zenodo(contributors): +def format_contributors_for_zenodo(contributors: list | None) -> list: """ Format contributors list for Zenodo contributors field. @@ -167,11 +171,11 @@ def format_contributors_for_zenodo(contributors): return formatted_contributors -def create_zenodo_json(): +def create_zenodo_json() -> bool | None: """ - Creates a .zenodo.json file from CITATION.cff and metadata.yml. + Create a .zenodo.json file from CITATION.cff and metadata.yml. - The function reads the 'preferred-citation' section from CITATION.cff + Reads the 'preferred-citation' section from CITATION.cff and combines it with data from metadata.yml to create a Zenodo-compliant metadata file. The upload_type is always set to "lesson" for QUADRIGA OERs. @@ -186,47 +190,50 @@ def create_zenodo_json(): citation_cff_path = get_file_path("CITATION.cff", repo_root) metadata_path = get_file_path("metadata.yml", repo_root) zenodo_json_path = get_file_path(".zenodo.json", repo_root) - except Exception as e: - logging.exception(f"Failed to resolve file paths: {e!s}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Check if required files exist - if not Path(citation_cff_path).exists(): - logging.error(f"CITATION.cff file not found at {citation_cff_path}") + if not citation_cff_path.exists(): + logger.error("CITATION.cff file not found at %s", citation_cff_path) return False - if not Path(metadata_path).exists(): - logging.error(f"metadata.yml file not found at {metadata_path}") + if not metadata_path.exists(): + logger.error("metadata.yml file not found at %s", metadata_path) return False # Load CITATION.cff citation_data = load_yaml_file(citation_cff_path) - if not citation_data: - logging.error("Could not load CITATION.cff. Exiting.") + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - if not metadata: - logging.error("Could not load metadata.yml. Exiting.") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False # Extract data from preferred-citation or root if "preferred-citation" in citation_data: - logging.info("Using 'preferred-citation' section from CITATION.cff") + logger.info("Using 'preferred-citation' section from CITATION.cff") pref = citation_data.get("preferred-citation") + if not isinstance(pref, dict): + logger.error("preferred-citation is not a dictionary") + return False else: - logging.info("No 'preferred-citation' section found, using root data") + logger.info("No 'preferred-citation' section found, using root data") pref = citation_data - zenodo_metadata = {"upload_type": "lesson"} + zenodo_metadata: dict[str, Any] = {"upload_type": "lesson"} # title if "title" in pref: zenodo_metadata["title"] = pref["title"] - logging.info(f"Added title: {pref['title']}") + logger.info("Added title: %s", pref["title"]) else: - logging.error("No title found in CITATION.cff") + logger.error("No title found in CITATION.cff") return False # creators @@ -234,24 +241,42 @@ def create_zenodo_json(): creators = format_creators_for_zenodo(pref["authors"]) if creators: zenodo_metadata["creators"] = creators - logging.info(f"Added {len(creators)} creators") + logger.info("Added %d creators", len(creators)) else: - logging.error("Could not format any creators from authors") + logger.error("Could not format any creators from authors") return False else: - logging.error("No authors found in preferred-citation") + logger.error("No authors found in preferred-citation") return False # description - description = citation_data.get("abstract") - if not description: - description = metadata.get("description") - - if description: - zenodo_metadata["description"] = description - logging.info("Added description") - else: - logging.warning("No description/abstract found") + description = "

" + metadata.get("description") + "

" + + description_base = f""" +

Das interaktive Lehrbuch kann als Web-Version verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch auf GitHub zur Verfügung.

+

Die QUADRIGA-OER sind nach einem einheitlichen Template gestaltet, werden nach einem standardisierten Verfahren qualitätsgeprüft und mit Metadaten ausgezeichnet.

+
QUADRIGA Datenkompetenzzentrum
+

QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozess möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes Datenkompetenzframework zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.

+

Zu den Zielgruppen von QUADRIGA zählen insbesondere promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.

+

QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR) und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartnern zählen: +

+

+ +

Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im Umsetzungskonzept erfahren.

+ +

Weitere Informationen sowie Publikationen finden Sie auf der Webseite, in der Zenodo-Community und der GitHub-Organisation des Projekts.

+""" + zenodo_metadata["description"] = description + description_base + logger.info("Added description") # publication date publication_date = None @@ -265,16 +290,16 @@ def create_zenodo_json(): else: # It's already a string publication_date = str(date_value) - logging.info(f"Added publication_date from metadata.yml: {publication_date}") + logger.info("Added publication_date from metadata.yml: %s", publication_date) elif "year" in pref: # Fall back to year from CITATION.cff year = str(pref["year"]) # Zenodo expects ISO 8601 date format (YYYY-MM-DD) # We use January 1st as default when only year is provided publication_date = f"{year}-01-01" - logging.info(f"Added publication_date from year (fallback): {publication_date}") + logger.info("Added publication_date from year (fallback): %s", publication_date) else: - logging.warning("No publication date or year found") + logger.warning("No publication date or year found") if publication_date: zenodo_metadata["publication_date"] = publication_date @@ -287,7 +312,7 @@ def create_zenodo_json(): keywords_list = extract_keywords(pref["keywords"]) if keywords_list: zenodo_metadata["keywords"] = keywords_list - logging.info(f"Added {len(keywords_list)} keywords") + logger.info("Added %d keywords", len(keywords_list)) # license license_id = None @@ -300,7 +325,7 @@ def create_zenodo_json(): # Clean up common variations license_clean = str(license_id).upper().replace("_", "-") zenodo_metadata["license"] = license_clean - logging.info(f"Added license: {license_clean}") + logger.info("Added license: %s", license_clean) # language if pref.get("languages"): @@ -308,14 +333,14 @@ def create_zenodo_json(): pref["languages"][0] if isinstance(pref["languages"], list) else pref["languages"] ) zenodo_metadata["language"] = lang - logging.info(f"Added language: {lang}") + logger.info("Added language: %s", lang) # contributors if metadata.get("contributors"): contributors = format_contributors_for_zenodo(metadata["contributors"]) if contributors: zenodo_metadata["contributors"] = contributors - logging.info(f"Added {len(contributors)} contributors") + logger.info("Added %d contributors", len(contributors)) # related_identifiers related_identifiers = [] @@ -324,38 +349,39 @@ def create_zenodo_json(): related_identifiers.append( {"identifier": repo_url, "relation": "isSupplementedBy", "scheme": "url"} ) - logging.info("Added repository URL as related identifier") + logger.info("Added repository URL as related identifier") url = pref.get("url") if url and url != repo_url: related_identifiers.append( {"identifier": url, "relation": "isAlternateIdentifier", "scheme": "url"} ) - logging.info("Added URL as related identifier") + logger.info("Added URL as related identifier") if related_identifiers: zenodo_metadata["related_identifiers"] = related_identifiers # community zenodo_metadata["communities"] = [{"identifier": "quadriga"}] - logging.info("Added QUADRIGA community") + logger.info("Added QUADRIGA community") # version if "version" in pref: zenodo_metadata["version"] = str(pref["version"]) - logging.info(f"Added version: {pref['version']}") + logger.info("Added version: %s", pref["version"]) # write .zenodo.json try: with zenodo_json_path.open("w", encoding="utf-8") as f: json.dump(zenodo_metadata, f, ensure_ascii=False, indent=2) - logging.info(f"Zenodo metadata successfully created at {zenodo_json_path}") - return True - except OSError as e: - logging.exception(f"Error writing to {zenodo_json_path}: {e}") + except OSError: + logger.exception("Error writing to %s", zenodo_json_path) return False + else: + logger.info("Zenodo metadata successfully created at %s", zenodo_json_path) + return True - except Exception as e: - logging.exception(f"Unexpected error in create_zenodo_json: {e!s}") + except Exception: + logger.exception("Unexpected error in create_zenodo_json") return False diff --git a/quadriga/metadata/extract_from_book_config.py b/quadriga/metadata/extract_from_book_config.py index 79b87cb..ea3946d 100644 --- a/quadriga/metadata/extract_from_book_config.py +++ b/quadriga/metadata/extract_from_book_config.py @@ -1,12 +1,14 @@ """ -This script extracts the title from _config.yml and the first level of the TOC from _toc.yml. -It then uses this information to update metadata.yml. -The titles for the TOC chapters are extracted from the first heading of the corresponding files. +Extract the title from _config.yml and the first level of the TOC from _toc.yml. + +It then uses this information to update metadata.yml. The titles for the TOC chapters are extracted +from the first heading of the corresponding files. """ +from __future__ import annotations + import logging import sys -from datetime import datetime from pathlib import Path from .utils import ( @@ -19,21 +21,23 @@ # Configure logging logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def extract_and_update(): +def extract_and_update() -> bool | None: """ Extract information from _config.yml and _toc.yml files and update metadata.yml. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: # Get the repository root directory try: repo_root = get_repo_root() - except Exception as e: - logging.error(f"Failed to get repository root: {e}") + except Exception: + logger.exception("Failed to get repository root") return False # Define file paths using the get_file_path utility function @@ -48,7 +52,7 @@ def extract_and_update(): (metadata_path, "metadata.yml"), ]: if not path.exists(): - logging.error(f"Required file {name} not found at {path}") + logger.error("Required file %s not found at %s", name, path) return False # Load the files @@ -56,27 +60,32 @@ def extract_and_update(): toc_data = load_yaml_file(toc_path) metadata_data = load_yaml_file(metadata_path) - if not all([config_data, toc_data, metadata_data]): - logging.error("One or more required files couldn't be loaded. Exiting.") + if not config_data or not isinstance(config_data, dict): + logger.error("Could not load _config.yml or invalid format. Exiting.") + return False + if not toc_data or not isinstance(toc_data, dict): + logger.error("Could not load _toc.yml or invalid format. Exiting.") + return False + if not metadata_data or not isinstance(metadata_data, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False # Extract information from _config.yml title = config_data.get("title", "") - author = config_data.get("author", "") if not title: - logging.warning("No title found in _config.yml") + logger.warning("No title found in _config.yml") # Extract chapters and their titles from _toc.yml toc_chapters = [] missing_files = [] if "chapters" not in toc_data: - logging.warning("No 'chapters' section found in _toc.yml") + logger.warning("No 'chapters' section found in _toc.yml") else: for chapter in toc_data["chapters"]: if "file" not in chapter: - logging.warning("Found chapter entry without 'file' attribute in _toc.yml") + logger.warning("Found chapter entry without 'file' attribute in _toc.yml") continue try: @@ -94,7 +103,7 @@ def extract_and_update(): # Check if file exists if not full_path.exists(): missing_files.append(str(full_path)) - logging.warning(f"Chapter file not found: {full_path}") + logger.warning("Chapter file not found: %s", full_path) # Use filename as fallback title toc_chapters.append(f"[Missing: {p.stem}]") continue @@ -104,19 +113,19 @@ def extract_and_update(): # Add to the list of chapters toc_chapters.append(chapter_title) - except Exception as e: - logging.error(f"Error processing chapter {chapter.get('file', 'unknown')}: {e}") + except Exception: + logger.exception("Error processing chapter %s", chapter.get("file", "unknown")) # Add a placeholder with the filename if possible try: toc_chapters.append(f"[Error: {p.stem}]") - except: + except Exception: toc_chapters.append("[Error: unknown chapter]") if missing_files: - logging.warning(f"Missing {len(missing_files)} chapter files") + logger.warning("Missing %d chapter files", len(missing_files)) if not toc_chapters: - logging.warning("No chapter titles were extracted") + logger.warning("No chapter titles were extracted") # Format the TOC as a string with proper indentation and single newline between items toc_formatted = "- " + "\n- ".join(toc_chapters) @@ -132,7 +141,7 @@ def extract_and_update(): if "table-of-contents" in metadata_data: metadata_data["table-of-contents"] = toc_formatted else: - logging.warning("No 'table-of-contents' field found in metadata.yml") + logger.warning("No 'table-of-contents' field found in metadata.yml") # Save the updated metadata if save_yaml_file( @@ -140,20 +149,19 @@ def extract_and_update(): metadata_data, schema_comment="# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json", ): - logging.info("Metadata updated successfully!") + logger.info("Metadata updated successfully!") return True - else: - logging.error("Failed to save metadata.yml") - return False - except Exception as e: - logging.exception(f"Error updating metadata.yml: {e}") + logger.error("Failed to save metadata.yml") + return False + except Exception: + logger.exception("Error updating metadata.yml") return False else: - logging.error("Metadata file couldn't be loaded or is empty.") + logger.error("Metadata file couldn't be loaded or is empty.") return False - except Exception as e: - logging.exception(f"Unexpected error in extract_and_update: {e}") + except Exception: + logger.exception("Unexpected error in extract_and_update") return False diff --git a/quadriga/metadata/inject_all_metadata.py b/quadriga/metadata/inject_all_metadata.py new file mode 100644 index 0000000..bfbddcc --- /dev/null +++ b/quadriga/metadata/inject_all_metadata.py @@ -0,0 +1,889 @@ +""" +Injects all metadata (JSON-LD, OpenGraph, and RDF links) into generated HTML files. + +This unified script combines JSON-LD structured data injection and OpenGraph +social media metadata into a single efficient pass through HTML files. + +It reads metadata.jsonld and injects: +- OpenGraph tags for social media previews +- JSON-LD ') + + # 3. RDF discovery links + if add_link_elements: + injection_parts.append( + ' ' + ) + injection_parts.append( + ' ' + ) + + # Join all parts with newlines + full_injection = "\n".join(injection_parts) + + # Find optimal injection point + # Priority: after viewport, after charset, before (fallback) + injection_point = None + + # Try to inject after viewport meta tag (best practice for OpenGraph) + viewport_match = re.search(r"(]*>\s*)", html_content, re.IGNORECASE) + if viewport_match: + injection_point = viewport_match.end() + else: + # Fallback: try after charset meta tag + charset_match = re.search(r"(]*>\s*)", html_content, re.IGNORECASE) + if charset_match: + injection_point = charset_match.end() + + # Final fallback: inject before + if injection_point is None: + injection_point = html_content.find("") + if injection_point == -1: + logger.warning("No tag found in %s, skipping", html_path.name) + return False + # For injection, add before the tag + html_content = html_content[:injection_point] + f"\n{full_injection}\n" + html_content[injection_point:] + else: + # For after viewport/charset injection, insert at found position + html_content = html_content[:injection_point] + f"\n{full_injection}\n\n" + html_content[injection_point:] + + # Write the modified HTML back + with html_path.open("w", encoding="utf-8") as f: + f.write(html_content) + + logger.info("Injected all metadata into %s", html_path.name) + return True + + except FileNotFoundError: + logger.exception("HTML file not found: %s", html_path) + return False + except Exception: + logger.exception("Error injecting metadata into %s", html_path) + return False + + +# ============================================================================ +# Main Injection Orchestration +# ============================================================================ + + +def inject_all_metadata( + build_dir: Path | None = None, + jsonld_path: Path | None = None, + config_path: Path | None = None, + toc_path: Path | None = None, +) -> bool: + """ + Inject all metadata (OpenGraph, JSON-LD, and RDF links) into Jupyter Book HTML files. + + This unified function combines OpenGraph social media metadata and JSON-LD structured + data injection into a single efficient operation. + + For the root page, it injects: + - OpenGraph: og:type="book" with book:author, book:release_date, book:tag + - JSON-LD: Full book metadata + + For chapter pages, it injects: + - OpenGraph: og:type="article" with article:author, article:published_time, article:modified_time + - JSON-LD: Chapter-specific LearningResource with isPartOf reference to book + + Args: + build_dir (Path, optional): Path to _build/html directory (default: ./_build/html) + jsonld_path (Path, optional): Path to metadata.jsonld (default: ./metadata.jsonld) + config_path (Path, optional): Path to _config.yml (default: ./_config.yml) + toc_path (Path, optional): Path to _toc.yml (default: ./_toc.yml) + + Returns + ------- + bool: True if successful, False otherwise + """ + try: + # Determine paths + if build_dir is None: + build_dir = Path.cwd() / "_build" / "html" + if jsonld_path is None: + jsonld_path = Path.cwd() / "metadata.jsonld" + if config_path is None: + config_path = Path.cwd() / "_config.yml" + if toc_path is None: + toc_path = Path.cwd() / "_toc.yml" + + # Check if build directory exists + if not build_dir.exists(): + logger.error("Build directory not found: %s", build_dir) + return False + + # Check if JSON-LD file exists + if not jsonld_path.exists(): + logger.error("JSON-LD file not found: %s", jsonld_path) + return False + + # Read and validate JSON-LD + try: + with jsonld_path.open(encoding="utf-8") as f: + jsonld_data = json.load(f) + logger.info("Loaded JSON-LD from %s", jsonld_path) + except json.JSONDecodeError: + logger.exception("Invalid JSON in %s", jsonld_path) + return False + + # Extract base URL from metadata + base_url = jsonld_data.get("url", "") + if not base_url: + logger.error("No URL found in metadata.jsonld") + return False + + # Extract book title for og:site_name + book_title = jsonld_data.get("name", "") + if not book_title: + logger.warning("No book title found in metadata.jsonld") + + # Get logo filename from config + logo_filename = get_logo_from_config(config_path) + logger.info("Using logo: %s", logo_filename) + + # Determine the actual root page from _toc.yml + root_html = get_root_page_from_toc(toc_path, build_dir) + + # Fall back to index.html if we couldn't determine the root + if root_html is None or not root_html.exists(): + root_html = build_dir / "index.html" + logger.info("Using index.html as root page") + + # ==================================================================== + # Process root page + # ==================================================================== + if root_html.exists(): + # Generate OpenGraph tags for root page (og:type="book") + og_tags = create_opengraph_meta_tags( + jsonld_data, + base_url, + book_title, + logo_filename, + is_chapter=False, + ) + + # Format JSON-LD content with proper indentation + jsonld_content = json.dumps(jsonld_data, ensure_ascii=False, indent=2) + jsonld_content = "\n".join(" " + line for line in jsonld_content.split("\n")) + + # Inject both OpenGraph and JSON-LD into root page + if not inject_all_metadata_into_html(root_html, og_tags, jsonld_content): + logger.error("Failed to inject metadata into %s", root_html.name) + return False + else: + logger.warning("Root HTML file not found at %s", root_html) + return False + + # ==================================================================== + # Process index.html redirect page (if different from root) + # ==================================================================== + # Jupyter Book may create index.html as a meta-refresh redirect + # Social media crawlers don't follow these redirects, so we need + # to inject OpenGraph metadata into index.html as well + index_html = build_dir / "index.html" + if index_html.exists() and index_html != root_html: + try: + with index_html.open(encoding="utf-8") as f: + index_content = f.read() + + # Check if this is a simple meta-refresh redirect page (with or without proper HTML structure) + if "meta http-equiv" in index_content.lower(): + logger.info("Found index.html redirect page, injecting OpenGraph metadata") + + # Generate OpenGraph tags (but skip JSON-LD for redirect page) + og_tags = create_opengraph_meta_tags( + jsonld_data, + base_url, + book_title, + logo_filename, + is_chapter=False, + ) + + # Check if the redirect page has proper HTML structure + if "]*>', index_content, re.IGNORECASE) + meta_refresh = meta_refresh_match.group(0) if meta_refresh_match else "" + + # Create proper HTML with OpenGraph metadata and meta refresh + new_index_content = f""" + + + + {meta_refresh} +{og_tags} + {escape_html(book_title)} + + +

Redirecting to {root_html.name}...

+ + +""" + # Write the new index.html + with index_html.open("w", encoding="utf-8") as f: + f.write(new_index_content) + logger.info("Successfully created index.html with OpenGraph metadata and redirect") + else: + # Has proper HTML structure, inject normally + if not inject_all_metadata_into_html(index_html, og_tags, "", add_link_elements=False): + logger.warning("Failed to inject OpenGraph into index.html redirect page") + else: + logger.info("Successfully injected OpenGraph metadata into index.html redirect page") + except Exception: + logger.exception("Error processing index.html redirect page") + + # ==================================================================== + # Process chapter pages + # ==================================================================== + chapters_injected = 0 + if jsonld_data.get("hasPart"): + logger.info("Processing %d chapters...", len(jsonld_data["hasPart"])) + + for chapter in jsonld_data["hasPart"]: + if not isinstance(chapter, dict): + continue + + # Get chapter URL + chapter_url = chapter.get("url") + if not chapter_url: + logger.warning("Chapter missing URL: %s", chapter.get("name", "Unknown")) + continue + + # Find the HTML file for this chapter + chapter_html_path = get_html_path_from_url(chapter_url, build_dir) + if not chapter_html_path: + logger.warning("Could not find HTML file for chapter: %s", chapter.get("name", "Unknown")) + continue + + # Create chapter metadata for OpenGraph (combining chapter + book data) + chapter_og_metadata = { + "name": chapter.get("name", ""), + "url": chapter_url, + "description": chapter.get("description", ""), + "author": jsonld_data.get("author", []), # Inherit from book + "datePublished": jsonld_data.get("datePublished"), # Inherit from book + "dateModified": jsonld_data.get("dateModified"), # Inherit from book + "inLanguage": jsonld_data.get("inLanguage"), # Inherit from book + } + + # Generate OpenGraph tags for chapter (og:type="article") + chapter_og_tags = create_opengraph_meta_tags( + chapter_og_metadata, + base_url, + book_title, + logo_filename, + is_chapter=True, + ) + + # Create chapter-specific JSON-LD with book-level metadata + chapter_jsonld = create_chapter_jsonld(chapter, jsonld_data) + + # Convert to formatted string + chapter_jsonld_str = json.dumps(chapter_jsonld, ensure_ascii=False, indent=2) + chapter_jsonld_str = "\n".join(" " + line for line in chapter_jsonld_str.split("\n")) + + # Inject both OpenGraph and JSON-LD into chapter HTML + if inject_all_metadata_into_html(chapter_html_path, chapter_og_tags, chapter_jsonld_str): + chapters_injected += 1 + else: + logger.warning("Failed to inject metadata into chapter: %s", chapter.get("name", "Unknown")) + + logger.info("Injected metadata into %d chapter pages", chapters_injected) + + logger.info("All metadata injection completed successfully") + return True + + except Exception: + logger.exception("Unexpected error in inject_all_metadata") + return False + + +# ============================================================================ +# CLI Entry Point +# ============================================================================ + + +def main() -> None: + """ + Run the unified metadata injection script. + + Usage: + python -m quadriga.metadata.inject_all_metadata + """ + import argparse + + parser = argparse.ArgumentParser( + description="Inject all metadata (OpenGraph, JSON-LD, RDF links) into Jupyter Book HTML" + ) + parser.add_argument( + "--build-dir", + type=Path, + help="Path to _build/html directory (default: ./_build/html)", + ) + parser.add_argument( + "--jsonld-path", + type=Path, + help="Path to metadata.jsonld file (default: ./metadata.jsonld)", + ) + parser.add_argument( + "--config-path", + type=Path, + help="Path to _config.yml file (default: ./_config.yml)", + ) + parser.add_argument( + "--toc-path", + type=Path, + help="Path to _toc.yml file (default: ./_toc.yml)", + ) + + args = parser.parse_args() + + success = inject_all_metadata( + build_dir=args.build_dir, + jsonld_path=args.jsonld_path, + config_path=args.config_path, + toc_path=args.toc_path, + ) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/quadriga/metadata/run_all.py b/quadriga/metadata/run_all.py index b6c0b8e..6407619 100644 --- a/quadriga/metadata/run_all.py +++ b/quadriga/metadata/run_all.py @@ -1,17 +1,30 @@ -""" -This script runs the various metadata update scripts in the correct order. -""" +"""Script to coordinate the different metadata transformation scripts for QUADRIGA Jupyter Books.""" + +from __future__ import annotations import logging +import os import sys -from .create_bibtex import create_bibtex_from_cff -from .extract_from_book_config import extract_and_update -from .update_citation_cff import update_citation -from .create_zenodo_json import create_zenodo_json +# Add current working directory to sys.path if not present +# This allows the script to run with python -m without package installation +cwd = os.getcwd() +if cwd not in sys.path: + sys.path.insert(0, cwd) + +from quadriga.metadata.create_bibtex import create_bibtex_from_cff +from quadriga.metadata.create_jsonld import create_jsonld +from quadriga.metadata.create_rdfxml import create_rdfxml +from quadriga.metadata.create_zenodo_json import create_zenodo_json +from quadriga.metadata.extract_from_book_config import extract_and_update +from quadriga.metadata.update_citation_cff import update_citation +from quadriga.metadata.validate_schema import validate_schema +logger = logging.getLogger(__name__) -def main(): + +def main() -> bool | None: + """Run the different metadata transformation scripts in order.""" try: # Configure logging with timestamp logging.basicConfig( @@ -20,56 +33,86 @@ def main(): datefmt="%Y-%m-%d %H:%M:%S", ) - logging.info("Running all metadata update scripts...") + logger.info("Running all metadata update scripts...") + + # Validate metadata.yml against QUADRIGA schema first + try: + logger.info("Validating metadata.yml against QUADRIGA schema...") + if not validate_schema(): + logger.error("Schema validation failed.") + return False + except Exception: + logger.exception("Unexpected error during schema validation") + return False # Execute extract_and_update with error handling try: - logging.info("Extracting metadata from _config.yml and _toc.yml...") + logger.info("Extracting metadata from _config.yml and _toc.yml...") if not extract_and_update(): - logging.error("Extract and update process failed.") + logger.error("Extract and update process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during extract_and_update: {str(e)}") + except Exception: + logger.exception("Unexpected error during extract_and_update") return False # Execute update_citation with error handling try: - logging.info("Updating CITATION.cff...") + logger.info("Updating CITATION.cff...") if not update_citation(): - logging.error("Update citation process failed.") + logger.error("Update citation process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during update_citation: {str(e)}") + except Exception: + logger.exception("Unexpected error during update_citation") return False # Execute create_bibtex_from_cff with error handling try: - logging.info("Creating CITATION.bib from CITATION.cff...") + logger.info("Creating CITATION.bib from CITATION.cff...") if not create_bibtex_from_cff(): - logging.error("Create BibTeX process failed.") + logger.error("Create BibTeX process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during create_bibtex_from_cff: {str(e)}") + except Exception: + logger.exception("Unexpected error during create_bibtex_from_cff") return False # Execute create_zenodo_json with error handling try: - logging.info("Creating .zenodo.json from CITATION.cff and metadata.yml...") + logger.info("Creating .zenodo.json from CITATION.cff and metadata.yml...") if not create_zenodo_json(): - logging.error("Create Zenodo JSON process failed.") + logger.error("Create Zenodo JSON process failed.") + return False + except Exception: + logger.exception("Unexpected error during create_zenodo_json") + return False + + # Execute create_jsonld with error handling + try: + logger.info("Creating metadata.jsonld from metadata.yml...") + if not create_jsonld(): + logger.error("Create JSON-LD process failed.") + return False + except Exception: + logger.exception("Unexpected error during create_jsonld") + return False + + # Execute create_rdfxml with error handling + try: + logger.info("Creating metadata.rdf from metadata.yml...") + if not create_rdfxml(): + logger.error("Create RDF/XML process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during create_zenodo_json: {str(e)}") + except Exception: + logger.exception("Unexpected error during create_rdfxml") return False - logging.info("All scripts executed successfully.") + logger.info("All scripts executed successfully.") return True except KeyboardInterrupt: - logging.warning("Process interrupted by user.") + logger.warning("Process interrupted by user.") return False - except Exception as e: - logging.exception(f"Unexpected error in main: {str(e)}") + except Exception: + logger.exception("Unexpected error in main") return False diff --git a/quadriga/metadata/update_citation_cff.py b/quadriga/metadata/update_citation_cff.py index e149b66..61c7e81 100644 --- a/quadriga/metadata/update_citation_cff.py +++ b/quadriga/metadata/update_citation_cff.py @@ -1,10 +1,13 @@ """ -Updates the CITATION.cff file with metadata from metadata.yml. +Update or create the CITATION.cff file with metadata from metadata.yml. This script reads metadata from 'metadata.yml' and updates the corresponding fields in 'CITATION.cff'. It handles fields like title, authors, URL, repository URL, and publication date. It also ensures that the 'preferred-citation' section, if present, is updated consistently. + +If CITATION.cff does not exist, a new one is created from metadata.yml with +the required CFF boilerplate fields. """ import logging @@ -14,20 +17,105 @@ from .utils import extract_keywords, get_file_path, load_yaml_file, save_yaml_file logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def update_citation(): +def _create_initial_citation_cff(metadata: dict) -> dict: """ - Updates the CITATION.cff file using data from the metadata.yml file. + Create an initial CITATION.cff data structure from metadata.yml. + + Builds a minimal valid CFF 1.2.0 file with the required fields + (cff-version, message, title, authors) plus optional fields + that can be derived from metadata.yml. - The function performs the following steps: - 1. Constructs absolute paths to 'metadata.yml' and 'CITATION.cff'. - 2. Loads data from both YAML files. - 3. Updates 'CITATION.cff' fields (title, authors, URL, repository-code, - and publication year in preferred-citation) based on 'metadata.yml'. - 4. For authors, it attempts to preserve existing author details in - 'CITATION.cff' if a matching author (by given and family names) is found. - 5. Saves the updated data back to 'CITATION.cff', including a schema comment. + Args: + metadata: Parsed metadata.yml data + + Returns + ------- + dict: A valid CFF data structure + """ + citation_data: dict = { + "cff-version": "1.2.0", + "message": "If you use this work, please cite it using the metadata from this file.", + "type": "dataset", + } + + # Title (required by CFF) + citation_data["title"] = metadata.get("title", "Untitled") + + # Authors (required by CFF) + if metadata.get("authors"): + citation_authors = [] + for author in metadata["authors"]: + cff_author: dict = {} + if "given-names" in author: + cff_author["given-names"] = author["given-names"] + if "family-names" in author: + cff_author["family-names"] = author["family-names"] + if "orcid" in author: + cff_author["orcid"] = author["orcid"] + if "affiliation" in author: + cff_author["affiliation"] = author["affiliation"] + if cff_author: + citation_authors.append(cff_author) + citation_data["authors"] = citation_authors if citation_authors else [{"name": "Unknown"}] + else: + citation_data["authors"] = [{"name": "Unknown"}] + + # Optional fields from metadata + if "version" in metadata: + citation_data["version"] = metadata["version"] + + if "url" in metadata: + citation_data["url"] = metadata["url"] + + if "git" in metadata: + citation_data["repository-code"] = metadata["git"] + + if "identifier" in metadata: + doi_url = metadata["identifier"] + if "doi.org" in str(doi_url): + # Extract DOI value from URL + doi_value = str(doi_url).split("doi.org/")[-1] if "doi.org/" in str(doi_url) else None + if doi_value: + citation_data["identifiers"] = [ + {"type": "doi", "value": doi_value, "description": "Zenodo"} + ] + + if "description" in metadata: + citation_data["abstract"] = metadata["description"] + + if metadata.get("keywords"): + flattened = extract_keywords(metadata["keywords"]) + if flattened: + citation_data["keywords"] = flattened + + if "license" in metadata: + license_data = metadata["license"] + if isinstance(license_data, str): + citation_data["license"] = license_data + elif isinstance(license_data, dict) and "content" in license_data: + # Use content license as primary (most relevant for citation) + content_license = license_data["content"] + if isinstance(content_license, str): + citation_data["license"] = content_license + elif isinstance(content_license, list) and content_license: + citation_data["license"] = content_license[0] + + logger.info("Created initial CITATION.cff from metadata.yml") + return citation_data + + +def update_citation() -> bool: + """ + Update or create the CITATION.cff file using data from the metadata.yml file. + + If CITATION.cff exists, the function updates its fields based on metadata.yml, + preserving existing author details and preferred-citation sections. + + If CITATION.cff does not exist, a new one is created from metadata.yml with + the required CFF boilerplate. Returns ------- @@ -39,32 +127,31 @@ def update_citation(): repo_root = get_file_path("") # Get repo root by providing empty relative path metadata_path = get_file_path("metadata.yml", repo_root) citation_cff_path = get_file_path("CITATION.cff", repo_root) - except Exception as e: - logging.exception(f"Failed to resolve file paths: {e!s}") + except Exception: + logger.exception("Failed to resolve file paths") return False - # Check if files exist - for path, name in [ - (metadata_path, "metadata.yml"), - (citation_cff_path, "CITATION.cff"), - ]: - if not Path(path).exists(): - logging.error(f"Required file {name} not found at {path}") - return False + # metadata.yml must exist + if not Path(metadata_path).exists(): + logger.error("Required file metadata.yml not found at %s", metadata_path) + return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - # Load existing CITATION.cff - citation_data = load_yaml_file(citation_cff_path) - - if not metadata: - logging.error("Could not load metadata.yml. Exiting.") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False - if not citation_data: - logging.error("Could not load CITATION.cff. Exiting.") - return False + # Load existing CITATION.cff or create initial structure + if Path(citation_cff_path).exists(): + citation_data = load_yaml_file(citation_cff_path) + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") + return False + else: + logger.info("CITATION.cff not found — creating from metadata.yml") + citation_data = _create_initial_citation_cff(metadata) # Track if updates were made updates_made = False @@ -76,18 +163,18 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["title"] = metadata["title"] updates_made = True - logging.info(f"Updated title to: {metadata['title']}") + logger.info("Updated title to: %s", metadata["title"]) else: - logging.warning("No title found in metadata.yml") + logger.warning("No title found in metadata.yml") if "version" in metadata: citation_data["version"] = metadata["version"] if "preferred-citation" in citation_data: citation_data["preferred-citation"]["version"] = metadata["version"] updates_made = True - logging.info(f"Updated version to: {metadata['version']}") + logger.info("Updated version to: %s", metadata["version"]) else: - logging.warning("No version found in metadata.yml, skipping version update") + logger.warning("No version found in metadata.yml, skipping version update") if metadata.get("authors"): try: @@ -102,13 +189,12 @@ def update_citation(): and "family-names" in cit_author and "given-names" in author and "family-names" in author + ) and ( + cit_author["given-names"] == author["given-names"] + and cit_author["family-names"] == author["family-names"] ): - if ( - cit_author["given-names"] == author["given-names"] - and cit_author["family-names"] == author["family-names"] - ): - new_author_entry = cit_author - break + new_author_entry = cit_author + break # Update author entry with metadata if "given-names" in author: @@ -129,13 +215,13 @@ def update_citation(): citation_data["preferred-citation"]["authors"] = citation_authors updates_made = True - logging.info(f"Updated {len(citation_authors)} authors") + logger.info("Updated %d authors", len(citation_authors)) else: - logging.warning("Failed to process authors from metadata.yml") - except Exception as e: - logging.exception(f"Error processing authors: {e!s}") + logger.warning("Failed to process authors from metadata.yml") + except Exception: + logger.exception("Error processing authors") else: - logging.warning("No authors found in metadata.yml") + logger.warning("No authors found in metadata.yml") # Update URL if present in metadata if "url" in metadata: @@ -143,7 +229,7 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["url"] = metadata["url"] updates_made = True - logging.info(f"Updated URL to: {metadata['url']}") + logger.info("Updated URL to: %s", metadata["url"]) # Update repository URL if present in metadata if "git" in metadata: @@ -151,27 +237,33 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["repository-code"] = metadata["git"] updates_made = True - logging.info(f"Updated repository-code to: {metadata['git']}") + logger.info("Updated repository-code to: %s", metadata["git"]) - # Update publication year based on date-modified or date-published + # Update publication year based on date-modified or date-issued # Prefer newer date-modified, if available year_source = None year_value = None + year_digits = 4 if "date-modified" in metadata: date_str = metadata["date-modified"] - if isinstance(date_str, str) and len(date_str) >= 4: + if isinstance(date_str, str) and len(date_str) >= year_digits: year_value = date_str[:4] year_source = "date-modified" - elif "date-published" in metadata: - date_str = metadata["date-published"] - if isinstance(date_str, str) and len(date_str) >= 4: + elif "date-issued" in metadata: + date_str = metadata["date-issued"] + if isinstance(date_str, str) and len(date_str) >= year_digits: year_value = date_str[:4] # Extract year from YYYY-MM-DD - year_source = "date-published" + year_source = "date-issued" if year_value and "preferred-citation" in citation_data: citation_data["preferred-citation"]["year"] = year_value updates_made = True - logging.info(f"Updated publication year to: {year_value} (from {year_source})") + logger.info("Updated publication year to: %s (from %s)", year_value, year_source) + + if "description" in metadata: + citation_data["abstract"] = metadata["description"] + updates_made = True + logger.info("Updated abstract from description") # Update keywords if present in metadata # Extract keywords to flatten any language-keyed formats @@ -182,27 +274,26 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["keywords"] = flattened_keywords updates_made = True - logging.info(f"Updated keywords with {len(flattened_keywords)} items") + logger.info("Updated keywords with %d items", len(flattened_keywords)) else: - logging.warning("Keywords found in metadata.yml but could not be extracted") + logger.warning("Keywords found in metadata.yml but could not be extracted") else: - logging.warning("No keywords found in metadata.yml") + logger.warning("No keywords found in metadata.yml") # No changes if not updates_made: - logging.warning("No updates were made to CITATION.cff") + logger.warning("No updates were made to CITATION.cff") return True # Not an error, just no changes needed # Save updated CITATION.cff - success = save_yaml_file( + return save_yaml_file( citation_cff_path, citation_data, schema_comment="# yaml-language-server: $schema=https://citation-file-format.github.io/1.2.0/schema.json", ) - return success - except Exception as e: - logging.exception(f"Unexpected error in update_citation: {e!s}") + except Exception: + logger.exception("Unexpected error in update_citation") return False diff --git a/quadriga/metadata/update_version_from_tag.py b/quadriga/metadata/update_version_from_tag.py index c150054..57e1fff 100644 --- a/quadriga/metadata/update_version_from_tag.py +++ b/quadriga/metadata/update_version_from_tag.py @@ -1,47 +1,47 @@ -""" -Updates book-version and date-modified in metadata.yml based on git tag. -""" +"""Update version and date-modified in metadata.yml based on git tag.""" import logging import os import sys -from datetime import datetime +from datetime import UTC, datetime from .utils import get_file_path, load_yaml_file, save_yaml_file logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def update_version_from_tag(): +def update_version_from_tag() -> bool: """ - Updates book-version and date-modified in metadata.yml from git tag. + Update book and date-modified in metadata.yml from git tag. Expects the version to be passed via environment variable TAG_VERSION. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: # Get version from environment variable (set by GitHub Actions) version = os.environ.get("TAG_VERSION") if not version: - logging.info("No TAG_VERSION environment variable found - skipping version update") + logger.info("No TAG_VERSION environment variable found - skipping version update") return True - logging.info(f"Updating metadata for version: {version}") + logger.info("Updating metadata for version: %s", version) # Get file path try: repo_root = get_file_path("") metadata_path = get_file_path("metadata.yml", repo_root) - except Exception as e: - logging.error(f"Failed to resolve file paths: {str(e)}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - if not metadata: - logging.error("Could not load metadata.yml") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format") return False # Track if updates were made @@ -52,19 +52,19 @@ def update_version_from_tag(): if current_version != version: metadata["version"] = version updates_made = True - logging.info(f"Updated version from '{current_version}' to '{version}'") + logger.info("Updated version from '%s' to '%s'", current_version, version) else: - logging.info(f"version already matches tag version: {version}") + logger.info("version already matches tag version: %s", version) # Update date-modified - current_date = datetime.now().strftime("%Y-%m-%d") + current_date = datetime.now(UTC).strftime("%Y-%m-%d") old_date = metadata.get("date-modified") if old_date != current_date: metadata["date-modified"] = current_date updates_made = True - logging.info(f"Updated date-modified from '{old_date}' to '{current_date}'") + logger.info("Updated date-modified from '%s' to '%s'", old_date, current_date) else: - logging.info(f"date-modified already current: {current_date}") + logger.info("date-modified already current: %s", current_date) # Save if updates were made if updates_made: @@ -74,14 +74,13 @@ def update_version_from_tag(): schema_comment="# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json", ) if success: - logging.info("Successfully updated metadata.yml") + logger.info("Successfully updated metadata.yml") return success - else: - logging.info("No updates needed") - return True + logger.info("No updates needed") + return True - except Exception as e: - logging.exception(f"Unexpected error in update_version_from_tag: {str(e)}") + except Exception: + logger.exception("Unexpected error in update_version_from_tag") return False diff --git a/quadriga/metadata/utils.py b/quadriga/metadata/utils.py index c11a376..eb1ebe1 100644 --- a/quadriga/metadata/utils.py +++ b/quadriga/metadata/utils.py @@ -1,19 +1,20 @@ """ Common utility functions for metadata management in the Quadriga Book Template. + This module provides reused functionality across different metadata scripts. """ +from __future__ import annotations + import json import logging -import os import re -import sys -from datetime import datetime from pathlib import Path import yaml logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) # ---- File Path Handling ---- @@ -22,10 +23,12 @@ def get_repo_root() -> Path: """ Get the path to the repository root, assuming this module is in quadriga/metadata/. - Returns: + Returns + ------- Path: Absolute path to the repository root - Raises: + Raises + ------ FileNotFoundError: If the repository structure is not as expected """ try: @@ -38,14 +41,16 @@ def get_repo_root() -> Path: found_files = [f for f in required_files if (repo_root / f).exists()] if len(found_files) < 1: - raise FileNotFoundError( - f"Repository root at {repo_root} doesn't contain expected files (_config.yml or _toc.yml)" + msg = ( + f"Repository root at {repo_root} doesn't contain expected files " + "(_config.yml or _toc.yml)" ) - - return repo_root - except Exception as e: - logging.exception(f"Error resolving repository root: {e}") + raise FileNotFoundError(msg) + except Exception: + logger.exception("Error resolving repository root") raise + else: + return repo_root def get_file_path(relative_path: str | Path, repo_root: Path | None = None) -> Path: @@ -56,47 +61,53 @@ def get_file_path(relative_path: str | Path, repo_root: Path | None = None) -> P relative_path (str | Path): Relative path from the repository root repo_root (Path, optional): Repository root path. If None, it will be determined - Returns: + Returns + ------- Path: Absolute path to the file """ - try: - if repo_root is None: - repo_root = get_repo_root() - return repo_root / Path(relative_path) - except Exception as e: - logging.error(f"Error resolving file path for '{relative_path}': {e}") - # Return the relative path as a fallback - return Path(relative_path) + if repo_root is None: + repo_root = get_repo_root() + return repo_root / Path(relative_path) # ---- YAML Handling ---- -def load_yaml_file(file_path: str | Path): +def load_yaml_file(file_path: str | Path) -> dict | list | None: """ Load a YAML file and return its contents as a Python object. Args: file_path (str | Path): Path to the YAML file - Returns: + Returns + ------- dict/list: Contents of the YAML file, or None if an error occurs """ + # Convert to Path at the edge + path = Path(file_path) + try: - with open(file_path, "r", encoding="utf-8") as file: - return yaml.safe_load(file) + with path.open(encoding="utf-8") as file: + data = yaml.safe_load(file) + # yaml.safe_load returns Any; ensure it's dict or list + if isinstance(data, (dict, list)): + return data + return None except FileNotFoundError: - logging.error(f"File not found: {Path(file_path).name}") + logger.exception("File not found: %s", path.name) return None - except yaml.YAMLError as e: - logging.error(f"YAML parsing error in {Path(file_path).name}: {e}") + except yaml.YAMLError: + logger.exception("YAML parsing error in %s", path.name) return None - except Exception as e: - logging.error(f"Error loading {Path(file_path).name}: {e}") + except Exception: + logger.exception("Error loading %s", path.name) return None -def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = None): +def save_yaml_file( + file_path: str | Path, data: dict | list, schema_comment: str | None = None +) -> bool: """ Save Python object as YAML to the specified file. @@ -104,19 +115,23 @@ def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = Non file_path (str | Path): Path where the YAML file should be saved data (dict/list): Data to save schema_comment (str, optional): Schema comment to add at the start of the file - e.g. "# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/schema.json" + e.g. "# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/latest/schema.json" - Returns: + Returns + ------- bool: True if successful, False otherwise """ + # Convert to Path at the edge + path = Path(file_path) + try: # Create directories if they don't exist - directory = Path(file_path).parent + directory = path.parent if not directory.exists(): directory.mkdir(parents=True) - logging.info(f"Created directory: {directory}") + logger.info("Created directory: %s", directory) - with open(file_path, "w", encoding="utf-8") as file: + with path.open("w", encoding="utf-8") as file: yaml.dump( data, file, @@ -127,25 +142,25 @@ def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = Non if schema_comment: try: - with open(file_path, "r+", encoding="utf-8") as file: + with path.open("r+", encoding="utf-8") as file: content = file.read() file.seek(0, 0) file.write(f"{schema_comment}\n" + content) - except Exception as e: - logging.warning(f"Failed to add schema comment to {Path(file_path).name}: {e}") + except Exception: + logger.exception("Failed to add schema comment to %s", path.name) # Not a critical error, proceed - - logging.info(f"Successfully updated {Path(file_path).name}") - return True - except yaml.YAMLError as e: - logging.error(f"YAML encoding error for {Path(file_path).name}: {e}") + except yaml.YAMLError: + logger.exception("YAML encoding error for %s", path.name) return False - except PermissionError as e: - logging.error(f"Permission denied when saving {Path(file_path).name}: {e}") + except PermissionError: + logger.exception("Permission denied when saving %s", path.name) return False - except Exception as e: - logging.error(f"Error saving to {Path(file_path).name}: {e}") + except Exception: + logger.exception("Error saving to %s", path.name) return False + else: + logger.info("Successfully updated %s", path.name) + return True # ---- Markdown and Jupyter Content Handling ---- @@ -161,7 +176,8 @@ def remove_yaml_frontmatter(text: str) -> str: Args: text (str): Markdown content that may contain frontmatter - Returns: + Returns + ------- str: Content with frontmatter removed """ pattern = r"^\s*---\s*\n(.*?)\n\s*---\s*(\n|$)" @@ -175,14 +191,16 @@ def extract_first_heading(file_path: str | Path) -> str: Args: file_path (str | Path): Path to the file - Returns: + Returns + ------- str: The content of the first heading or filename if no heading found """ + # Convert to Path at the edge file_path_obj = Path(file_path) try: if file_path_obj.suffix == ".ipynb": try: - with open(file_path_obj, "r", encoding="utf-8") as file: + with file_path_obj.open(encoding="utf-8") as file: notebook = json.load(file) for cell in notebook.get("cells", []): @@ -191,30 +209,30 @@ def extract_first_heading(file_path: str | Path) -> str: heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if heading_match: return heading_match.group(1).strip() - except json.JSONDecodeError as e: - logging.error(f"Invalid JSON in notebook {file_path_obj.name}: {e}") - except Exception as e: - logging.error(f"Error reading notebook {file_path_obj.name}: {e}") + except json.JSONDecodeError: + logger.exception("Invalid JSON in notebook %s", file_path_obj.name) + except Exception: + logger.exception("Error reading notebook %s", file_path_obj.name) elif file_path_obj.suffix == ".md": try: - with open(file_path_obj, "r", encoding="utf-8") as file: + with file_path_obj.open(encoding="utf-8") as file: content = file.read() content = remove_yaml_frontmatter(content) heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if heading_match: return heading_match.group(1).strip() - except Exception as e: - logging.error(f"Error reading markdown {file_path_obj.name}: {e}") + except Exception: + logger.exception("Error reading markdown %s", file_path_obj.name) else: - logging.warning(f"Unsupported file type for heading extraction: {file_path_obj.name}") + logger.warning("Unsupported file type for heading extraction: %s", file_path_obj.name) return file_path_obj.stem except FileNotFoundError: - logging.error(f"File not found: {file_path_obj.name}") - except Exception as e: - logging.error(f"Error processing {file_path_obj.name}: {e}") + logger.exception("File not found: %s", file_path_obj.name) + except Exception: + logger.exception("Error processing %s", file_path_obj.name) return file_path_obj.stem @@ -222,43 +240,44 @@ def extract_first_heading(file_path: str | Path) -> str: # ---- Citation Handling ---- -def format_authors_for_bibtex(authors): +def format_authors_for_bibtex(authors: list) -> str: """ Format a list of authors in the proper BibTeX format. Args: authors (list): List of author dictionaries with 'given-names' and 'family-names' - Returns: + Returns + ------- str: Authors formatted for BibTeX """ try: if not authors: - logging.warning("No authors provided to format_authors_for_bibtex") + logger.warning("No authors provided to format_authors_for_bibtex") return "" formatted_authors = [] for i, author in enumerate(authors): if not isinstance(author, dict): - logging.warning(f"Author at index {i} is not a dictionary: {author}") + logger.warning("Author at index %s is not a dictionary: %s", i, author) continue family = author.get("family-names", "") given = author.get("given-names", "") if not family and not given: - logging.warning(f"Author at index {i} is missing both family-names and given-names") + logger.warning("Author at index %s is missing both family-names and given-names", i) continue formatted_authors.append(f"{family}, {given}") return " and ".join(formatted_authors) - except Exception as e: - logging.exception(f"Error formatting authors for BibTeX: {e}") + except Exception: + logger.exception("Error formatting authors for BibTeX") return "" -def generate_citation_key(authors, title, year): +def generate_citation_key(authors: list, title: str, year: str) -> str: """ Generate a citation key for BibTeX. @@ -267,7 +286,8 @@ def generate_citation_key(authors, title, year): title (str): Title of the work year (str): Year of publication - Returns: + Returns + ------- str: Citation key """ try: @@ -277,7 +297,7 @@ def generate_citation_key(authors, title, year): family_name = first_author.get("family-names", "Unknown") else: family_name = "Unknown" - logging.warning("No valid authors provided for citation key generation") + logger.warning("No valid authors provided for citation key generation") # Get the first word of the title or use 'Untitled' if title and isinstance(title, str): @@ -285,11 +305,11 @@ def generate_citation_key(authors, title, year): first_word = title_words[0] if title_words else "Untitled" else: first_word = "Untitled" - logging.warning("No valid title provided for citation key generation") + logger.warning("No valid title provided for citation key generation") # Use the year or empty string if not year: - logging.warning("No year provided for citation key generation") + logger.warning("No year provided for citation key generation") year = "" # Create a citation key with no invalid characters @@ -297,17 +317,17 @@ def generate_citation_key(authors, title, year): # Clean the key - remove special characters clean_key = re.sub(r"[^a-zA-Z0-9_]", "", raw_key) - - return clean_key or "Unknown_Citation" - except Exception as e: - logging.exception(f"Error generating citation key: {e}") + except Exception: + logger.exception("Error generating citation key") return "Unknown_Citation_Error" + else: + return clean_key or "Unknown_Citation" # ---- Keyword Handling ---- -def extract_keywords(keywords_data): +def extract_keywords(keywords_data: list | None) -> list: """ Extract keywords from various formats. @@ -319,15 +339,13 @@ def extract_keywords(keywords_data): Args: keywords_data: Keywords in various formats - Returns: + Returns + ------- list: List of keyword strings """ if not keywords_data: return [] - if not isinstance(keywords_data, list): - return [] - keywords = [] for item in keywords_data: if isinstance(item, str): @@ -336,10 +354,8 @@ def extract_keywords(keywords_data): elif isinstance(item, dict): # Dictionary format with language codes # Extract all values from the dictionary (should be only one per item) - for lang_code, keyword in item.items(): - if keyword: - keywords.append(str(keyword)) + keywords.extend(str(keyword) for keyword in item.values() if keyword) else: - logging.warning(f"Unexpected keyword format: {item}") + logger.warning("Unexpected keyword format: %s", item) return keywords diff --git a/quadriga/metadata/validate_schema.py b/quadriga/metadata/validate_schema.py new file mode 100644 index 0000000..71cd9a8 --- /dev/null +++ b/quadriga/metadata/validate_schema.py @@ -0,0 +1,119 @@ +"""Validate metadata.yml against the QUADRIGA JSON Schema. + +This module fetches the QUADRIGA schema (and referenced sub-schemas) from the +remote URL and validates a metadata dictionary against it. +""" + +from __future__ import annotations + +import json +import logging +import urllib.request + +from quadriga.metadata.utils import get_file_path, load_yaml_file + +logger = logging.getLogger(__name__) + +QUADRIGA_SCHEMA_URL = ( + "https://quadriga-dk.github.io/quadriga-schema/latest/schema.json" +) + + +def _fetch_json(url: str) -> dict: + """Fetch a JSON document from a URL. + + Args: + url: URL to fetch + + Returns + ------- + dict: Parsed JSON content + + Raises + ------ + urllib.error.URLError: If the URL cannot be reached + json.JSONDecodeError: If the response is not valid JSON + """ + with urllib.request.urlopen(url, timeout=30) as resp: # noqa: S310 + return json.loads(resp.read()) + + +def _validate_metadata( + metadata: dict, schema_url: str = QUADRIGA_SCHEMA_URL +) -> tuple[bool, list[str]]: + """Validate a metadata dictionary against the QUADRIGA JSON Schema. + + Fetches the schema (and any ``$ref`` sub-schemas) fresh from the given URL. + + Args: + metadata: Metadata dictionary (e.g. parsed from metadata.yml) + schema_url: URL of the main QUADRIGA schema + + Returns + ------- + tuple[bool, list[str]]: ``(True, [])`` when valid, or + ``(False, [error_message, ...])`` when validation fails or the schema + cannot be fetched. + """ + try: + from jsonschema import Draft202012Validator + from referencing import Registry, Resource + from referencing.jsonschema import DRAFT202012 + except ImportError: + logger.warning( + "jsonschema package not installed – skipping schema validation. " + "Install it via: pip install jsonschema" + ) + return True, [] + + try: + logger.info("Fetching QUADRIGA schema from %s ...", schema_url) + main_schema = _fetch_json(schema_url) + except Exception: + logger.exception("Failed to fetch schema from %s", schema_url) + return False, [f"Could not fetch schema from {schema_url}"] + + def retrieve(uri: str) -> Resource: + data = _fetch_json(uri) + return Resource.from_contents(data, default_specification=DRAFT202012) + + try: + registry: Registry = Registry(retrieve=retrieve) + validator = Draft202012Validator(main_schema, registry=registry) + errors = list(validator.iter_errors(metadata)) + except Exception: + logger.exception("Error during schema validation") + return False, ["Unexpected error during schema validation"] + + if errors: + messages = [] + for err in errors: + path = err.json_path if err.json_path != "$" else "(root)" + messages.append(f"{path}: {err.message}") + return False, messages + + return True, [] + + +def validate_schema() -> bool: + """Load metadata.yml and validate it against the QUADRIGA schema. + + Returns + ------- + bool: True if validation passed, False otherwise. + """ + metadata_path = get_file_path("metadata.yml") + metadata = load_yaml_file(metadata_path) + if metadata is None: + logger.error("Could not load metadata.yml for validation.") + return False + + valid, errors = _validate_metadata(metadata) + if valid: + logger.info("Schema validation passed.") + return True + + logger.error("Schema validation failed with %d error(s):", len(errors)) + for i, error in enumerate(errors, 1): + logger.error(" %d. %s", i, error) + return False