diff --git a/.github/workflows/update-metadata.yml b/.github/workflows/update-metadata.yml index c0d907d..5e49502 100644 --- a/.github/workflows/update-metadata.yml +++ b/.github/workflows/update-metadata.yml @@ -20,35 +20,96 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ github.ref }} + fetch-depth: 0 + + - name: Extract version from tag (if triggered by tag) + id: extract_version + run: | + if [[ "$GITHUB_REF" == refs/tags/v* ]]; then + TAG_NAME=${GITHUB_REF#refs/tags/} + # Remove 'v' prefix if present (v1.0.0 -> 1.0.0) + VERSION=${TAG_NAME#v} + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT + echo "is_tag=true" >> $GITHUB_OUTPUT + echo "Triggered by version tag: $TAG_NAME (version: $VERSION)" + else + echo "is_tag=false" >> $GITHUB_OUTPUT + echo "Triggered by regular push to: $GITHUB_REF" + fi - name: Set up Python uses: actions/setup-python@v5 with: python-version-file: '.python-version' cache: pip - + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pyyaml - + pip install -r dev-requirements.txt + + - name: Update version metadata (if triggered by tag) + if: steps.extract_version.outputs.is_tag == 'true' + env: + TAG_VERSION: ${{ steps.extract_version.outputs.version }} + run: | + python -m quadriga.metadata.update_version_from_tag + - name: Update metadata files + env: + PYTHONHASHSEED: 0 run: python -m quadriga.metadata.run_all - - - name: Check if files changed + + - name: Stage metadata files + run: | + # Add all metadata files that exist (handles both new and modified files) + for file in metadata.yml CITATION.bib CITATION.cff .zenodo.json metadata.jsonld metadata.rdf; do + [ -f "$file" ] && git add "$file" + done + + - name: Check if files staged id: check_changes run: | - if git diff --quiet metadata.yml && git diff --quiet CITATION.bib && git diff --quiet CITATION.cff; then + if git diff --cached --quiet; then echo "changes_detected=false" >> $GITHUB_OUTPUT else echo "changes_detected=true" >> $GITHUB_OUTPUT fi - - - name: Commit changes if necessary - if: steps.check_changes.outputs.changes_detected == 'true' + + - name: Commit changes (regular push) + if: steps.check_changes.outputs.changes_detected == 'true' && steps.extract_version.outputs.is_tag == 'false' run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" - git add metadata.yml CITATION.bib CITATION.cff git commit -m "[Automated] Update metadata files" - git push \ No newline at end of file + git push + + - name: Commit changes and move tag (tag-triggered) + if: steps.check_changes.outputs.changes_detected == 'true' && steps.extract_version.outputs.is_tag == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git commit -m "[Automated] Update metadata for version ${{ steps.extract_version.outputs.version }}" + + # Delete the old tag (locally and remotely) + git tag -d ${{ steps.extract_version.outputs.tag_name }} + git push origin :refs/tags/${{ steps.extract_version.outputs.tag_name }} + + # Create new tag at the current commit (with updated metadata) + git tag ${{ steps.extract_version.outputs.tag_name }} + + # Push the changes and the new tag + git push origin HEAD:main + git push origin ${{ steps.extract_version.outputs.tag_name }} + + echo "Tag ${{ steps.extract_version.outputs.tag_name }} moved to commit with updated metadata" + + - name: No changes needed + if: steps.check_changes.outputs.changes_detected == 'false' + run: | + if [[ "${{ steps.extract_version.outputs.is_tag }}" == "true" ]]; then + echo "Metadata already matches the tag version - no changes needed" + else + echo "No metadata changes detected" + fi diff --git a/.zenodo.json b/.zenodo.json index 51862d3..c6e9cdb 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -22,7 +22,7 @@ "affiliation": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" } ], - "description": "Diese Fallstudie geht auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen ein. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt.", + "description": "
Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt.
\nDas interaktive Lehrbuch kann als Web-Version verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch auf GitHub zur Verfügung.
\nDie QUADRIGA-OER sind nach einem einheitlichen Template gestaltet, werden nach einem standardisierten Verfahren qualitätsgeprüft und mit Metadaten ausgezeichnet.
\nQUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozess möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes Datenkompetenzframework zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.
\nZu den Zielgruppen von QUADRIGA zählen insbesondere promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.
\nQUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR) und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartnern zählen:\n
Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im Umsetzungskonzept erfahren.
\n\nWeitere Informationen sowie Publikationen finden Sie auf der Webseite, in der Zenodo-Community und der GitHub-Organisation des Projekts.
\n", "publication_date": "2025-03-24", "keywords": [ "Verwaltung", @@ -37,7 +37,7 @@ "Reproduzierbarkeit", "R (Programmiersprache)" ], - "license": "CC-BY-SA-4.0", + "license": "CC BY 4.0", "language": "deu", "contributors": [ { diff --git a/CITATION.bib b/CITATION.bib index b4558da..f908f0f 100644 --- a/CITATION.bib +++ b/CITATION.bib @@ -4,7 +4,6 @@ @misc{Plomin_Reproduzierbarkeit_2025 year = {2025}, version = {1.0.0-beta.2}, note = {Repository: https://github.com/quadriga-dk/Tabelle-Fallstudie-1}, - version = {1.0.0-beta.2}, howpublished = {Available from: https://github.com/quadriga-dk/Tabelle-Fallstudie-1}, doi = {10.5281/zenodo.14975202}, url = {https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/0_Intro.html}, diff --git a/CITATION.cff b/CITATION.cff index ec6dc12..5b15ac3 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,9 +3,11 @@ cff-version: 1.2.0 version: 1.0.0-beta.2 title: 'Reproduzierbarkeit von Datenanalysen: Ein Fallbeispiel aus dem Nationalen Bildungsbericht. QUADRIGA Open Educational Resources: Tabelle 1' -abstract: Diese Fallstudie geht auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen - ein. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von - Datenqualität und Nachvollziehbarkeit von Analysen behandelt. +abstract: Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf + in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage + auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu + werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität + und Nachvollziehbarkeit von Analysen behandelt. type: software message: Please cite this software using the metadata from `preferred-citation` in `CITATION.cff`. diff --git a/dev-requirements.txt b/dev-requirements.txt index 4818cc5..2d753e8 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1 +1,3 @@ -pyyaml \ No newline at end of file +pyyaml +jsonschema +rdflib diff --git a/metadata.jsonld b/metadata.jsonld new file mode 100644 index 0000000..ed42d01 --- /dev/null +++ b/metadata.jsonld @@ -0,0 +1,523 @@ +{ + "@context": { + "schema": "http://schema.org/", + "dc": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "lrmi": "http://purl.org/dcx/lrmi-terms/", + "skos": "http://www.w3.org/2004/02/skos/core#", + "@vocab": "http://schema.org/" + }, + "@type": [ + "Book", + "LearningResource" + ], + "name": "Reproduzierbarkeit von Datenanalysen: Ein Fallbeispiel aus dem Nationalen Bildungsbericht. QUADRIGA Open Educational Resources: Tabelle 1", + "description": "Diese Fallstudie bildet mit Hilfe eines JupyterBooks einen Forschungsverlauf in der Verwaltungswissenschaft nach. Dabei wird anhand einer modellhaften Forschungsfrage auf die Reproduzierbarkeit von Analysen und Forschungsergebnissen eingegangen. Dazu werden in einzelnen Kapiteln die Themen Datennachnutzung, Bewertung von Datenqualität und Nachvollziehbarkeit von Analysen behandelt.", + "identifier": { + "@type": "PropertyValue", + "propertyID": "DOI", + "value": "10.5281/zenodo.14975202", + "url": "https://doi.org/10.5281/zenodo.14975202" + }, + "version": "1.0.0-beta.2", + "schemaVersion": "1.0.0", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/0_Intro.html", + "workExample": { + "@type": "SoftwareSourceCode", + "name": "Source Code Repository", + "codeRepository": "https://github.com/quadriga-dk/Tabelle-Fallstudie-1" + }, + "datePublished": "2024-06-13", + "dateModified": "2025-03-24", + "author": [ + { + "@type": "Person", + "givenName": "Jana", + "familyName": "Plomin", + "name": "Jana Plomin", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-0127-7493", + "url": "https://orcid.org/0000-0003-0127-7493" + }, + "affiliation": { + "@type": "Organization", + "name": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" + } + }, + { + "@type": "Person", + "givenName": "Juliane", + "familyName": "Schmeling", + "name": "Juliane Schmeling", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0009-0005-9814-1139", + "url": "https://orcid.org/0009-0005-9814-1139" + }, + "affiliation": { + "@type": "Organization", + "name": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" + } + }, + { + "@type": "Person", + "givenName": "Paul", + "familyName": "Walter", + "name": "Paul Walter", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-9236-3268", + "url": "https://orcid.org/0000-0002-9236-3268" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Anton", + "familyName": "Schulze", + "name": "Anton Schulze", + "affiliation": { + "@type": "Organization", + "name": "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS" + } + } + ], + "contributor": [ + { + "@type": "Person", + "givenName": "Hannes", + "familyName": "Schnaitter", + "name": "Hannes Schnaitter", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-1602-6032", + "url": "https://orcid.org/0000-0002-1602-6032" + }, + "affiliation": { + "@type": "Organization", + "name": "Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft" + } + }, + { + "@type": "Person", + "givenName": "Evgenia", + "familyName": "Samoilova", + "name": "Evgenia Samoilova", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-3858-901X", + "url": "https://orcid.org/0000-0003-3858-901X" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Carsten", + "familyName": "Schneemann", + "name": "Carsten Schneemann", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-2683-5853", + "url": "https://orcid.org/0000-0002-2683-5853" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Lamia", + "familyName": "Islam", + "name": "Lamia Islam", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0009-0001-1879-9880", + "url": "https://orcid.org/0009-0001-1879-9880" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Yue", + "familyName": "Zhang", + "name": "Yue Zhang", + "affiliation": { + "@type": "Organization", + "name": "Technische Universität Berlin" + } + }, + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Wiemer", + "name": "Philip Wiemer" + }, + { + "@type": "Person", + "givenName": "Jan", + "familyName": "Bernoth", + "name": "Jan Bernoth", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-4127-0053", + "url": "https://orcid.org/0000-0002-4127-0053" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Melanie", + "familyName": "Seltmann", + "name": "Melanie Seltmann", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-7588-4395", + "url": "https://orcid.org/0000-0002-7588-4395" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Henny", + "familyName": "Sluyther-Gäthhje", + "name": "Henny Sluyther-Gäthhje", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0003-2969-3237", + "url": "https://orcid.org/0000-0003-2969-3237" + }, + "affiliation": { + "@type": "Organization", + "name": "Universität Potsdam" + } + }, + { + "@type": "Person", + "givenName": "Vivien", + "familyName": "Petras", + "name": "Vivien Petras", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-8113-1509", + "url": "https://orcid.org/0000-0002-8113-1509" + }, + "affiliation": { + "@type": "Organization", + "name": "Humboldt-Universität zu Berlin, Institut für Bibliotheks- und Informationswissenschaft" + } + }, + { + "@type": "Person", + "givenName": "Heike", + "familyName": "Neuroth", + "name": "Heike Neuroth", + "identifier": { + "@type": "PropertyValue", + "propertyID": "ORCID", + "value": "0000-0002-3637-3154", + "url": "https://orcid.org/0000-0002-3637-3154" + }, + "affiliation": { + "@type": "Organization", + "name": "Fachhochschule Potsdam" + } + } + ], + "inLanguage": "de", + "keywords": [ + "Verwaltung", + "Verwaltungswissenschaft", + "Tabelle", + "Lerneinheit", + "Public Sector", + "Open Educational Resource", + "FAIR-Prinzipien", + "Datenqualität", + "Persistente Identifikatoren", + "Reproduzierbarkeit", + "R (Programmiersprache)" + ], + "about": [ + { + "@type": "Thing", + "name": "Verwaltung" + }, + { + "@type": "Thing", + "name": "Verwaltungswissenschaft" + }, + { + "@type": "Thing", + "name": "Tabelle" + }, + { + "@type": "Thing", + "name": "Lerneinheit" + }, + { + "@type": "Thing", + "name": "Public Sector" + }, + { + "@type": "Thing", + "name": "Open Educational Resource" + }, + { + "@type": "Thing", + "name": "FAIR-Prinzipien" + }, + { + "@type": "Thing", + "name": "Datenqualität" + }, + { + "@type": "Thing", + "name": "Persistente Identifikatoren" + }, + { + "@type": "Thing", + "name": "Reproduzierbarkeit" + }, + { + "@type": "Thing", + "name": "R (Programmiersprache)" + }, + { + "@type": "Thing", + "name": "Verwaltungswissenschaften" + }, + { + "@type": "Thing", + "name": "Informationswissenschaft" + }, + { + "@type": "Thing", + "name": "Tabelle" + } + ], + "audience": [ + { + "@type": "Audience", + "audienceType": "Forschende (PostDoc)" + }, + { + "@type": "Audience", + "audienceType": "Forschende (Projektleitung)" + }, + { + "@type": "Audience", + "audienceType": "Promovierende" + }, + { + "@type": "Audience", + "audienceType": "Hochschullehrende" + } + ], + "timeRequired": "PT3H15M", + "license": [ + { + "@type": "CreativeWork", + "name": "Source Code", + "license": "https://opensource.org/licenses/AGPL-3.0" + }, + { + "@type": "CreativeWork", + "name": "Content", + "license": "https://creativecommons.org/licenses/by-sa/4.0/" + } + ], + "hasPart": [ + { + "@type": "LearningResource", + "name": "Datenbasis", + "description": "Dieses Kapitel beschreibt die in dieser Fallstudie genutzten Daten.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/2_Datenbasis.html", + "timeRequired": "PT5M", + "teaches": "Forschungssfrage", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Forschungsfrage kann erläutert und in einen Kontext mit dem Bildungsbericht gesetzt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: nicht anwendbar | Bloom's: 2 Verstehen | Data Flow: nicht anwendbar", + "lrmi:assesses": "nicht anwendbar" + } + ] + }, + { + "@type": "LearningResource", + "name": "Datennachnutzung", + "description": "Dieses Kapitel befasst sich mit der Datennachnutzung und den FAIR-Prinzipien.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/05_Datennachnutzung.html", + "timeRequired": "PT15M", + "teaches": "Grundsätze des Datenmanagements", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung der vier FAIR-Prinzipien kann anhand eines konkreten Beispiels erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: Orientierungswissen | Bloom's: 2 Verstehen | Data Flow: übergreifend", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die FAIRness eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: Orientierungswissen | Bloom's: 3 Anwenden | Data Flow: übergreifend", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Qualitätsbewertung", + "description": "In diesem Kapitel stehen die Kriterien der Qualitätsbewertung im Mittelpunkt.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Qualit%C3%A4tsbewertung.html", + "timeRequired": "PT30M", + "teaches": "Sicherstellen der Qualität von Datensätzen", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung zentraler Qualitätskriterien für Datensätze kann für die Forschung erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.2 Qualitätssicherung | Bloom's: 2 Verstehen | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Vor- und Nachteile zwischen XLSX- und CSV-Dateiformaten können für verschiedene Anwendungsfälle aufgezeigt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.2 Qualitätssicherung | Bloom's: 2 Verstehen | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Qualität eines gegebenen Datensatzes kann anhand einer Checkliste bewertet werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.2 Qualitätssicherung | Bloom's: 3 Anwenden | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Identifikatoren", + "description": "Dieses Kapitel behandelt Identifikatoren der eindeutigen und persistenten Auszeichnung von Daten sowie das Zitieren von Forschungsdaten.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Identifikatoren.html", + "timeRequired": "PT25M", + "teaches": "Datenzitierung und PID", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die allgemein anerkannten Methoden der Datenzitierung können beschrieben werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 1.3 Ethik und Recht | Bloom's: 2 Verstehen | Data Flow: 1 Planung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Rolle von Persistenten Identifikatoren (PIDs) in der Datenzitierung kann erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 5.2 Datenpublikation | Bloom's: 2 Verstehen | Data Flow: 5 Publikation und Nachnutzung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die spezifischen Einsatzbereiche verschiedener Arten von PIDs (z.B. DOI, ORCID) können erklärt werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 5.2 Datenpublikation | Bloom's: 2 Verstehen | Data Flow: 5 Publikation und Nachnutzung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Datenaufbereitung I - Organisation und Strukturierung", + "description": "Dieses Kapitel widmet sich der Datenmanipulation als ein entscheidender Bestandteil, um die Qualität und den Aufbau von Datensätzen zu evaluieren und zu verbessern.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Datenmanipulation1.html", + "timeRequired": "PT60M", + "teaches": "Datenaufbereitung und -strukturierung", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung von Datenaufbereitung kann erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 2.2 Validierung | Bloom's: 2 Verstehen | Data Flow: 2 Erhebung und Aufbereitung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Anwendung spezifischer Aufbereitungstechniken auf einen gegebenen Datensatz kann beschrieben werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 2.3 Aufbereitung | Bloom's: 2 Verstehen | Data Flow: 2 Erhebung und Aufbereitung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Die Bedeutung einer klaren Datenstruktur für effektive Datenaufbereitung kann anhand eines Beispiels demonstriert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 2.3 Aufbereitung | Bloom's: 3 Anwenden | Data Flow: 2 Erhebung und Aufbereitung", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + }, + { + "@type": "LearningResource", + "name": "Datenaufbereitung II - Analyse und Reproduzierbarkeit", + "description": "Dieses Kapitel widmet sich der Datenanalyse und -reproduzierbarkeit mit Hilfe der Programmiersprache R.", + "url": "https://quadriga-dk.github.io/Tabelle-Fallstudie-1/Markdown/Datenmanipulation2.html", + "timeRequired": "PT60M", + "teaches": "Datenanalyse und -reproduzierbarkeit", + "educationalAlignment": [ + { + "@type": "AlignmentObject", + "targetName": "Die Wichtigkeit einer sorgfältigen Dokumentation bei der Durchführung einer deskriptiven Analyse kann beschrieben werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 4.1 Datenanalyse | Bloom's: 2 Verstehen | Data Flow: 4 Analyse", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + }, + { + "@type": "AlignmentObject", + "targetName": "Anhand eines konkreten Beispiels (aus der Fallstudie zur Reproduzierbarkeit) können mindestens drei für die Reproduzierbarkeit besonders relevante Aspekte erläutert werden.", + "educationalFramework": "QUADRIGA Competency Framework", + "targetDescription": "Competency: 4.1 Datenanalyse | Bloom's: 2 Verstehen | Data Flow: 4 Analyse", + "lrmi:assesses": "Multiple-Choice-Quiz, Reflexionsfragen" + } + ] + } + ], + "dcterms:tableOfContents": "- Präambel\n- Datenbasis: Nationaler Bildungsbericht\n- Datennachnutzung\n- Qualitätsbewertung\n- Identifikatoren\n- Datenaufbereitung I - Organisation und Strukturierung\n- Datenaufbereitung II - Analyse und Reproduzierbarkeit\n- Zusammenfassung und Reflexion\n- Epilog", + "funding": "Die vorliegenden Open Educational Resources wurden durch das Datenkompetenzzentrum QUADRIGA erstellt.\n\nFörderkennzeichen: 16DKZ2034", + "learningResourceType": "Jupyter Book", + "lrmi:learningResourceType": "Jupyter Book", + "dcterms:type": "Jupyter Book", + "dc:type": "Jupyter Book" +} \ No newline at end of file diff --git a/metadata.rdf b/metadata.rdf new file mode 100644 index 0000000..3ee67da --- /dev/null +++ b/metadata.rdf @@ -0,0 +1,542 @@ + +" + metadata.get("description") + "
" + + description_base = f""" +Das interaktive Lehrbuch kann als Web-Version verwendet, zur individuellen Anpassung heruntergeladen werden und steht darüber hinaus auch auf GitHub zur Verfügung.
+Die QUADRIGA-OER sind nach einem einheitlichen Template gestaltet, werden nach einem standardisierten Verfahren qualitätsgeprüft und mit Metadaten ausgezeichnet.
+QUADRIGA ist das Datenkompetenzzentrum der Wissenschaftsregion Berlin-Brandenburg. Für die beiden Anwendungsdomänen Digital Humanities und Verwaltungswissenschaft entstehen unter der Einbindung der Expertise der beiden Disziplinen Informatik und Informationswissenschaft Selbstlernangebote, die als OER in Form von Jupyter Books zur freien Nachnutzung zur Verfügung gestellt werden. Um den Forschungsprozess möglichst realistisch abzubilden, basieren die OER auf Fallstudien, denen wiederum ein eigens für das Projekt entwickeltes Datenkompetenzframework zugrunde liegt. Die Fallstudien nehmen drei für die Anwendungsdomänen repräsentativen Datentypen in den Blick: Bewegtes Bild, Tabelle und Text.
+Zu den Zielgruppen von QUADRIGA zählen insbesondere promovierende und promovierte Wissenschaftler*innen der genannten Disziplinen, die den Umgang mit digitalen Daten, Methoden und Werkzeugen erlernen und weiterentwickeln wollen.
+QUADRIGA ist eins von 11 Datenkompetenzzentren in Deutschland und wird vom Bundesministerium für Forschung, Technologie und Raumfahrt (BMFTR) und von der Europäischen Union im Rahmen von “NextGenerationEU” finanziert. Zu den Verbundpartnern zählen: +
Mehr zum Aufbau und zur Umsetzung des Projekts können Sie im Umsetzungskonzept erfahren.
+ +Weitere Informationen sowie Publikationen finden Sie auf der Webseite, in der Zenodo-Community und der GitHub-Organisation des Projekts.
+""" + zenodo_metadata["description"] = description + description_base + logger.info("Added description") # publication date publication_date = None @@ -265,16 +290,16 @@ def create_zenodo_json(): else: # It's already a string publication_date = str(date_value) - logging.info(f"Added publication_date from metadata.yml: {publication_date}") + logger.info("Added publication_date from metadata.yml: %s", publication_date) elif "year" in pref: # Fall back to year from CITATION.cff year = str(pref["year"]) # Zenodo expects ISO 8601 date format (YYYY-MM-DD) # We use January 1st as default when only year is provided publication_date = f"{year}-01-01" - logging.info(f"Added publication_date from year (fallback): {publication_date}") + logger.info("Added publication_date from year (fallback): %s", publication_date) else: - logging.warning("No publication date or year found") + logger.warning("No publication date or year found") if publication_date: zenodo_metadata["publication_date"] = publication_date @@ -287,7 +312,7 @@ def create_zenodo_json(): keywords_list = extract_keywords(pref["keywords"]) if keywords_list: zenodo_metadata["keywords"] = keywords_list - logging.info(f"Added {len(keywords_list)} keywords") + logger.info("Added %d keywords", len(keywords_list)) # license license_id = None @@ -300,7 +325,7 @@ def create_zenodo_json(): # Clean up common variations license_clean = str(license_id).upper().replace("_", "-") zenodo_metadata["license"] = license_clean - logging.info(f"Added license: {license_clean}") + logger.info("Added license: %s", license_clean) # language if pref.get("languages"): @@ -308,14 +333,14 @@ def create_zenodo_json(): pref["languages"][0] if isinstance(pref["languages"], list) else pref["languages"] ) zenodo_metadata["language"] = lang - logging.info(f"Added language: {lang}") + logger.info("Added language: %s", lang) # contributors if metadata.get("contributors"): contributors = format_contributors_for_zenodo(metadata["contributors"]) if contributors: zenodo_metadata["contributors"] = contributors - logging.info(f"Added {len(contributors)} contributors") + logger.info("Added %d contributors", len(contributors)) # related_identifiers related_identifiers = [] @@ -324,38 +349,39 @@ def create_zenodo_json(): related_identifiers.append( {"identifier": repo_url, "relation": "isSupplementedBy", "scheme": "url"} ) - logging.info("Added repository URL as related identifier") + logger.info("Added repository URL as related identifier") url = pref.get("url") if url and url != repo_url: related_identifiers.append( {"identifier": url, "relation": "isAlternateIdentifier", "scheme": "url"} ) - logging.info("Added URL as related identifier") + logger.info("Added URL as related identifier") if related_identifiers: zenodo_metadata["related_identifiers"] = related_identifiers # community zenodo_metadata["communities"] = [{"identifier": "quadriga"}] - logging.info("Added QUADRIGA community") + logger.info("Added QUADRIGA community") # version if "version" in pref: zenodo_metadata["version"] = str(pref["version"]) - logging.info(f"Added version: {pref['version']}") + logger.info("Added version: %s", pref["version"]) # write .zenodo.json try: with zenodo_json_path.open("w", encoding="utf-8") as f: json.dump(zenodo_metadata, f, ensure_ascii=False, indent=2) - logging.info(f"Zenodo metadata successfully created at {zenodo_json_path}") - return True - except OSError as e: - logging.exception(f"Error writing to {zenodo_json_path}: {e}") + except OSError: + logger.exception("Error writing to %s", zenodo_json_path) return False + else: + logger.info("Zenodo metadata successfully created at %s", zenodo_json_path) + return True - except Exception as e: - logging.exception(f"Unexpected error in create_zenodo_json: {e!s}") + except Exception: + logger.exception("Unexpected error in create_zenodo_json") return False diff --git a/quadriga/metadata/extract_from_book_config.py b/quadriga/metadata/extract_from_book_config.py index 79b87cb..ea3946d 100644 --- a/quadriga/metadata/extract_from_book_config.py +++ b/quadriga/metadata/extract_from_book_config.py @@ -1,12 +1,14 @@ """ -This script extracts the title from _config.yml and the first level of the TOC from _toc.yml. -It then uses this information to update metadata.yml. -The titles for the TOC chapters are extracted from the first heading of the corresponding files. +Extract the title from _config.yml and the first level of the TOC from _toc.yml. + +It then uses this information to update metadata.yml. The titles for the TOC chapters are extracted +from the first heading of the corresponding files. """ +from __future__ import annotations + import logging import sys -from datetime import datetime from pathlib import Path from .utils import ( @@ -19,21 +21,23 @@ # Configure logging logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def extract_and_update(): +def extract_and_update() -> bool | None: """ Extract information from _config.yml and _toc.yml files and update metadata.yml. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: # Get the repository root directory try: repo_root = get_repo_root() - except Exception as e: - logging.error(f"Failed to get repository root: {e}") + except Exception: + logger.exception("Failed to get repository root") return False # Define file paths using the get_file_path utility function @@ -48,7 +52,7 @@ def extract_and_update(): (metadata_path, "metadata.yml"), ]: if not path.exists(): - logging.error(f"Required file {name} not found at {path}") + logger.error("Required file %s not found at %s", name, path) return False # Load the files @@ -56,27 +60,32 @@ def extract_and_update(): toc_data = load_yaml_file(toc_path) metadata_data = load_yaml_file(metadata_path) - if not all([config_data, toc_data, metadata_data]): - logging.error("One or more required files couldn't be loaded. Exiting.") + if not config_data or not isinstance(config_data, dict): + logger.error("Could not load _config.yml or invalid format. Exiting.") + return False + if not toc_data or not isinstance(toc_data, dict): + logger.error("Could not load _toc.yml or invalid format. Exiting.") + return False + if not metadata_data or not isinstance(metadata_data, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False # Extract information from _config.yml title = config_data.get("title", "") - author = config_data.get("author", "") if not title: - logging.warning("No title found in _config.yml") + logger.warning("No title found in _config.yml") # Extract chapters and their titles from _toc.yml toc_chapters = [] missing_files = [] if "chapters" not in toc_data: - logging.warning("No 'chapters' section found in _toc.yml") + logger.warning("No 'chapters' section found in _toc.yml") else: for chapter in toc_data["chapters"]: if "file" not in chapter: - logging.warning("Found chapter entry without 'file' attribute in _toc.yml") + logger.warning("Found chapter entry without 'file' attribute in _toc.yml") continue try: @@ -94,7 +103,7 @@ def extract_and_update(): # Check if file exists if not full_path.exists(): missing_files.append(str(full_path)) - logging.warning(f"Chapter file not found: {full_path}") + logger.warning("Chapter file not found: %s", full_path) # Use filename as fallback title toc_chapters.append(f"[Missing: {p.stem}]") continue @@ -104,19 +113,19 @@ def extract_and_update(): # Add to the list of chapters toc_chapters.append(chapter_title) - except Exception as e: - logging.error(f"Error processing chapter {chapter.get('file', 'unknown')}: {e}") + except Exception: + logger.exception("Error processing chapter %s", chapter.get("file", "unknown")) # Add a placeholder with the filename if possible try: toc_chapters.append(f"[Error: {p.stem}]") - except: + except Exception: toc_chapters.append("[Error: unknown chapter]") if missing_files: - logging.warning(f"Missing {len(missing_files)} chapter files") + logger.warning("Missing %d chapter files", len(missing_files)) if not toc_chapters: - logging.warning("No chapter titles were extracted") + logger.warning("No chapter titles were extracted") # Format the TOC as a string with proper indentation and single newline between items toc_formatted = "- " + "\n- ".join(toc_chapters) @@ -132,7 +141,7 @@ def extract_and_update(): if "table-of-contents" in metadata_data: metadata_data["table-of-contents"] = toc_formatted else: - logging.warning("No 'table-of-contents' field found in metadata.yml") + logger.warning("No 'table-of-contents' field found in metadata.yml") # Save the updated metadata if save_yaml_file( @@ -140,20 +149,19 @@ def extract_and_update(): metadata_data, schema_comment="# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json", ): - logging.info("Metadata updated successfully!") + logger.info("Metadata updated successfully!") return True - else: - logging.error("Failed to save metadata.yml") - return False - except Exception as e: - logging.exception(f"Error updating metadata.yml: {e}") + logger.error("Failed to save metadata.yml") + return False + except Exception: + logger.exception("Error updating metadata.yml") return False else: - logging.error("Metadata file couldn't be loaded or is empty.") + logger.error("Metadata file couldn't be loaded or is empty.") return False - except Exception as e: - logging.exception(f"Unexpected error in extract_and_update: {e}") + except Exception: + logger.exception("Unexpected error in extract_and_update") return False diff --git a/quadriga/metadata/inject_all_metadata.py b/quadriga/metadata/inject_all_metadata.py new file mode 100644 index 0000000..bfbddcc --- /dev/null +++ b/quadriga/metadata/inject_all_metadata.py @@ -0,0 +1,889 @@ +""" +Injects all metadata (JSON-LD, OpenGraph, and RDF links) into generated HTML files. + +This unified script combines JSON-LD structured data injection and OpenGraph +social media metadata into a single efficient pass through HTML files. + +It reads metadata.jsonld and injects: +- OpenGraph tags for social media previews +- JSON-LD ') + + # 3. RDF discovery links + if add_link_elements: + injection_parts.append( + ' ' + ) + injection_parts.append( + ' ' + ) + + # Join all parts with newlines + full_injection = "\n".join(injection_parts) + + # Find optimal injection point + # Priority: after viewport, after charset, before (fallback) + injection_point = None + + # Try to inject after viewport meta tag (best practice for OpenGraph) + viewport_match = re.search(r"(]*>\s*)", html_content, re.IGNORECASE) + if viewport_match: + injection_point = viewport_match.end() + else: + # Fallback: try after charset meta tag + charset_match = re.search(r"(]*>\s*)", html_content, re.IGNORECASE) + if charset_match: + injection_point = charset_match.end() + + # Final fallback: inject before + if injection_point is None: + injection_point = html_content.find("") + if injection_point == -1: + logger.warning("No tag found in %s, skipping", html_path.name) + return False + # For injection, add before the tag + html_content = html_content[:injection_point] + f"\n{full_injection}\n" + html_content[injection_point:] + else: + # For after viewport/charset injection, insert at found position + html_content = html_content[:injection_point] + f"\n{full_injection}\n\n" + html_content[injection_point:] + + # Write the modified HTML back + with html_path.open("w", encoding="utf-8") as f: + f.write(html_content) + + logger.info("Injected all metadata into %s", html_path.name) + return True + + except FileNotFoundError: + logger.exception("HTML file not found: %s", html_path) + return False + except Exception: + logger.exception("Error injecting metadata into %s", html_path) + return False + + +# ============================================================================ +# Main Injection Orchestration +# ============================================================================ + + +def inject_all_metadata( + build_dir: Path | None = None, + jsonld_path: Path | None = None, + config_path: Path | None = None, + toc_path: Path | None = None, +) -> bool: + """ + Inject all metadata (OpenGraph, JSON-LD, and RDF links) into Jupyter Book HTML files. + + This unified function combines OpenGraph social media metadata and JSON-LD structured + data injection into a single efficient operation. + + For the root page, it injects: + - OpenGraph: og:type="book" with book:author, book:release_date, book:tag + - JSON-LD: Full book metadata + + For chapter pages, it injects: + - OpenGraph: og:type="article" with article:author, article:published_time, article:modified_time + - JSON-LD: Chapter-specific LearningResource with isPartOf reference to book + + Args: + build_dir (Path, optional): Path to _build/html directory (default: ./_build/html) + jsonld_path (Path, optional): Path to metadata.jsonld (default: ./metadata.jsonld) + config_path (Path, optional): Path to _config.yml (default: ./_config.yml) + toc_path (Path, optional): Path to _toc.yml (default: ./_toc.yml) + + Returns + ------- + bool: True if successful, False otherwise + """ + try: + # Determine paths + if build_dir is None: + build_dir = Path.cwd() / "_build" / "html" + if jsonld_path is None: + jsonld_path = Path.cwd() / "metadata.jsonld" + if config_path is None: + config_path = Path.cwd() / "_config.yml" + if toc_path is None: + toc_path = Path.cwd() / "_toc.yml" + + # Check if build directory exists + if not build_dir.exists(): + logger.error("Build directory not found: %s", build_dir) + return False + + # Check if JSON-LD file exists + if not jsonld_path.exists(): + logger.error("JSON-LD file not found: %s", jsonld_path) + return False + + # Read and validate JSON-LD + try: + with jsonld_path.open(encoding="utf-8") as f: + jsonld_data = json.load(f) + logger.info("Loaded JSON-LD from %s", jsonld_path) + except json.JSONDecodeError: + logger.exception("Invalid JSON in %s", jsonld_path) + return False + + # Extract base URL from metadata + base_url = jsonld_data.get("url", "") + if not base_url: + logger.error("No URL found in metadata.jsonld") + return False + + # Extract book title for og:site_name + book_title = jsonld_data.get("name", "") + if not book_title: + logger.warning("No book title found in metadata.jsonld") + + # Get logo filename from config + logo_filename = get_logo_from_config(config_path) + logger.info("Using logo: %s", logo_filename) + + # Determine the actual root page from _toc.yml + root_html = get_root_page_from_toc(toc_path, build_dir) + + # Fall back to index.html if we couldn't determine the root + if root_html is None or not root_html.exists(): + root_html = build_dir / "index.html" + logger.info("Using index.html as root page") + + # ==================================================================== + # Process root page + # ==================================================================== + if root_html.exists(): + # Generate OpenGraph tags for root page (og:type="book") + og_tags = create_opengraph_meta_tags( + jsonld_data, + base_url, + book_title, + logo_filename, + is_chapter=False, + ) + + # Format JSON-LD content with proper indentation + jsonld_content = json.dumps(jsonld_data, ensure_ascii=False, indent=2) + jsonld_content = "\n".join(" " + line for line in jsonld_content.split("\n")) + + # Inject both OpenGraph and JSON-LD into root page + if not inject_all_metadata_into_html(root_html, og_tags, jsonld_content): + logger.error("Failed to inject metadata into %s", root_html.name) + return False + else: + logger.warning("Root HTML file not found at %s", root_html) + return False + + # ==================================================================== + # Process index.html redirect page (if different from root) + # ==================================================================== + # Jupyter Book may create index.html as a meta-refresh redirect + # Social media crawlers don't follow these redirects, so we need + # to inject OpenGraph metadata into index.html as well + index_html = build_dir / "index.html" + if index_html.exists() and index_html != root_html: + try: + with index_html.open(encoding="utf-8") as f: + index_content = f.read() + + # Check if this is a simple meta-refresh redirect page (with or without proper HTML structure) + if "meta http-equiv" in index_content.lower(): + logger.info("Found index.html redirect page, injecting OpenGraph metadata") + + # Generate OpenGraph tags (but skip JSON-LD for redirect page) + og_tags = create_opengraph_meta_tags( + jsonld_data, + base_url, + book_title, + logo_filename, + is_chapter=False, + ) + + # Check if the redirect page has proper HTML structure + if "]*>', index_content, re.IGNORECASE) + meta_refresh = meta_refresh_match.group(0) if meta_refresh_match else "" + + # Create proper HTML with OpenGraph metadata and meta refresh + new_index_content = f""" + + + + {meta_refresh} +{og_tags} +Redirecting to {root_html.name}...
+ + +""" + # Write the new index.html + with index_html.open("w", encoding="utf-8") as f: + f.write(new_index_content) + logger.info("Successfully created index.html with OpenGraph metadata and redirect") + else: + # Has proper HTML structure, inject normally + if not inject_all_metadata_into_html(index_html, og_tags, "", add_link_elements=False): + logger.warning("Failed to inject OpenGraph into index.html redirect page") + else: + logger.info("Successfully injected OpenGraph metadata into index.html redirect page") + except Exception: + logger.exception("Error processing index.html redirect page") + + # ==================================================================== + # Process chapter pages + # ==================================================================== + chapters_injected = 0 + if jsonld_data.get("hasPart"): + logger.info("Processing %d chapters...", len(jsonld_data["hasPart"])) + + for chapter in jsonld_data["hasPart"]: + if not isinstance(chapter, dict): + continue + + # Get chapter URL + chapter_url = chapter.get("url") + if not chapter_url: + logger.warning("Chapter missing URL: %s", chapter.get("name", "Unknown")) + continue + + # Find the HTML file for this chapter + chapter_html_path = get_html_path_from_url(chapter_url, build_dir) + if not chapter_html_path: + logger.warning("Could not find HTML file for chapter: %s", chapter.get("name", "Unknown")) + continue + + # Create chapter metadata for OpenGraph (combining chapter + book data) + chapter_og_metadata = { + "name": chapter.get("name", ""), + "url": chapter_url, + "description": chapter.get("description", ""), + "author": jsonld_data.get("author", []), # Inherit from book + "datePublished": jsonld_data.get("datePublished"), # Inherit from book + "dateModified": jsonld_data.get("dateModified"), # Inherit from book + "inLanguage": jsonld_data.get("inLanguage"), # Inherit from book + } + + # Generate OpenGraph tags for chapter (og:type="article") + chapter_og_tags = create_opengraph_meta_tags( + chapter_og_metadata, + base_url, + book_title, + logo_filename, + is_chapter=True, + ) + + # Create chapter-specific JSON-LD with book-level metadata + chapter_jsonld = create_chapter_jsonld(chapter, jsonld_data) + + # Convert to formatted string + chapter_jsonld_str = json.dumps(chapter_jsonld, ensure_ascii=False, indent=2) + chapter_jsonld_str = "\n".join(" " + line for line in chapter_jsonld_str.split("\n")) + + # Inject both OpenGraph and JSON-LD into chapter HTML + if inject_all_metadata_into_html(chapter_html_path, chapter_og_tags, chapter_jsonld_str): + chapters_injected += 1 + else: + logger.warning("Failed to inject metadata into chapter: %s", chapter.get("name", "Unknown")) + + logger.info("Injected metadata into %d chapter pages", chapters_injected) + + logger.info("All metadata injection completed successfully") + return True + + except Exception: + logger.exception("Unexpected error in inject_all_metadata") + return False + + +# ============================================================================ +# CLI Entry Point +# ============================================================================ + + +def main() -> None: + """ + Run the unified metadata injection script. + + Usage: + python -m quadriga.metadata.inject_all_metadata + """ + import argparse + + parser = argparse.ArgumentParser( + description="Inject all metadata (OpenGraph, JSON-LD, RDF links) into Jupyter Book HTML" + ) + parser.add_argument( + "--build-dir", + type=Path, + help="Path to _build/html directory (default: ./_build/html)", + ) + parser.add_argument( + "--jsonld-path", + type=Path, + help="Path to metadata.jsonld file (default: ./metadata.jsonld)", + ) + parser.add_argument( + "--config-path", + type=Path, + help="Path to _config.yml file (default: ./_config.yml)", + ) + parser.add_argument( + "--toc-path", + type=Path, + help="Path to _toc.yml file (default: ./_toc.yml)", + ) + + args = parser.parse_args() + + success = inject_all_metadata( + build_dir=args.build_dir, + jsonld_path=args.jsonld_path, + config_path=args.config_path, + toc_path=args.toc_path, + ) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/quadriga/metadata/run_all.py b/quadriga/metadata/run_all.py index b6c0b8e..6407619 100644 --- a/quadriga/metadata/run_all.py +++ b/quadriga/metadata/run_all.py @@ -1,17 +1,30 @@ -""" -This script runs the various metadata update scripts in the correct order. -""" +"""Script to coordinate the different metadata transformation scripts for QUADRIGA Jupyter Books.""" + +from __future__ import annotations import logging +import os import sys -from .create_bibtex import create_bibtex_from_cff -from .extract_from_book_config import extract_and_update -from .update_citation_cff import update_citation -from .create_zenodo_json import create_zenodo_json +# Add current working directory to sys.path if not present +# This allows the script to run with python -m without package installation +cwd = os.getcwd() +if cwd not in sys.path: + sys.path.insert(0, cwd) + +from quadriga.metadata.create_bibtex import create_bibtex_from_cff +from quadriga.metadata.create_jsonld import create_jsonld +from quadriga.metadata.create_rdfxml import create_rdfxml +from quadriga.metadata.create_zenodo_json import create_zenodo_json +from quadriga.metadata.extract_from_book_config import extract_and_update +from quadriga.metadata.update_citation_cff import update_citation +from quadriga.metadata.validate_schema import validate_schema +logger = logging.getLogger(__name__) -def main(): + +def main() -> bool | None: + """Run the different metadata transformation scripts in order.""" try: # Configure logging with timestamp logging.basicConfig( @@ -20,56 +33,86 @@ def main(): datefmt="%Y-%m-%d %H:%M:%S", ) - logging.info("Running all metadata update scripts...") + logger.info("Running all metadata update scripts...") + + # Validate metadata.yml against QUADRIGA schema first + try: + logger.info("Validating metadata.yml against QUADRIGA schema...") + if not validate_schema(): + logger.error("Schema validation failed.") + return False + except Exception: + logger.exception("Unexpected error during schema validation") + return False # Execute extract_and_update with error handling try: - logging.info("Extracting metadata from _config.yml and _toc.yml...") + logger.info("Extracting metadata from _config.yml and _toc.yml...") if not extract_and_update(): - logging.error("Extract and update process failed.") + logger.error("Extract and update process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during extract_and_update: {str(e)}") + except Exception: + logger.exception("Unexpected error during extract_and_update") return False # Execute update_citation with error handling try: - logging.info("Updating CITATION.cff...") + logger.info("Updating CITATION.cff...") if not update_citation(): - logging.error("Update citation process failed.") + logger.error("Update citation process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during update_citation: {str(e)}") + except Exception: + logger.exception("Unexpected error during update_citation") return False # Execute create_bibtex_from_cff with error handling try: - logging.info("Creating CITATION.bib from CITATION.cff...") + logger.info("Creating CITATION.bib from CITATION.cff...") if not create_bibtex_from_cff(): - logging.error("Create BibTeX process failed.") + logger.error("Create BibTeX process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during create_bibtex_from_cff: {str(e)}") + except Exception: + logger.exception("Unexpected error during create_bibtex_from_cff") return False # Execute create_zenodo_json with error handling try: - logging.info("Creating .zenodo.json from CITATION.cff and metadata.yml...") + logger.info("Creating .zenodo.json from CITATION.cff and metadata.yml...") if not create_zenodo_json(): - logging.error("Create Zenodo JSON process failed.") + logger.error("Create Zenodo JSON process failed.") + return False + except Exception: + logger.exception("Unexpected error during create_zenodo_json") + return False + + # Execute create_jsonld with error handling + try: + logger.info("Creating metadata.jsonld from metadata.yml...") + if not create_jsonld(): + logger.error("Create JSON-LD process failed.") + return False + except Exception: + logger.exception("Unexpected error during create_jsonld") + return False + + # Execute create_rdfxml with error handling + try: + logger.info("Creating metadata.rdf from metadata.yml...") + if not create_rdfxml(): + logger.error("Create RDF/XML process failed.") return False - except Exception as e: - logging.exception(f"Unexpected error during create_zenodo_json: {str(e)}") + except Exception: + logger.exception("Unexpected error during create_rdfxml") return False - logging.info("All scripts executed successfully.") + logger.info("All scripts executed successfully.") return True except KeyboardInterrupt: - logging.warning("Process interrupted by user.") + logger.warning("Process interrupted by user.") return False - except Exception as e: - logging.exception(f"Unexpected error in main: {str(e)}") + except Exception: + logger.exception("Unexpected error in main") return False diff --git a/quadriga/metadata/update_citation_cff.py b/quadriga/metadata/update_citation_cff.py index e149b66..61c7e81 100644 --- a/quadriga/metadata/update_citation_cff.py +++ b/quadriga/metadata/update_citation_cff.py @@ -1,10 +1,13 @@ """ -Updates the CITATION.cff file with metadata from metadata.yml. +Update or create the CITATION.cff file with metadata from metadata.yml. This script reads metadata from 'metadata.yml' and updates the corresponding fields in 'CITATION.cff'. It handles fields like title, authors, URL, repository URL, and publication date. It also ensures that the 'preferred-citation' section, if present, is updated consistently. + +If CITATION.cff does not exist, a new one is created from metadata.yml with +the required CFF boilerplate fields. """ import logging @@ -14,20 +17,105 @@ from .utils import extract_keywords, get_file_path, load_yaml_file, save_yaml_file logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def update_citation(): +def _create_initial_citation_cff(metadata: dict) -> dict: """ - Updates the CITATION.cff file using data from the metadata.yml file. + Create an initial CITATION.cff data structure from metadata.yml. + + Builds a minimal valid CFF 1.2.0 file with the required fields + (cff-version, message, title, authors) plus optional fields + that can be derived from metadata.yml. - The function performs the following steps: - 1. Constructs absolute paths to 'metadata.yml' and 'CITATION.cff'. - 2. Loads data from both YAML files. - 3. Updates 'CITATION.cff' fields (title, authors, URL, repository-code, - and publication year in preferred-citation) based on 'metadata.yml'. - 4. For authors, it attempts to preserve existing author details in - 'CITATION.cff' if a matching author (by given and family names) is found. - 5. Saves the updated data back to 'CITATION.cff', including a schema comment. + Args: + metadata: Parsed metadata.yml data + + Returns + ------- + dict: A valid CFF data structure + """ + citation_data: dict = { + "cff-version": "1.2.0", + "message": "If you use this work, please cite it using the metadata from this file.", + "type": "dataset", + } + + # Title (required by CFF) + citation_data["title"] = metadata.get("title", "Untitled") + + # Authors (required by CFF) + if metadata.get("authors"): + citation_authors = [] + for author in metadata["authors"]: + cff_author: dict = {} + if "given-names" in author: + cff_author["given-names"] = author["given-names"] + if "family-names" in author: + cff_author["family-names"] = author["family-names"] + if "orcid" in author: + cff_author["orcid"] = author["orcid"] + if "affiliation" in author: + cff_author["affiliation"] = author["affiliation"] + if cff_author: + citation_authors.append(cff_author) + citation_data["authors"] = citation_authors if citation_authors else [{"name": "Unknown"}] + else: + citation_data["authors"] = [{"name": "Unknown"}] + + # Optional fields from metadata + if "version" in metadata: + citation_data["version"] = metadata["version"] + + if "url" in metadata: + citation_data["url"] = metadata["url"] + + if "git" in metadata: + citation_data["repository-code"] = metadata["git"] + + if "identifier" in metadata: + doi_url = metadata["identifier"] + if "doi.org" in str(doi_url): + # Extract DOI value from URL + doi_value = str(doi_url).split("doi.org/")[-1] if "doi.org/" in str(doi_url) else None + if doi_value: + citation_data["identifiers"] = [ + {"type": "doi", "value": doi_value, "description": "Zenodo"} + ] + + if "description" in metadata: + citation_data["abstract"] = metadata["description"] + + if metadata.get("keywords"): + flattened = extract_keywords(metadata["keywords"]) + if flattened: + citation_data["keywords"] = flattened + + if "license" in metadata: + license_data = metadata["license"] + if isinstance(license_data, str): + citation_data["license"] = license_data + elif isinstance(license_data, dict) and "content" in license_data: + # Use content license as primary (most relevant for citation) + content_license = license_data["content"] + if isinstance(content_license, str): + citation_data["license"] = content_license + elif isinstance(content_license, list) and content_license: + citation_data["license"] = content_license[0] + + logger.info("Created initial CITATION.cff from metadata.yml") + return citation_data + + +def update_citation() -> bool: + """ + Update or create the CITATION.cff file using data from the metadata.yml file. + + If CITATION.cff exists, the function updates its fields based on metadata.yml, + preserving existing author details and preferred-citation sections. + + If CITATION.cff does not exist, a new one is created from metadata.yml with + the required CFF boilerplate. Returns ------- @@ -39,32 +127,31 @@ def update_citation(): repo_root = get_file_path("") # Get repo root by providing empty relative path metadata_path = get_file_path("metadata.yml", repo_root) citation_cff_path = get_file_path("CITATION.cff", repo_root) - except Exception as e: - logging.exception(f"Failed to resolve file paths: {e!s}") + except Exception: + logger.exception("Failed to resolve file paths") return False - # Check if files exist - for path, name in [ - (metadata_path, "metadata.yml"), - (citation_cff_path, "CITATION.cff"), - ]: - if not Path(path).exists(): - logging.error(f"Required file {name} not found at {path}") - return False + # metadata.yml must exist + if not Path(metadata_path).exists(): + logger.error("Required file metadata.yml not found at %s", metadata_path) + return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - # Load existing CITATION.cff - citation_data = load_yaml_file(citation_cff_path) - - if not metadata: - logging.error("Could not load metadata.yml. Exiting.") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format. Exiting.") return False - if not citation_data: - logging.error("Could not load CITATION.cff. Exiting.") - return False + # Load existing CITATION.cff or create initial structure + if Path(citation_cff_path).exists(): + citation_data = load_yaml_file(citation_cff_path) + if not citation_data or not isinstance(citation_data, dict): + logger.error("Could not load CITATION.cff or invalid format. Exiting.") + return False + else: + logger.info("CITATION.cff not found — creating from metadata.yml") + citation_data = _create_initial_citation_cff(metadata) # Track if updates were made updates_made = False @@ -76,18 +163,18 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["title"] = metadata["title"] updates_made = True - logging.info(f"Updated title to: {metadata['title']}") + logger.info("Updated title to: %s", metadata["title"]) else: - logging.warning("No title found in metadata.yml") + logger.warning("No title found in metadata.yml") if "version" in metadata: citation_data["version"] = metadata["version"] if "preferred-citation" in citation_data: citation_data["preferred-citation"]["version"] = metadata["version"] updates_made = True - logging.info(f"Updated version to: {metadata['version']}") + logger.info("Updated version to: %s", metadata["version"]) else: - logging.warning("No version found in metadata.yml, skipping version update") + logger.warning("No version found in metadata.yml, skipping version update") if metadata.get("authors"): try: @@ -102,13 +189,12 @@ def update_citation(): and "family-names" in cit_author and "given-names" in author and "family-names" in author + ) and ( + cit_author["given-names"] == author["given-names"] + and cit_author["family-names"] == author["family-names"] ): - if ( - cit_author["given-names"] == author["given-names"] - and cit_author["family-names"] == author["family-names"] - ): - new_author_entry = cit_author - break + new_author_entry = cit_author + break # Update author entry with metadata if "given-names" in author: @@ -129,13 +215,13 @@ def update_citation(): citation_data["preferred-citation"]["authors"] = citation_authors updates_made = True - logging.info(f"Updated {len(citation_authors)} authors") + logger.info("Updated %d authors", len(citation_authors)) else: - logging.warning("Failed to process authors from metadata.yml") - except Exception as e: - logging.exception(f"Error processing authors: {e!s}") + logger.warning("Failed to process authors from metadata.yml") + except Exception: + logger.exception("Error processing authors") else: - logging.warning("No authors found in metadata.yml") + logger.warning("No authors found in metadata.yml") # Update URL if present in metadata if "url" in metadata: @@ -143,7 +229,7 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["url"] = metadata["url"] updates_made = True - logging.info(f"Updated URL to: {metadata['url']}") + logger.info("Updated URL to: %s", metadata["url"]) # Update repository URL if present in metadata if "git" in metadata: @@ -151,27 +237,33 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["repository-code"] = metadata["git"] updates_made = True - logging.info(f"Updated repository-code to: {metadata['git']}") + logger.info("Updated repository-code to: %s", metadata["git"]) - # Update publication year based on date-modified or date-published + # Update publication year based on date-modified or date-issued # Prefer newer date-modified, if available year_source = None year_value = None + year_digits = 4 if "date-modified" in metadata: date_str = metadata["date-modified"] - if isinstance(date_str, str) and len(date_str) >= 4: + if isinstance(date_str, str) and len(date_str) >= year_digits: year_value = date_str[:4] year_source = "date-modified" - elif "date-published" in metadata: - date_str = metadata["date-published"] - if isinstance(date_str, str) and len(date_str) >= 4: + elif "date-issued" in metadata: + date_str = metadata["date-issued"] + if isinstance(date_str, str) and len(date_str) >= year_digits: year_value = date_str[:4] # Extract year from YYYY-MM-DD - year_source = "date-published" + year_source = "date-issued" if year_value and "preferred-citation" in citation_data: citation_data["preferred-citation"]["year"] = year_value updates_made = True - logging.info(f"Updated publication year to: {year_value} (from {year_source})") + logger.info("Updated publication year to: %s (from %s)", year_value, year_source) + + if "description" in metadata: + citation_data["abstract"] = metadata["description"] + updates_made = True + logger.info("Updated abstract from description") # Update keywords if present in metadata # Extract keywords to flatten any language-keyed formats @@ -182,27 +274,26 @@ def update_citation(): if "preferred-citation" in citation_data: citation_data["preferred-citation"]["keywords"] = flattened_keywords updates_made = True - logging.info(f"Updated keywords with {len(flattened_keywords)} items") + logger.info("Updated keywords with %d items", len(flattened_keywords)) else: - logging.warning("Keywords found in metadata.yml but could not be extracted") + logger.warning("Keywords found in metadata.yml but could not be extracted") else: - logging.warning("No keywords found in metadata.yml") + logger.warning("No keywords found in metadata.yml") # No changes if not updates_made: - logging.warning("No updates were made to CITATION.cff") + logger.warning("No updates were made to CITATION.cff") return True # Not an error, just no changes needed # Save updated CITATION.cff - success = save_yaml_file( + return save_yaml_file( citation_cff_path, citation_data, schema_comment="# yaml-language-server: $schema=https://citation-file-format.github.io/1.2.0/schema.json", ) - return success - except Exception as e: - logging.exception(f"Unexpected error in update_citation: {e!s}") + except Exception: + logger.exception("Unexpected error in update_citation") return False diff --git a/quadriga/metadata/update_version_from_tag.py b/quadriga/metadata/update_version_from_tag.py index c150054..57e1fff 100644 --- a/quadriga/metadata/update_version_from_tag.py +++ b/quadriga/metadata/update_version_from_tag.py @@ -1,47 +1,47 @@ -""" -Updates book-version and date-modified in metadata.yml based on git tag. -""" +"""Update version and date-modified in metadata.yml based on git tag.""" import logging import os import sys -from datetime import datetime +from datetime import UTC, datetime from .utils import get_file_path, load_yaml_file, save_yaml_file logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) -def update_version_from_tag(): +def update_version_from_tag() -> bool: """ - Updates book-version and date-modified in metadata.yml from git tag. + Update book and date-modified in metadata.yml from git tag. Expects the version to be passed via environment variable TAG_VERSION. - Returns: + Returns + ------- bool: True if successful, False otherwise. """ try: # Get version from environment variable (set by GitHub Actions) version = os.environ.get("TAG_VERSION") if not version: - logging.info("No TAG_VERSION environment variable found - skipping version update") + logger.info("No TAG_VERSION environment variable found - skipping version update") return True - logging.info(f"Updating metadata for version: {version}") + logger.info("Updating metadata for version: %s", version) # Get file path try: repo_root = get_file_path("") metadata_path = get_file_path("metadata.yml", repo_root) - except Exception as e: - logging.error(f"Failed to resolve file paths: {str(e)}") + except Exception: + logger.exception("Failed to resolve file paths") return False # Load metadata.yml metadata = load_yaml_file(metadata_path) - if not metadata: - logging.error("Could not load metadata.yml") + if not metadata or not isinstance(metadata, dict): + logger.error("Could not load metadata.yml or invalid format") return False # Track if updates were made @@ -52,19 +52,19 @@ def update_version_from_tag(): if current_version != version: metadata["version"] = version updates_made = True - logging.info(f"Updated version from '{current_version}' to '{version}'") + logger.info("Updated version from '%s' to '%s'", current_version, version) else: - logging.info(f"version already matches tag version: {version}") + logger.info("version already matches tag version: %s", version) # Update date-modified - current_date = datetime.now().strftime("%Y-%m-%d") + current_date = datetime.now(UTC).strftime("%Y-%m-%d") old_date = metadata.get("date-modified") if old_date != current_date: metadata["date-modified"] = current_date updates_made = True - logging.info(f"Updated date-modified from '{old_date}' to '{current_date}'") + logger.info("Updated date-modified from '%s' to '%s'", old_date, current_date) else: - logging.info(f"date-modified already current: {current_date}") + logger.info("date-modified already current: %s", current_date) # Save if updates were made if updates_made: @@ -74,14 +74,13 @@ def update_version_from_tag(): schema_comment="# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/v1.0.0/schema.json", ) if success: - logging.info("Successfully updated metadata.yml") + logger.info("Successfully updated metadata.yml") return success - else: - logging.info("No updates needed") - return True + logger.info("No updates needed") + return True - except Exception as e: - logging.exception(f"Unexpected error in update_version_from_tag: {str(e)}") + except Exception: + logger.exception("Unexpected error in update_version_from_tag") return False diff --git a/quadriga/metadata/utils.py b/quadriga/metadata/utils.py index c11a376..eb1ebe1 100644 --- a/quadriga/metadata/utils.py +++ b/quadriga/metadata/utils.py @@ -1,19 +1,20 @@ """ Common utility functions for metadata management in the Quadriga Book Template. + This module provides reused functionality across different metadata scripts. """ +from __future__ import annotations + import json import logging -import os import re -import sys -from datetime import datetime from pathlib import Path import yaml logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) # ---- File Path Handling ---- @@ -22,10 +23,12 @@ def get_repo_root() -> Path: """ Get the path to the repository root, assuming this module is in quadriga/metadata/. - Returns: + Returns + ------- Path: Absolute path to the repository root - Raises: + Raises + ------ FileNotFoundError: If the repository structure is not as expected """ try: @@ -38,14 +41,16 @@ def get_repo_root() -> Path: found_files = [f for f in required_files if (repo_root / f).exists()] if len(found_files) < 1: - raise FileNotFoundError( - f"Repository root at {repo_root} doesn't contain expected files (_config.yml or _toc.yml)" + msg = ( + f"Repository root at {repo_root} doesn't contain expected files " + "(_config.yml or _toc.yml)" ) - - return repo_root - except Exception as e: - logging.exception(f"Error resolving repository root: {e}") + raise FileNotFoundError(msg) + except Exception: + logger.exception("Error resolving repository root") raise + else: + return repo_root def get_file_path(relative_path: str | Path, repo_root: Path | None = None) -> Path: @@ -56,47 +61,53 @@ def get_file_path(relative_path: str | Path, repo_root: Path | None = None) -> P relative_path (str | Path): Relative path from the repository root repo_root (Path, optional): Repository root path. If None, it will be determined - Returns: + Returns + ------- Path: Absolute path to the file """ - try: - if repo_root is None: - repo_root = get_repo_root() - return repo_root / Path(relative_path) - except Exception as e: - logging.error(f"Error resolving file path for '{relative_path}': {e}") - # Return the relative path as a fallback - return Path(relative_path) + if repo_root is None: + repo_root = get_repo_root() + return repo_root / Path(relative_path) # ---- YAML Handling ---- -def load_yaml_file(file_path: str | Path): +def load_yaml_file(file_path: str | Path) -> dict | list | None: """ Load a YAML file and return its contents as a Python object. Args: file_path (str | Path): Path to the YAML file - Returns: + Returns + ------- dict/list: Contents of the YAML file, or None if an error occurs """ + # Convert to Path at the edge + path = Path(file_path) + try: - with open(file_path, "r", encoding="utf-8") as file: - return yaml.safe_load(file) + with path.open(encoding="utf-8") as file: + data = yaml.safe_load(file) + # yaml.safe_load returns Any; ensure it's dict or list + if isinstance(data, (dict, list)): + return data + return None except FileNotFoundError: - logging.error(f"File not found: {Path(file_path).name}") + logger.exception("File not found: %s", path.name) return None - except yaml.YAMLError as e: - logging.error(f"YAML parsing error in {Path(file_path).name}: {e}") + except yaml.YAMLError: + logger.exception("YAML parsing error in %s", path.name) return None - except Exception as e: - logging.error(f"Error loading {Path(file_path).name}: {e}") + except Exception: + logger.exception("Error loading %s", path.name) return None -def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = None): +def save_yaml_file( + file_path: str | Path, data: dict | list, schema_comment: str | None = None +) -> bool: """ Save Python object as YAML to the specified file. @@ -104,19 +115,23 @@ def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = Non file_path (str | Path): Path where the YAML file should be saved data (dict/list): Data to save schema_comment (str, optional): Schema comment to add at the start of the file - e.g. "# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/schema.json" + e.g. "# yaml-language-server: $schema=https://quadriga-dk.github.io/quadriga-schema/latest/schema.json" - Returns: + Returns + ------- bool: True if successful, False otherwise """ + # Convert to Path at the edge + path = Path(file_path) + try: # Create directories if they don't exist - directory = Path(file_path).parent + directory = path.parent if not directory.exists(): directory.mkdir(parents=True) - logging.info(f"Created directory: {directory}") + logger.info("Created directory: %s", directory) - with open(file_path, "w", encoding="utf-8") as file: + with path.open("w", encoding="utf-8") as file: yaml.dump( data, file, @@ -127,25 +142,25 @@ def save_yaml_file(file_path: str | Path, data, schema_comment: str | None = Non if schema_comment: try: - with open(file_path, "r+", encoding="utf-8") as file: + with path.open("r+", encoding="utf-8") as file: content = file.read() file.seek(0, 0) file.write(f"{schema_comment}\n" + content) - except Exception as e: - logging.warning(f"Failed to add schema comment to {Path(file_path).name}: {e}") + except Exception: + logger.exception("Failed to add schema comment to %s", path.name) # Not a critical error, proceed - - logging.info(f"Successfully updated {Path(file_path).name}") - return True - except yaml.YAMLError as e: - logging.error(f"YAML encoding error for {Path(file_path).name}: {e}") + except yaml.YAMLError: + logger.exception("YAML encoding error for %s", path.name) return False - except PermissionError as e: - logging.error(f"Permission denied when saving {Path(file_path).name}: {e}") + except PermissionError: + logger.exception("Permission denied when saving %s", path.name) return False - except Exception as e: - logging.error(f"Error saving to {Path(file_path).name}: {e}") + except Exception: + logger.exception("Error saving to %s", path.name) return False + else: + logger.info("Successfully updated %s", path.name) + return True # ---- Markdown and Jupyter Content Handling ---- @@ -161,7 +176,8 @@ def remove_yaml_frontmatter(text: str) -> str: Args: text (str): Markdown content that may contain frontmatter - Returns: + Returns + ------- str: Content with frontmatter removed """ pattern = r"^\s*---\s*\n(.*?)\n\s*---\s*(\n|$)" @@ -175,14 +191,16 @@ def extract_first_heading(file_path: str | Path) -> str: Args: file_path (str | Path): Path to the file - Returns: + Returns + ------- str: The content of the first heading or filename if no heading found """ + # Convert to Path at the edge file_path_obj = Path(file_path) try: if file_path_obj.suffix == ".ipynb": try: - with open(file_path_obj, "r", encoding="utf-8") as file: + with file_path_obj.open(encoding="utf-8") as file: notebook = json.load(file) for cell in notebook.get("cells", []): @@ -191,30 +209,30 @@ def extract_first_heading(file_path: str | Path) -> str: heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if heading_match: return heading_match.group(1).strip() - except json.JSONDecodeError as e: - logging.error(f"Invalid JSON in notebook {file_path_obj.name}: {e}") - except Exception as e: - logging.error(f"Error reading notebook {file_path_obj.name}: {e}") + except json.JSONDecodeError: + logger.exception("Invalid JSON in notebook %s", file_path_obj.name) + except Exception: + logger.exception("Error reading notebook %s", file_path_obj.name) elif file_path_obj.suffix == ".md": try: - with open(file_path_obj, "r", encoding="utf-8") as file: + with file_path_obj.open(encoding="utf-8") as file: content = file.read() content = remove_yaml_frontmatter(content) heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) if heading_match: return heading_match.group(1).strip() - except Exception as e: - logging.error(f"Error reading markdown {file_path_obj.name}: {e}") + except Exception: + logger.exception("Error reading markdown %s", file_path_obj.name) else: - logging.warning(f"Unsupported file type for heading extraction: {file_path_obj.name}") + logger.warning("Unsupported file type for heading extraction: %s", file_path_obj.name) return file_path_obj.stem except FileNotFoundError: - logging.error(f"File not found: {file_path_obj.name}") - except Exception as e: - logging.error(f"Error processing {file_path_obj.name}: {e}") + logger.exception("File not found: %s", file_path_obj.name) + except Exception: + logger.exception("Error processing %s", file_path_obj.name) return file_path_obj.stem @@ -222,43 +240,44 @@ def extract_first_heading(file_path: str | Path) -> str: # ---- Citation Handling ---- -def format_authors_for_bibtex(authors): +def format_authors_for_bibtex(authors: list) -> str: """ Format a list of authors in the proper BibTeX format. Args: authors (list): List of author dictionaries with 'given-names' and 'family-names' - Returns: + Returns + ------- str: Authors formatted for BibTeX """ try: if not authors: - logging.warning("No authors provided to format_authors_for_bibtex") + logger.warning("No authors provided to format_authors_for_bibtex") return "" formatted_authors = [] for i, author in enumerate(authors): if not isinstance(author, dict): - logging.warning(f"Author at index {i} is not a dictionary: {author}") + logger.warning("Author at index %s is not a dictionary: %s", i, author) continue family = author.get("family-names", "") given = author.get("given-names", "") if not family and not given: - logging.warning(f"Author at index {i} is missing both family-names and given-names") + logger.warning("Author at index %s is missing both family-names and given-names", i) continue formatted_authors.append(f"{family}, {given}") return " and ".join(formatted_authors) - except Exception as e: - logging.exception(f"Error formatting authors for BibTeX: {e}") + except Exception: + logger.exception("Error formatting authors for BibTeX") return "" -def generate_citation_key(authors, title, year): +def generate_citation_key(authors: list, title: str, year: str) -> str: """ Generate a citation key for BibTeX. @@ -267,7 +286,8 @@ def generate_citation_key(authors, title, year): title (str): Title of the work year (str): Year of publication - Returns: + Returns + ------- str: Citation key """ try: @@ -277,7 +297,7 @@ def generate_citation_key(authors, title, year): family_name = first_author.get("family-names", "Unknown") else: family_name = "Unknown" - logging.warning("No valid authors provided for citation key generation") + logger.warning("No valid authors provided for citation key generation") # Get the first word of the title or use 'Untitled' if title and isinstance(title, str): @@ -285,11 +305,11 @@ def generate_citation_key(authors, title, year): first_word = title_words[0] if title_words else "Untitled" else: first_word = "Untitled" - logging.warning("No valid title provided for citation key generation") + logger.warning("No valid title provided for citation key generation") # Use the year or empty string if not year: - logging.warning("No year provided for citation key generation") + logger.warning("No year provided for citation key generation") year = "" # Create a citation key with no invalid characters @@ -297,17 +317,17 @@ def generate_citation_key(authors, title, year): # Clean the key - remove special characters clean_key = re.sub(r"[^a-zA-Z0-9_]", "", raw_key) - - return clean_key or "Unknown_Citation" - except Exception as e: - logging.exception(f"Error generating citation key: {e}") + except Exception: + logger.exception("Error generating citation key") return "Unknown_Citation_Error" + else: + return clean_key or "Unknown_Citation" # ---- Keyword Handling ---- -def extract_keywords(keywords_data): +def extract_keywords(keywords_data: list | None) -> list: """ Extract keywords from various formats. @@ -319,15 +339,13 @@ def extract_keywords(keywords_data): Args: keywords_data: Keywords in various formats - Returns: + Returns + ------- list: List of keyword strings """ if not keywords_data: return [] - if not isinstance(keywords_data, list): - return [] - keywords = [] for item in keywords_data: if isinstance(item, str): @@ -336,10 +354,8 @@ def extract_keywords(keywords_data): elif isinstance(item, dict): # Dictionary format with language codes # Extract all values from the dictionary (should be only one per item) - for lang_code, keyword in item.items(): - if keyword: - keywords.append(str(keyword)) + keywords.extend(str(keyword) for keyword in item.values() if keyword) else: - logging.warning(f"Unexpected keyword format: {item}") + logger.warning("Unexpected keyword format: %s", item) return keywords diff --git a/quadriga/metadata/validate_schema.py b/quadriga/metadata/validate_schema.py new file mode 100644 index 0000000..71cd9a8 --- /dev/null +++ b/quadriga/metadata/validate_schema.py @@ -0,0 +1,119 @@ +"""Validate metadata.yml against the QUADRIGA JSON Schema. + +This module fetches the QUADRIGA schema (and referenced sub-schemas) from the +remote URL and validates a metadata dictionary against it. +""" + +from __future__ import annotations + +import json +import logging +import urllib.request + +from quadriga.metadata.utils import get_file_path, load_yaml_file + +logger = logging.getLogger(__name__) + +QUADRIGA_SCHEMA_URL = ( + "https://quadriga-dk.github.io/quadriga-schema/latest/schema.json" +) + + +def _fetch_json(url: str) -> dict: + """Fetch a JSON document from a URL. + + Args: + url: URL to fetch + + Returns + ------- + dict: Parsed JSON content + + Raises + ------ + urllib.error.URLError: If the URL cannot be reached + json.JSONDecodeError: If the response is not valid JSON + """ + with urllib.request.urlopen(url, timeout=30) as resp: # noqa: S310 + return json.loads(resp.read()) + + +def _validate_metadata( + metadata: dict, schema_url: str = QUADRIGA_SCHEMA_URL +) -> tuple[bool, list[str]]: + """Validate a metadata dictionary against the QUADRIGA JSON Schema. + + Fetches the schema (and any ``$ref`` sub-schemas) fresh from the given URL. + + Args: + metadata: Metadata dictionary (e.g. parsed from metadata.yml) + schema_url: URL of the main QUADRIGA schema + + Returns + ------- + tuple[bool, list[str]]: ``(True, [])`` when valid, or + ``(False, [error_message, ...])`` when validation fails or the schema + cannot be fetched. + """ + try: + from jsonschema import Draft202012Validator + from referencing import Registry, Resource + from referencing.jsonschema import DRAFT202012 + except ImportError: + logger.warning( + "jsonschema package not installed – skipping schema validation. " + "Install it via: pip install jsonschema" + ) + return True, [] + + try: + logger.info("Fetching QUADRIGA schema from %s ...", schema_url) + main_schema = _fetch_json(schema_url) + except Exception: + logger.exception("Failed to fetch schema from %s", schema_url) + return False, [f"Could not fetch schema from {schema_url}"] + + def retrieve(uri: str) -> Resource: + data = _fetch_json(uri) + return Resource.from_contents(data, default_specification=DRAFT202012) + + try: + registry: Registry = Registry(retrieve=retrieve) + validator = Draft202012Validator(main_schema, registry=registry) + errors = list(validator.iter_errors(metadata)) + except Exception: + logger.exception("Error during schema validation") + return False, ["Unexpected error during schema validation"] + + if errors: + messages = [] + for err in errors: + path = err.json_path if err.json_path != "$" else "(root)" + messages.append(f"{path}: {err.message}") + return False, messages + + return True, [] + + +def validate_schema() -> bool: + """Load metadata.yml and validate it against the QUADRIGA schema. + + Returns + ------- + bool: True if validation passed, False otherwise. + """ + metadata_path = get_file_path("metadata.yml") + metadata = load_yaml_file(metadata_path) + if metadata is None: + logger.error("Could not load metadata.yml for validation.") + return False + + valid, errors = _validate_metadata(metadata) + if valid: + logger.info("Schema validation passed.") + return True + + logger.error("Schema validation failed with %d error(s):", len(errors)) + for i, error in enumerate(errors, 1): + logger.error(" %d. %s", i, error) + return False