diff --git a/invenio_records_resources/services/files/components/__init__.py b/invenio_records_resources/services/files/components/__init__.py index 2c08f31f..c317090b 100644 --- a/invenio_records_resources/services/files/components/__init__.py +++ b/invenio_records_resources/services/files/components/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2021 CERN. +# Copyright (C) 2024 TU Wien. # # Invenio-Records-Resources is free software; you can redistribute it and/or # modify it under the terms of the MIT License; see LICENSE file for more @@ -10,6 +11,7 @@ from .base import FileServiceComponent from .content import FileContentComponent +from .filetype import FileTypeDetectionComponent from .metadata import FileMetadataComponent from .processor import FileProcessorComponent @@ -18,4 +20,5 @@ "FileMetadataComponent", "FileProcessorComponent", "FileServiceComponent", + "FileTypeDetectionComponent", ) diff --git a/invenio_records_resources/services/files/components/filetype.py b/invenio_records_resources/services/files/components/filetype.py new file mode 100644 index 00000000..c63cfba3 --- /dev/null +++ b/invenio_records_resources/services/files/components/filetype.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 TU Wien. +# +# Invenio-Records-Resources is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""Service component for detecting file types.""" + +from ...uow import TaskOp +from ..tasks import detect_file_type +from .base import FileServiceComponent + + +class FileTypeDetectionComponent(FileServiceComponent): + """Service component for detecting file types.""" + + def commit_file(self, identity, id, file_key, record): + """Detect the file format as soon as the file has been committed.""" + self.uow.register(TaskOp(detect_file_type, str(record.bucket.id), file_key)) diff --git a/invenio_records_resources/services/files/config.py b/invenio_records_resources/services/files/config.py index 8b665618..eb0a1cbe 100644 --- a/invenio_records_resources/services/files/config.py +++ b/invenio_records_resources/services/files/config.py @@ -2,6 +2,7 @@ # # Copyright (C) 2020-2022 CERN. # Copyright (C) 2020 Northwestern University. +# Copyright (C) 2024 TU Wien. # # Invenio-Records-Resources is free software; you can redistribute it and/or # modify it under the terms of the MIT License; see LICENSE file for more @@ -15,6 +16,7 @@ FileContentComponent, FileMetadataComponent, FileProcessorComponent, + FileTypeDetectionComponent, ) from .links import FileLink from .processors import ImageMetadataExtractor @@ -54,6 +56,7 @@ class FileServiceConfig(ServiceConfig): FileMetadataComponent, FileContentComponent, FileProcessorComponent, + FileTypeDetectionComponent, ] file_processors = [ diff --git a/invenio_records_resources/services/files/tasks.py b/invenio_records_resources/services/files/tasks.py index 8ef42298..d1dc01cf 100644 --- a/invenio_records_resources/services/files/tasks.py +++ b/invenio_records_resources/services/files/tasks.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2022 CERN. +# Copyright (C) 2024 TU Wien. # # Invenio-Records-Resources is free software; you can redistribute it and/or # modify it under the terms of the MIT License; see LICENSE file for more @@ -8,10 +9,15 @@ """Files tasks.""" +import json +import subprocess as sp + import requests from celery import shared_task from flask import current_app from invenio_access.permissions import system_identity +from invenio_db import db +from invenio_files_rest.models import ObjectVersion, ObjectVersionTag from ...proxies import current_service_registry from ...services.errors import FileKeyNotFoundError @@ -39,3 +45,57 @@ def fetch_file(service_id, record_id, file_key): except FileKeyNotFoundError as e: current_app.logger.error(e) + + +# TODO update siegfried signatures (`sf -update`) regularly +@shared_task(ignore_result=True) +def detect_file_type(bucket_id, file_key): + """Detect the format of the file using siegfried.""" + # TODO maybe we should go through the Records-Resources files API instead? + ov = ObjectVersion.get(bucket_id, file_key) + if ov.file is None: + return + + # TODO the original filename is lost (renamed to 'data'), but sf uses the filename + # for parts of its algorithm; possible solutions: + # * create a temporary alias (link?) to the file and pass that to sf + # * pipe the file's contents into sf via stdin and use the `-name` arg + + # TODO question: could we utilize siegfried's server mode? + + mimetype, pronom_id = None, None + try: + sf_bin = "sf" + # TODO this may only be possible for 'local' storage? + sf_output = sp.check_output([sf_bin, "-json", ov.file.uri], text=True) + result = json.loads(sf_output) + + for file_info in result.get("files", []): + # only consider results for the file in question + if file_info.get("filename") != ov.file.uri: + continue + + if not file_info.get("errors", None) and file_info.get("matches", []): + for match in file_info["matches"]: + if match["ns"] == "pronom": + pronom_id = match["id"] + + # NOTE: there may be results other than for the "pronom" ns + # which may actually deliver better matches + # e.g. for the `sway-vulkan` script, the sf website + # (https://www.itforarchivists.com/siegfried) + # reports "plain text file" and no mimetype for PRONOM + # but "shell script" (and a mimetype) for the + # "freedesktop.org" ns + if match["mime"]: + mimetype = match["mime"] + + except Exception as e: + print(e) + + if mimetype is not None: + ov.mimetype = mimetype + if pronom_id is not None: + ObjectVersionTag.create_or_update(ov, "pronom_id", pronom_id) + + db.session.commit()