Skip to content

Commit

Permalink
files: add file format detection on file commit
Browse files Browse the repository at this point in the history
* this functionality uses a tool called siegfried
  https://github.com/richardlehane/siegfried
  • Loading branch information
max-moser committed Jan 11, 2024
1 parent 47f1cf9 commit f3badce
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021 CERN.
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -10,6 +11,7 @@

from .base import FileServiceComponent
from .content import FileContentComponent
from .filetype import FileTypeDetectionComponent
from .metadata import FileMetadataComponent
from .processor import FileProcessorComponent

Expand All @@ -18,4 +20,5 @@
"FileMetadataComponent",
"FileProcessorComponent",
"FileServiceComponent",
"FileTypeDetectionComponent",
)
21 changes: 21 additions & 0 deletions invenio_records_resources/services/files/components/filetype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""Service component for detecting file types."""

from ...uow import TaskOp
from ..tasks import detect_file_type
from .base import FileServiceComponent


class FileTypeDetectionComponent(FileServiceComponent):
"""Service component for detecting file types."""

def commit_file(self, identity, id, file_key, record):
"""Detect the file format as soon as the file has been committed."""
self.uow.register(TaskOp(detect_file_type, str(record.bucket.id), file_key))
3 changes: 3 additions & 0 deletions invenio_records_resources/services/files/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# Copyright (C) 2020-2022 CERN.
# Copyright (C) 2020 Northwestern University.
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -15,6 +16,7 @@
FileContentComponent,
FileMetadataComponent,
FileProcessorComponent,
FileTypeDetectionComponent,
)
from .links import FileLink
from .processors import ImageMetadataExtractor
Expand Down Expand Up @@ -54,6 +56,7 @@ class FileServiceConfig(ServiceConfig):
FileMetadataComponent,
FileContentComponent,
FileProcessorComponent,
FileTypeDetectionComponent,
]

file_processors = [
Expand Down
60 changes: 60 additions & 0 deletions invenio_records_resources/services/files/tasks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""Files tasks."""

import json
import subprocess as sp

import requests
from celery import shared_task
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_files_rest.models import ObjectVersion, ObjectVersionTag

from ...proxies import current_service_registry
from ...services.errors import FileKeyNotFoundError
Expand Down Expand Up @@ -39,3 +45,57 @@ def fetch_file(service_id, record_id, file_key):

except FileKeyNotFoundError as e:
current_app.logger.error(e)


# TODO update siegfried signatures (`sf -update`) regularly
@shared_task(ignore_result=True)
def detect_file_type(bucket_id, file_key):
"""Detect the format of the file using siegfried."""
# TODO maybe we should go through the Records-Resources files API instead?
ov = ObjectVersion.get(bucket_id, file_key)
if ov.file is None:
return

# TODO the original filename is lost (renamed to 'data'), but sf uses the filename
# for parts of its algorithm; possible solutions:
# * create a temporary alias (link?) to the file and pass that to sf
# * pipe the file's contents into sf via stdin and use the `-name` arg

# TODO question: could we utilize siegfried's server mode?

mimetype, pronom_id = None, None
try:
sf_bin = "sf"
# TODO this may only be possible for 'local' storage?
sf_output = sp.check_output([sf_bin, "-json", ov.file.uri], text=True)
result = json.loads(sf_output)

for file_info in result.get("files", []):
# only consider results for the file in question
if file_info.get("filename") != ov.file.uri:
continue

if not file_info.get("errors", None) and file_info.get("matches", []):
for match in file_info["matches"]:
if match["ns"] == "pronom":
pronom_id = match["id"]

# NOTE: there may be results other than for the "pronom" ns
# which may actually deliver better matches
# e.g. for the `sway-vulkan` script, the sf website
# (https://www.itforarchivists.com/siegfried)
# reports "plain text file" and no mimetype for PRONOM
# but "shell script" (and a mimetype) for the
# "freedesktop.org" ns
if match["mime"]:
mimetype = match["mime"]

except Exception as e:
print(e)

if mimetype is not None:
ov.mimetype = mimetype
if pronom_id is not None:
ObjectVersionTag.create_or_update(ov, "PUID", pronom_id)

db.session.commit()

0 comments on commit f3badce

Please sign in to comment.