Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File upload: Add task for file format detection on file commit #553

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021 CERN.
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -10,6 +11,7 @@

from .base import FileServiceComponent
from .content import FileContentComponent
from .filetype import FileTypeDetectionComponent
from .metadata import FileMetadataComponent
from .processor import FileProcessorComponent

Expand All @@ -18,4 +20,5 @@
"FileMetadataComponent",
"FileProcessorComponent",
"FileServiceComponent",
"FileTypeDetectionComponent",
)
21 changes: 21 additions & 0 deletions invenio_records_resources/services/files/components/filetype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""Service component for detecting file types."""

from ...uow import TaskOp
from ..tasks import detect_file_type
from .base import FileServiceComponent


class FileTypeDetectionComponent(FileServiceComponent):
"""Service component for detecting file types."""

def commit_file(self, identity, id, file_key, record):
"""Detect the file format as soon as the file has been committed."""
self.uow.register(TaskOp(detect_file_type, str(record.bucket.id), file_key))
3 changes: 3 additions & 0 deletions invenio_records_resources/services/files/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# Copyright (C) 2020-2022 CERN.
# Copyright (C) 2020 Northwestern University.
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -15,6 +16,7 @@
FileContentComponent,
FileMetadataComponent,
FileProcessorComponent,
FileTypeDetectionComponent,
)
from .links import FileLink
from .processors import ImageMetadataExtractor
Expand Down Expand Up @@ -54,6 +56,7 @@ class FileServiceConfig(ServiceConfig):
FileMetadataComponent,
FileContentComponent,
FileProcessorComponent,
FileTypeDetectionComponent,
]

file_processors = [
Expand Down
60 changes: 60 additions & 0 deletions invenio_records_resources/services/files/tasks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
# Copyright (C) 2024 TU Wien.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""Files tasks."""

import json
import subprocess as sp

import requests
from celery import shared_task
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_files_rest.models import ObjectVersion, ObjectVersionTag

from ...proxies import current_service_registry
from ...services.errors import FileKeyNotFoundError
Expand Down Expand Up @@ -39,3 +45,57 @@ def fetch_file(service_id, record_id, file_key):

except FileKeyNotFoundError as e:
current_app.logger.error(e)


# TODO update siegfried signatures (`sf -update`) regularly
@shared_task(ignore_result=True)
def detect_file_type(bucket_id, file_key):
"""Detect the format of the file using siegfried."""
# TODO maybe we should go through the Records-Resources files API instead?
ov = ObjectVersion.get(bucket_id, file_key)
if ov.file is None:
return

# TODO the original filename is lost (renamed to 'data'), but sf uses the filename
# for parts of its algorithm; possible solutions:
# * create a temporary alias (link?) to the file and pass that to sf
# * pipe the file's contents into sf via stdin and use the `-name` arg

# TODO question: could we utilize siegfried's server mode?

mimetype, pronom_id = None, None
try:
sf_bin = "sf"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if installed with go install, the $GOPATH needs to be added to the $PATH with this logic

# TODO this may only be possible for 'local' storage?
sf_output = sp.check_output([sf_bin, "-json", ov.file.uri], text=True)
result = json.loads(sf_output)

for file_info in result.get("files", []):
# only consider results for the file in question
if file_info.get("filename") != ov.file.uri:
continue

if not file_info.get("errors", None) and file_info.get("matches", []):
for match in file_info["matches"]:
if match["ns"] == "pronom":
pronom_id = match["id"]

# NOTE: there may be results other than for the "pronom" ns
# which may actually deliver better matches
# e.g. for the `sway-vulkan` script, the sf website
# (https://www.itforarchivists.com/siegfried)
# reports "plain text file" and no mimetype for PRONOM
# but "shell script" (and a mimetype) for the
# "freedesktop.org" ns
if match["mime"]:
mimetype = match["mime"]

except Exception as e:
print(e)

if mimetype is not None:
ov.mimetype = mimetype
if pronom_id is not None:
ObjectVersionTag.create_or_update(ov, "PUID", pronom_id)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this an appropriate place to store the PRONOM identifier?
or should it be stored somewhere else, e.g. the file "metadata" which is used to report dimensions of images files and such?


db.session.commit()
Loading