Skip to content

Commit

Permalink
[app][feat] validate if configured close #19
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Jun 10, 2024
1 parent b689a1d commit 136ba39
Show file tree
Hide file tree
Showing 8 changed files with 1,472 additions and 32 deletions.
4 changes: 2 additions & 2 deletions cli_oai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def _clear_sub_dirs(root_dir: str):
process_resource_monitor.check_vmem()
process_resource_monitor.monit_disk_space(PROCESS.load)
if CFG.getboolean('mets', 'prevalidate', fallback=True):
PROCESS.validate_mets()
PROCESS.validate_metadata()
PROCESS.inspect_metadata()
PROCESS.clear_existing_entries()
PROCESS.language_modelconfig()
Expand All @@ -372,7 +372,7 @@ def _clear_sub_dirs(root_dir: str):
PROCESS.create_text_bundle_data()
PROCESS.postprocess_mets()
if CFG.getboolean('mets', 'postvalidate', fallback=True):
PROCESS.validate_mets()
PROCESS.validate_metadata()
if not MUST_KEEP_RESOURCES:
PROCESS.delete_before_export(LOCAL_DELETE_BEFORE_EXPORT)
PROCESS.export_data()
Expand Down
4 changes: 2 additions & 2 deletions cli_oai_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def wrap_save_record_state(status: str, urn, **kwargs):
process_resource_monitor.check_vmem()
process_resource_monitor.monit_disk_space(PROCESS.load)
if CFG.getboolean('mets','prevalidate', fallback=True):
PROCESS.validate_mets()
PROCESS.validate_metadata()
PROCESS.inspect_metadata()
PROCESS.clear_existing_entries()
PROCESS.language_modelconfig()
Expand All @@ -214,7 +214,7 @@ def wrap_save_record_state(status: str, urn, **kwargs):
PROCESS.create_text_bundle_data()
PROCESS.postprocess_mets()
if CFG.getboolean('mets','postvalidate', fallback=True):
PROCESS.validate_mets()
PROCESS.validate_metadata()
if not MUST_KEEP_RESOURCES:
PROCESS.delete_before_export(LOCAL_DELETE_BEVOR_EXPORT)
PROCESS.export_data()
Expand Down
23 changes: 13 additions & 10 deletions lib/ocrd3_odem/ocrd3_odem.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,9 @@
CATALOG_ULB,
ODEMMetadataInspecteur,
ODEMMetadataMetsException,
# extract_mets_data,
integrate_ocr_file,
postprocess_mets,
validate_mets,
validate,
)
from .processing_ocrd import (
run_ocr_page,
Expand Down Expand Up @@ -480,14 +479,18 @@ def postprocess_mets(self):

postprocess_mets(self.mets_file, self.cfg.get('ocr', 'ocrd_baseimage'))

def validate_mets(self):
"""Forward METS-schema validation"""
try:
validate_mets(self.mets_file)
except dfv.InvalidXMLException as err:
if len(err.args) > 0 and ('SCHEMASV' in str(err.args[0])):
raise ODEMException(str(err.args[0])) from err
raise err
def validate_metadata(self):
"""Forward (optional) validation concerning
METS/MODS XML-schema and/or current DDB-schematron
validation for 'digitalisierte medien'
"""
check_ddb = False
if self.cfg.has_option('mets', 'ddb_validation'):
check_ddb = self.cfg.getboolean('mets', 'ddb_validation', fallback=False)
dtype = 'Aa'
if 'pica' in self.record.info:
dtype = self.record.info['pica']
return validate(self.mets_file, validate_ddb=check_ddb, digi_type=dtype)

def export_data(self):
"""re-do metadata and transform into output format"""
Expand Down
34 changes: 16 additions & 18 deletions lib/ocrd3_odem/processing_mets.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
"""Encapsulate Implementations concerning METS/MODS handling"""

import os

from typing import (
List,
)
import typing

import lxml.etree as ET
import digiflow as df
Expand All @@ -13,6 +10,7 @@
from .odem_commons import (
FILEGROUP_IMG,
FILEGROUP_OCR,
ODEMException,
)

TYPE_PRINTS_PICA = ['a', 'f', 'F', 'Z', 'B']
Expand All @@ -29,16 +27,6 @@
IMAGE_GROUP_DEFAULT = 'DEFAULT'


# def extract_mets_data(the_self, the_data):
# """
# Migration Post-recive OAI METS/MODS callback
# """

# xml_root = ET.fromstring(the_data)
# mets_tree = df.post_oai_extract_metsdata(xml_root)
# df.write_xml_file(mets_tree, the_self.path_mets)


class ODEMMetadataMetsException(Exception):
"""Mark state when inconsistencies exist
between linkings of physical and logical
Expand Down Expand Up @@ -261,7 +249,7 @@ def clear_filegroups(xml_file, removals):
proc.write()


def integrate_ocr_file(xml_tree, ocr_files: List) -> int:
def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int:
"""Enrich given OCR-Files into XML tree
Returns number of linked files
Expand Down Expand Up @@ -319,7 +307,7 @@ def _link_fulltext(file_ident, xml_tree):
return 0


def is_in(tokens: List[str], label):
def is_in(tokens: typing.List[str], label):
"""label contained somewhere in a list of tokens?"""

return any(t in label for t in tokens)
Expand Down Expand Up @@ -370,8 +358,18 @@ def _clear_provenance_links(mproc):
parent.remove(old_dv)


def validate_mets(mets_file:str):
def validate(mets_file:str, validate_ddb=False, digi_type='Aa'):
"""Forward METS-schema validation"""

xml_root = ET.parse(mets_file).getroot()
dfv.validate_xml(xml_root)
try:
dfv.validate_xml(xml_root)
if validate_ddb:
df.ddb_validation(path_mets=mets_file, digi_type=digi_type)
except dfv.InvalidXMLException as err:
if len(err.args) > 0 and ('SCHEMASV' in str(err.args[0])):
raise ODEMException(str(err.args[0])) from err
raise err
except df.DigiflowDDBException as ddb_err:
raise ODEMException(ddb_err.args[0]) from ddb_err
return True
Loading

0 comments on commit 136ba39

Please sign in to comment.