diff --git a/cli_dir_local.py b/cli_dir_local.py index 1dcc183..ca08c87 100644 --- a/cli_dir_local.py +++ b/cli_dir_local.py @@ -123,7 +123,7 @@ PROCESS: ODEMProcess = ODEMProcess.create(proc_type, None, req_dst_dir, EXECUTORS) PROCESS.local_mode = True - PROCESS.cfg = CFG + PROCESS.odem_configuration = CFG PROCESS.the_logger = LOGGER local_images = PROCESS.get_local_image_paths(image_local_dir=ROOT_PATH) PROCESS._statistics_ocr[STATS_KEY_N_PAGES] = len(local_images) diff --git a/cli_mets_local.py b/cli_mets_local.py new file mode 100644 index 0000000..c06d32b --- /dev/null +++ b/cli_mets_local.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- +"""MAIN CLI plain OCR with optional export""" + +import argparse +import os +import sys + +from pathlib import Path + +import digiflow as df + +import lib.ocrd3_odem as o3o + +from lib.resources_monitoring import ProcessResourceMonitor, ProcessResourceMonitorConfig + +DEFAULT_EXECUTORS = 2 + + +######## +# MAIN # +######## +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description="generate ocr-data for OAI-Record") + PARSER.add_argument( + "mets_file", + help="path to digital object's METS/MODS file") + PARSER.add_argument( + "-c", + "--config", + required=False, + default="resources/odem.ini", + help="path to configuration file") + PARSER.add_argument( + "-e", + "--executors", + required=False, + help="Number of OCR-D Executors in parallel mode") + PARSER.add_argument( + "-s", + "--sequential-mode", + required=False, + default=False, + action="store_true", + help="Disable parallel mode, just run sequential") + PARSER.add_argument( + "-k", + "--keep-resources", + required=False, + default=False, + action='store_true', + help="keep stored images after processing") + PARSER.add_argument( + "-l", + "--lock-mode", + required=False, + default=False, + action='store_true', + help="lock each run to avoid parallel starts") + ARGS = PARSER.parse_args() + + # check some pre-conditions + # inspect configuration settings + CONF_FILE = os.path.abspath(ARGS.config) + if not os.path.exists(CONF_FILE): + print(f"[ERROR] no config at '{CONF_FILE}'! Halt execution!") + sys.exit(1) + + # pick common args + SEQUENTIAL = ARGS.sequential_mode + MUST_KEEP_RESOURCES = ARGS.keep_resources + MUST_LOCK = ARGS.lock_mode + EXECUTOR_ARGS = ARGS.executors + + CFG = o3o.get_configparser() + configurations_read = CFG.read(CONF_FILE) + if not configurations_read: + print(f"unable to read config from '{CONF_FILE}! exit!") + sys.exit(1) + + CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True) + ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True) + + # set work_dirs and logger + LOCAL_DELETE_BEVOR_EXPORT = [] + if CFG.has_option('export', 'delete_before_export'): + LOCAL_DELETE_BEVOR_EXPORT = CFG.getlist('export', 'delete_before_export') + LOCAL_LOG_DIR = CFG.get('global', 'local_log_dir') + if not os.path.exists(LOCAL_LOG_DIR) or not os.access( + LOCAL_LOG_DIR, os.W_OK): + raise RuntimeError(f"cant store log files at invalid {LOCAL_LOG_DIR}") + LOG_FILE_NAME = None + if CFG.has_option('global', 'logfile_name'): + LOG_FILE_NAME = CFG.get('global', 'logfile_name') + LOGGER = o3o.get_logger(LOCAL_LOG_DIR, LOG_FILE_NAME) + + mets_file: Path = Path(ARGS.mets_file).absolute() + if not mets_file.is_file(): + print(f"unable to read file '{mets_file}! exit!") + sys.exit(1) + LOGGER.info("use '%s'", mets_file) + mets_file_dir = mets_file.parent + + # if valid n_executors via cli, use it's value + if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0: + CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS)) + EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=DEFAULT_EXECUTORS) + if SEQUENTIAL: + EXECUTORS = 1 + LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s", + mets_file_dir, EXECUTORS, MUST_KEEP_RESOURCES, MUST_LOCK) + + try: + local_ident = mets_file.stem + proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None) + if proc_type is None: + LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'") + record = df.OAIRecord(local_ident) + odem_process: o3o.ODEMProcess = o3o.ODEMProcess(record, mets_file_dir) + odem_process.the_logger = LOGGER + odem_process.the_logger.info("[%s] odem from %s, %d executors", local_ident, mets_file, EXECUTORS) + odem_process.odem_configuration = CFG + process_resource_monitor: ProcessResourceMonitor = ProcessResourceMonitor( + ProcessResourceMonitorConfig( + enable_resource_monitoring=CFG.getboolean('resource-monitoring', 'enable', fallback=False), + polling_interval=CFG.getfloat('resource-monitoring', 'polling_interval', fallback=1), + path_disk_usage=CFG.get('resource-monitoring', 'path_disk_usage', fallback='/home/ocr'), + factor_free_disk_space_needed=CFG.getfloat( + 'resource-monitoring', + 'factor_free_disk_space_needed', + fallback=3.0 + ), + max_vmem_percentage=CFG.getfloat('resource-monitoring', 'max_vmem_percentage', fallback=None), + max_vmem_bytes=CFG.getint('resource-monitoring', 'max_vmem_bytes', fallback=None), + ), + LOGGER.error, + None, + odem_process.process_identifier, + record.identifier + ) + process_resource_monitor.check_vmem() + # process_resource_monitor.monit_disk_space(odem_process.load) + odem_process.inspect_metadata() + if CFG.getboolean('mets','prevalidate', fallback=True): + odem_process.validate_metadata() + odem_process.clear_existing_entries() + odem_process.language_modelconfig() + odem_process.set_local_images() + + # NEW NEW NEW + odem_pipeline = o3o.ODEMOCRPipeline.create(proc_type, odem_process) + odem_runner = o3o.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) + OUTCOMES = process_resource_monitor.monit_vmem(odem_runner.run) + if OUTCOMES is None or len(OUTCOMES) == 0: + raise o3o.ODEMException(f"process run error: {record.identifier}") + + odem_process.calculate_statistics_ocr(OUTCOMES) + odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics) + odem_process.link_ocr() + if CREATE_PDF: + odem_process.create_pdf() + odem_process.postprocess_ocr() + if CREATE_PDF: + odem_process.create_text_bundle_data() + odem_process.postprocess_mets() + if CFG.getboolean('mets','postvalidate', fallback=True): + odem_process.validate_metadata() + if odem_process.odem_configuration.has_option('export', 'local_export_dir'): + odem_process.the_logger.info("[%s] start to export data", + odem_process.process_identifier) + if not MUST_KEEP_RESOURCES: + odem_process.delete_before_export(LOCAL_DELETE_BEVOR_EXPORT) + odem_process.export_data() + _mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}' + odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier, + odem_process.duration, _mode, odem_process.statistics) + # finale + LOGGER.info("[%s] odem done in '%s' (%d executors)", + odem_process.process_identifier, odem_process.duration, EXECUTORS) + except o3o.ODEMNoTypeForOCRException as type_unknown: + # we don't ocr this one + LOGGER.warning("[%s] odem skips '%s'", + odem_process.process_identifier, type_unknown.args[0]) + except o3o.ODEMNoImagesForOCRException as not_ocrable: + LOGGER.warning("[%s] odem no ocrables '%s'", + odem_process.process_identifier, not_ocrable.args) + except o3o.ODEMException as _odem_exc: + _err_args = {'ODEMException': _odem_exc.args[0]} + LOGGER.error("[%s] odem fails with: '%s'", odem_process.process_identifier, _err_args) + except RuntimeError as exc: + LOGGER.error("odem fails for '%s' after %s with: '%s'", + record, odem_process.duration, str(exc)) + sys.exit(1) diff --git a/cli_oai_client.py b/cli_oai_client.py index 1430461..e882ad0 100644 --- a/cli_oai_client.py +++ b/cli_oai_client.py @@ -320,7 +320,7 @@ def oai_arg_parser(value): local_ident, CLIENT.host, EXECUTORS ) - PROCESS.cfg = CFG + PROCESS.odem_configuration = CFG try: if os.path.exists(req_dst_dir): diff --git a/cli_oai_local.py b/cli_oai_local.py index 45c1ab6..41dc790 100644 --- a/cli_oai_local.py +++ b/cli_oai_local.py @@ -1,18 +1,12 @@ # -*- coding: utf-8 -*- """MAIN CLI OAI LOCAL ODEM""" +import ast import argparse import os import shutil import sys -from ast import ( - literal_eval, -) -from digiflow import ( - OAIRecordHandler, - OAIRecord, - LocalStore -) + import digiflow as df import lib.ocrd3_odem as o3o @@ -39,10 +33,10 @@ def trnfrm(row): oai_id = row[RECORD_IDENTIFIER] try: - _info = literal_eval(row[RECORD_INFO]) + _info = ast.literal_eval(row[RECORD_INFO]) except: _info = row[RECORD_INFO] - _record = OAIRecord(oai_id,) + _record = df.OAIRecord(oai_id,) _record.info = _info return _record @@ -143,7 +137,7 @@ def trnfrm(row): DATA_FIELDS = CFG.getlist('global', 'data_fields') LOGGER.info("data fields: '%s'", DATA_FIELDS) LOGGER.info("use records from '%s'", OAI_RECORD_FILE) - handler = OAIRecordHandler( + handler = df.OAIRecordHandler( OAI_RECORD_FILE, data_fields=DATA_FIELDS, transform_func=trnfrm) record: df.OAIRecord = handler.next_record(state=MARK_OCR_OPEN) if not record: @@ -165,16 +159,15 @@ def wrap_save_record_state(status: str, urn, **kwargs): proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None) if proc_type is None: LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'") - PROCESS: ODEMProcess = ODEMProcess.create(proc_type, record, req_dst_dir, EXECUTORS) - - PROCESS.the_logger = LOGGER - PROCESS.the_logger.info("[%s] odem from %s, %d executors", local_ident, OAI_RECORD_FILE, EXECUTORS) - PROCESS.cfg = CFG + odem_process: ODEMProcess = ODEMProcess(record, req_dst_dir) + odem_process.the_logger = LOGGER + odem_process.the_logger.info("[%s] odem from %s, %d executors", local_ident, OAI_RECORD_FILE, EXECUTORS) + odem_process.odem_configuration = CFG LOCAL_STORE_ROOT = CFG.get('global', 'local_store_root', fallback=None) if LOCAL_STORE_ROOT is not None: STORE_DIR = os.path.join(LOCAL_STORE_ROOT, local_ident) - STORE = LocalStore(STORE_DIR, req_dst_dir) - PROCESS.store = STORE + STORE = df.LocalStore(STORE_DIR, req_dst_dir) + odem_process.store = STORE process_resource_monitor: ProcessResourceMonitor = ProcessResourceMonitor( ProcessResourceMonitorConfig( enable_resource_monitoring=CFG.getboolean('resource-monitoring', 'enable', fallback=False), @@ -191,67 +184,74 @@ def wrap_save_record_state(status: str, urn, **kwargs): LOGGER.error, wrap_save_record_state, None, - PROCESS.process_identifier, + odem_process.process_identifier, record.identifier ) process_resource_monitor.check_vmem() - process_resource_monitor.monit_disk_space(PROCESS.load) - PROCESS.inspect_metadata() + process_resource_monitor.monit_disk_space(odem_process.load) + odem_process.inspect_metadata() if CFG.getboolean('mets','prevalidate', fallback=True): - PROCESS.validate_metadata() - PROCESS.clear_existing_entries() - PROCESS.language_modelconfig() - PROCESS.set_local_images() - OUTCOMES = process_resource_monitor.monit_vmem(PROCESS.run) - PROCESS.calculate_statistics_ocr(OUTCOMES) - PROCESS.the_logger.info("[%s] %s", local_ident, PROCESS.statistics) - PROCESS.link_ocr() + odem_process.validate_metadata() + odem_process.clear_existing_entries() + odem_process.language_modelconfig() + odem_process.set_local_images() + + # NEW NEW NEW + odem_pipeline = o3o.ODEMOCRPipeline.create(proc_type, odem_process) + odem_runner = o3o.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) + OUTCOMES = process_resource_monitor.monit_vmem(odem_runner.run) + if OUTCOMES is None or len(OUTCOMES) == 0: + raise ODEMException(f"process run error: {record.identifier}") + + odem_process.calculate_statistics_ocr(OUTCOMES) + odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics) + odem_process.link_ocr() if CREATE_PDF: - PROCESS.create_pdf() - PROCESS.postprocess_ocr() + odem_process.create_pdf() + odem_process.postprocess_ocr() if CREATE_PDF: - PROCESS.create_text_bundle_data() - PROCESS.postprocess_mets() + odem_process.create_text_bundle_data() + odem_process.postprocess_mets() if CFG.getboolean('mets','postvalidate', fallback=True): - PROCESS.validate_metadata() + odem_process.validate_metadata() if not MUST_KEEP_RESOURCES: - PROCESS.delete_before_export(LOCAL_DELETE_BEVOR_EXPORT) - PROCESS.export_data() - _kwargs = PROCESS.statistics - if PROCESS.record.info != 'n.a.': + odem_process.delete_before_export(LOCAL_DELETE_BEVOR_EXPORT) + odem_process.export_data() + _kwargs = odem_process.statistics + if odem_process.record.info != 'n.a.': try: - if isinstance(PROCESS.record.info, str): - _info = dict(literal_eval(PROCESS.record.info)) - PROCESS.record.info.update(_kwargs) - _info = f"{PROCESS.record.info}" + if isinstance(odem_process.record.info, str): + _info = dict(ast.literal_eval(odem_process.record.info)) + odem_process.record.info.update(_kwargs) + _info = f"{odem_process.record.info}" except: - PROCESS.the_logger.error("Can't parse '%s', store info literally", - PROCESS.record.info) + odem_process.the_logger.error("Can't parse '%s', store info literally", + odem_process.record.info) _info = f"{_kwargs}" else: _info = f"{_kwargs}" handler.save_record_state(record.identifier, MARK_OCR_DONE, INFO=_info) _mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}' - PROCESS.the_logger.info("[%s] duration: %s/%s (%s)", PROCESS.process_identifier, - PROCESS.duration, _mode, PROCESS.statistics) + odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier, + odem_process.duration, _mode, odem_process.statistics) # finale LOGGER.info("[%s] odem done in '%s' (%d executors)", - PROCESS.process_identifier, PROCESS.duration, EXECUTORS) + odem_process.process_identifier, odem_process.duration, EXECUTORS) except o3o.ODEMNoTypeForOCRException as type_unknown: # we don't ocr this one LOGGER.warning("[%s] odem skips '%s'", - PROCESS.process_identifier, type_unknown.args[0]) + odem_process.process_identifier, type_unknown.args[0]) handler.save_record_state(record.identifier, o3o.MARK_OCR_SKIP) except o3o.ODEMNoImagesForOCRException as not_ocrable: LOGGER.warning("[%s] odem no ocrables '%s'", - PROCESS.process_identifier, not_ocrable.args) + odem_process.process_identifier, not_ocrable.args) handler.save_record_state(record.identifier, o3o.MARK_OCR_SKIP) except ODEMException as _odem_exc: _err_args = {'ODEMException': _odem_exc.args[0]} - LOGGER.error("[%s] odem fails with: '%s'", PROCESS.process_identifier, _err_args) + LOGGER.error("[%s] odem fails with: '%s'", odem_process.process_identifier, _err_args) handler.save_record_state(record.identifier, MARK_OCR_FAIL, INFO=f'{_err_args}') except RuntimeError as exc: LOGGER.error("odem fails for '%s' after %s with: '%s'", - record, PROCESS.duration, str(exc)) + record, odem_process.duration, str(exc)) handler.save_record_state(record.identifier, MARK_OCR_FAIL, INFO=f'{str(exc) : exc.args[0]}') sys.exit(1) diff --git a/lib/ocrd3_odem/__init__.py b/lib/ocrd3_odem/__init__.py index 261d345..bde6607 100644 --- a/lib/ocrd3_odem/__init__.py +++ b/lib/ocrd3_odem/__init__.py @@ -5,6 +5,8 @@ from .ocrd3_odem import ( OdemWorkflowProcessType, ODEMProcess, + ODEMOCRPipeline, + ODEMPipelineRunner, OCRDPageParallel, ODEMTesseract, ) diff --git a/lib/ocrd3_odem/ocrd3_odem.py b/lib/ocrd3_odem/ocrd3_odem.py index 523cdfa..52fbf26 100644 --- a/lib/ocrd3_odem/ocrd3_odem.py +++ b/lib/ocrd3_odem/ocrd3_odem.py @@ -6,6 +6,7 @@ import concurrent.futures import configparser import datetime +import typing import logging import os import shutil @@ -13,21 +14,15 @@ import subprocess import tempfile import time +import typing + from enum import Enum -from pathlib import ( - Path -) -from typing import ( - Dict, - List, - Optional, -) -import lxml.etree as ET +from pathlib import Path + import numpy as np import digiflow as df import digiflow.digiflow_export as dfx import digiflow.digiflow_metadata as dfm -import digiflow.validate as dfv from .odem_commons import ( CFG_SEC_OCR, @@ -36,8 +31,6 @@ KEY_LANGUAGES, STATS_KEY_LANGS, STATS_KEY_MODELS, - STATS_KEY_N_PAGES, - STATS_KEY_N_OCRABLE, STATS_KEY_N_OCR, STATS_KEY_MB, STATS_KEY_MPS, @@ -46,9 +39,8 @@ ODEMException, ) from .processing_mets import ( - CATALOG_ULB, ODEMMetadataInspecteur, - ODEMMetadataMetsException, + extract_text_content, integrate_ocr_file, postprocess_mets, validate, @@ -57,7 +49,7 @@ run_ocr_page, ) from .processing_ocr_pipeline import ( - analyze, + STEP_MOVE_PATH_TARGET, run_pipeline, ) from .processing_ocr_results import ( @@ -107,20 +99,6 @@ class ODEMProcess: for the underlying OCR-Engine Tesseract-OCR. """ - @staticmethod - def create( - workflow_type: OdemWorkflowProcessType | str, - record: df.OAIRecord, - work_dir, - executors=2, - log_dir=None, - logger=None - ) -> ODEMProcess: - if (workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT - or workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT.value): - return ODEMTesseract(record, work_dir, executors, log_dir, logger) - return OCRDPageParallel(record, work_dir, executors, log_dir, logger) - def __init__(self, record: df.OAIRecord, work_dir, executors=2, log_dir=None, logger=None): """Create new ODEM Process. Args: @@ -132,11 +110,10 @@ def __init__(self, record: df.OAIRecord, work_dir, executors=2, log_dir=None, lo Defaults to None. """ - self.identifiers: Optional[Dict] self.record = record - self.n_executors = executors self.work_dir_main = work_dir self.digi_type = None + self.mods_identifier = None self.local_mode = record is None self.process_identifier = None if self.local_mode: @@ -145,12 +122,12 @@ def __init__(self, record: df.OAIRecord, work_dir, executors=2, log_dir=None, lo self.process_identifier = record.local_identifier self.export_dir = None self.the_logger: logging.Logger = None - self.cfg: configparser.ConfigParser = None + self.odem_configuration: configparser.ConfigParser = None self.store: df.LocalStore = None - self.images_4_ocr: List = [] # List[str] | List[Tuple[str, str]] + self.images_4_ocr: typing.List = [] # List[str] | List[Tuple[str, str]] self.ocr_files = [] self.ocr_function = None - self.ocr_input: List[List] = [] + self.ocr_input: typing.List = [] self._statistics_ocr = {'execs': executors} self._process_start = time.time() if logger is not None: @@ -190,7 +167,7 @@ def load(self): req_dst = os.path.join(req_dst_dir, local_identifier + '.xml') self.the_logger.debug("[%s] download %s to %s", self.process_identifier, request_identifier, req_dst) - base_url = self.cfg.get('global', 'base_url') + base_url = self.odem_configuration.get('global', 'base_url') try: loader = df.OAILoader(req_dst_dir, base_url=base_url, post_oai=dfm.extract_mets) loader.store = self.store @@ -226,14 +203,16 @@ def inspect_metadata(self): insp = ODEMMetadataInspecteur(self.mets_file, self.record.identifier, - cfg=self.cfg) + cfg=self.odem_configuration) try: - insp.inspect() + the_report = insp.metadata_report() + self.digi_type = the_report.type self.images_4_ocr = insp.image_pairs except RuntimeError as mde: raise ODEMException(f"{mde.args[0]}") from mde - self.identifiers = insp.identifiers - self._statistics_ocr[CATALOG_ULB] = insp.record_identifier + self.mods_identifier = insp.mods_record_identifier + for t, ident in insp.identifiers.items(): + self._statistics_ocr[t] = ident self._statistics_ocr['type'] = insp.type self._statistics_ocr[STATS_KEY_LANGS] = insp.languages self._statistics_ocr['n_images_pages'] = insp.n_images_pages @@ -247,8 +226,8 @@ def inspect_metadata(self): def clear_existing_entries(self): """Clear METS/MODS of configured file groups""" - if self.cfg: - _blacklisted = self.cfg.getlist('mets', 'blacklist_file_groups') + if self.odem_configuration: + _blacklisted = self.odem_configuration.getlist('mets', 'blacklist_file_groups') _ident = self.process_identifier self.the_logger.info("[%s] remove %s", _ident, _blacklisted) _proc = df.MetsProcessor(self.mets_file) @@ -266,7 +245,7 @@ def language_modelconfig(self, languages=None) -> str: """ _models = [] - model_mappings: dict = self.cfg.getdict( # pylint: disable=no-member + model_mappings: dict = self.odem_configuration.getdict( # pylint: disable=no-member 'ocr', 'model_mapping') self.the_logger.info("[%s] inspect languages '%s'", self.process_identifier, languages) @@ -281,7 +260,7 @@ def language_modelconfig(self, languages=None) -> str: _models.append(model) else: raise ODEMException(f"'{model}' model config not found !") - _model_conf = '+'.join(_models) if self.cfg.getboolean('ocr', "model_combinable", fallback=True) else _models[0] + _model_conf = '+'.join(_models) if self.odem_configuration.getboolean('ocr', "model_combinable", fallback=True) else _models[0] self._statistics_ocr[STATS_KEY_MODELS] = _model_conf self.the_logger.info("[%s] map languages '%s' => '%s'", self.process_identifier, languages, _model_conf) @@ -303,8 +282,8 @@ def map_language_to_modelconfig(self, image_path) -> str: _file_lang_suffixes = DEFAULT_LANG # inspect language arg - if self.cfg.has_option(CFG_SEC_OCR, KEY_LANGUAGES): - _file_lang_suffixes = self.cfg.get(CFG_SEC_OCR, KEY_LANGUAGES).split('+') + if self.odem_configuration.has_option(CFG_SEC_OCR, KEY_LANGUAGES): + _file_lang_suffixes = self.odem_configuration.get(CFG_SEC_OCR, KEY_LANGUAGES).split('+') return self.language_modelconfig(_file_lang_suffixes) # inspect final '_' segment of local file names if self.local_mode: @@ -324,14 +303,14 @@ def map_language_to_modelconfig(self, image_path) -> str: def _is_model_available(self, model) -> bool: """Determine whether model is available""" - resource_dir_mappings = self.cfg.getdict(CFG_SEC_OCR, CFG_KEY_RES_VOL, fallback={}) + resource_dir_mappings = self.odem_configuration.getdict(CFG_SEC_OCR, CFG_KEY_RES_VOL, fallback={}) for host_dir, _ in resource_dir_mappings.items(): training_file = host_dir + '/' + model if os.path.exists(training_file): return True return False - def get_local_image_paths(self, image_local_dir=None) -> List[str]: + def get_local_image_paths(self, image_local_dir=None) -> typing.List[str]: """Build dataset from two different scenarios (-therefore setting images is divided from filtering): @@ -348,7 +327,7 @@ def get_local_image_paths(self, image_local_dir=None) -> List[str]: image_dir = image_local_dir # gather local images, propably recursive - images: List[str] = sorted([ + images: typing.List[str] = sorted([ os.path.join(curr, the_file) for curr, _, the_files in os.walk(image_dir) for the_file in the_files @@ -377,13 +356,7 @@ def set_local_images(self): _images_of_interest.append((_the_file, _urn)) self.images_4_ocr = _images_of_interest - def run(self) -> List: - """Execute OCR workflow - Subject to actual ODEM flavor - """ - return [(0, 0, 0, 0)] - - def calculate_statistics_ocr(self, outcomes: List): + def calculate_statistics_ocr(self, outcomes: typing.List): """Calculate and aggregate runtime stats""" n_ocr = sum([e[0] for e in outcomes if e[0] == 1]) _total_mps = [round(e[2], 1) for e in outcomes if e[0] == 1] @@ -411,7 +384,7 @@ def postprocess_ocr(self): # inspect each single created ocr file # drop unwanted elements # clear punctual regions - strip_tags = self.cfg.getlist('ocr', 'strip_tags') + strip_tags = self.odem_configuration.getlist('ocr', 'strip_tags') for _ocr_file in self.ocr_files: postprocess_ocrd_file(_ocr_file, strip_tags) @@ -420,40 +393,32 @@ def create_text_bundle_data(self): read ocr-file sequential according to their number label and extract every row into additional text file""" - _ocrs = sorted(self.ocr_files) - _txts = [] - for _o in _ocrs: - with open(_o, mode='r', encoding='UTF-8') as _ocr_file: - _alto_root = ET.parse(_ocr_file) - _lines = _alto_root.findall('.//alto:TextLine', df.XMLNS) - for _l in _lines: - _l_strs = [s.attrib['CONTENT'] for s in _l.findall('.//alto:String', df.XMLNS)] - _txts.append(' '.join(_l_strs)) - txt_content = '\n'.join(_txts) - _out_path = os.path.join(self.work_dir_main, f'{self.statistics[CATALOG_ULB]}.pdf.txt') + txt_lines = extract_text_content(self.ocr_files) + txt_content = '\n'.join(txt_lines) + _out_path = os.path.join(self.work_dir_main, f'{self.mods_identifier}.pdf.txt') with open(_out_path, mode='w', encoding='UTF-8') as _writer: _writer.write(txt_content) self.the_logger.info("[%s] harvested %d lines from %d ocr files to %s", - self.process_identifier, len(_txts), len(_ocrs), _out_path) - self._statistics_ocr['n_text_lines'] = len(_txts) + self.process_identifier, len(txt_lines), len(self.ocr_files), _out_path) + self._statistics_ocr['n_text_lines'] = len(txt_lines) def create_pdf(self): """Forward PDF-creation to Derivans""" - _cfg_path_dir_bin = self.cfg.get('derivans', 'derivans_dir_bin', fallback=None) + _cfg_path_dir_bin = self.odem_configuration.get('derivans', 'derivans_dir_bin', fallback=None) path_bin = None if _cfg_path_dir_bin is not None: path_bin = os.path.join(PROJECT_ROOT, _cfg_path_dir_bin) - _cfg_path_dir_project = self.cfg.get('derivans', 'derivans_dir_project', fallback=None) + _cfg_path_dir_project = self.odem_configuration.get('derivans', 'derivans_dir_project', fallback=None) path_prj = None if _cfg_path_dir_project is not None: path_prj = os.path.join(PROJECT_ROOT, _cfg_path_dir_project) path_cfg = os.path.join( PROJECT_ROOT, - self.cfg.get('derivans', 'derivans_config') + self.odem_configuration.get('derivans', 'derivans_config') ) - derivans_image = self.cfg.get('derivans', 'derivans_image', fallback=None) - path_logging = self.cfg.get('derivans', 'derivans_logdir', fallback=None) + derivans_image = self.odem_configuration.get('derivans', 'derivans_image', fallback=None) + path_logging = self.odem_configuration.get('derivans', 'derivans_logdir', fallback=None) derivans: df.BaseDerivansManager = df.BaseDerivansManager.create( self.mets_file, container_image_name=derivans_image, @@ -490,7 +455,7 @@ def delete_before_export(self, folders): def postprocess_mets(self): """wrap work related to processing METS/MODS""" - postprocess_mets(self.mets_file, self.cfg.get('ocr', 'ocrd_baseimage')) + postprocess_mets(self.mets_file, self.odem_configuration.get('ocr', 'ocrd_baseimage')) def validate_metadata(self): """Forward (optional) validation concerning @@ -499,34 +464,34 @@ def validate_metadata(self): """ check_ddb = False ignore_ddb = [] - if self.cfg.has_option('mets', 'ddb_validation'): - check_ddb = self.cfg.getboolean('mets', 'ddb_validation', fallback=False) - if self.cfg.has_option('mets', 'ddb_validation_ignore'): - raw_ignore_str = self.cfg.get('mets', 'ddb_validation_ignore') + if self.odem_configuration.has_option('mets', 'ddb_validation'): + check_ddb = self.odem_configuration.getboolean('mets', 'ddb_validation', fallback=False) + if self.odem_configuration.has_option('mets', 'ddb_validation_ignore'): + raw_ignore_str = self.odem_configuration.get('mets', 'ddb_validation_ignore') ignore_ddb = [i.strip() for i in raw_ignore_str.split(',')] - dtype = 'Aa' - if 'pica' in self.record.info: - dtype = self.record.info['pica'] + # dtype = 'Aa' + # if 'pica' in self.record.info: + # dtype = self.record.info['pica'] return validate(self.mets_file, validate_ddb=check_ddb, - digi_type=dtype, ddb_ignores=ignore_ddb) + digi_type=self.digi_type, ddb_ignores=ignore_ddb) def export_data(self): """re-do metadata and transform into output format""" - export_format: str = self.cfg.get('export', 'export_format', fallback=ExportFormat.SAF) - export_mets: bool = self.cfg.getboolean('export', 'export_mets', fallback=True) + export_format: str = self.odem_configuration.get('export', 'export_format', fallback=ExportFormat.SAF) + export_mets: bool = self.odem_configuration.getboolean('export', 'export_mets', fallback=True) - exp_dst = self.cfg.get('export', 'local_export_dir') - exp_tmp = self.cfg.get('export', 'local_export_tmp') - exp_col = self.cfg.get('export', 'export_collection') - exp_map = self.cfg.getdict('export', 'export_mappings') + exp_dst = self.odem_configuration.get('export', 'local_export_dir') + exp_tmp = self.odem_configuration.get('export', 'local_export_tmp') + exp_col = self.odem_configuration.get('export', 'export_collection') + exp_map = self.odem_configuration.getdict('export', 'export_mappings') # overwrite default mapping *.xml => 'mets.xml' # since we will have currently many more XML-files # created due OCR and do more specific mapping, though exp_map = {k: v for k, v in exp_map.items() if v != 'mets.xml'} if export_mets: exp_map[os.path.basename(self.mets_file)] = 'mets.xml' - saf_name = self.identifiers.get(CATALOG_ULB) + saf_name = self.mods_identifier if export_format == ExportFormat.SAF: export_result = df.export_data_from( self.mets_file, @@ -550,10 +515,8 @@ def export_data(self): tmp_zip_path, size = self._compress(os.path.dirname(work_dir), saf_name) path_export_processing = dfx._move_to_tmp_file(tmp_zip_path, exp_dst) export_result = path_export_processing, size - else: raise ODEMException(f'Unsupported export format: {export_format}') - self.the_logger.info("[%s] exported data: %s", self.process_identifier, export_result) if export_result: @@ -589,7 +552,6 @@ def statistics(self): def _compress(self, work_dir, archive_name): zip_file_path = os.path.join(os.path.dirname(work_dir), archive_name) + '.zip' - previous_dir = os.getcwd() os.chdir(os.path.join(work_dir, archive_name)) cmd = f'zip -q -r {zip_file_path} ./*' @@ -600,71 +562,102 @@ def _compress(self, work_dir, archive_name): return zip_file_path, f"{zip_size}MiB" -class OCRDPageParallel(ODEMProcess): - """Use page parallel workflow""" - - def run(self): - """Wrap specific OCR execution with - respect to number of executors""" +class ODEMPipelineRunner: + """Wrap actual ODEM process execution""" - if not self.cfg.has_option('ocr', 'ocrd_process_list'): - raise ODEMException("No option 'ocrd_process_list' in section: 'ocr'") + def __init__(self, identifier, n_executors, + internal_logger, odem_ocr_pipeline) -> None: + self.process_identifier = identifier + self.n_executors = n_executors + self.logger:logging.Logger = internal_logger + self.odem_ocr_pipeline: ODEMOCRPipeline = odem_ocr_pipeline - _outcomes = [(0, 0, 0, 0)] + def run(self): + input_data = self.odem_ocr_pipeline.get_input() + the_outcomes = [(0, 0, 0, 0)] if self.n_executors > 1: - _outcomes = self.run_parallel() + the_outcomes = self.run_parallel(input_data) else: - _outcomes = self.run_sequential() - if _outcomes: - self._statistics_ocr['outcomes'] = _outcomes - self.to_alto() - return _outcomes + the_outcomes = self.run_sequential(input_data) + return the_outcomes - def run_parallel(self): - """Run workflow parallel given poolsize""" + def run_parallel(self, input_data): + """Run workflow parallel with given executors""" - self.the_logger.info("[%s] %d images run_parallel by %d executors", - self.process_identifier, len(self.images_4_ocr), self.n_executors) + n_inputs = len(input_data) + self.logger.info("[%s] %d inputs run_parallel by %d executors", + self.process_identifier, n_inputs, self.n_executors) try: with concurrent.futures.ThreadPoolExecutor( max_workers=self.n_executors, - thread_name_prefix='odem' + thread_name_prefix='odem.ocrd' ) as executor: - outcomes = list(executor.map(self.ocrd_page, self.images_4_ocr)) - return outcomes + return list(executor.map(self.odem_ocr_pipeline.process, input_data)) except (OSError, AttributeError) as err: - self.the_logger.error(err) - raise RuntimeError(f"OCR-D parallel: {err.args[0]}") from err + self.logger.error(err) + raise ODEMException(f"ODEM parallel: {err.args[0]}") from err - def run_sequential(self): + def run_sequential(self, input_data): """run complete workflow plain sequential For debugging or small machines """ - _len_img = len(self.images_4_ocr) - _estm_min = _len_img * DEFAULT_RUNTIME_PAGE - self.the_logger.info("[%s] %d images run_sequential, estm. %dmin", - self.process_identifier, _len_img, _estm_min) + len_img = len(input_data) + estm_min = len_img * DEFAULT_RUNTIME_PAGE + self.logger.info("[%s] %d inputs run_sequential, estm. %dmin", + self.process_identifier, len_img, estm_min) try: - outcomes = [self.ocrd_page(_img) - for _img in self.images_4_ocr] + outcomes = [self.odem_ocr_pipeline.process(the_input) + for the_input in input_data] return outcomes except (OSError, AttributeError) as err: - self.the_logger.error(err) - raise RuntimeError(f"OCR-D sequential: {err.args[0]}") from err + self.logger.error(err) + raise ODEMException(f"ODEM sequential: {err.args[0]}") from err + + +class ODEMOCRPipeline: + """Base Interface""" + + @staticmethod + def create( + workflow_type: OdemWorkflowProcessType | str, + odem: ODEMProcess, + ) -> ODEMOCRPipeline: + if (workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT + or workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT.value): + return ODEMTesseract(odem) + return OCRDPageParallel(odem) + + def get_input(self) -> typing.List: + pass + + def process(self): + pass - def ocrd_page(self, image_4_ocr): + +class OCRDPageParallel(ODEMOCRPipeline): + """Use page parallel workflow""" + + def __init__(self, odem_process: ODEMProcess): + self.odem = odem_process + self.cfg = odem_process.odem_configuration + self.logger = odem_process.the_logger + + def get_input(self): + return self.odem.images_4_ocr + + def process(self, input_data): """Create OCR Data""" ocr_log_conf = os.path.join( PROJECT_ROOT, self.cfg.get('ocr', 'ocrd_logging')) # Preprare workspace with makefile - (image_path, ident) = image_4_ocr - os.chdir(self.work_dir_main) + (image_path, ident) = input_data + os.chdir(self.odem.work_dir_main) file_name = os.path.basename(image_path) file_id = file_name.split('.')[0] - page_workdir = os.path.join(self.work_dir_main, file_id) + page_workdir = os.path.join(self.odem.work_dir_main, file_id) if os.path.exists(page_workdir): shutil.rmtree(page_workdir, ignore_errors=True) os.mkdir(page_workdir) @@ -677,8 +670,8 @@ def ocrd_page(self, image_4_ocr): # init ocr-d workspace ocrd_workspace_setup(page_workdir, processed_image_path) - # # find out the needed model config for tesseract - model_config = self.map_language_to_modelconfig(image_path) + # find model config for tesseract + model_config = self.odem.map_language_to_modelconfig(image_path) stored = 0 mps = 0 @@ -692,14 +685,14 @@ def ocrd_page(self, image_4_ocr): (mps, dpi) = get_imageinfo(image_path) # how to identify data set? - if self.record: - _ident = self.process_identifier + if self.odem.record: + _ident = self.odem.process_identifier else: - _ident = os.path.basename(self.work_dir_main) + _ident = os.path.basename(self.odem.work_dir_main) # OCR Generation profiling = ('n.a.', 0) - container_name: str = f'{self.process_identifier}_{os.path.basename(page_workdir)}' + container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}' container_memory_limit: str = self.cfg.get('ocr', 'docker_container_memory_limit', fallback=None) container_user = self.cfg.get('ocr', 'docker_container_user', fallback=os.getuid()) container_timeout: int = self.cfg.getint( @@ -709,10 +702,10 @@ def ocrd_page(self, image_4_ocr): ) base_image = self.cfg.get('ocr', 'ocrd_baseimage') ocrd_process_list = self.cfg.getlist('ocr', 'ocrd_process_list') - tesseract_model_rtl: List[str] = self.cfg.getlist('ocr', 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS) - ocrd_resources_volumes: Dict[str, str] = self.cfg.getdict('ocr', CFG_KEY_RES_VOL, fallback={}) + tesseract_model_rtl: typing.List[str] = self.cfg.getlist('ocr', 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS) + ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict('ocr', CFG_KEY_RES_VOL, fallback={}) - if self.local_mode: + if self.odem.local_mode: container_name = os.path.basename(page_workdir) try: profiling = run_ocr_page( @@ -729,21 +722,21 @@ def ocrd_page(self, image_4_ocr): ) # will be unset in case of magic mocking for test if profiling: - self.the_logger.info("[%s] '%s' in %s (%.1fMP, %dDPI, %.1fMB)", + self.logger.info("[%s] '%s' in %s (%.1fMP, %dDPI, %.1fMB)", _ident, profiling[1], profiling[0], mps, dpi, filesize_mb) - self.the_logger.info("[%s] run ocr creation in '%s'", + self.logger.info("[%s] run ocr creation in '%s'", _ident, page_workdir) stored = self._store_fulltext(page_workdir, image_path) if stored: self._preserve_log(page_workdir, ident) except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: - self.the_logger.error("[%s] image '%s' failed due to subprocess timeout: %s", + self.logger.error("[%s] image '%s' failed due to subprocess timeout: %s", _ident, base_image, exc) except Exception as plain_exc: - self.the_logger.error("[%s] generic exc '%s' for image '%s'", + self.logger.error("[%s] generic exc '%s' for image '%s'", _ident, plain_exc, base_image) - os.chdir(self.work_dir_main) + os.chdir(self.odem.work_dir_main) if self.cfg.getboolean('ocr', 'keep_temp_orcd_data', fallback=False) is False: shutil.rmtree(page_workdir, ignore_errors=True) return stored, 1, mps, filesize_mb @@ -754,7 +747,7 @@ def _preserve_log(self, work_subdir, image_ident): identifier (local section of system OAI handle)""" _root_log = self.cfg.get('global', 'local_log_dir') - _local_ident = self.process_identifier.replace('/', '_') + _local_ident = self.odem.process_identifier.replace('/', '_') _local_ocr_log = os.path.join(_root_log, _local_ident) if not os.path.exists(_local_ocr_log): os.makedirs(_local_ocr_log, exist_ok=True) @@ -762,13 +755,13 @@ def _preserve_log(self, work_subdir, image_ident): _org_log = os.path.join(work_subdir, 'ocrd.log') if os.path.exists(_org_log): _ts = time.strftime(ODEM_PAGE_TIME_FORMAT, time.localtime()) - _log_label = f'ocrd_odem_{self.process_identifier}_{image_ident}_{_ts}.log' + _log_label = f'ocrd_odem_{self.odem.process_identifier}_{image_ident}_{_ts}.log' _rebranded = os.path.join(work_subdir, _log_label) os.rename(_org_log, _rebranded) shutil.copy(_rebranded, _local_ocr_log) else: - self.the_logger.warning("[%s] No ocrd.log in %s", - self.process_identifier, work_subdir) + self.logger.warning("[%s] No ocrd.log in %s", + self.odem.process_identifier, work_subdir) def _store_fulltext(self, image_subdir, original_image_path) -> int: """Move OCR Result from Workspace Subdir to export folder if exists""" @@ -778,23 +771,23 @@ def _store_fulltext(self, image_subdir, original_image_path) -> int: old_id = os.path.basename(image_subdir) ocr_result_dir = os.path.join(image_subdir, 'PAGE') if not os.path.isdir(ocr_result_dir): - self.the_logger.info("[%s] no ocr results for '%s'", - self.process_identifier, ocr_result_dir) + self.logger.info("[%s] no ocr results for '%s'", + self.odem.process_identifier, ocr_result_dir) return 0 ocrs = [os.path.join(ocr_result_dir, ocr) for ocr in os.listdir(ocr_result_dir) if str(ocr).endswith('.xml')] - self.the_logger.debug("[%s] %s ocr files", - self.process_identifier, ocrs) + self.logger.debug("[%s] %s ocr files", + self.odem.process_identifier, ocrs) if ocrs and len(ocrs) == 1: # propably need to rename # since file now is like 'PAGE_01.xml' renamed = os.path.join(ocr_result_dir, old_id + '.xml') os.rename(ocrs[0], renamed) # regular case: OAI Workflow - if not self.local_mode: + if not self.odem.local_mode: # export to 'PAGE' dir - wd_fulltext = os.path.join(self.work_dir_main, 'PAGE') + wd_fulltext = os.path.join(self.odem.work_dir_main, 'PAGE') if not os.path.exists(wd_fulltext): os.mkdir(wd_fulltext) @@ -815,133 +808,60 @@ def to_alto(self) -> int: if len(_cnv) == 0 and n_candidates > 0: raise ODEMException(f"No OCR result for {n_candidates} candidates created!") self.ocr_files = _cnv - self.the_logger.info("[%s] converted '%d' files page-to-alto", - self.process_identifier, len(_cnv)) + self.logger.info("[%s] converted '%d' files page-to-alto", + self.odem.process_identifier, len(_cnv)) -class ODEMTesseract(ODEMProcess): +class ODEMTesseract(ODEMOCRPipeline): """Tesseract Runner""" - def run(self): - """Wrap specific OCR execution with - respect to number of executors""" - - _cfg = self.read_pipeline_config() - self._prepare_workdir_tmp() - _n_total = len(self.images_4_ocr) - self.ocr_input_paths = [(img, i, _n_total, self.the_logger, _cfg) - for i, img in enumerate(self.images_4_ocr, start=1)] - _outcomes = [(0, 0, 0, 0)] - if self.n_executors > 1: - _outcomes = self.run_parallel() - else: - _outcomes = self.run_sequential() - if _outcomes: - self._statistics_ocr['outcomes'] = _outcomes - return _outcomes - - def run_parallel(self): - """Run workflow parallel given poolsize""" - self.the_logger.info("[%s] %d images run_parallel by %d executors", - self.process_identifier, len(self.ocr_input_paths), self.n_executors) - try: - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.n_executors, - thread_name_prefix='odem' - ) as executor: - outcomes = list(executor.map(run_pipeline, self.ocr_input_paths)) - return outcomes - except (OSError, AttributeError) as err: - self.the_logger.error(err) - raise RuntimeError(f"OCR-D parallel: {err.args[0]}") from err - - def run_sequential(self): - """run complete workflow plain sequential - For debugging or small machines""" - _len_img = len(self.ocr_input_paths) - _estm_min = _len_img * DEFAULT_RUNTIME_PAGE - self.the_logger.info("[%s] %d images run_sequential, estm. %dmin", - self.process_identifier, _len_img, _estm_min) - try: - outcomes = [run_pipeline(_img) - for _img in self.ocr_input_paths] - return outcomes - except (OSError, AttributeError) as err: - self.the_logger.error(err) - raise RuntimeError(f"OCR-D sequential: {err.args[0]}") from err - - def read_pipeline_config(self, path_cfg=None) -> configparser: - """Read and process additional pipeline configuration""" - - _path_cfg = path_cfg - if path_cfg is None: - if self.cfg.has_option('ocr', 'ocr_pipeline_config'): - _path_cfg = os.path.abspath(self.cfg.get('ocr', 'ocr_pipeline_config')) - if not os.path.isfile(_path_cfg): - raise ODEMException(f"Invalid ocr-pipeline conf {_path_cfg}") - _cfg = configparser.ConfigParser() - _cfg.read(_path_cfg) - self.pipeline_config = _cfg - return _cfg - - def _prepare_workdir_tmp(self): - workdir_tmp = self.cfg.get('ocr', 'ocr_pipeline_workdir_tmp') - self.the_logger.warning("no workdir set, use '%s'", workdir_tmp) - if not os.path.isdir(workdir_tmp): - if os.access(workdir_tmp, os.W_OK): - os.makedirs(workdir_tmp) - else: - self.the_logger.warning("tmp workdir '%s' not writable, use /tmp", - workdir_tmp) - workdir_tmp = '/tmp/ocr-pipeline-workdir' - if os.path.exists(workdir_tmp): - self._clean_workdir(workdir_tmp) - os.makedirs(workdir_tmp, exist_ok=True) - else: - self._clean_workdir(workdir_tmp) - return workdir_tmp - - def _clean_workdir(self, the_dir): - self.the_logger.info("clean existing workdir '%s'", the_dir) - for file_ in os.listdir(the_dir): - fpath = os.path.join(the_dir, file_) - if os.path.isfile(fpath): - os.unlink(fpath) - - def store_estimations(self, estms): - """Postprocessing of OCR-Quality Estimation Data""" - - valids = [r for r in estms if r[1] != -1] - invalids = [r for r in estms if r[1] == -1] - sorteds = sorted(valids, key=lambda r: r[1]) - aggregations = analyze(sorteds) - end_time = time.strftime('%Y-%m-%d_%H-%M', time.localtime()) - if not os.path.isdir(self.work_dir_main): - self.the_logger.warning('unable to choose store for estm data: %s', - str(self.work_dir_main)) - return - - file_name = os.path.basename(self.work_dir_main) - file_path = os.path.join( - self.work_dir_main, f"{file_name}_{end_time}.wtr") - self.the_logger.info("store mean '%.3f' in '%s'", - aggregations[0], file_path) - if aggregations: - (mean, bins) = aggregations - b_1 = len(bins[0]) - b_2 = len(bins[1]) - b_3 = len(bins[2]) - b_4 = len(bins[3]) - b_5 = len(bins[4]) - n_v = len(valids) - n_i = len(invalids) - self.the_logger.info("WTE (Mean): '%.1f' (1: %d/%d, ... 5: %d/%d)", - mean, b_1, n_v, b_5, n_v) - with open(file_path, 'w', encoding="UTF-8") as outfile: - outfile.write( - f"{mean},{b_1},{b_2},{b_3},{b_4},{b_5},{len(estms)},{n_i}\n") - for s in sorteds: - outfile.write( - f"{s[0]},{s[1]:.3f},{s[2]},{s[3]},{s[4]},{s[5]},{s[6]},{s[7]}\n") - outfile.write("\n") - return file_path + def __init__(self, odem_process: ODEMProcess): + self.odem = odem_process + self.odem_configuration = odem_process.odem_configuration + self.logger = odem_process.the_logger + self.pipeline_configuration = None + + def get_input(self): + images_4_ocr = self.odem.images_4_ocr + n_total = len(images_4_ocr) + pipeline_cfg = self.read_pipeline_config() + input_data = [(img, i, n_total, self.logger, pipeline_cfg) + for i, img in enumerate(self.odem.images_4_ocr, start=1)] + return input_data + + def process(self, input_data): + + image_path = input_data[0][0] + pipeline_result = run_pipeline(input_data) + stored = pipeline_result is not None + mps = 0 + filesize_mb = 0 + filestat = os.stat(image_path) + if filestat: + filesize_mb = filestat.st_size / 1048576 + (mps, _) = get_imageinfo(image_path) + return stored, 1, mps, filesize_mb + + def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: + """Read pipeline configuration and replace + model_configs with known language data""" + + if self.pipeline_configuration is None: + if path_config is None: + if self.odem_configuration.has_option('ocr', 'ocr_pipeline_config'): + path_config = os.path.abspath(self.odem_configuration.get('ocr', 'ocr_pipeline_config')) + if not os.path.isfile(path_config): + raise ODEMException(f"no ocr-pipeline conf {path_config} !") + pipe_cfg = configparser.ConfigParser() + pipe_cfg.read(path_config) + self.logger.info(f"use config '{path_config}'") + for sect in pipe_cfg.sections(): + if pipe_cfg.has_option(sect, 'model_configs'): + known_langs = self.odem._statistics_ocr.get(STATS_KEY_LANGS) + model_files = self.odem.language_modelconfig(known_langs) + models = model_files.replace('.traineddata','') + pipe_cfg.set(sect, 'model_configs', models) + if pipe_cfg.has_option(sect, STEP_MOVE_PATH_TARGET): + pipe_cfg.set(sect, STEP_MOVE_PATH_TARGET, f'{self.odem.work_dir_main}/FULLTEXT') + self.pipeline_configuration = pipe_cfg + return self.pipeline_configuration diff --git a/lib/ocrd3_odem/processing_mets.py b/lib/ocrd3_odem/processing_mets.py index 2fb43a2..c6ab965 100644 --- a/lib/ocrd3_odem/processing_mets.py +++ b/lib/ocrd3_odem/processing_mets.py @@ -15,10 +15,11 @@ TYPE_PRINTS_PICA = ['a', 'f', 'F', 'Z', 'B'] TYPE_PRINTS_LOGICAL = ['monograph', 'volume', 'issue', 'additional'] -CATALOG_ULB = 'gvk-ppn' -CATALOG_OTH = 'gbv-ppn' -CATALOG_SWB = 'swb-ppn' # SLUB OAI related -CATALOGUE_IDENTIFIERS = [CATALOG_ULB, CATALOG_OTH, CATALOG_SWB] +CATALOG_ULB = 'gvk-ppn' +CATALOG_ULB2 = 'kxp-ppn' # ULB ZD related +CATALOG_OTH = 'gbv-ppn' +CATALOG_SWB = 'swb-ppn' # SLUB OAI related +CATALOGUE_IDENTIFIERS = [CATALOG_ULB, CATALOG_ULB2, CATALOG_OTH, CATALOG_SWB] RECORD_IDENTIFIER = 'recordIdentifier' Q_XLINK_HREF = '{http://www.w3.org/1999/xlink}href' METS_AGENT_ODEM = 'DFG-OCRD3-ODEM' @@ -72,7 +73,7 @@ def _get_report(self): raise ODEMMetadataMetsException(_err) from _err return self._report - def inspect(self): + def metadata_report(self) -> df.MetsReaderReport: """Gather knowledge about digital object's. First, try to determin what kind of retro-digit we are handling by inspecting it's final PICA mark @@ -93,6 +94,7 @@ def inspect(self): self.inspect_metadata_images() if not any(ident in CATALOGUE_IDENTIFIERS for ident in report.identifiers): raise ODEMMetadataMetsException(f"No {CATALOGUE_IDENTIFIERS} in {self.process_identifier}") + return report @property def identifiers(self): @@ -100,13 +102,19 @@ def identifiers(self): return self._get_report().identifiers @property - def record_identifier(self): - """Get main MODS recordIdentifier if present""" - _idents = self._get_report().identifiers - if CATALOG_ULB in _idents: - return _idents[CATALOG_ULB] - elif CATALOG_OTH in _idents: - return _idents[CATALOG_OTH] + def mods_record_identifier(self): + """Get main MODS recordIdentifier if present + guess if more than 1 ppn-like entry exist + """ + idents = dict(self._get_report().identifiers) + if 'urn' in idents: + del idents['urn'] + if len(idents) == 1: + return list(idents.values())[0] + if CATALOG_ULB in idents: + return idents[CATALOG_ULB] + elif CATALOG_OTH in idents: + return idents[CATALOG_OTH] else: _proc_in = self.process_identifier if ':' in _proc_in: @@ -274,9 +282,10 @@ def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int: # Assignment takes place via the name of the corresponding # image (= name ALTO file) _mproc = df.MetsProcessor(_ocr_file) - src_info = _mproc.tree.xpath('//alto:sourceImageInformation/alto:fileName', namespaces=df.XMLNS)[0] + ns_map = _sanitize_namespaces(_mproc.tree) + src_info = _mproc.tree.xpath('//alto:sourceImageInformation/alto:fileName', namespaces=ns_map)[0] src_info.text = f'{_file_name}.jpg' - first_page_el = _mproc.tree.xpath('//alto:Page', namespaces=df.XMLNS)[0] + first_page_el = _mproc.tree.xpath('//alto:Page', namespaces=ns_map)[0] first_page_el.attrib['ID'] = f'p{_file_name}' _mproc.write() _n_linked_ocr += _link_fulltext(new_id, xml_tree) @@ -284,6 +293,14 @@ def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int: return _n_linked_ocr +def _sanitize_namespaces(tree): + ns_map = tree.nsmap + if None in ns_map and '/alto/' in ns_map[None]: + mapping = ns_map[None] + ns_map = {'alto': mapping} + return ns_map + + def _link_fulltext(file_ident, xml_tree): file_name = file_ident.split('_')[-1] xp_files = f'.//mets:fileGrp[@USE="{FILEGROUP_IMG}"]/mets:file' @@ -372,3 +389,19 @@ def validate(mets_file:str, ddb_ignores, except df.DigiflowDDBException as ddb_err: raise ODEMException(ddb_err.args[0]) from ddb_err return True + + +def extract_text_content(ocr_files: typing.List) -> str: + """Extract textual content from ALTO files' String element + """ + sorted_files = sorted(ocr_files) + txt_contents = [] + for ocr_file in sorted_files: + with open(ocr_file, mode='r', encoding='UTF-8') as _ocr_file: + ocr_root = ET.parse(_ocr_file).getroot() + ns_map = _sanitize_namespaces(ocr_root) + all_lines = ocr_root.findall('.//alto:TextLine', ns_map) + for single_line in all_lines: + line_strs = [s.attrib['CONTENT'] for s in single_line.findall('.//alto:String', ns_map)] + txt_contents.append(' '.join(line_strs)) + return txt_contents diff --git a/lib/ocrd3_odem/processing_ocr_pipeline.py b/lib/ocrd3_odem/processing_ocr_pipeline.py index 91dd1f7..14d648d 100644 --- a/lib/ocrd3_odem/processing_ocr_pipeline.py +++ b/lib/ocrd3_odem/processing_ocr_pipeline.py @@ -1,40 +1,26 @@ """Processing OCR-Pipeline""" +import abc +import collections +import configparser +import logging import os import re import shutil import subprocess import sys import time +import typing -from abc import ( - ABC, abstractmethod -) -from collections import ( - OrderedDict -) -from configparser import ( - ConfigParser, -) -from typing import ( - Dict, - List, - Tuple -) +from pathlib import Path -import lxml.etree as ET import requests -from digiflow import ( - write_xml_file, -) - -from .odem_commons import ( - ODEMException, -) -from .ocr_model import ( - TextLine, - get_lines, -) + +import digiflow as df +import lxml.etree as ET + +from .odem_commons import ODEMException +from .ocr_model import TextLine, get_lines NAMESPACES = {'alto': 'http://www.loc.gov/standards/alto/ns-v3#'} @@ -43,6 +29,8 @@ DEFAULT_LANGTOOL_LANG = 'de-DE' DEFAULT_LANGTOOL_RULE = 'GERMAN_SPELLER_RULE' +STEP_MOVE_PATH_TARGET = 'path_target' + # python process-wrapper os.environ['OMP_THREAD_LIMIT'] = '1' @@ -53,26 +41,24 @@ class StepException(Exception): """Mark Step Execution Exception""" -class StepI(ABC): +class StepI(abc.ABC): """step that handles input data""" - @abstractmethod + @abc.abstractmethod def execute(self): """Step Action to execute""" @property - def path_in(self): + def path_in(self) -> Path: """Input data path""" return self._path_in @path_in.setter def path_in(self, path_in): - if not os.path.exists(path_in): - raise RuntimeError('path {} invalid'.format(path_in)) - if not isinstance(path_in, str): - path_in = str(path_in) + path_in = Path(path_in).absolute() + if not path_in.exists(): + raise StepException(f"Path '{path_in}' invalid!") self._path_in = path_in - (self._path_in_dir, self._filename) = split_path(self._path_in) class StepIO(StepI): @@ -81,18 +67,18 @@ class StepIO(StepI): def __init__(self): super().__init__() self._filename = None - self._path_in_dir = None - self._path_next = None - self._path_next_dir = None + self._path_next: Path = None @property - def path_next(self): + def path_next(self) -> Path: """calculate path_out for result data""" + if self._path_next is None: + self._path_next = Path(self._path_in) return self._path_next @path_next.setter def path_next(self, path_next): - self._path_next = path_next + self._path_next = Path(path_next).absolute() class StepIOExtern(StepIO): @@ -102,8 +88,11 @@ def __init__(self, params): super().__init__() self._cmd = None self._bin = None + self._env = None + if not isinstance(params, dict): + raise StepException(f"Invalid params '{params}'!") try: - self._params = OrderedDict(params) + self._params = collections.OrderedDict(params) if 'type' in self._params: del self._params['type'] except ValueError as exc: @@ -111,7 +100,14 @@ def __init__(self, params): raise StepException(msg) from exc def execute(self): - subprocess.run(self.cmd, shell=True, check=True) + try: + completed_process = subprocess.run(self.cmd, + shell=True, + capture_output=True, + check=True, env=self._env) + return completed_process + except subprocess.SubprocessError as sub_exc: + raise StepException(sub_exc) from sub_exc @property def cmd(self): @@ -126,14 +122,18 @@ def cmd(self, cmd): class StepTesseract(StepIOExtern): """Central Call to Tessract OCR""" - def __init__(self, params: Dict): + def __init__(self, params: typing.Dict): super().__init__(params) self._bin = 'tesseract' + self._tessdata = None if 'tesseract_bin' in self._params: self._bin = self._params['tesseract_bin'] del self._params['tesseract_bin'] if 'path_out_dir' in self._params: self._path_out_dir = self._params['path_out_dir'] + if 'tessdata_prefix' in self._params: + self._tessdata = self._params['tessdata_prefix'] + del self._params['tessdata_prefix'] # common process params # where to store alto data, dpi and language @@ -155,8 +155,9 @@ def __init__(self, params: Dict): if 'output_configs' in self._params: del self._params['output_configs'] # otherwise output - outputs = [k for k, v in self._params.items() if v is None and k in [ - 'alto', 'txt', 'pdf']] + outputs = [k for k, v in self._params.items() + if v is None and k in ['alto', 'txt', 'pdf'] + ] if len(outputs) > 0: for output in outputs: del self._params[output] @@ -166,17 +167,9 @@ def __init__(self, params: Dict): @property def path_next(self): - _filename = self._filename - if not _filename.endswith('.xml'): - _filename += '.xml' - - # calculate abs path - self._path_next = None - if self._path_next_dir: - self._path_next = os.path.join(self._path_next_dir, _filename) - else: - self._path_next = os.path.join(self._path_in_dir, _filename) - return self._path_next + if not self._path_in.suffix == '.xml': + return self._path_in.with_suffix('.xml') + return self._path_in @property def cmd(self): @@ -184,6 +177,8 @@ def cmd(self): Update Command with specific in/output paths """ out_file = os.path.splitext(self.path_next)[0] + if self._tessdata is not None: + self._env = {"TESSDATA_PREFIX" : self._tessdata} self._cmd = f"{self._bin} {self.path_in} {out_file} {dict2line(self._params, ' ')}" return self._cmd @@ -204,7 +199,7 @@ def parse_dict(the_dict): class StepPostReplaceChars(StepIO): """Postprocess: Replace suspicious character sequences""" - def __init__(self, params: Dict): + def __init__(self, params: typing.Dict): super().__init__() dict_chars = params.get('dict_chars', '{}') self.dict_chars = parse_dict(dict_chars) @@ -267,7 +262,7 @@ def statistics(self): class StepPostReplaceCharsRegex(StepPostReplaceChars): """Postprocess: Replace via regular expressions""" - def __init__(self, params: Dict): + def __init__(self, params: typing.Dict): super().__init__({}) self.pattern = params['pattern'] self.old = params['old'] @@ -276,7 +271,6 @@ def __init__(self, params: Dict): def _replace(self, lines): for line in lines: - # for string_element in self.regex_replacements: matcher = re.search(self.pattern, line) if matcher: match = matcher.group(1) @@ -287,31 +281,25 @@ def _replace(self, lines): class StepPostMoveAlto(StepIO): - """Postprocess: move Alto file to original scandata folder""" + """Postprocess: move output to desired directory""" - def __init__(self, params: Dict): + def __init__(self, params: typing.Dict): super().__init__() - if 'path_target' in params: - self._path_out = params['path_target'] + if STEP_MOVE_PATH_TARGET in params: + self._path_out = Path(params[STEP_MOVE_PATH_TARGET]) def execute(self): - shutil.copyfile(self._path_in, self._path_out) - - @property - def path_next(self): - (folder, _) = split_path(self._path_out) - return os.path.join(folder, self._filename + '.xml') - - @path_next.setter - def path_next(self, path_target): - (folder, _) = split_path(path_target) - self._path_out = os.path.join(folder, self._filename + '.xml') + if not self._path_out.exists(): + self._path_out.mkdir(parents=True) + path_target = self._path_out / self._path_in.name + os.rename(self._path_in, path_target) + self._path_next = path_target class StepPostRemoveFile(StepI): """Cleanup and remove temporal TIF-Files before they flood the Discs""" - def __init__(self, params: Dict): + def __init__(self, params: typing.Dict): super().__init__() self._file_removed = False self._suffix = params.get('file_suffix', 'tif') @@ -331,7 +319,7 @@ def is_removed(self): class StepEstimateOCR(StepI): """Estimate OCR-Quality of current run by using Web-Service language-tool""" - def __init__(self, params: Dict): + def __init__(self, params: typing.Dict): super().__init__() self.service_url = params.get('service_url', DEFAULT_LANGTOOL_URL) self.lang = params.get('language', DEFAULT_LANGTOOL_LANG) @@ -417,7 +405,7 @@ def statistics(self): self.n_lines_out) -def textlines2data(lines: List[TextLine], minlen: int = 2) -> Tuple: +def textlines2data(lines: typing.List[TextLine], minlen: int = 2) -> typing.Tuple: """Transform text lines after preprocessing into data set""" non_empty_lines = [l.get_textline_content() @@ -528,8 +516,7 @@ def execute(self): # remove empty sections drop_empty_contents(xml_root) - - write_xml_file(xml_root, self.path_in) + df.write_xml_file(xml_root, self.path_in) @staticmethod def _append_source_infos(descr_tree, file_name, namespace): @@ -587,73 +574,69 @@ def profile(func): def run_pipeline(*args): """Wrap run ocr-pipeline""" - _start_path = args[0][0] - if isinstance(_start_path, Tuple): - _start_path = _start_path[0] - _number = args[0][1] - _total = args[0][2] - _logger = args[0][3] - _step_config: ConfigParser = args[0][4] - batch_label = f"{_number:04d}/{_total:04d}" - next_in = _start_path - file_name = os.path.basename(_start_path) + start_path = args[0][0] + if isinstance(start_path, typing.Tuple): + start_path = start_path[0] + n_curr = args[0][1] + n_total = args[0][2] + the_logger: logging.Logger = args[0][3] + step_config: configparser.ConfigParser = args[0][4] + batch_label = f"{n_curr:04d}/{n_total:04d}" + next_in = start_path + file_name = os.path.basename(start_path) outcome = (file_name, MARK_MISSING_ESTM) try: - the_steps = init_steps(_step_config) - _logger.info("[%s] [%s] start pipeline with %d steps", + the_steps = init_steps(step_config) + the_logger.info("[%s] [%s] start pipeline with %d steps", file_name, batch_label, len(the_steps)) - - # for step in STEPS: for step in the_steps: step.path_in = next_in if isinstance(step, StepIOExtern): - _logger.debug("[%s] call '%s'", file_name, step.cmd) - # the actual execution + the_logger.debug("[%s] call '%s' (env: '%s')", + file_name, step.cmd, step._env) profile_result = profile(step.execute) - # log current step if hasattr(step, 'statistics') and len(step.statistics) > 0: if profile_result and isinstance(step, StepEstimateOCR): outcome = (file_name,) + step.statistics - _logger.info("[%s] %s, statistics: %s", + the_logger.info("[%s] %s, statistics: %s", file_name, profile_result, str(step.statistics)) else: - _logger.debug("[%s] %s", file_name, profile_result) - # prepare next step + the_logger.debug("[%s] %s", file_name, profile_result) if hasattr(step, 'path_next') and step.path_next is not None: - _logger.debug("[%s] step.path_next: %s", + the_logger.debug("[%s] step.path_next: %s", file_name, step.path_next) next_in = step.path_next - _logger.info("[%s] [%s] done pipeline with %d steps", + the_logger.info("[%s] [%s] done pipeline with %d steps", file_name, batch_label, len(the_steps)) return outcome # if a single step-based images crashes, we will go on anyway - except StepException as exc: - _logger.error( - "[%s] %s: %s", - _start_path, - step, - exc.args[0]) + except (StepException) as exc: + the_logger.error( + "[%s] %s: %s", start_path, step, exc.args) raise ODEMException(exc) from exc # OSError means something really severe, like # non-existing resources/connections that will harm # all images in pipeline, therefore signal halt except OSError as os_exc: - _logger.critical("[%s] %s: %s", _start_path, step, str(os_exc)) + the_logger.critical("[%s] %s: %s", start_path, step, os_exc.args) + sys.exit(1) + except Exception as generic_exc: + the_logger.critical("[%s] %s: %s", start_path, step, generic_exc.args) sys.exit(1) -def init_steps(steps_config: ConfigParser) -> List[StepI]: +def init_steps(steps_config: configparser.ConfigParser) -> typing.List[StepI]: """ Create all configured steps (each time again) labeled like 'step_01', step_02' and so forth to ensure their sequence """ - steps: List[StepI] = [] + steps: typing.List[StepI] = [] step_configs = [ s for s in steps_config.sections() if s.startswith('step_')] sorted_steps = sorted(step_configs, key=lambda s: int(s.split('_')[1])) @@ -669,14 +652,6 @@ def init_steps(steps_config: ConfigParser) -> List[StepI]: return steps -def split_path(path_in): - """create tuple with dirname and filename (minus ext)""" - path_in_folder = os.path.dirname(path_in) - file_name_in = path_in.split(os.sep)[-1] - filename = file_name_in.split('.')[0] - return (path_in_folder, filename) - - def dict2line(the_dict, the_glue): """create string from dictionary""" def impl(key, val, glue): diff --git a/resources/odem.pipeline.ini b/resources/odem.ocr-pipeline.ini similarity index 98% rename from resources/odem.pipeline.ini rename to resources/odem.ocr-pipeline.ini index 74ec38b..0f766e5 100644 --- a/resources/odem.pipeline.ini +++ b/resources/odem.ocr-pipeline.ini @@ -26,7 +26,7 @@ max_vmem_percentage = 75 # use OCRD parallel workflow or ocr-pipeline workflow with tesseract # possible are: OCRD_PAGE_PARALLEL or ODEM_TESSERACT workflow_type = ODEM_TESSERACT -ocr_pipeline_config = /tesseract_pipeline_config.ini +ocr_pipeline_config = odem.ocr-pipeline.steps.ini # how many OCR-D containers to start in parallel mode n_executors = 12 diff --git a/resources/odem.ocr-pipeline.steps.ini b/resources/odem.ocr-pipeline.steps.ini new file mode 100644 index 0000000..e88977d --- /dev/null +++ b/resources/odem.ocr-pipeline.steps.ini @@ -0,0 +1,30 @@ +[pipeline] +logdir = /opt/ocr-pipeline/logdir +workdir = /opt/ocr-pipeline/workdir +file_ext = tif,jpg,png,jpeg +logger_name = ocr_pipeline + +# write marker into scandata dir +mark_open = +mark_done = ocr_pipeline_done +mark_fail = ocr_pipeline_fail +mark_lock = ocr_pipeline_busy + +# tesseract specific configs like TESSDATA_PREFIX +[step_01] +type = StepTesseract +tesseract_bin = tesseract +tessdata_prefix = /data/ocr/tesseract4/tessdata +model_configs = frk+deu +output_configs = alto + +# replace 'J's with regex +[step_02] +type = StepPostReplaceCharsRegex +pattern = r'(J[cdhmn])' +old = 'J' +new = 'I' + +[step_03] +type = StepPostMoveAlto +path_target = FULLTEXT \ No newline at end of file diff --git a/resources/tesseract_pipeline_config.ini b/resources/tesseract_pipeline_config.ini deleted file mode 100644 index 12e0641..0000000 --- a/resources/tesseract_pipeline_config.ini +++ /dev/null @@ -1,27 +0,0 @@ -[pipeline] -file_ext = tif,jpg,png,jpeg -executors = 8 -logger_name = ocr_pipeline - -# tesseract specific config -[step_01] -type = StepTesseract -tesseract_bin = tesseract -output_configs = alto - -# additional config for replacement -[step_02] -type = StepPostReplaceChars -dict_chars = {'ic)': 'ich', 's<': 'sc', '<': 'c'} - -;# additional config for post-dict-lookup -;[step_03] -;type = StepEstimateOCR -;active = True -;service_url = http://localhost:8010/v2/check -;language = de-DE -;enabled_rules = GERMAN_SPELLER_RULE - -# optional: post-processing ALTO-XML -[step_04] -type = StepPostprocessALTO diff --git a/tests/conftest.py b/tests/conftest.py index 7c8a1d7..642bee7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,8 +79,8 @@ def _module_fixture_123456789_27949(tmp_path_factory): _model_dir = prepare_tessdata_dir(path_workdir) record = OAIRecord('oai:dev.opendata.uni-halle.de:123456789/27949') _oproc = ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log') - _oproc.cfg = fixture_configuration() - _oproc.cfg.set('ocr', CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') + _oproc.odem_configuration = fixture_configuration() + _oproc.odem_configuration.set('ocr', CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') _oproc.ocr_files = [os.path.join(trgt_alto, a) for a in os.listdir(trgt_alto)] _oproc.mets_file = str(trgt_mets) diff --git a/tests/resources/1516514412012_175762.xml b/tests/resources/1516514412012_175762.xml new file mode 100644 index 0000000..cb4428b --- /dev/null +++ b/tests/resources/1516514412012_175762.xml @@ -0,0 +1,223 @@ + + + + + DigitalDerivans V1.8.5 + PDF FileGroup for PDF_16691561019210131 created at 2024-01-25T09:38:87 + + 6999 + + + + + + text + + 1921-01-31 + + + Universitäts- und Landesbibliothek Sachsen-Anhalt + + Halle (Saale) + + 2024 + + urn:nbn:de:gbv:3:1-171133730-16691561019210131-18 + + 16691561019210131 + 29-10-94 + 04-09-22 + + + Klassenkampf + + + + 1774602105 + + 3098062-8 + + 347407-0 + + 166915610 + + + + Klassenkampf + + + + ger + Latf + + + AZ + 1516514412012/175735 + !2000,2000 + + + Universitäts- und Landesbibliothek Sachsen-Anhalt + Fi 17 R + + 090 + + + 1. Jahrgang + + + Nr. 25 + + + Public Domain Mark 1.0 + + + + + + + + + text + + Beilage zum Klassenkampf. + + + + + + + + + + text + + Leben Wissen Kunst + + + + + + + + + + + Universitäts- und Landesbibliothek Sachsen-Anhalt + https://opendata2.uni-halle.de/image/mets_viewerLogo.gif + http://www.bibliothek.uni-halle.de + mailto:auskunft@bibliothek.uni-halle.de + https://creativecommons.org/publicdomain/mark/1.0/ + + + + + + + + + https://opac.lbs-halle.gbv.de/DB=1/XMLPRS=N/PPN?PPN=166915610 + https://opendata2.uni-halle.de//handle/1516514412012/175762 + https://opendata2.uni-halle.de//json/iiif/1516514412012/175762/c8a25f28-35a6-45e9-81ad-197955e0ed59/manifest + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/resources/1516514412012_175762_00000003.xml b/tests/resources/1516514412012_175762_00000003.xml new file mode 100644 index 0000000..3f60ccf --- /dev/null +++ b/tests/resources/1516514412012_175762_00000003.xml @@ -0,0 +1,8498 @@ + + + + pixel + + 00000003.jpg + + + + + tesseract 5.3.4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/resources/ocr_config_full.ini b/tests/resources/ocr_config_full.ini index 66dca08..10261f0 100644 --- a/tests/resources/ocr_config_full.ini +++ b/tests/resources/ocr_config_full.ini @@ -12,7 +12,7 @@ mark_prev = ocr_busy [step_01] type = StepTesseract tesseract_bin = tesseract -model_configs = frk+deu +model_configs = frk+lat output_configs = alto # additional config for replacement diff --git a/tests/test_ocrd3_odem.py b/tests/test_ocrd3_odem.py index 899a646..e57ecc6 100644 --- a/tests/test_ocrd3_odem.py +++ b/tests/test_ocrd3_odem.py @@ -49,9 +49,9 @@ def test_mapping_from_imagefilename(img_path, lang_str, tmp_path): log_dir = tmp_path / 'log' log_dir.mkdir() odem_processor = ODEMProcess(None, work_dir=str(work_2)) - odem_processor.cfg = fixture_configuration() + odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) - odem_processor.cfg.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, + odem_processor.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_tess_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') odem_processor.the_logger = get_logger(str(log_dir)) odem_processor.local_mode = True @@ -79,14 +79,14 @@ def test_exchange_language(img_path, langs, models, tmp_path): log_dir = tmp_path / 'log' log_dir.mkdir() odem_processor = ODEMProcess(None, work_dir=str(work_2)) - odem_processor.cfg = fixture_configuration() + odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) - odem_processor.cfg.set( + odem_processor.odem_configuration.set( CFG_SEC_OCR, CFG_KEY_RES_VOL, f"{_tess_dir}:/dummy" ) - odem_processor.cfg.set(CFG_SEC_OCR, KEY_LANGUAGES, langs) + odem_processor.odem_configuration.set(CFG_SEC_OCR, KEY_LANGUAGES, langs) odem_processor.the_logger = get_logger(str(log_dir)) odem_processor.local_mode = True @@ -111,16 +111,16 @@ def test_enforce_language_and_model_mapping(tmp_path): log_dir = tmp_path / 'log' log_dir.mkdir() odem_processor = ODEMProcess(None, work_dir=str(work_2)) - odem_processor.cfg = fixture_configuration() + odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) _kraken_dir = prepare_kraken_dir(tmp_path) - odem_processor.cfg.set( + odem_processor.odem_configuration.set( CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_tess_dir}:/dummy,{_kraken_dir}:/dummy' ) - odem_processor.cfg.set(CFG_SEC_OCR, KEY_LANGUAGES, 'ara+fas') - odem_processor.cfg.set( + odem_processor.odem_configuration.set(CFG_SEC_OCR, KEY_LANGUAGES, 'ara+fas') + odem_processor.odem_configuration.set( CFG_SEC_OCR, KEY_MODEL_MAP, 'fas: fas.traineddata, ara:arabic_best.mlmodel' @@ -129,14 +129,14 @@ def test_enforce_language_and_model_mapping(tmp_path): odem_processor.local_mode = True # act 1st - odem_processor.cfg.set(CFG_SEC_OCR, CFG_KEY_MODEL_COMBINABLE, 'False') + odem_processor.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_MODEL_COMBINABLE, 'False') assert odem_processor.map_language_to_modelconfig('/data/img/0001.tif') == 'arabic_best.mlmodel' # act 2nd - odem_processor.cfg.set(CFG_SEC_OCR, CFG_KEY_MODEL_COMBINABLE, 'True') + odem_processor.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_MODEL_COMBINABLE, 'True') assert odem_processor.map_language_to_modelconfig('/data/img/0002.tif') == 'arabic_best.mlmodel+fas.traineddata' # act 3rd call. still only fas:fas - odem_processor.cfg.set(CFG_SEC_OCR, CFG_KEY_MODEL_COMBINABLE, 'False') - odem_processor.cfg.set(CFG_SEC_OCR, KEY_LANGUAGES, 'fas') + odem_processor.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_MODEL_COMBINABLE, 'False') + odem_processor.odem_configuration.set(CFG_SEC_OCR, KEY_LANGUAGES, 'fas') assert odem_processor.map_language_to_modelconfig('/data/img/0003.tif') == 'fas.traineddata' @@ -159,9 +159,9 @@ def _side_effect(*args, **kwargs): _log_dir.mkdir() _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') odem = ODEMProcess(_record, _workdir) - odem.cfg = fixture_configuration() + odem.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(_workdir) - odem.cfg.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') + odem.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') odem.the_logger = get_logger(str(_log_dir)) # mock loading of OAI Record @@ -205,9 +205,9 @@ def _fixture_odem_setup(tmp_path): odem_processor = ODEMProcess(None, work_dir=str(work_2)) cfg = get_configparser() cfg.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) - odem_processor.cfg = cfg + odem_processor.odem_configuration = cfg _model_dir = prepare_tessdata_dir(work_dir) - odem_processor.cfg.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, + odem_processor.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') odem_processor.local_mode = True odem_processor.the_logger = get_logger(log_dir) @@ -318,7 +318,7 @@ def test_images_4_ocr_properly_filtered(tmp_path): odem_processor = ODEMProcess(_record, work_dir=_work_dir) cfg = get_configparser() cfg.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) - odem_processor.cfg = cfg + odem_processor.odem_configuration = cfg _log_dir = tmp_path / 'log' _log_dir.mkdir() odem_processor.the_logger = get_logger(str(_log_dir)) @@ -344,7 +344,7 @@ def test_no_catch_when_load_exc(mock_load, tmp_path): odem_processor = ODEMProcess(_record, work_dir=_work_dir) cfg = get_configparser() cfg.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) - odem_processor.cfg = cfg + odem_processor.odem_configuration = cfg _log_dir = tmp_path / 'log' _log_dir.mkdir() odem_processor.the_logger = get_logger(str(_log_dir)) @@ -372,9 +372,9 @@ def test_record_with_unknown_language(tmp_path): (path_workdir / 'log').mkdir() record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/72977') oproc = ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log') - oproc.cfg = fixture_configuration() + oproc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(tmp_path) - oproc.cfg.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, + oproc.odem_configuration.set(CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') oproc.mets_file = str(trgt_mets) oproc.inspect_metadata() @@ -413,13 +413,13 @@ def test_export_flat_zip(tmp_path): (path_workdir / 'log').mkdir() record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') oproc = ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log') - oproc.cfg = fixture_configuration() + oproc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(tmp_path) - oproc.cfg.set('export', 'export_format', ExportFormat.FLAT_ZIP) - oproc.cfg.set('export', 'local_export_tmp', str(path_tmp_export_dir)) - oproc.cfg.set('export', 'local_export_dir', str(path_export_dir)) - oproc.cfg.set( + oproc.odem_configuration.set('export', 'export_format', ExportFormat.FLAT_ZIP) + oproc.odem_configuration.set('export', 'local_export_tmp', str(path_tmp_export_dir)) + oproc.odem_configuration.set('export', 'local_export_dir', str(path_export_dir)) + oproc.odem_configuration.set( CFG_SEC_OCR, CFG_KEY_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize' diff --git a/tests/test_odem_processing_mets.py b/tests/test_odem_processing_mets.py index 07bfdb4..9fe262b 100644 --- a/tests/test_odem_processing_mets.py +++ b/tests/test_odem_processing_mets.py @@ -10,13 +10,7 @@ import digiflow as df import lib.ocrd3_odem as o3o - -from lib.ocrd3_odem import ( - ODEMMetadataMetsException, - ODEMNoImagesForOCRException, - ODEMMetadataInspecteur, - postprocess_mets, -) +import lib.ocrd3_odem.processing_mets as o3o_pm from .conftest import ( TEST_RES, @@ -24,27 +18,27 @@ ) -@pytest.fixture(name="inspecteur_44043", scope='module') -def _fixture_1981185920_44043(): +@pytest.fixture(name="inspecteur_44046", scope='module') +def _fixture_1981185920_44046(): """Initial ODEM fixture before doing any OCR""" # arrange _ident = '1981185920_44046' file = TEST_RES / '1981185920_44046.xml' - inspc = ODEMMetadataInspecteur(file, + inspc = o3o.ODEMMetadataInspecteur(file, process_identifier=_ident, cfg=fixture_configuration()) yield inspc -def test_odem_process_internal_identifier(inspecteur_44043: ODEMMetadataInspecteur): +def test_odem_process_internal_identifier(inspecteur_44046: o3o.ODEMMetadataInspecteur): """Ensure proper internal identifier calculated for say, logging""" - assert inspecteur_44043.process_identifier == '1981185920_44046' + assert inspecteur_44046.process_identifier == '1981185920_44046' -def test_odem_process_catalog_identifier(inspecteur_44043: ODEMMetadataInspecteur): +def test_odem_process_catalog_identifier(inspecteur_44046: o3o.ODEMMetadataInspecteur): """Ensure proper external identifier present which will be used finally to name the export SAF """ @@ -53,7 +47,7 @@ def test_odem_process_catalog_identifier(inspecteur_44043: ODEMMetadataInspecteu # init_odem.inspect_metadata() # assert - assert inspecteur_44043.record_identifier == '265982944' + assert inspecteur_44046.mods_record_identifier == '265982944' @pytest.fixture(name='post_mets', scope='module') @@ -65,7 +59,7 @@ def _fixture_postprocessing_mets(tmp_path_factory): shutil.copyfile(orig_file, trgt_mets) _cfg = fixture_configuration() _cnt_base_image = _cfg.get('ocr', 'ocrd_baseimage') - postprocess_mets(trgt_mets, _cnt_base_image) + o3o.postprocess_mets(trgt_mets, _cnt_base_image) _root = ET.parse(trgt_mets).getroot() yield _root @@ -117,11 +111,11 @@ def test_opendata_record_no_images_for_ocr(): orig_file = TEST_RES / '1981185920_74357.xml' _oai_urn = 'oai:opendata.uni-halle.de:1981185920/74357' cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - with pytest.raises(ODEMNoImagesForOCRException) as odem_exc: - inspc.inspect() + with pytest.raises(o3o.ODEMNoImagesForOCRException) as odem_exc: + inspc.metadata_report() # assert _alert = "oai:opendata.uni-halle.de:1981185920/74357 contains no images for OCR (total: 15)!" @@ -136,11 +130,11 @@ def test_opendata_record_no_printwork(): _oai_urn = 'oai:opendata.uni-halle.de:1981185920/79080' orig_file = TEST_RES / '1981185920_79080.xml' cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act with pytest.raises(o3o.ODEMNoTypeForOCRException) as odem_exc: - inspc.inspect() + inspc.metadata_report() # assert assert f"{_oai_urn} no PICA type for OCR: Ac" == odem_exc.value.args[0] @@ -156,10 +150,10 @@ def test_opendata_record_no_granular_urn_present(): _oai_urn = 'oai:opendata.uni-halle.de:1981185920/88132' orig_file = TEST_RES / '1981185920_88132.xml' cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - inspc.inspect() + inspc.metadata_report() # assert for img_entry in inspc.image_pairs: @@ -176,11 +170,11 @@ def test_opendata_record_type_error(): _oai_urn = 'oai:opendata.uni-halle.de:1981185920/105290' orig_file = TEST_RES / '1981185920_105290.xml' cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - with pytest.raises(ODEMMetadataMetsException) as odem_exc: - inspc.inspect() + with pytest.raises(o3o.ODEMMetadataMetsException) as odem_exc: + inspc.metadata_report() # assert assert "2x: Page PHYS_0112 not linked,Page PHYS_0113 not linked" == odem_exc.value.args[0] @@ -195,14 +189,14 @@ def test_mets_mods_sbb_vol01_with_ulb_defaults(): orig_file = TEST_RES / 'sbb-PPN891267093.xml' assert os.path.isfile(orig_file) cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - inspc.inspect() + inspc.metadata_report() # assert assert inspc.process_identifier == _oai_urn - assert inspc.record_identifier == 'PPN891267093' + assert inspc.mods_record_identifier == 'PPN891267093' def test_mets_filter_logical_structs_by_type(): @@ -218,14 +212,14 @@ def test_mets_filter_logical_structs_by_type(): orig_file = TEST_RES / '1981185920_33908.xml' assert os.path.isfile(orig_file) cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - inspc.inspect() + inspc.metadata_report() # assert assert inspc.process_identifier == _oai_urn - assert inspc.record_identifier == '058134433' + assert inspc.mods_record_identifier == '058134433' _image_page_pairs = inspc.image_pairs assert not any('PHYS_0001' in p[1] for p in _image_page_pairs) assert not any('PHYS_0002' in p[1] for p in _image_page_pairs) @@ -245,10 +239,10 @@ def test_mets_mods_sbb_vol01_filtering(): orig_file = TEST_RES / 'sbb-PPN891267093.xml' assert os.path.isfile(orig_file) cfg = fixture_configuration() - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - inspc.inspect() + inspc.metadata_report() # assert _image_page_pairs = inspc.image_pairs @@ -266,10 +260,10 @@ def test_mets_mods_sbb_vol01_filtering_custom(): assert os.path.isfile(orig_file) cfg = fixture_configuration() cfg.set('mets', 'blacklist_logical_containers', 'cover_front,cover_back,binding') - inspc = ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) + inspc = o3o.ODEMMetadataInspecteur(orig_file, _oai_urn, cfg) # act - inspc.inspect() + inspc.metadata_report() # assert _image_page_pairs = inspc.image_pairs @@ -288,7 +282,7 @@ def test_validate_mets_105054_schema_fails(tmp_path): _orig_mets = TEST_RES / '1981185920_105054.xml' shutil.copyfile(_orig_mets, _work_dir / '1981185920_105054.xml') odem_processor = o3o.ODEMProcess(_record, work_dir=_work_dir) - odem_processor.cfg = fixture_configuration() + odem_processor.odem_configuration = fixture_configuration() with pytest.raises(o3o.ODEMException) as exec: odem_processor.validate_metadata() @@ -305,7 +299,7 @@ def test_validate_mets_37167_schema_fails(tmp_path): original_mets = TEST_RES / '1981185920_37167_01.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') odem_processor = o3o.ODEMProcess(rec, work_dir=work_dir) - odem_processor.cfg = fixture_configuration() + odem_processor.odem_configuration = fixture_configuration() with pytest.raises(o3o.ODEMException) as exec: odem_processor.validate_metadata() @@ -329,8 +323,8 @@ def test_validate_mets_37167_ddb_fails(tmp_path): original_mets = TEST_RES / '1981185920_37167_02.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') odem_processor = o3o.ODEMProcess(rec, work_dir=work_dir) - odem_processor.cfg = fixture_configuration() - odem_processor.cfg.set('mets', 'ddb_validation', 'True') + odem_processor.odem_configuration = fixture_configuration() + odem_processor.odem_configuration.set('mets', 'ddb_validation', 'True') with pytest.raises(o3o.ODEMException) as exec: odem_processor.validate_metadata() @@ -354,7 +348,64 @@ def test_validate_mets_37167_finally_succeeds(tmp_path): original_mets = TEST_RES / '1981185920_37167_03.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') odem_processor = o3o.ODEMProcess(rec, work_dir=work_dir) - odem_processor.cfg = fixture_configuration() - odem_processor.cfg.set('mets', 'ddb_validation', 'True') + odem_processor.odem_configuration = fixture_configuration() + odem_processor.odem_configuration.set('mets', 'ddb_validation', 'True') assert odem_processor.validate_metadata() + + +def test_integrate_alto_from_ocr_pipeline(tmp_path): + """Ensure we can handle ALTO output straight from Tesseract + OCR-Pipeline workflows + """ + + # arrange + mets_file = TEST_RES / '1981185920_42296.xml' + fulltext_dir = TEST_RES / '1981185920_42296_FULLTEXT' + assert mets_file.exists() + assert fulltext_dir.exists() + tmp_mets = shutil.copy(mets_file, tmp_path) + + mets_tree = ET.parse(tmp_mets) + ocr_files = [os.path.join(fulltext_dir, f) for f in os.listdir(fulltext_dir)] + assert len(ocr_files) == 4 + + # actsert + assert 4 == o3o_pm.integrate_ocr_file(mets_tree, ocr_files) + + +def test_extract_text_content_from_alto_file(): + """Ensure we can read ALTO output and get its contents + """ + + # arrange + fulltext_dir = TEST_RES / '1981185920_42296_FULLTEXT' + ocr_files = [os.path.join(fulltext_dir, f) for f in os.listdir(fulltext_dir)] + assert len(ocr_files) == 4 + + # act + text = o3o_pm.extract_text_content(ocr_files) + + # assert + assert text is not None + assert len(text) == 126 + + +def test_extract_identifiers(): + """What can we expect for identification + when feeding newspapers? Expect the + custom kvx-ppn value + 16691561019210131 + """ + + # arrange + mets_file = TEST_RES / '1516514412012_175762.xml' + inspecteur = o3o_pm.ODEMMetadataInspecteur(mets_file, + '1516514412012_175762', + fixture_configuration()) + # act + report = inspecteur.metadata_report() + + # assert + assert report is not None + assert inspecteur.mods_record_identifier == '16691561019210131' diff --git a/tests/test_odem_processing_ocrd.py b/tests/test_odem_processing_ocrd.py index 7c9db31..3dafb0a 100644 --- a/tests/test_odem_processing_ocrd.py +++ b/tests/test_odem_processing_ocrd.py @@ -42,18 +42,3 @@ def test_odem_recognition_level_custom(model_conf, rec_level): _custom_rtl = ['gt4ara', 'ulb-fas'] assert get_recognition_level(model_conf, _custom_rtl) == rec_level - - -def test_no_cfg_ocrd_process_list(tmp_path): - """ - if no ocrd_process_list is configured, process can not be executed (OCRDPageParallel) - """ - _record = OAIRecord('oai:opendata.uni-halle.de:1981185920/105054') - _work_dir = tmp_path / '1981185920_105054' - odem_processor = OCRDPageParallel(_record, work_dir=_work_dir) - odem_processor.cfg = fixture_configuration() - del odem_processor.cfg['ocr']['ocrd_process_list'] - with pytest.raises(ODEMException) as exc: - odem_processor.run() - - assert exc.value.args[0] == "No option 'ocrd_process_list' in section: 'ocr'" diff --git a/tests/test_odem_processing_tesseract_pipeline.py b/tests/test_odem_processing_tesseract_pipeline.py index c74e045..a99f90e 100644 --- a/tests/test_odem_processing_tesseract_pipeline.py +++ b/tests/test_odem_processing_tesseract_pipeline.py @@ -4,41 +4,45 @@ import os import shutil -from pathlib import ( - Path, -) +from pathlib import Path import pytest -from digiflow import ( - OAIRecord, -) - -from lib.ocrd3_odem.odem_commons import ( - get_logger, -) -from lib.ocrd3_odem.ocrd3_odem import ( - ODEMTesseract, -) -from lib.ocrd3_odem.processing_ocr_pipeline import ( - StepPostReplaceChars, - StepPostReplaceCharsRegex, - StepTesseract, - profile, - init_steps, -) - -from .conftest import ( - TEST_RES, - PROD_RES, -) +import digiflow as df + +import lib.ocrd3_odem as o3o +import lib.ocrd3_odem.processing_ocr_pipeline as o3o_pop +from lib.ocrd3_odem.odem_commons import get_logger +from lib.ocrd3_odem.ocrd3_odem import ODEMTesseract + +from .conftest import TEST_RES, PROD_RES RES_0001_TIF = "0001.tif" RES_0002_PNG = "0002.png" RES_0003_JPG = "0003.jpg" RES_00041_XML = str(TEST_RES / '0041.xml') -RES_CFG = str(PROD_RES / 'tesseract_pipeline_config.ini') +PATH_ODEM_CFG = PROD_RES / 'odem.ocr-pipeline.ini' +ODEM_CFG = o3o.get_configparser() +ODEM_CFG.read(PATH_ODEM_CFG) +OCR_PIPELINE_CFG = PROD_RES / 'odem.ocr-pipeline.steps.ini' + + +def test_ocr_pipeline_profile(): + """check profiling""" + + # arrange + # pylint: disable=missing-class-docstring,too-few-public-methods + class InnerClass: + + # pylint: disable=missing-function-docstring,no-self-use + def func(self): + return [i * i for i in range(1, 200000)] + + # act + inner = InnerClass() + result = o3o_pop.profile(inner.func) + assert "test_ocr_pipeline_profile run" in result @pytest.fixture(name="a_workspace") @@ -63,43 +67,25 @@ def fixure_a_workspace(tmp_path): @pytest.fixture(name="my_pipeline") def _fixture_default_pipeline(a_workspace: Path): - _record = OAIRecord('oai:urn:mwe') - _odem = ODEMTesseract(_record, a_workspace) - _odem.read_pipeline_config(RES_CFG) - _logger = get_logger(a_workspace / 'log') - _odem.the_logger = _logger - return _odem + _record = df.OAIRecord('oai:urn:mwe') + odem_process = o3o.ODEMProcess(_record, a_workspace) + odem_process.odem_configuration = ODEM_CFG + odem_process._statistics_ocr['languages'] = ['ger'] + odem_process.the_logger = get_logger(a_workspace / 'log') + odem_tess = ODEMTesseract(odem_process) + return odem_tess def test_ocr_pipeline_default_config(my_pipeline: ODEMTesseract): """check default config options""" - assert my_pipeline - _cfg = my_pipeline.pipeline_config - assert _cfg.get('pipeline', 'executors') == '8' + _cfg = my_pipeline.read_pipeline_config(OCR_PIPELINE_CFG) + assert 'pipeline' in _cfg.sections() assert _cfg.get('pipeline', 'logger_name') == 'ocr_pipeline' assert _cfg.get('pipeline', 'file_ext') == 'tif,jpg,png,jpeg' - # assert _cfg.get('step_03', 'language') == 'de-DE' - # assert _cfg.get('step_03', 'enabled_rules') == 'GERMAN_SPELLER_RULE' - - -def test_ocr_pipeline_profile(): - """check profiling""" - - # arrange - # pylint: disable=missing-class-docstring,too-few-public-methods - class InnerClass: - - # pylint: disable=missing-function-docstring,no-self-use - def func(self): - return [i * i for i in range(1, 2000000)] - - # act - inner = InnerClass() - result = profile(inner.func) - assert "test_ocr_pipeline_profile run" in result +@pytest.mark.skip('kept only for documentation') def test_ocr_pipeline_estimations(my_pipeline: ODEMTesseract): """check estimation data persisted""" @@ -128,20 +114,24 @@ def _fixture_custom_config_pipeline(a_workspace): conf_dir.mkdir() conf_file = TEST_RES / 'ocr_config_full.ini' assert os.path.isfile(conf_file) - _odem = ODEMTesseract(OAIRecord('oai:urn_custom'), a_workspace) - _odem.read_pipeline_config(conf_file) - return _odem + odem_process = o3o.ODEMProcess(df.OAIRecord('oai:urn_custom'), a_workspace) + odem_process.odem_configuration = ODEM_CFG + odem_process._statistics_ocr['languages'] = ['ger', 'lat'] + odem_process.the_logger = get_logger(a_workspace / 'log') + odem_tess = o3o.ODEMTesseract(odem_process) + odem_tess.read_pipeline_config(conf_file) + return odem_tess def test_pipeline_step_tesseract(custom_pipe: ODEMTesseract, a_workspace): """Check proper tesseract cmd from full configuration""" - steps = init_steps(custom_pipe.pipeline_config) + steps = o3o_pop.init_steps(custom_pipe.pipeline_configuration) steps[0].path_in = a_workspace / 'scandata' / RES_0001_TIF # assert assert len(steps) == 5 - assert isinstance(steps[0], StepTesseract) + assert isinstance(steps[0], o3o_pop.StepTesseract) the_cmd = steps[0].cmd the_cmd_tokens = the_cmd.split() assert len(the_cmd_tokens) == 6 @@ -149,29 +139,29 @@ def test_pipeline_step_tesseract(custom_pipe: ODEMTesseract, a_workspace): assert the_cmd_tokens[1].endswith('scandata/0001.tif') assert the_cmd_tokens[2].endswith('scandata/0001') assert the_cmd_tokens[3] == '-l' - assert the_cmd_tokens[4] == 'frk+deu' + assert the_cmd_tokens[4] == 'gt4hist_5000k+lat_ocr' assert the_cmd_tokens[5] == 'alto' -def test_pipeline_step_replace(custom_pipe): +def test_pipeline_step_replace(custom_pipe: ODEMTesseract): """Check proper steps from full configuration""" # act - steps = init_steps(custom_pipe.pipeline_config) + steps = o3o_pop.init_steps(custom_pipe.pipeline_configuration) # assert assert len(steps) == 5 - assert isinstance(steps[1], StepPostReplaceChars) + assert isinstance(steps[1], o3o_pop.StepPostReplaceChars) assert isinstance(steps[1].dict_chars, dict) -def test_pipeline_step_replace_regex(custom_pipe): +def test_pipeline_step_replace_regex(custom_pipe: ODEMTesseract): """Check proper steps from full configuration""" # act - steps = init_steps(custom_pipe.pipeline_config) + steps = o3o_pop.init_steps(custom_pipe.pipeline_configuration) # assert assert len(steps) == 5 - assert isinstance(steps[2], StepPostReplaceCharsRegex) + assert isinstance(steps[2], o3o_pop.StepPostReplaceCharsRegex) assert steps[2].pattern == 'r\'([aeioubcglnt]3[:-]*")\'' diff --git a/tests/test_odem_processing_tesseract_steps.py b/tests/test_odem_processing_tesseract_steps.py index a04cbcf..0ece4ba 100644 --- a/tests/test_odem_processing_tesseract_steps.py +++ b/tests/test_odem_processing_tesseract_steps.py @@ -5,42 +5,24 @@ import os import shutil -from unittest import ( - mock -) +from pathlib import Path +from unittest import mock import lxml.etree as ET import requests - import pytest -from lib.ocrd3_odem.processing_ocr_pipeline import ( - NAMESPACES, - StepIO, - StepTesseract, - StepPostMoveAlto, - StepPostReplaceChars, - StepPostReplaceCharsRegex, - StepPostRemoveFile, - StepException, - StepEstimateOCR, - StepPostprocessALTO, - analyze, - get_lines, - textlines2data, -) - -from .conftest import ( - TEST_RES, -) +import lib.ocrd3_odem.processing_ocr_pipeline as o3o_pop + +from .conftest import TEST_RES def test_stepio_not_initable(): """StepIO cant be instantiated""" with pytest.raises(TypeError) as exec_info: - StepIO() # pylint: disable=abstract-class-instantiated + o3o_pop.StepIO() # pylint: disable=abstract-class-instantiated assert "Can't instantiate" in str(exec_info.value) @@ -58,18 +40,18 @@ def fixture_path_existing(tmp_path): path1.write_bytes(bytearray([120, 3, 255, 0, 100])) path2 = max_dir / TIF_002 path2.write_bytes(bytearray([120, 3, 255, 0, 100])) - return str(max_dir) + return max_dir -def test_step_tesseract_list_langs(max_dir): +def test_step_tesseract_list_langs(max_dir: Path): """Tesseract list-langs""" # arrange args = {'--list-langs': None} # act - step = StepTesseract(args) - step.path_in = os.path.join(max_dir, TIF_001) + step = o3o_pop.StepTesseract(args) + step.path_in = max_dir / TIF_001 # assert assert ' --list-langs' in step.cmd @@ -82,11 +64,11 @@ def test_step_tesseract_path_out_folder(max_dir): args = {'-l': 'deu', 'alto': None} # act - step = StepTesseract(args) + step = o3o_pop.StepTesseract(args) step.path_in = os.path.join(max_dir, TIF_001) # assert - assert '001.xml' in step.path_next + assert step.path_next.name == '001.xml' def test_step_tesseract_change_input(max_dir): @@ -96,7 +78,7 @@ def test_step_tesseract_change_input(max_dir): args = {'-l': 'deu', 'alto': None} # act - step = StepTesseract(args) + step = o3o_pop.StepTesseract(args) step.path_in = os.path.join(max_dir, TIF_001) # assert @@ -120,7 +102,7 @@ def test_step_tesseract_change_input_with_dir(max_dir): args = {'-l': 'deu', 'alto': None} # act - step = StepTesseract(args) + step = o3o_pop.StepTesseract(args) step.path_in = os.path.join(max_dir, TIF_001) # assert @@ -137,16 +119,15 @@ def test_step_tesseract_change_input_with_dir(max_dir): def test_step_tesseract_invalid_params(max_dir): - """Tesseract path to write result""" + """Check nature of params""" # act - with pytest.raises(StepException) as excinfo: - StepTesseract(max_dir) + with pytest.raises(o3o_pop.StepException) as excinfo: + o3o_pop.StepTesseract(max_dir) # assert actual_exc_text = str(excinfo.value) - assert 'Invalid Dictionary for arguments provided' in actual_exc_text - assert '"need more than 1 value to unpack" !' in actual_exc_text + assert 'Invalid params' in actual_exc_text def test_step_tesseract_full_args(max_dir): @@ -159,7 +140,7 @@ def test_step_tesseract_full_args(max_dir): args = {'--dpi': 470, '-l': 'ulbfrk', 'alto': None} # act - step = StepTesseract(args) + step = o3o_pop.StepTesseract(args) step.path_in = os.path.join(max_dir, TIF_001) # assert @@ -167,7 +148,7 @@ def test_step_tesseract_full_args(max_dir): output_xml = os.path.splitext(os.path.join(max_dir, TIF_001))[0] cmd = f'tesseract {input_tif} {output_xml} --dpi 470 -l ulbfrk alto' assert cmd == step.cmd - assert step.path_next.endswith('001.xml') + assert step.path_next.name == '001.xml' def test_step_tesseract_different_configurations(max_dir): @@ -177,7 +158,7 @@ def test_step_tesseract_different_configurations(max_dir): args = {'-l': 'frk_ulbzd1', 'alto': None, 'txt': None} # act - step = StepTesseract(args) + step = o3o_pop.StepTesseract(args) step.path_in = os.path.join(max_dir, TIF_001) # assert @@ -187,24 +168,22 @@ def test_step_tesseract_different_configurations(max_dir): assert tesseract_cmd == step.cmd -def test_step_copy_alto_back(max_dir): +def test_step_copy_alto_back(max_dir: Path): """ Move ALTO file back to where we started Preserve filename, only switch directory """ # arrange - path_target = '/tmp/500_gray00001_st.tif' + path_target = max_dir.parent / 'FULLTEXT' + step = o3o_pop.StepPostMoveAlto({'path_target': path_target}) # act - step = StepPostMoveAlto({}) - step.path_in = os.path.join(max_dir, TIF_001) - step.path_next = path_target + step.path_in = max_dir / TIF_001 step.execute() # assert - assert os.path.join(max_dir, TIF_001) == step.path_in - assert step.path_next == '/tmp/001.xml' + assert step.path_next == path_target / TIF_001 assert os.path.exists(step.path_next) @@ -215,7 +194,7 @@ def test_step_replace(): src = str(TEST_RES / '500_gray00003.xml') dict_chars = {'ſ': 's', 'ic)': 'ich'} params = {'dict_chars': dict_chars, 'must_backup': True} - step = StepPostReplaceChars(params) + step = o3o_pop.StepPostReplaceChars(params) step.path_in = src lines = [''] @@ -244,7 +223,7 @@ def fixture_empty_ocr(tmpdir): def test_step_replace_with_empty_alto(empty_ocr): """Determine behavior for invalid input data""" - step = StepPostReplaceChars({'dict_chars': {'ſ': 's'}}) + step = o3o_pop.StepPostReplaceChars({'dict_chars': {'ſ': 's'}}) step.path_in = empty_ocr # act @@ -274,7 +253,7 @@ def test_replaced_file_written(tmp_500_gray): # arrange params = _provide_replace_params() - step = StepPostReplaceChars(params) + step = o3o_pop.StepPostReplaceChars(params) # act step.path_in = tmp_500_gray @@ -298,7 +277,7 @@ def test_replaced_file_statistics(tmp_500_gray): """test statistics available""" # arrange - step = StepPostReplaceChars(_provide_replace_params()) + step = o3o_pop.StepPostReplaceChars(_provide_replace_params()) step.path_in = tmp_500_gray # act @@ -316,7 +295,7 @@ def test_regex_replacements(tmp_500_gray): # arrange params = {'pattern': r'([aeioubcglnt]3[:-]*")', 'old': '3', 'new': 's'} - step = StepPostReplaceCharsRegex(params) + step = o3o_pop.StepPostReplaceCharsRegex(params) # act step.path_in = tmp_500_gray @@ -347,18 +326,21 @@ def test_remove_failed(): """Test remove failed since file is missing""" # arrange - step = StepPostRemoveFile({'file_suffix': 'tif'}) + step = o3o_pop.StepPostRemoveFile({'file_suffix': 'tif'}) # act - with pytest.raises(RuntimeError): + with pytest.raises(o3o_pop.StepException) as step_err: step.path_in = 'qwerrwe.tif' + # assert + assert "qwerrwe.tif' invalid!" in step_err.value.args[0] + def test_remove_succeeded(max_dir): """Test remove success""" # arrange - step = StepPostRemoveFile({'file_suffix': 'tif'}) + step = o3o_pop.StepPostRemoveFile({'file_suffix': 'tif'}) # act step.path_in = os.path.join(max_dir, TIF_001) @@ -386,7 +368,7 @@ def test_stepestimateocr_analyze(): ] # act - actual = analyze(results) + actual = o3o_pop.analyze(results) # assert assert actual[0] == 42.723 @@ -412,7 +394,7 @@ def test_estimate_handle_large_wtr(): ] # act - actual = analyze(results) + actual = o3o_pop.analyze(results) # assert assert actual[0] == 49.677 @@ -430,7 +412,7 @@ def test_step_estimateocr_empty_alto(empty_ocr): Modified: in this (rare) case, just do nothing, do *not* raise any Exception """ - step = StepEstimateOCR({}) + step = o3o_pop.StepEstimateOCR({}) step.path_in = empty_ocr # act @@ -446,7 +428,7 @@ def test_service_down(mock_requests): # arrange params = {'service_url': 'http://localhost:8010/v2/check'} - step = StepEstimateOCR(params) + step = o3o_pop.StepEstimateOCR(params) mock_requests.side_effect = requests.ConnectionError # assert @@ -461,8 +443,8 @@ def test_step_estimateocr_textline_conversions(): # pylint: disable=protected-access xml_data = ET.parse(test_data) - lines = get_lines(xml_data) - (_, n_lines, _, _, n_lines_out) = textlines2data(lines) + lines = o3o_pop.get_lines(xml_data) + (_, n_lines, _, _, n_lines_out) = o3o_pop.textlines2data(lines) assert n_lines == 360 assert n_lines_out == 346 @@ -490,7 +472,7 @@ def test_step_estimateocr_lines_and_tokens_err_ratio(mock_requests): 'language': 'de-DE', 'enabled_rules': 'GERMAN_SPELLER_RULE' } - step = StepEstimateOCR(params) + step = o3o_pop.StepEstimateOCR(params) step.path_in = test_data # act @@ -514,7 +496,7 @@ def test_step_estimateocr_lines_and_tokens_hit_ratio(mock_requests): 'language': 'de-DE', 'enabled_rules': 'GERMAN_SPELLER_RULE' } - step = StepEstimateOCR(params) + step = o3o_pop.StepEstimateOCR(params) step.path_in = test_data # act @@ -546,7 +528,7 @@ def test_stepestimate_invalid_data(mock_request): 'language': 'de-DE', 'enabled_rules': 'GERMAN_SPELLER_RULE' } - step = StepEstimateOCR(params) + step = o3o_pop.StepEstimateOCR(params) step.path_in = data_path # act @@ -561,13 +543,13 @@ def test_stepestimate_invalid_data(mock_request): def _fixture_altov4(tmp_path): test_data = os.path.join(TEST_RES / '16331011.xml') prev_root = ET.parse(test_data).getroot() - prev_strings = prev_root.findall('.//alto:String', NAMESPACES) + prev_strings = prev_root.findall('.//alto:String', o3o_pop.NAMESPACES) assert len(prev_strings) == 275 dst_path = tmp_path / "16331011.xml" shutil.copy(test_data, dst_path) # act within a fixture - step = StepPostprocessALTO() + step = o3o_pop.StepPostprocessALTO() step.path_in = str(dst_path) step.execute() @@ -577,7 +559,7 @@ def _fixture_altov4(tmp_path): def test_clear_empty_content(altov4_xml): """Ensure no more empty Strings exist""" - all_strings = altov4_xml.findall('.//alto:String', NAMESPACES) + all_strings = altov4_xml.findall('.//alto:String', o3o_pop.NAMESPACES) # assert about 20 Strings (from 275, cf. fixture) # have been dropped due emptyness assert len(all_strings) == 254 @@ -586,13 +568,13 @@ def test_clear_empty_content(altov4_xml): def test_process_alto_file_identifier_set(altov4_xml): """Ensure expected fileIdentifier present """ - assert altov4_xml.find('.//alto:fileIdentifier', NAMESPACES).text == '16331011' + assert altov4_xml.find('.//alto:fileIdentifier', o3o_pop.NAMESPACES).text == '16331011' def test_process_alto_filename_set(altov4_xml): """Ensure expected fileName present """ - assert altov4_xml.find('.//alto:fileName', NAMESPACES).text == '16331011.xml' + assert altov4_xml.find('.//alto:fileName', o3o_pop.NAMESPACES).text == '16331011.xml' def test_clear_empty_lines_with_spatiums(tmp_path): @@ -600,12 +582,12 @@ def test_clear_empty_lines_with_spatiums(tmp_path): test_data = os.path.join(TEST_RES / '16331001.xml') prev_root = ET.parse(test_data).getroot() - prev_strings = prev_root.findall('.//alto:String', NAMESPACES) + prev_strings = prev_root.findall('.//alto:String', o3o_pop.NAMESPACES) # original ALTO output assert len(prev_strings) == 1854 dst_path = tmp_path / "16331001.xml" shutil.copy(test_data, dst_path) - step = StepPostprocessALTO() + step = o3o_pop.StepPostprocessALTO() step.path_in = dst_path # act @@ -613,17 +595,17 @@ def test_clear_empty_lines_with_spatiums(tmp_path): # assert xml_root = ET.parse(dst_path).getroot() - all_strings = xml_root.findall('.//alto:String', NAMESPACES) + all_strings = xml_root.findall('.//alto:String', o3o_pop.NAMESPACES) # line with 2 empty strings and SP in between line_with_sps = xml_root.findall( - './/alto:TextLine[@ID="line_2"]', NAMESPACES) + './/alto:TextLine[@ID="line_2"]', o3o_pop.NAMESPACES) assert not line_with_sps # assert many Strings have been dropped due emptyness assert len(all_strings) == 1673 assert xml_root.find( './/alto:fileIdentifier', - NAMESPACES).text == '16331001' - assert xml_root.find('.//alto:fileName', NAMESPACES).text == '16331001.xml' + o3o_pop.NAMESPACES).text == '16331001' + assert xml_root.find('.//alto:fileName', o3o_pop.NAMESPACES).text == '16331001.xml' @pytest.fixture(name="pipeline_odem_xml") @@ -633,7 +615,7 @@ def _fixture_pipeline_odem_xml(tmp_path): shutil.copy(test_data, dst_path) # act within a fixture - step = StepPostprocessALTO({'page_prefix': ''}) + step = o3o_pop.StepPostprocessALTO({'page_prefix': ''}) step.path_in = dst_path step.execute() @@ -643,7 +625,7 @@ def _fixture_pipeline_odem_xml(tmp_path): def test_process_odem_result_identifier_set(pipeline_odem_xml): """Ensure expected fileIdentifier present """ - file_ident = pipeline_odem_xml.find('.//alto:fileIdentifier', NAMESPACES) + file_ident = pipeline_odem_xml.find('.//alto:fileIdentifier', o3o_pop.NAMESPACES) assert file_ident is not None assert file_ident.text == 'urn+nbn+de+gbv+3+1-121915-p0159-6_ger' @@ -651,7 +633,7 @@ def test_process_odem_result_identifier_set(pipeline_odem_xml): def test_process_odem_filename_set(pipeline_odem_xml): """Ensure expected fileName present """ - txt_filename = pipeline_odem_xml.find('.//alto:fileName', NAMESPACES) + txt_filename = pipeline_odem_xml.find('.//alto:fileName', o3o_pop.NAMESPACES) assert txt_filename is not None assert txt_filename.text == 'urn+nbn+de+gbv+3+1-121915-p0159-6_ger.xml' @@ -659,5 +641,34 @@ def test_process_odem_filename_set(pipeline_odem_xml): def test_process_odem_page_id(pipeline_odem_xml): """Ensure expected fileName present """ - page_id = pipeline_odem_xml.find('.//alto:Page', NAMESPACES).attrib['ID'] + page_id = pipeline_odem_xml.find('.//alto:Page', o3o_pop.NAMESPACES).attrib['ID'] assert page_id == 'urn+nbn+de+gbv+3+1-121915-p0159-6_ger' + + +def test_step_replace_regex(tmp_path): + """Ensure 'J's have reduced""" + + # arrange + alto_in = TEST_RES / '1516514412012_175762_00000003.xml' + tmp_file = shutil.copyfile(alto_in, tmp_path / alto_in.name) + assert tmp_file.exists() + with open(tmp_file, encoding='utf-8') as reader: + text_in = reader.readlines() + J_in = sum((1 for l in text_in if 'J' in l)) + assert J_in == 185 + params = { + 'pattern': r'(J[cdhmn]\w*)', 'old': 'J', 'new': 'I' + } + + step = o3o_pop.StepPostReplaceCharsRegex(params) + step.path_in = tmp_file + + # act + step.execute() + + # assert + assert len(step._replacements) == 9 + with open(step.path_next, encoding='utf-8') as reader: + text_out = reader.readlines() + J_out = sum((1 for l in text_out if 'J' in l)) + assert J_out == 172