diff --git a/cli_dir_local.py b/cli_dir_local.py index bed4ce3..b953ea5 100644 --- a/cli_dir_local.py +++ b/cli_dir_local.py @@ -95,7 +95,7 @@ proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None) if proc_type is None: LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'") - PROCESS: odem.ODEMProcess = odem.ODEMProcess.create(proc_type, None, req_dst_dir, EXECUTORS) + PROCESS: odem.ODEMProcessImpl = odem.ODEMProcessImpl.create(proc_type, None, req_dst_dir, EXECUTORS) PROCESS.local_mode = True PROCESS.odem_configuration = CFG PROCESS.the_logger = LOGGER diff --git a/cli_mets_local.py b/cli_mets_local.py index 1a66d30..f882eb2 100644 --- a/cli_mets_local.py +++ b/cli_mets_local.py @@ -116,7 +116,7 @@ if proc_type is None: LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'") record = df.OAIRecord(local_ident) - odem_process: odem.ODEMProcess = odem.ODEMProcess(record, mets_file_dir) + odem_process: odem.ODEMProcessImpl = odem.ODEMProcessImpl(record, mets_file_dir) odem_process.the_logger = LOGGER odem_process.the_logger.info("[%s] odem from %s, %d executors", local_ident, mets_file, EXECUTORS) odem_process.odem_configuration = CFG diff --git a/cli_oai_client.py b/cli_oai_client.py index e95e1a3..87ef3ba 100644 --- a/cli_oai_client.py +++ b/cli_oai_client.py @@ -260,7 +260,7 @@ def oai_arg_parser(value): rec_ident = record.identifier local_ident = record.local_identifier req_dst_dir = os.path.join(LOCAL_WORK_ROOT, local_ident) - odem_process: odem.ODEMProcess = odem.ODEMProcess(record, req_dst_dir) + odem_process: odem.ODEMProcessImpl = odem.ODEMProcessImpl(record, req_dst_dir) odem_process.the_logger = LOGGER odem_process.the_logger.debug( "request %s from %s (%s part slots)", diff --git a/cli_oai_local.py b/cli_oai_local.py index 76fbf31..26c3dc3 100644 --- a/cli_oai_local.py +++ b/cli_oai_local.py @@ -21,7 +21,7 @@ MARK_OCR_DONE, MARK_OCR_OPEN, MARK_OCR_FAIL, - ODEMProcess, + ODEMProcessImpl, ODEMException, get_configparser, get_logger, @@ -156,7 +156,7 @@ def wrap_save_record_state(status: str, urn, **kwargs): proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None) if proc_type is None: LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'") - odem_process: ODEMProcess = ODEMProcess(record, req_dst_dir) + odem_process: ODEMProcessImpl = ODEMProcessImpl(record, req_dst_dir) odem_process.the_logger = LOGGER odem_process.the_logger.info("[%s] odem from %s, %d executors", local_ident, OAI_RECORD_FILE, EXECUTORS) odem_process.odem_configuration = CFG diff --git a/lib/odem/__init__.py b/lib/odem/__init__.py index b1e9072..60c30a8 100644 --- a/lib/odem/__init__.py +++ b/lib/odem/__init__.py @@ -1,12 +1,12 @@ """Public ODEM API""" from .odem_commons import * - from .ocrd3_odem import ( - OdemWorkflowProcessType, - ODEMProcess, - ODEMWorkflow, + ODEMProcessImpl, +) +from .ocr.ocr_workflow import ( ODEMWorkflowRunner, + ODEMWorkflow, OCRDPageParallel, ODEMTesseract, ) diff --git a/lib/odem/ocr/ocr_workflow.py b/lib/odem/ocr/ocr_workflow.py new file mode 100644 index 0000000..b236153 --- /dev/null +++ b/lib/odem/ocr/ocr_workflow.py @@ -0,0 +1,355 @@ +"""ODEM Workflow API""" + +from __future__ import annotations + +import concurrent.futures +import configparser +import logging +import os +import shutil +import subprocess +import time +import typing + +import lib.odem.odem_commons as odem_c +import lib.odem.ocr.ocrd as odem_ocrd +import lib.odem.ocr.ocr_pipeline as odem_tess +import lib.odem.processing.image as odem_img +import lib.odem.processing.ocr_files as odem_fmt + +# estimated ocr-d runtime +# for a regular page (A4, 1MB) +DEFAULT_RUNTIME_PAGE = 1.0 +# process duration format +ODEM_PAGE_TIME_FORMAT = '%Y-%m-%d_%H-%m-%S' +# how long to process single page? +DEFAULT_DOCKER_CONTAINER_TIMEOUT = 600 + +LOCAL_OCRD_RESULT_DIR = 'PAGE' + + +class ODEMWorkflowRunner: + """Wrap actual ODEM process execution""" + + def __init__(self, identifier, n_executors, + internal_logger, odem_workflow) -> None: + self.process_identifier = identifier + self.n_executors = n_executors + self.logger:logging.Logger = internal_logger + self.odem_workflow: ODEMWorkflow = odem_workflow + + def run(self): + input_data = self.odem_workflow.get_inputs() + the_outcomes = [(0, 0, 0, 0)] + if self.n_executors > 1: + the_outcomes = self.run_parallel(input_data) + else: + the_outcomes = self.run_sequential(input_data) + self.odem_workflow.foster_outputs() + return the_outcomes + + def run_parallel(self, input_data): + """Run workflow parallel with given executors""" + + n_inputs = len(input_data) + self.logger.info("[%s] %d inputs run_parallel by %d executors", + self.process_identifier, n_inputs, self.n_executors) + try: + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.n_executors, + thread_name_prefix='odem.ocrd' + ) as executor: + return list(executor.map(self.odem_workflow.run, input_data)) + except (OSError, AttributeError) as err: + self.logger.error(err) + raise odem_c.ODEMException(f"ODEM parallel: {err.args[0]}") from err + + def run_sequential(self, input_data): + """run complete workflow plain sequential + For debugging or small machines + """ + + len_img = len(input_data) + estm_min = len_img * DEFAULT_RUNTIME_PAGE + self.logger.info("[%s] %d inputs run_sequential, estm. %dmin", + self.process_identifier, len_img, estm_min) + try: + outcomes = [self.odem_workflow.run(the_input) + for the_input in input_data] + return outcomes + except (OSError, AttributeError) as err: + self.logger.error(err) + raise odem_c.ODEMException(f"ODEM sequential: {err.args[0]}") from err + + +class ODEMWorkflow: + """Base Interface""" + + @staticmethod + def create( + workflow_type: odem_c.OdemWorkflowProcessType | str, + odem: odem_c.ODEMProcess, + ) -> ODEMWorkflow: + if (workflow_type == odem_c.OdemWorkflowProcessType.ODEM_TESSERACT + or workflow_type == odem_c.OdemWorkflowProcessType.ODEM_TESSERACT.value): + return ODEMTesseract(odem) + return OCRDPageParallel(odem) + + def get_inputs(self) -> typing.List: + """Collect all input data files for processing""" + + def run(self): + """Run actual implemented Workflow""" + + def foster_outputs(self): + """Work to do after pipeline has been run successfully + like additional format transformations or sanitizings + """ + + +class OCRDPageParallel(ODEMWorkflow): + """Use page parallel workflow""" + + def __init__(self, odem_process: odem_c.ODEMProcess): + self.odem = odem_process + self.cfg = odem_process.odem_configuration + self.logger = odem_process.the_logger + + def get_inputs(self): + return self.odem.images_4_ocr + + def run(self, input_data): + """Create OCR Data""" + + ocr_log_conf = os.path.join( + odem_c.PROJECT_ROOT, self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_logging')) + + # Preprare workspace with makefile + (image_path, ident) = input_data + os.chdir(self.odem.work_dir_main) + file_name = os.path.basename(image_path) + file_id = file_name.split('.')[0] + page_workdir = os.path.join(self.odem.work_dir_main, file_id) + if os.path.exists(page_workdir): + shutil.rmtree(page_workdir, ignore_errors=True) + os.mkdir(page_workdir) + shutil.copy(ocr_log_conf, page_workdir) + os.chdir(page_workdir) + + # move and convert image data at once + processed_image_path = odem_img.sanitize_image(image_path, page_workdir) + + # init ocr-d workspace + odem_ocrd.ocrd_workspace_setup(page_workdir, processed_image_path) + + # find model config for tesseract + model_config = self.odem.map_language_to_modelconfig(image_path) + + stored = 0 + mps = 0 + filesize_mb = 0 + # use original image rather than + # transformed one since PNG is + # usually 2-5 times larger than JPG + filestat = os.stat(image_path) + if filestat: + filesize_mb = filestat.st_size / 1048576 + (mps, dpi) = odem_img.get_imageinfo(image_path) + + # how to identify data set? + if self.odem.record: + _ident = self.odem.process_identifier + else: + _ident = os.path.basename(self.odem.work_dir_main) + # OCR Generation + profiling = ('n.a.', 0) + + container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}' + container_memory_limit: str = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) + container_user = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) + container_timeout: int = self.cfg.getint( + odem_c.CFG_SEC_OCR, + 'docker_container_timeout', + fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT + ) + base_image = self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_baseimage') + ocrd_process_list = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'ocrd_process_list') + tesseract_model_rtl: typing.List[str] = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'tesseract_model_rtl', fallback=odem_c.DEFAULT_RTL_MODELS) + ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) + + if self.odem.local_mode: + container_name = os.path.basename(page_workdir) + try: + profiling = odem_ocrd.run_ocr_page( + page_workdir, + base_image, + container_memory_limit, + container_timeout, + container_name, + container_user, + ocrd_process_list, + model_config, + ocrd_resources_volumes, + tesseract_model_rtl, + ) + # will be unset in case of magic mocking for test + if profiling: + self.logger.info("[%s] '%s' in %s (%.1fMP, %dDPI, %.1fMB)", + _ident, profiling[1], profiling[0], mps, dpi, filesize_mb) + self.logger.info("[%s] run ocr creation in '%s'", + _ident, page_workdir) + stored = self._store_fulltext(page_workdir, image_path) + if stored: + self._preserve_log(page_workdir, ident) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: + self.logger.error("[%s] image '%s' failed due to subprocess timeout: %s", + _ident, base_image, exc) + except Exception as plain_exc: + self.logger.error("[%s] generic exc '%s' for image '%s'", + _ident, plain_exc, base_image) + + os.chdir(self.odem.work_dir_main) + if self.cfg.getboolean(odem_c.CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: + shutil.rmtree(page_workdir, ignore_errors=True) + return stored, 1, mps, filesize_mb + + def _preserve_log(self, work_subdir, image_ident): + """preserve ocrd.log for later analyzis as + sub directory identified by adopted local + identifier (local section of system OAI handle)""" + + _root_log = self.cfg.get('global', 'local_log_dir') + _local_ident = self.odem.process_identifier.replace('/', '_') + _local_ocr_log = os.path.join(_root_log, _local_ident) + if not os.path.exists(_local_ocr_log): + os.makedirs(_local_ocr_log, exist_ok=True) + + _org_log = os.path.join(work_subdir, 'ocrd.log') + if os.path.exists(_org_log): + _ts = time.strftime(ODEM_PAGE_TIME_FORMAT, time.localtime()) + _log_label = f'ocrd_odem_{self.odem.process_identifier}_{image_ident}_{_ts}.log' + _rebranded = os.path.join(work_subdir, _log_label) + os.rename(_org_log, _rebranded) + shutil.copy(_rebranded, _local_ocr_log) + else: + self.logger.warning("[%s] No ocrd.log in %s", + self.odem.process_identifier, work_subdir) + + def _store_fulltext(self, image_subdir, original_image_path) -> int: + """Move OCR Result from Workspace Subdir to export folder if exists""" + + # inspect possible ocr result dirs from within + # the OCR-D subordinate workspaces for each image + old_id = os.path.basename(image_subdir) + ocr_result_dir = os.path.join(image_subdir, LOCAL_OCRD_RESULT_DIR) + if not os.path.isdir(ocr_result_dir): + self.logger.info("[%s] no ocr results for '%s'", + self.odem.process_identifier, ocr_result_dir) + return 0 + ocrs = [os.path.join(ocr_result_dir, ocr) + for ocr in os.listdir(ocr_result_dir) + if str(ocr).endswith('.xml')] + self.logger.debug("[%s] %s ocr files", + self.odem.process_identifier, ocrs) + if ocrs and len(ocrs) == 1: + # propably need to rename + # since file now is like 'PAGE_01.xml' + renamed = os.path.join(ocr_result_dir, old_id + '.xml') + os.rename(ocrs[0], renamed) + # regular case: OAI Workflow + if not self.odem.local_mode: + # export to 'PAGE' dir + wd_fulltext = os.path.join(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) + if not os.path.exists(wd_fulltext): + os.mkdir(wd_fulltext) + + # special case: local runnings for straight evaluations + else: + wd_fulltext = os.path.dirname(original_image_path) + + # final storage + target_path = os.path.join(wd_fulltext, old_id + '.xml') + shutil.copy(renamed, target_path) + return 1 + + def foster_outputs(self): + """In this case: + * move files from dir PAGE to FULLTEXT + * convert OCR format PAGE => ALTO + * some additional tag stripping + """ + + n_candidates = len(self.odem.images_4_ocr) + ocrd_data_files = odem_c.list_files(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) + if len(ocrd_data_files) == 0 and n_candidates > 0: + raise odem_c.ODEMException(f"No OCR result for {n_candidates} candidates created!") + final_fulltext_dir = os.path.join(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) + if not os.path.isdir(final_fulltext_dir): + os.makedirs(final_fulltext_dir, exist_ok=True) + self.ocr_files = odem_fmt.convert_to_output_format(ocrd_data_files, final_fulltext_dir) + self.logger.info("[%s] converted '%d' files page-to-alto", + self.odem.process_identifier, len(self.ocr_files)) + strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') + for _ocr_file in self.ocr_files: + odem_fmt.postprocess_ocr_file(_ocr_file, strip_tags) + + +class ODEMTesseract(ODEMWorkflow): + """Tesseract Runner""" + + def __init__(self, odem_process: odem_c.ODEMProcess): + self.odem = odem_process + self.odem_configuration = odem_process.odem_configuration + self.logger = odem_process.the_logger + self.pipeline_configuration = None + + def get_inputs(self): + images_4_ocr = self.odem.images_4_ocr + n_total = len(images_4_ocr) + pipeline_cfg = self.read_pipeline_config() + input_data = [(img, i, n_total, self.logger, pipeline_cfg) + for i, img in enumerate(self.odem.images_4_ocr, start=1)] + return input_data + + def run(self, input_data): + + image_path = input_data[0][0] + pipeline_result = odem_tess.run_pipeline(input_data) + stored = pipeline_result is not None + mps = 0 + filesize_mb = 0 + filestat = os.stat(image_path) + if filestat: + filesize_mb = filestat.st_size / 1048576 + (mps, _) = odem_img.get_imageinfo(image_path) + return stored, 1, mps, filesize_mb + + def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: + """Read pipeline configuration and replace + model_configs with known language data""" + + if self.pipeline_configuration is None: + if path_config is None: + if self.odem_configuration.has_option(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config'): + path_config = os.path.abspath(self.odem_configuration.get(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config')) + if not os.path.isfile(path_config): + raise odem_c.ODEMException(f"no ocr-pipeline conf {path_config} !") + pipe_cfg = configparser.ConfigParser() + pipe_cfg.read(path_config) + self.logger.info(f"use config '{path_config}'") + for sect in pipe_cfg.sections(): + if pipe_cfg.has_option(sect, 'model_configs'): + known_langs = self.odem._statistics_ocr.get(odem_c.STATS_KEY_LANGS) + model_files = self.odem.language_modelconfig(known_langs) + models = model_files.replace('.traineddata','') + pipe_cfg.set(sect, 'model_configs', models) + if pipe_cfg.has_option(sect, odem_tess.STEP_MOVE_PATH_TARGET): + pipe_cfg.set(sect, odem_tess.STEP_MOVE_PATH_TARGET, f'{self.odem.work_dir_main}/FULLTEXT') + self.pipeline_configuration = pipe_cfg + return self.pipeline_configuration + + def foster_outputs(self): + self.ocr_files = odem_c.list_files(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) + strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') + for _ocr_file in self.ocr_files: + odem_fmt.postprocess_ocr_file(_ocr_file, strip_tags) diff --git a/lib/odem/ocrd3_odem.py b/lib/odem/ocrd3_odem.py index aff7cf6..290d6da 100644 --- a/lib/odem/ocrd3_odem.py +++ b/lib/odem/ocrd3_odem.py @@ -3,7 +3,7 @@ from __future__ import annotations -import concurrent.futures +# import concurrent.futures import configparser import datetime import typing @@ -16,7 +16,7 @@ import time import typing -from enum import Enum +# from enum import Enum from pathlib import Path import numpy as np @@ -25,6 +25,7 @@ import digiflow.digiflow_metadata as dfm import lib.odem.odem_commons as odem_c +import lib.odem.processing.image as odem_image from .processing.mets import ( ODEMMetadataInspecteur, @@ -33,48 +34,48 @@ postprocess_mets, validate, ) -from lib.odem.ocr.ocrd import ( - run_ocr_page, -) -from .ocr.ocr_pipeline import ( - STEP_MOVE_PATH_TARGET, - run_pipeline, -) -from .processing.ocr_files import ( - convert_to_output_format, - postprocess_ocr_file, -) -from .processing.image import ( - has_image_ext, - sanitize_image, - get_imageinfo, -) -from .ocr.ocrd import ( - ocrd_workspace_setup, -) +# from lib.odem.ocr.ocrd import ( +# run_ocr_page, +# ) +# from .ocr.ocr_pipeline import ( +# STEP_MOVE_PATH_TARGET, +# run_pipeline, +# ) +# from .processing.ocr_files import ( +# convert_to_output_format, +# postprocess_ocr_file, +# ) +# from .processing.image import ( +# has_image_ext, +# sanitize_image, +# get_imageinfo, +# ) +# from .ocr.ocrd import ( +# ocrd_workspace_setup, +# ) # python process-wrapper limit os.environ['OMP_THREAD_LIMIT'] = '1' # default language fallback # (only when processing local images) DEFAULT_LANG = 'ger' -# estimated ocr-d runtime -# for a regular page (A4, 1MB) -DEFAULT_RUNTIME_PAGE = 1.0 -# process duration format -ODEM_PAGE_TIME_FORMAT = '%Y-%m-%d_%H-%m-%S' -# how long to process single page? -DEFAULT_DOCKER_CONTAINER_TIMEOUT = 600 +# # estimated ocr-d runtime +# # for a regular page (A4, 1MB) +# _DEFAULT_RUNTIME_PAGE = 1.0 +# # process duration format +# _ODEM_PAGE_TIME_FORMAT = '%Y-%m-%d_%H-%m-%S' +# # how long to process single page? +# _DEFAULT_DOCKER_CONTAINER_TIMEOUT = 600 -LOCAL_OCRD_RESULT_DIR = 'PAGE' +# _LOCAL_OCRD_RESULT_DIR = 'PAGE' -class OdemWorkflowProcessType(str, Enum): - OCRD_PAGE_PARALLEL = "OCRD_PAGE_PARALLEL" - ODEM_TESSERACT = "ODEM_TESSERACT" +# class OdemWorkflowProcessType(str, Enum): +# OCRD_PAGE_PARALLEL = "OCRD_PAGE_PARALLEL" +# ODEM_TESSERACT = "ODEM_TESSERACT" -class ODEMProcess: +class ODEMProcessImpl(odem_c.OdemProcess): """Create OCR for OAI Records. Runs both wiht OAIRecord or local path as input. @@ -142,10 +143,6 @@ def _init_logger(self, log_dir): self.the_logger = logging.getLogger('odem') def load(self): - """Load Data via OAI-PMH-API very LAZY - i.e. if not metadata file exists already in - configured workspace directory""" - request_identifier = self.record.identifier local_identifier = self.record.local_identifier req_dst_dir = os.path.join( @@ -179,18 +176,6 @@ def clear_resources(self, remove_all=False): shutil.rmtree(self.work_dir_main) def inspect_metadata(self): - """Inspected record data and try to - make sense (or go nuts if invalid) - - Invalid means: - * no print work type (i.e. C-stage, newspaper year) - * no language - * missing links between physical and logical structs - (otherwise viewer navigation and PDF outline - will be corrupt at this segment) - * no page images for OCR - """ - insp = ODEMMetadataInspecteur(self.mets_file, self.record.identifier, cfg=self.odem_configuration) @@ -321,7 +306,7 @@ def get_local_image_paths(self, image_local_dir=None) -> typing.List[str]: os.path.join(curr, the_file) for curr, _, the_files in os.walk(image_dir) for the_file in the_files - if has_image_ext(the_file) + if odem_image.has_image_ext(the_file) ]) # this shouldn't happen @@ -492,7 +477,7 @@ def export_data(self): export_mappings = df.map_contents(source_path_dir, work_dir, exp_map) for mapping in export_mappings: mapping.copy() - tmp_zip_path, size = ODEMProcess.compress_flat(os.path.dirname(work_dir), saf_name) + tmp_zip_path, size = ODEMProcessImpl.compress_flat(os.path.dirname(work_dir), saf_name) path_export_processing = dfx._move_to_tmp_file(tmp_zip_path, exp_dst) export_result = path_export_processing, size else: @@ -542,328 +527,328 @@ def statistics(self): return self._statistics_ocr -class ODEMWorkflowRunner: - """Wrap actual ODEM process execution""" - - def __init__(self, identifier, n_executors, - internal_logger, odem_workflow) -> None: - self.process_identifier = identifier - self.n_executors = n_executors - self.logger:logging.Logger = internal_logger - self.odem_workflow: ODEMWorkflow = odem_workflow - - def run(self): - input_data = self.odem_workflow.get_inputs() - the_outcomes = [(0, 0, 0, 0)] - if self.n_executors > 1: - the_outcomes = self.run_parallel(input_data) - else: - the_outcomes = self.run_sequential(input_data) - self.odem_workflow.foster_outputs() - return the_outcomes - - def run_parallel(self, input_data): - """Run workflow parallel with given executors""" - - n_inputs = len(input_data) - self.logger.info("[%s] %d inputs run_parallel by %d executors", - self.process_identifier, n_inputs, self.n_executors) - try: - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.n_executors, - thread_name_prefix='odem.ocrd' - ) as executor: - return list(executor.map(self.odem_workflow.run, input_data)) - except (OSError, AttributeError) as err: - self.logger.error(err) - raise odem_c.ODEMException(f"ODEM parallel: {err.args[0]}") from err - - def run_sequential(self, input_data): - """run complete workflow plain sequential - For debugging or small machines - """ - - len_img = len(input_data) - estm_min = len_img * DEFAULT_RUNTIME_PAGE - self.logger.info("[%s] %d inputs run_sequential, estm. %dmin", - self.process_identifier, len_img, estm_min) - try: - outcomes = [self.odem_workflow.run(the_input) - for the_input in input_data] - return outcomes - except (OSError, AttributeError) as err: - self.logger.error(err) - raise odem_c.ODEMException(f"ODEM sequential: {err.args[0]}") from err - - -class ODEMWorkflow: - """Base Interface""" - - @staticmethod - def create( - workflow_type: OdemWorkflowProcessType | str, - odem: ODEMProcess, - ) -> ODEMWorkflow: - if (workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT - or workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT.value): - return ODEMTesseract(odem) - return OCRDPageParallel(odem) - - def get_inputs(self) -> typing.List: - """Collect all input data files for processing""" - - def run(self): - """Run actual implemented Workflow""" - - def foster_outputs(self): - """Work to do after pipeline has been run successfully - like additional format transformations or sanitizings - """ - - -class OCRDPageParallel(ODEMWorkflow): - """Use page parallel workflow""" - - def __init__(self, odem_process: ODEMProcess): - self.odem = odem_process - self.cfg = odem_process.odem_configuration - self.logger = odem_process.the_logger - - def get_inputs(self): - return self.odem.images_4_ocr - - def run(self, input_data): - """Create OCR Data""" - - ocr_log_conf = os.path.join( - odem_c.PROJECT_ROOT, self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_logging')) - - # Preprare workspace with makefile - (image_path, ident) = input_data - os.chdir(self.odem.work_dir_main) - file_name = os.path.basename(image_path) - file_id = file_name.split('.')[0] - page_workdir = os.path.join(self.odem.work_dir_main, file_id) - if os.path.exists(page_workdir): - shutil.rmtree(page_workdir, ignore_errors=True) - os.mkdir(page_workdir) - shutil.copy(ocr_log_conf, page_workdir) - os.chdir(page_workdir) - - # move and convert image data at once - processed_image_path = sanitize_image(image_path, page_workdir) - - # init ocr-d workspace - ocrd_workspace_setup(page_workdir, processed_image_path) - - # find model config for tesseract - model_config = self.odem.map_language_to_modelconfig(image_path) - - stored = 0 - mps = 0 - filesize_mb = 0 - # use original image rather than - # transformed one since PNG is - # usually 2-5 times larger than JPG - filestat = os.stat(image_path) - if filestat: - filesize_mb = filestat.st_size / 1048576 - (mps, dpi) = get_imageinfo(image_path) - - # how to identify data set? - if self.odem.record: - _ident = self.odem.process_identifier - else: - _ident = os.path.basename(self.odem.work_dir_main) - # OCR Generation - profiling = ('n.a.', 0) - - container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}' - container_memory_limit: str = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) - container_user = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) - container_timeout: int = self.cfg.getint( - odem_c.CFG_SEC_OCR, - 'docker_container_timeout', - fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT - ) - base_image = self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_baseimage') - ocrd_process_list = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'ocrd_process_list') - tesseract_model_rtl: typing.List[str] = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'tesseract_model_rtl', fallback=odem_c.DEFAULT_RTL_MODELS) - ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) - - if self.odem.local_mode: - container_name = os.path.basename(page_workdir) - try: - profiling = run_ocr_page( - page_workdir, - base_image, - container_memory_limit, - container_timeout, - container_name, - container_user, - ocrd_process_list, - model_config, - ocrd_resources_volumes, - tesseract_model_rtl, - ) - # will be unset in case of magic mocking for test - if profiling: - self.logger.info("[%s] '%s' in %s (%.1fMP, %dDPI, %.1fMB)", - _ident, profiling[1], profiling[0], mps, dpi, filesize_mb) - self.logger.info("[%s] run ocr creation in '%s'", - _ident, page_workdir) - stored = self._store_fulltext(page_workdir, image_path) - if stored: - self._preserve_log(page_workdir, ident) - except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: - self.logger.error("[%s] image '%s' failed due to subprocess timeout: %s", - _ident, base_image, exc) - except Exception as plain_exc: - self.logger.error("[%s] generic exc '%s' for image '%s'", - _ident, plain_exc, base_image) - - os.chdir(self.odem.work_dir_main) - if self.cfg.getboolean(odem_c.CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: - shutil.rmtree(page_workdir, ignore_errors=True) - return stored, 1, mps, filesize_mb - - def _preserve_log(self, work_subdir, image_ident): - """preserve ocrd.log for later analyzis as - sub directory identified by adopted local - identifier (local section of system OAI handle)""" - - _root_log = self.cfg.get('global', 'local_log_dir') - _local_ident = self.odem.process_identifier.replace('/', '_') - _local_ocr_log = os.path.join(_root_log, _local_ident) - if not os.path.exists(_local_ocr_log): - os.makedirs(_local_ocr_log, exist_ok=True) - - _org_log = os.path.join(work_subdir, 'ocrd.log') - if os.path.exists(_org_log): - _ts = time.strftime(ODEM_PAGE_TIME_FORMAT, time.localtime()) - _log_label = f'ocrd_odem_{self.odem.process_identifier}_{image_ident}_{_ts}.log' - _rebranded = os.path.join(work_subdir, _log_label) - os.rename(_org_log, _rebranded) - shutil.copy(_rebranded, _local_ocr_log) - else: - self.logger.warning("[%s] No ocrd.log in %s", - self.odem.process_identifier, work_subdir) - - def _store_fulltext(self, image_subdir, original_image_path) -> int: - """Move OCR Result from Workspace Subdir to export folder if exists""" - - # inspect possible ocr result dirs from within - # the OCR-D subordinate workspaces for each image - old_id = os.path.basename(image_subdir) - ocr_result_dir = os.path.join(image_subdir, LOCAL_OCRD_RESULT_DIR) - if not os.path.isdir(ocr_result_dir): - self.logger.info("[%s] no ocr results for '%s'", - self.odem.process_identifier, ocr_result_dir) - return 0 - ocrs = [os.path.join(ocr_result_dir, ocr) - for ocr in os.listdir(ocr_result_dir) - if str(ocr).endswith('.xml')] - self.logger.debug("[%s] %s ocr files", - self.odem.process_identifier, ocrs) - if ocrs and len(ocrs) == 1: - # propably need to rename - # since file now is like 'PAGE_01.xml' - renamed = os.path.join(ocr_result_dir, old_id + '.xml') - os.rename(ocrs[0], renamed) - # regular case: OAI Workflow - if not self.odem.local_mode: - # export to 'PAGE' dir - wd_fulltext = os.path.join(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) - if not os.path.exists(wd_fulltext): - os.mkdir(wd_fulltext) - - # special case: local runnings for straight evaluations - else: - wd_fulltext = os.path.dirname(original_image_path) - - # final storage - target_path = os.path.join(wd_fulltext, old_id + '.xml') - shutil.copy(renamed, target_path) - return 1 - - def foster_outputs(self): - """In this case: - * move files from dir PAGE to FULLTEXT - * convert OCR format PAGE => ALTO - * some additional tag stripping - """ - - n_candidates = len(self.odem.images_4_ocr) - ocrd_data_files = odem_c.list_files(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) - if len(ocrd_data_files) == 0 and n_candidates > 0: - raise odem_c.ODEMException(f"No OCR result for {n_candidates} candidates created!") - final_fulltext_dir = os.path.join(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) - if not os.path.isdir(final_fulltext_dir): - os.makedirs(final_fulltext_dir, exist_ok=True) - self.ocr_files = convert_to_output_format(ocrd_data_files, final_fulltext_dir) - self.logger.info("[%s] converted '%d' files page-to-alto", - self.odem.process_identifier, len(self.ocr_files)) - strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') - for _ocr_file in self.ocr_files: - postprocess_ocr_file(_ocr_file, strip_tags) - - -class ODEMTesseract(ODEMWorkflow): - """Tesseract Runner""" - - def __init__(self, odem_process: ODEMProcess): - self.odem = odem_process - self.odem_configuration = odem_process.odem_configuration - self.logger = odem_process.the_logger - self.pipeline_configuration = None - - def get_inputs(self): - images_4_ocr = self.odem.images_4_ocr - n_total = len(images_4_ocr) - pipeline_cfg = self.read_pipeline_config() - input_data = [(img, i, n_total, self.logger, pipeline_cfg) - for i, img in enumerate(self.odem.images_4_ocr, start=1)] - return input_data - - def run(self, input_data): - - image_path = input_data[0][0] - pipeline_result = run_pipeline(input_data) - stored = pipeline_result is not None - mps = 0 - filesize_mb = 0 - filestat = os.stat(image_path) - if filestat: - filesize_mb = filestat.st_size / 1048576 - (mps, _) = get_imageinfo(image_path) - return stored, 1, mps, filesize_mb +# class ODEMWorkflowRunner: +# """Wrap actual ODEM process execution""" + +# def __init__(self, identifier, n_executors, +# internal_logger, odem_workflow) -> None: +# self.process_identifier = identifier +# self.n_executors = n_executors +# self.logger:logging.Logger = internal_logger +# self.odem_workflow: ODEMWorkflow = odem_workflow + +# def run(self): +# input_data = self.odem_workflow.get_inputs() +# the_outcomes = [(0, 0, 0, 0)] +# if self.n_executors > 1: +# the_outcomes = self.run_parallel(input_data) +# else: +# the_outcomes = self.run_sequential(input_data) +# self.odem_workflow.foster_outputs() +# return the_outcomes + +# def run_parallel(self, input_data): +# """Run workflow parallel with given executors""" + +# n_inputs = len(input_data) +# self.logger.info("[%s] %d inputs run_parallel by %d executors", +# self.process_identifier, n_inputs, self.n_executors) +# try: +# with concurrent.futures.ThreadPoolExecutor( +# max_workers=self.n_executors, +# thread_name_prefix='odem.ocrd' +# ) as executor: +# return list(executor.map(self.odem_workflow.run, input_data)) +# except (OSError, AttributeError) as err: +# self.logger.error(err) +# raise odem_c.ODEMException(f"ODEM parallel: {err.args[0]}") from err + +# def run_sequential(self, input_data): +# """run complete workflow plain sequential +# For debugging or small machines +# """ + +# len_img = len(input_data) +# estm_min = len_img * DEFAULT_RUNTIME_PAGE +# self.logger.info("[%s] %d inputs run_sequential, estm. %dmin", +# self.process_identifier, len_img, estm_min) +# try: +# outcomes = [self.odem_workflow.run(the_input) +# for the_input in input_data] +# return outcomes +# except (OSError, AttributeError) as err: +# self.logger.error(err) +# raise odem_c.ODEMException(f"ODEM sequential: {err.args[0]}") from err + + +# class ODEMWorkflow: +# """Base Interface""" + +# @staticmethod +# def create( +# workflow_type: OdemWorkflowProcessType | str, +# odem: ODEMProcess, +# ) -> ODEMWorkflow: +# if (workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT +# or workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT.value): +# return ODEMTesseract(odem) +# return OCRDPageParallel(odem) + +# def get_inputs(self) -> typing.List: +# """Collect all input data files for processing""" + +# def run(self): +# """Run actual implemented Workflow""" + +# def foster_outputs(self): +# """Work to do after pipeline has been run successfully +# like additional format transformations or sanitizings +# """ + + +# class OCRDPageParallel(ODEMWorkflow): +# """Use page parallel workflow""" + +# def __init__(self, odem_process: ODEMProcess): +# self.odem = odem_process +# self.cfg = odem_process.odem_configuration +# self.logger = odem_process.the_logger + +# def get_inputs(self): +# return self.odem.images_4_ocr + +# def run(self, input_data): +# """Create OCR Data""" + +# ocr_log_conf = os.path.join( +# odem_c.PROJECT_ROOT, self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_logging')) + +# # Preprare workspace with makefile +# (image_path, ident) = input_data +# os.chdir(self.odem.work_dir_main) +# file_name = os.path.basename(image_path) +# file_id = file_name.split('.')[0] +# page_workdir = os.path.join(self.odem.work_dir_main, file_id) +# if os.path.exists(page_workdir): +# shutil.rmtree(page_workdir, ignore_errors=True) +# os.mkdir(page_workdir) +# shutil.copy(ocr_log_conf, page_workdir) +# os.chdir(page_workdir) + +# # move and convert image data at once +# processed_image_path = sanitize_image(image_path, page_workdir) + +# # init ocr-d workspace +# ocrd_workspace_setup(page_workdir, processed_image_path) + +# # find model config for tesseract +# model_config = self.odem.map_language_to_modelconfig(image_path) + +# stored = 0 +# mps = 0 +# filesize_mb = 0 +# # use original image rather than +# # transformed one since PNG is +# # usually 2-5 times larger than JPG +# filestat = os.stat(image_path) +# if filestat: +# filesize_mb = filestat.st_size / 1048576 +# (mps, dpi) = get_imageinfo(image_path) + +# # how to identify data set? +# if self.odem.record: +# _ident = self.odem.process_identifier +# else: +# _ident = os.path.basename(self.odem.work_dir_main) +# # OCR Generation +# profiling = ('n.a.', 0) + +# container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}' +# container_memory_limit: str = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) +# container_user = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) +# container_timeout: int = self.cfg.getint( +# odem_c.CFG_SEC_OCR, +# 'docker_container_timeout', +# fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT +# ) +# base_image = self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_baseimage') +# ocrd_process_list = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'ocrd_process_list') +# tesseract_model_rtl: typing.List[str] = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'tesseract_model_rtl', fallback=odem_c.DEFAULT_RTL_MODELS) +# ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) + +# if self.odem.local_mode: +# container_name = os.path.basename(page_workdir) +# try: +# profiling = run_ocr_page( +# page_workdir, +# base_image, +# container_memory_limit, +# container_timeout, +# container_name, +# container_user, +# ocrd_process_list, +# model_config, +# ocrd_resources_volumes, +# tesseract_model_rtl, +# ) +# # will be unset in case of magic mocking for test +# if profiling: +# self.logger.info("[%s] '%s' in %s (%.1fMP, %dDPI, %.1fMB)", +# _ident, profiling[1], profiling[0], mps, dpi, filesize_mb) +# self.logger.info("[%s] run ocr creation in '%s'", +# _ident, page_workdir) +# stored = self._store_fulltext(page_workdir, image_path) +# if stored: +# self._preserve_log(page_workdir, ident) +# except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: +# self.logger.error("[%s] image '%s' failed due to subprocess timeout: %s", +# _ident, base_image, exc) +# except Exception as plain_exc: +# self.logger.error("[%s] generic exc '%s' for image '%s'", +# _ident, plain_exc, base_image) + +# os.chdir(self.odem.work_dir_main) +# if self.cfg.getboolean(odem_c.CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: +# shutil.rmtree(page_workdir, ignore_errors=True) +# return stored, 1, mps, filesize_mb + +# def _preserve_log(self, work_subdir, image_ident): +# """preserve ocrd.log for later analyzis as +# sub directory identified by adopted local +# identifier (local section of system OAI handle)""" + +# _root_log = self.cfg.get('global', 'local_log_dir') +# _local_ident = self.odem.process_identifier.replace('/', '_') +# _local_ocr_log = os.path.join(_root_log, _local_ident) +# if not os.path.exists(_local_ocr_log): +# os.makedirs(_local_ocr_log, exist_ok=True) + +# _org_log = os.path.join(work_subdir, 'ocrd.log') +# if os.path.exists(_org_log): +# _ts = time.strftime(ODEM_PAGE_TIME_FORMAT, time.localtime()) +# _log_label = f'ocrd_odem_{self.odem.process_identifier}_{image_ident}_{_ts}.log' +# _rebranded = os.path.join(work_subdir, _log_label) +# os.rename(_org_log, _rebranded) +# shutil.copy(_rebranded, _local_ocr_log) +# else: +# self.logger.warning("[%s] No ocrd.log in %s", +# self.odem.process_identifier, work_subdir) + +# def _store_fulltext(self, image_subdir, original_image_path) -> int: +# """Move OCR Result from Workspace Subdir to export folder if exists""" + +# # inspect possible ocr result dirs from within +# # the OCR-D subordinate workspaces for each image +# old_id = os.path.basename(image_subdir) +# ocr_result_dir = os.path.join(image_subdir, LOCAL_OCRD_RESULT_DIR) +# if not os.path.isdir(ocr_result_dir): +# self.logger.info("[%s] no ocr results for '%s'", +# self.odem.process_identifier, ocr_result_dir) +# return 0 +# ocrs = [os.path.join(ocr_result_dir, ocr) +# for ocr in os.listdir(ocr_result_dir) +# if str(ocr).endswith('.xml')] +# self.logger.debug("[%s] %s ocr files", +# self.odem.process_identifier, ocrs) +# if ocrs and len(ocrs) == 1: +# # propably need to rename +# # since file now is like 'PAGE_01.xml' +# renamed = os.path.join(ocr_result_dir, old_id + '.xml') +# os.rename(ocrs[0], renamed) +# # regular case: OAI Workflow +# if not self.odem.local_mode: +# # export to 'PAGE' dir +# wd_fulltext = os.path.join(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) +# if not os.path.exists(wd_fulltext): +# os.mkdir(wd_fulltext) + +# # special case: local runnings for straight evaluations +# else: +# wd_fulltext = os.path.dirname(original_image_path) + +# # final storage +# target_path = os.path.join(wd_fulltext, old_id + '.xml') +# shutil.copy(renamed, target_path) +# return 1 + +# def foster_outputs(self): +# """In this case: +# * move files from dir PAGE to FULLTEXT +# * convert OCR format PAGE => ALTO +# * some additional tag stripping +# """ + +# n_candidates = len(self.odem.images_4_ocr) +# ocrd_data_files = odem_c.list_files(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) +# if len(ocrd_data_files) == 0 and n_candidates > 0: +# raise odem_c.ODEMException(f"No OCR result for {n_candidates} candidates created!") +# final_fulltext_dir = os.path.join(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) +# if not os.path.isdir(final_fulltext_dir): +# os.makedirs(final_fulltext_dir, exist_ok=True) +# self.ocr_files = convert_to_output_format(ocrd_data_files, final_fulltext_dir) +# self.logger.info("[%s] converted '%d' files page-to-alto", +# self.odem.process_identifier, len(self.ocr_files)) +# strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') +# for _ocr_file in self.ocr_files: +# postprocess_ocr_file(_ocr_file, strip_tags) + + +# class ODEMTesseract(ODEMWorkflow): +# """Tesseract Runner""" + +# def __init__(self, odem_process: ODEMProcess): +# self.odem = odem_process +# self.odem_configuration = odem_process.odem_configuration +# self.logger = odem_process.the_logger +# self.pipeline_configuration = None + +# def get_inputs(self): +# images_4_ocr = self.odem.images_4_ocr +# n_total = len(images_4_ocr) +# pipeline_cfg = self.read_pipeline_config() +# input_data = [(img, i, n_total, self.logger, pipeline_cfg) +# for i, img in enumerate(self.odem.images_4_ocr, start=1)] +# return input_data + +# def run(self, input_data): + +# image_path = input_data[0][0] +# pipeline_result = run_pipeline(input_data) +# stored = pipeline_result is not None +# mps = 0 +# filesize_mb = 0 +# filestat = os.stat(image_path) +# if filestat: +# filesize_mb = filestat.st_size / 1048576 +# (mps, _) = get_imageinfo(image_path) +# return stored, 1, mps, filesize_mb - def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: - """Read pipeline configuration and replace - model_configs with known language data""" +# def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: +# """Read pipeline configuration and replace +# model_configs with known language data""" - if self.pipeline_configuration is None: - if path_config is None: - if self.odem_configuration.has_option(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config'): - path_config = os.path.abspath(self.odem_configuration.get(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config')) - if not os.path.isfile(path_config): - raise odem_c.ODEMException(f"no ocr-pipeline conf {path_config} !") - pipe_cfg = configparser.ConfigParser() - pipe_cfg.read(path_config) - self.logger.info(f"use config '{path_config}'") - for sect in pipe_cfg.sections(): - if pipe_cfg.has_option(sect, 'model_configs'): - known_langs = self.odem._statistics_ocr.get(odem_c.STATS_KEY_LANGS) - model_files = self.odem.language_modelconfig(known_langs) - models = model_files.replace('.traineddata','') - pipe_cfg.set(sect, 'model_configs', models) - if pipe_cfg.has_option(sect, STEP_MOVE_PATH_TARGET): - pipe_cfg.set(sect, STEP_MOVE_PATH_TARGET, f'{self.odem.work_dir_main}/FULLTEXT') - self.pipeline_configuration = pipe_cfg - return self.pipeline_configuration - - def foster_outputs(self): - self.ocr_files = odem_c.list_files(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) - strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') - for _ocr_file in self.ocr_files: - postprocess_ocr_file(_ocr_file, strip_tags) +# if self.pipeline_configuration is None: +# if path_config is None: +# if self.odem_configuration.has_option(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config'): +# path_config = os.path.abspath(self.odem_configuration.get(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config')) +# if not os.path.isfile(path_config): +# raise odem_c.ODEMException(f"no ocr-pipeline conf {path_config} !") +# pipe_cfg = configparser.ConfigParser() +# pipe_cfg.read(path_config) +# self.logger.info(f"use config '{path_config}'") +# for sect in pipe_cfg.sections(): +# if pipe_cfg.has_option(sect, 'model_configs'): +# known_langs = self.odem._statistics_ocr.get(odem_c.STATS_KEY_LANGS) +# model_files = self.odem.language_modelconfig(known_langs) +# models = model_files.replace('.traineddata','') +# pipe_cfg.set(sect, 'model_configs', models) +# if pipe_cfg.has_option(sect, STEP_MOVE_PATH_TARGET): +# pipe_cfg.set(sect, STEP_MOVE_PATH_TARGET, f'{self.odem.work_dir_main}/FULLTEXT') +# self.pipeline_configuration = pipe_cfg +# return self.pipeline_configuration + +# def foster_outputs(self): +# self.ocr_files = odem_c.list_files(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) +# strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') +# for _ocr_file in self.ocr_files: +# postprocess_ocr_file(_ocr_file, strip_tags) diff --git a/lib/odem/odem_commons.py b/lib/odem/odem_commons.py index 3426a8c..a0d1eeb 100644 --- a/lib/odem/odem_commons.py +++ b/lib/odem/odem_commons.py @@ -108,6 +108,34 @@ class OAIRecordExhaustedException(Exception): """Mark that given file contains no open records""" +class OdemWorkflowProcessType(str, Enum): + OCRD_PAGE_PARALLEL = "OCRD_PAGE_PARALLEL" + ODEM_TESSERACT = "ODEM_TESSERACT" + + +class OdemProcess: + """Basic Interface for ODEM""" + + def load(self): + """Load Data via OAI-PMH-API very LAZY + i.e. if not metadata file exists already in + configured workspace directory""" + + def inspect_metadata(self): + """Inspected record data and try to make sense (or go nuts if invalid) + Invalid means: + * no print work type (i.e. C-stage, newspaper year) + * no language + * missing links between physical and logical structs + (otherwise viewer navigation and PDF outline + will be corrupt at this segment) + * no page images for OCR + """ + + def export_data(self): + """re-do metadata and transform into output format""" + + def get_configparser(): """init plain configparser""" diff --git a/tests/conftest.py b/tests/conftest.py index 838511c..8ae721f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -72,7 +72,7 @@ def _module_fixture_123456789_27949(tmp_path_factory): (path_workdir / 'log').mkdir() _model_dir = prepare_tessdata_dir(path_workdir) record = df.OAIRecord('oai:dev.opendata.uni-halle.de:123456789/27949') - _oproc = odem.ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log') + _oproc = odem.ODEMProcessImpl(record, work_dir=path_workdir, log_dir=path_workdir / 'log') _oproc.odem_configuration = fixture_configuration() _oproc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') _oproc.ocr_files = [os.path.join(trgt_alto, a) diff --git a/tests/test_ocrd3_odem.py b/tests/test_ocrd3_odem.py index 58c4f45..289e519 100644 --- a/tests/test_ocrd3_odem.py +++ b/tests/test_ocrd3_odem.py @@ -39,7 +39,7 @@ def test_mapping_from_imagefilename(img_path, lang_str, tmp_path): work_2.mkdir() log_dir = tmp_path / 'log' log_dir.mkdir() - odem_processor = odem.ODEMProcess(None, work_dir=str(work_2)) + odem_processor = odem.ODEMProcessImpl(None, work_dir=str(work_2)) odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) odem_processor.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, @@ -69,7 +69,7 @@ def test_exchange_language(img_path, langs, models, tmp_path): work_2.mkdir() log_dir = tmp_path / 'log' log_dir.mkdir() - odem_processor = odem.ODEMProcess(None, work_dir=str(work_2)) + odem_processor = odem.ODEMProcessImpl(None, work_dir=str(work_2)) odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) odem_processor.odem_configuration.set( @@ -101,7 +101,7 @@ def test_enforce_language_and_model_mapping(tmp_path): work_2.mkdir() log_dir = tmp_path / 'log' log_dir.mkdir() - odem_processor = odem.ODEMProcess(None, work_dir=str(work_2)) + odem_processor = odem.ODEMProcessImpl(None, work_dir=str(work_2)) odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) _kraken_dir = prepare_kraken_dir(tmp_path) @@ -149,7 +149,7 @@ def _side_effect(*args, **kwargs): _log_dir = _root_workdir / 'log' _log_dir.mkdir() _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') - odem_proc = odem.ODEMProcess(_record, _workdir) + odem_proc = odem.ODEMProcessImpl(_record, _workdir) odem_proc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(_workdir) odem_proc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') @@ -179,7 +179,7 @@ def test_odem_process_identifier_local_workdir(tmp_path): _workdir.mkdir(parents=True, exist_ok=True) # act - odem_proc = odem.ODEMProcess(None, _workdir) + odem_proc = odem.ODEMProcessImpl(None, _workdir) # assert assert odem_proc.process_identifier == 'foo_bar' @@ -193,7 +193,7 @@ def _fixture_odem_setup(tmp_path): work_2.mkdir() log_dir = tmp_path / 'log' log_dir.mkdir() - odem_processor = odem.ODEMProcess(None, work_dir=str(work_2)) + odem_processor = odem.ODEMProcessImpl(None, work_dir=str(work_2)) cfg = odem.get_configparser() cfg.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) odem_processor.odem_configuration = cfg @@ -205,7 +205,7 @@ def _fixture_odem_setup(tmp_path): return odem_processor -def test_lang_mapping_missing_conf_error(odem_processor: odem.ODEMProcess): +def test_lang_mapping_missing_conf_error(odem_processor: odem.ODEMProcessImpl): """Ensure unknown language mapping caught properly""" # arrange @@ -219,7 +219,7 @@ def test_lang_mapping_missing_conf_error(odem_processor: odem.ODEMProcess): assert "'gop' mapping not found (languages: ['gop'])!" in err.value.args[0] -def test_lang_mapping_missing_lang_error(odem_processor: odem.ODEMProcess): +def test_lang_mapping_missing_lang_error(odem_processor: odem.ODEMProcessImpl): """Ensure cannot map dummy language 'yyy.traineddata'""" # arrange @@ -233,7 +233,7 @@ def test_lang_mapping_missing_lang_error(odem_processor: odem.ODEMProcess): assert "'yyy.traineddata' model config not found !" in err.value.args[0] -def test_module_fixture_one_integrated_ocr_in_mets(fixture_27949: odem.ODEMProcess): +def test_module_fixture_one_integrated_ocr_in_mets(fixture_27949: odem.ODEMProcessImpl): """Ensure, that generated final OCR files * are properly linked into original METS * contain required link data to images @@ -253,7 +253,7 @@ def test_module_fixture_one_integrated_ocr_in_mets(fixture_27949: odem.ODEMProce assert len(_phys_links[6].getchildren()) == 1 -def test_module_fixture_one_images_4_ocr_by_metadata(fixture_27949: odem.ODEMProcess): +def test_module_fixture_one_images_4_ocr_by_metadata(fixture_27949: odem.ODEMProcessImpl): """Ensure setting and filtering of images behavior. Record oai:dev.opendata.uni-halle.de:123456789/27949 @@ -264,7 +264,7 @@ def test_module_fixture_one_images_4_ocr_by_metadata(fixture_27949: odem.ODEMPro assert len(fixture_27949.images_4_ocr) == 4 -def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEMProcess): +def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEMProcessImpl): """Ensure text bundle data created and present with expected number of text rows Please note: @@ -310,7 +310,7 @@ def test_images_4_ocr_properly_filtered(tmp_path): _writer.write(b'0x00') _orig_mets = TEST_RES / '1981185920_44046.xml' shutil.copyfile(_orig_mets, _work_dir / '1981185920_44046.xml') - odem_processor = odem.ODEMProcess(_record, work_dir=_work_dir) + odem_processor = odem.ODEMProcessImpl(_record, work_dir=_work_dir) cfg = odem.get_configparser() cfg.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) odem_processor.odem_configuration = cfg @@ -336,7 +336,7 @@ def test_no_catch_when_load_exc(mock_load, tmp_path): _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') _work_dir = tmp_path / '1981185920_44046' _work_dir.mkdir() - odem_processor = odem.ODEMProcess(_record, work_dir=_work_dir) + odem_processor = odem.ODEMProcessImpl(_record, work_dir=_work_dir) cfg = odem.get_configparser() cfg.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) odem_processor.odem_configuration = cfg @@ -366,7 +366,7 @@ def test_record_with_unknown_language(tmp_path): shutil.copyfile(orig_file, trgt_mets) (path_workdir / 'log').mkdir() record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/72977') - oproc = odem.ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log') + oproc = odem.ODEMProcessImpl(record, work_dir=path_workdir, log_dir=path_workdir / 'log') oproc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(tmp_path) oproc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, @@ -407,7 +407,7 @@ def test_export_flat_zip(tmp_path): (path_workdir / 'log').mkdir() record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') - oproc = odem.ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log') + oproc = odem.ODEMProcessImpl(record, work_dir=path_workdir, log_dir=path_workdir / 'log') oproc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(tmp_path) diff --git a/tests/test_odem_ocr_pipeline.py b/tests/test_odem_ocr_pipeline.py index e1d6bd2..96d33d3 100644 --- a/tests/test_odem_ocr_pipeline.py +++ b/tests/test_odem_ocr_pipeline.py @@ -82,7 +82,7 @@ def fixure_a_workspace(tmp_path): @pytest.fixture(name="my_pipeline") def _fixture_default_pipeline(a_workspace: Path): _record = df.OAIRecord('oai:urn:mwe') - odem_process = odem.ODEMProcess(_record, a_workspace) + odem_process = odem.ODEMProcessImpl(_record, a_workspace) odem_process.odem_configuration = ODEM_CFG odem_process._statistics_ocr['languages'] = ['ger'] odem_process.the_logger = odem.get_logger(a_workspace / 'log') @@ -119,7 +119,7 @@ def _fixture_custom_config_pipeline(a_workspace): conf_dir.mkdir() conf_file = TEST_RES / 'ocr_config_full.ini' assert os.path.isfile(conf_file) - odem_process = odem.ODEMProcess(df.OAIRecord('oai:urn_custom'), a_workspace) + odem_process = odem.ODEMProcessImpl(df.OAIRecord('oai:urn_custom'), a_workspace) odem_process.odem_configuration = ODEM_CFG odem_process._statistics_ocr['languages'] = ['ger', 'lat'] odem_process.the_logger = odem.get_logger(a_workspace / 'log') diff --git a/tests/test_odem_processing_mets.py b/tests/test_odem_processing_mets.py index 6397674..9382eef 100644 --- a/tests/test_odem_processing_mets.py +++ b/tests/test_odem_processing_mets.py @@ -279,7 +279,7 @@ def test_validate_mets_105054_schema_fails(tmp_path): _work_dir.mkdir() _orig_mets = TEST_RES / '1981185920_105054.xml' shutil.copyfile(_orig_mets, _work_dir / '1981185920_105054.xml') - odem_processor = odem.ODEMProcess(_record, work_dir=_work_dir) + odem_processor = odem.ODEMProcessImpl(_record, work_dir=_work_dir) odem_processor.odem_configuration = fixture_configuration() with pytest.raises(odem.ODEMException) as exec: odem_processor.validate_metadata() @@ -296,7 +296,7 @@ def test_validate_mets_37167_schema_fails(tmp_path): work_dir.mkdir() original_mets = TEST_RES / '1981185920_37167_01.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') - odem_processor = odem.ODEMProcess(rec, work_dir=work_dir) + odem_processor = odem.ODEMProcessImpl(rec, work_dir=work_dir) odem_processor.odem_configuration = fixture_configuration() with pytest.raises(odem.ODEMException) as exec: odem_processor.validate_metadata() @@ -320,7 +320,7 @@ def test_validate_mets_37167_ddb_fails(tmp_path): work_dir.mkdir() original_mets = TEST_RES / '1981185920_37167_02.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') - odem_processor = odem.ODEMProcess(rec, work_dir=work_dir) + odem_processor = odem.ODEMProcessImpl(rec, work_dir=work_dir) odem_processor.odem_configuration = fixture_configuration() odem_processor.odem_configuration.set('mets', 'ddb_validation', 'True') with pytest.raises(odem.ODEMException) as exec: @@ -345,7 +345,7 @@ def test_validate_mets_37167_finally_succeeds(tmp_path): work_dir.mkdir() original_mets = TEST_RES / '1981185920_37167_03.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') - odem_processor = odem.ODEMProcess(rec, work_dir=work_dir) + odem_processor = odem.ODEMProcessImpl(rec, work_dir=work_dir) odem_processor.odem_configuration = fixture_configuration() odem_processor.odem_configuration.set('mets', 'ddb_validation', 'True') diff --git a/tests/test_odem_processing_ocr_files.py b/tests/test_odem_processing_ocr_files.py index 7120e5d..2ef5acd 100644 --- a/tests/test_odem_processing_ocr_files.py +++ b/tests/test_odem_processing_ocr_files.py @@ -10,7 +10,7 @@ from .conftest import fixture_configuration -def test_module_fixture_one_integrated_ocr_files_fit_identifier(fixture_27949: odem.ODEMProcess): +def test_module_fixture_one_integrated_ocr_files_fit_identifier(fixture_27949: odem.ODEMProcessImpl): """Ensure ocr-file elements fit syntactically * proper fileName * proper PageId set @@ -30,7 +30,7 @@ def test_module_fixture_one_integrated_ocr_files_fit_identifier(fixture_27949: o assert not os.path.exists(tmp_path / 'FULLTEXT' / '00000007.xml') -def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcess): +def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcessImpl): """Ensure expected replacements done *even* when diacritics occour more several times in single word"""