diff --git a/cli_dir_local.py b/cli_dir_local.py index b953ea5..511d001 100644 --- a/cli_dir_local.py +++ b/cli_dir_local.py @@ -100,19 +100,20 @@ PROCESS.odem_configuration = CFG PROCESS.the_logger = LOGGER local_images = PROCESS.get_local_image_paths(image_local_dir=ROOT_PATH) - PROCESS._statistics_ocr[odem.STATS_KEY_N_PAGES] = len(local_images) - PROCESS._statistics_ocr[odem.STATS_KEY_N_OCRABLE] = 0 - PROCESS._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS - PROCESS.images_4_ocr = local_images + PROCESS.process_statistics[odem.STATS_KEY_N_PAGES] = len(local_images) + PROCESS.process_statistics[odem.STATS_KEY_N_OCRABLE] = 0 + PROCESS.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS + PROCESS.ocr_candidates = local_images # Type and Value change!!! # ODEMProcess.single_ocr() needs Tuple[str,str], in non-local # this is assigned to "PROCESS.images_4_ocr" in ODEMProcess.filter_images() # thats why we have to manually fit that requirement - PROCESS.images_4_ocr = list(zip(PROCESS.images_4_ocr, [pathlib.Path(i).stem for i in PROCESS.images_4_ocr])) + PROCESS.ocr_candidates = list(zip(PROCESS.ocr_candidates, + [pathlib.Path(i).stem for i in PROCESS.ocr_candidates])) PROCESS.run() PROCESS.the_logger.info("[%s] duration: %s (%s)", req_idn, - PROCESS.duration, PROCESS.statistics) + PROCESS.statistics['timedelta'], PROCESS.statistics) except Exception as exc: LOGGER.error("odem fails for '%s' after %s with: '%s'", - req_idn, PROCESS.duration, str(exc)) + req_idn, PROCESS.statistics['timedelta'], str(exc)) sys.exit(0) diff --git a/cli_mets_local.py b/cli_mets_local.py index f882eb2..7437613 100644 --- a/cli_mets_local.py +++ b/cli_mets_local.py @@ -143,7 +143,7 @@ if ocr_results is None or len(ocr_results) == 0: raise odem.ODEMException(f"OCR Process Runner error for {record.identifier}") odem_process.calculate_statistics_ocr(ocr_results) - odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS + odem_process.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics) odem_process.link_ocr_files() odem_process.postprocess_ocr() @@ -165,9 +165,9 @@ odem_process.export_data() _mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}' odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier, - odem_process.duration, _mode, odem_process.statistics) + odem_process.statistics['timedelta'], _mode, odem_process.statistics) LOGGER.info("[%s] odem done in '%s' (%d executors)", - odem_process.process_identifier, odem_process.duration, EXECUTORS) + odem_process.process_identifier, odem_process.statistics['timedelta'], EXECUTORS) except odem.ODEMNoTypeForOCRException as type_unknown: LOGGER.warning("[%s] odem skips '%s'", odem_process.process_identifier, type_unknown.args[0]) @@ -179,5 +179,5 @@ LOGGER.error("[%s] odem fails with: '%s'", odem_process.process_identifier, _err_args) except RuntimeError as exc: LOGGER.error("odem fails for '%s' after %s with: '%s'", - record, odem_process.duration, str(exc)) + record, odem_process.statistics['timedelta'], str(exc)) sys.exit(1) diff --git a/cli_oai_client.py b/cli_oai_client.py index 13b756e..b382a01 100644 --- a/cli_oai_client.py +++ b/cli_oai_client.py @@ -12,7 +12,9 @@ import typing import requests + import digiflow as df +import digiflow.record as df_r import lib.odem as odem import lib.odem.monitoring as odem_rm @@ -32,7 +34,7 @@ def trnfrm(row): """callback function""" oai_id = row['IDENTIFIER'] - oai_record = df.OAIRecord(oai_id) + oai_record = df_r.Record(oai_id) return oai_record @@ -92,13 +94,13 @@ def _request_record(self): sys.exit(1) return response.json() - def get_record(self) -> df.OAIRecord: + def get_record(self) -> df_r.Record: """Return requested data as temporary OAI Record but store internally as plain dictionary""" self.record_data = self._request_record() - _oai_record = df.OAIRecord(self.record_data[odem.RECORD_IDENTIFIER]) + _oai_record = df_r.Record(self.record_data[odem.RECORD_IDENTIFIER]) return _oai_record def update(self, status, oai_urn, **kwargs): @@ -305,7 +307,7 @@ def oai_arg_parser(value): if ocr_results is None or len(ocr_results) == 0: raise odem.ODEMException(f"process run error: {record.identifier}") odem_process.calculate_statistics_ocr(ocr_results) - odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS + odem_process.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS _stats_ocr = odem_process.statistics odem_process.the_logger.info("[%s] %s", local_ident, _stats_ocr) wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True) @@ -331,16 +333,16 @@ def oai_arg_parser(value): # finale odem_process.clear_resources(remove_all=True) LOGGER.info("[%s] odem done in '%s' (%d executors)", - odem_process.process_identifier, odem_process.duration, EXECUTORS) + odem_process.process_identifier, odem_process.statistics['timedelta'], EXECUTORS) except odem.ODEMNoTypeForOCRException as type_unknown: - LOGGER.warning("[%s] odem skips '%s'", - odem_process.process_identifier, type_unknown.args) + LOGGER.warning("[%s] odem skips '%s'", + odem_process.process_identifier, type_unknown.args) err_dict = {'NoTypeForOCR': type_unknown.args[0]} CLIENT.update(status=odem.MARK_OCR_SKIP, oai_urn=rec_ident, **err_dict) odem_process.clear_resources(remove_all=True) except odem.ODEMNoImagesForOCRException as not_ocrable: - LOGGER.warning("[%s] odem no ocrables '%s'", - odem_process.process_identifier, not_ocrable.args) + LOGGER.warning("[%s] odem no ocrables '%s'", + odem_process.process_identifier, not_ocrable.args) err_dict = {'NoImagesForOCR': not_ocrable.args[0]} CLIENT.update(status=odem.MARK_OCR_SKIP, oai_urn=rec_ident, **err_dict) odem_process.clear_resources(remove_all=True) diff --git a/cli_oai_local.py b/cli_oai_local.py index 26c3dc3..8f1f89d 100644 --- a/cli_oai_local.py +++ b/cli_oai_local.py @@ -7,7 +7,7 @@ import shutil import sys -import digiflow as df +import digiflow.record as df_r import lib.odem as odem import lib.odem.monitoring as odem_rm @@ -24,7 +24,7 @@ ODEMProcessImpl, ODEMException, get_configparser, - get_logger, + get_logger, ) DEFAULT_EXECUTORS = 2 @@ -34,9 +34,9 @@ def trnfrm(row): oai_id = row[RECORD_IDENTIFIER] try: _info = ast.literal_eval(row[RECORD_INFO]) - except: + except SyntaxError: _info = row[RECORD_INFO] - _record = df.OAIRecord(oai_id,) + _record = df_r.Record(oai_id,) _record.info = _info return _record @@ -134,9 +134,9 @@ def trnfrm(row): DATA_FIELDS = CFG.getlist('global', 'data_fields') LOGGER.info("data fields: '%s'", DATA_FIELDS) LOGGER.info("use records from '%s'", OAI_RECORD_FILE) - handler = df.OAIRecordHandler( + handler = df_r.RecordHandler( OAI_RECORD_FILE, data_fields=DATA_FIELDS, transform_func=trnfrm) - record: df.OAIRecord = handler.next_record(state=MARK_OCR_OPEN) + record: df_r.Record = handler.next_record(state=MARK_OCR_OPEN) if not record: LOGGER.info("no open records in '%s', work done", OAI_RECORD_FILE) sys.exit(1) @@ -163,7 +163,7 @@ def wrap_save_record_state(status: str, urn, **kwargs): LOCAL_STORE_ROOT = CFG.get('global', 'local_store_root', fallback=None) if LOCAL_STORE_ROOT is not None: STORE_DIR = os.path.join(LOCAL_STORE_ROOT, local_ident) - STORE = df.LocalStore(STORE_DIR, req_dst_dir) + STORE = df_r.LocalStore(STORE_DIR, req_dst_dir) odem_process.store = STORE process_resource_monitor: odem_rm.ProcessResourceMonitor = odem_rm.ProcessResourceMonitor( odem_rm.from_configuration(CFG), @@ -189,7 +189,7 @@ def wrap_save_record_state(status: str, urn, **kwargs): if ocr_results is None or len(ocr_results) == 0: raise ODEMException(f"process run error: {record.identifier}") odem_process.calculate_statistics_ocr(ocr_results) - odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS + odem_process.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics) # odem_process.link_ocr_files() # odem_process.postprocess_ocr() @@ -222,10 +222,10 @@ def wrap_save_record_state(status: str, urn, **kwargs): handler.save_record_state(record.identifier, MARK_OCR_DONE, INFO=_info) _mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}' odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier, - odem_process.duration, _mode, odem_process.statistics) + odem_process.statistics['timedelta'], _mode, odem_process.statistics) # finale LOGGER.info("[%s] odem done in '%s' (%d executors)", - odem_process.process_identifier, odem_process.duration, EXECUTORS) + odem_process.process_identifier, odem_process.statistics['timedelta'], EXECUTORS) except odem.ODEMNoTypeForOCRException as type_unknown: # we don't ocr this one LOGGER.warning("[%s] odem skips '%s'", @@ -241,6 +241,6 @@ def wrap_save_record_state(status: str, urn, **kwargs): handler.save_record_state(record.identifier, MARK_OCR_FAIL, INFO=f'{_err_args}') except RuntimeError as exc: LOGGER.error("odem fails for '%s' after %s with: '%s'", - record, odem_process.duration, str(exc)) + record, odem_process.statistics['timedelta'], str(exc)) handler.save_record_state(record.identifier, MARK_OCR_FAIL, INFO=f'{str(exc) : exc.args[0]}') sys.exit(1) diff --git a/lib/odem/ocr/ocr_workflow.py b/lib/odem/ocr/ocr_workflow.py index b236153..14d05b4 100644 --- a/lib/odem/ocr/ocr_workflow.py +++ b/lib/odem/ocr/ocr_workflow.py @@ -11,6 +11,8 @@ import time import typing +from pathlib import Path + import lib.odem.odem_commons as odem_c import lib.odem.ocr.ocrd as odem_ocrd import lib.odem.ocr.ocr_pipeline as odem_tess @@ -31,14 +33,15 @@ class ODEMWorkflowRunner: """Wrap actual ODEM process execution""" - def __init__(self, identifier, n_executors, + def __init__(self, identifier, n_executors, internal_logger, odem_workflow) -> None: self.process_identifier = identifier self.n_executors = n_executors - self.logger:logging.Logger = internal_logger + self.logger: logging.Logger = internal_logger self.odem_workflow: ODEMWorkflow = odem_workflow def run(self): + """Actual run wrapper""" input_data = self.odem_workflow.get_inputs() the_outcomes = [(0, 0, 0, 0)] if self.n_executors > 1: @@ -53,7 +56,7 @@ def run_parallel(self, input_data): n_inputs = len(input_data) self.logger.info("[%s] %d inputs run_parallel by %d executors", - self.process_identifier, n_inputs, self.n_executors) + self.process_identifier, n_inputs, self.n_executors) try: with concurrent.futures.ThreadPoolExecutor( max_workers=self.n_executors, @@ -72,7 +75,7 @@ def run_sequential(self, input_data): len_img = len(input_data) estm_min = len_img * DEFAULT_RUNTIME_PAGE self.logger.info("[%s] %d inputs run_sequential, estm. %dmin", - self.process_identifier, len_img, estm_min) + self.process_identifier, len_img, estm_min) try: outcomes = [self.odem_workflow.run(the_input) for the_input in input_data] @@ -90,15 +93,21 @@ def create( workflow_type: odem_c.OdemWorkflowProcessType | str, odem: odem_c.ODEMProcess, ) -> ODEMWorkflow: - if (workflow_type == odem_c.OdemWorkflowProcessType.ODEM_TESSERACT - or workflow_type == odem_c.OdemWorkflowProcessType.ODEM_TESSERACT.value): + """Create actual instance""" + if workflow_type == odem_c.OdemWorkflowProcessType.ODEM_TESSERACT: return ODEMTesseract(odem) return OCRDPageParallel(odem) + def __init__(self, odem_process: odem_c.ODEMProcess): + self.odem_process = odem_process + self.config = odem_process.odem_configuration + self.logger = odem_process.the_logger + self.ocr_files = [] + def get_inputs(self) -> typing.List: """Collect all input data files for processing""" - def run(self): + def run(self, _: typing.List): """Run actual implemented Workflow""" def foster_outputs(self): @@ -110,26 +119,21 @@ def foster_outputs(self): class OCRDPageParallel(ODEMWorkflow): """Use page parallel workflow""" - def __init__(self, odem_process: odem_c.ODEMProcess): - self.odem = odem_process - self.cfg = odem_process.odem_configuration - self.logger = odem_process.the_logger - def get_inputs(self): - return self.odem.images_4_ocr + return self.odem_process.ocr_candidates def run(self, input_data): """Create OCR Data""" ocr_log_conf = os.path.join( - odem_c.PROJECT_ROOT, self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_logging')) + odem_c.PROJECT_ROOT, self.config.get(odem_c.CFG_SEC_OCR, 'ocrd_logging')) # Preprare workspace with makefile (image_path, ident) = input_data - os.chdir(self.odem.work_dir_main) + os.chdir(self.odem_process.work_dir_root) file_name = os.path.basename(image_path) file_id = file_name.split('.')[0] - page_workdir = os.path.join(self.odem.work_dir_main, file_id) + page_workdir = os.path.join(self.odem_process.work_dir_root, file_id) if os.path.exists(page_workdir): shutil.rmtree(page_workdir, ignore_errors=True) os.mkdir(page_workdir) @@ -143,7 +147,7 @@ def run(self, input_data): odem_ocrd.ocrd_workspace_setup(page_workdir, processed_image_path) # find model config for tesseract - model_config = self.odem.map_language_to_modelconfig(image_path) + model_config = self.odem_process.map_language_to_modelconfig(image_path) stored = 0 mps = 0 @@ -157,27 +161,27 @@ def run(self, input_data): (mps, dpi) = odem_img.get_imageinfo(image_path) # how to identify data set? - if self.odem.record: - _ident = self.odem.process_identifier + if self.odem_process.record: + _ident = self.odem_process.process_identifier else: - _ident = os.path.basename(self.odem.work_dir_main) + _ident = os.path.basename(self.odem_process.work_dir_root) # OCR Generation profiling = ('n.a.', 0) - container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}' - container_memory_limit: str = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) - container_user = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) - container_timeout: int = self.cfg.getint( + container_name: str = f'{self.odem_process.process_identifier}_{os.path.basename(page_workdir)}' + container_memory_limit: str = self.config.get(odem_c.CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) + container_user = self.config.get(odem_c.CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) + container_timeout: int = self.config.getint( odem_c.CFG_SEC_OCR, 'docker_container_timeout', fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT ) - base_image = self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_baseimage') - ocrd_process_list = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'ocrd_process_list') - tesseract_model_rtl: typing.List[str] = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'tesseract_model_rtl', fallback=odem_c.DEFAULT_RTL_MODELS) - ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) + base_image = self.config.get(odem_c.CFG_SEC_OCR, 'ocrd_baseimage') + ocrd_process_list = self.config.getlist(odem_c.CFG_SEC_OCR, 'ocrd_process_list') + tesseract_model_rtl: typing.List[str] = self.config.getlist(odem_c.CFG_SEC_OCR, 'tesseract_model_rtl', fallback=odem_c.DEFAULT_RTL_MODELS) + ocrd_resources_volumes: typing.Dict[str, str] = self.config.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) - if self.odem.local_mode: + if self.odem_process.local_mode: container_name = os.path.basename(page_workdir) try: profiling = odem_ocrd.run_ocr_page( @@ -195,21 +199,21 @@ def run(self, input_data): # will be unset in case of magic mocking for test if profiling: self.logger.info("[%s] '%s' in %s (%.1fMP, %dDPI, %.1fMB)", - _ident, profiling[1], profiling[0], mps, dpi, filesize_mb) + _ident, profiling[1], profiling[0], mps, dpi, filesize_mb) self.logger.info("[%s] run ocr creation in '%s'", - _ident, page_workdir) + _ident, page_workdir) stored = self._store_fulltext(page_workdir, image_path) if stored: self._preserve_log(page_workdir, ident) except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: self.logger.error("[%s] image '%s' failed due to subprocess timeout: %s", - _ident, base_image, exc) + _ident, base_image, exc) except Exception as plain_exc: self.logger.error("[%s] generic exc '%s' for image '%s'", - _ident, plain_exc, base_image) + _ident, plain_exc, base_image) - os.chdir(self.odem.work_dir_main) - if self.cfg.getboolean(odem_c.CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: + os.chdir(self.odem_process.work_dir_root) + if self.config.getboolean(odem_c.CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: shutil.rmtree(page_workdir, ignore_errors=True) return stored, 1, mps, filesize_mb @@ -218,8 +222,8 @@ def _preserve_log(self, work_subdir, image_ident): sub directory identified by adopted local identifier (local section of system OAI handle)""" - _root_log = self.cfg.get('global', 'local_log_dir') - _local_ident = self.odem.process_identifier.replace('/', '_') + _root_log = self.config.get('global', 'local_log_dir') + _local_ident = self.odem_process.process_identifier.replace('/', '_') _local_ocr_log = os.path.join(_root_log, _local_ident) if not os.path.exists(_local_ocr_log): os.makedirs(_local_ocr_log, exist_ok=True) @@ -233,7 +237,7 @@ def _preserve_log(self, work_subdir, image_ident): shutil.copy(_rebranded, _local_ocr_log) else: self.logger.warning("[%s] No ocrd.log in %s", - self.odem.process_identifier, work_subdir) + self.odem_process.process_identifier, work_subdir) def _store_fulltext(self, image_subdir, original_image_path) -> int: """Move OCR Result from Workspace Subdir to export folder if exists""" @@ -244,22 +248,22 @@ def _store_fulltext(self, image_subdir, original_image_path) -> int: ocr_result_dir = os.path.join(image_subdir, LOCAL_OCRD_RESULT_DIR) if not os.path.isdir(ocr_result_dir): self.logger.info("[%s] no ocr results for '%s'", - self.odem.process_identifier, ocr_result_dir) + self.odem_process.process_identifier, ocr_result_dir) return 0 ocrs = [os.path.join(ocr_result_dir, ocr) for ocr in os.listdir(ocr_result_dir) if str(ocr).endswith('.xml')] self.logger.debug("[%s] %s ocr files", - self.odem.process_identifier, ocrs) + self.odem_process.process_identifier, ocrs) if ocrs and len(ocrs) == 1: # propably need to rename # since file now is like 'PAGE_01.xml' renamed = os.path.join(ocr_result_dir, old_id + '.xml') os.rename(ocrs[0], renamed) # regular case: OAI Workflow - if not self.odem.local_mode: + if not self.odem_process.local_mode: # export to 'PAGE' dir - wd_fulltext = os.path.join(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) + wd_fulltext = os.path.join(self.odem_process.work_dir_root, LOCAL_OCRD_RESULT_DIR) if not os.path.exists(wd_fulltext): os.mkdir(wd_fulltext) @@ -279,17 +283,18 @@ def foster_outputs(self): * some additional tag stripping """ - n_candidates = len(self.odem.images_4_ocr) - ocrd_data_files = odem_c.list_files(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) + n_candidates = len(self.odem_process.ocr_candidates) + list_from_dir = Path(self.odem_process.work_dir_root) / LOCAL_OCRD_RESULT_DIR + ocrd_data_files = odem_c.list_files(list_from_dir) if len(ocrd_data_files) == 0 and n_candidates > 0: raise odem_c.ODEMException(f"No OCR result for {n_candidates} candidates created!") - final_fulltext_dir = os.path.join(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) + final_fulltext_dir = os.path.join(self.odem_process.work_dir_root, odem_c.FILEGROUP_FULLTEXT) if not os.path.isdir(final_fulltext_dir): os.makedirs(final_fulltext_dir, exist_ok=True) self.ocr_files = odem_fmt.convert_to_output_format(ocrd_data_files, final_fulltext_dir) self.logger.info("[%s] converted '%d' files page-to-alto", - self.odem.process_identifier, len(self.ocr_files)) - strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') + self.odem_process.process_identifier, len(self.ocr_files)) + strip_tags = self.config.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') for _ocr_file in self.ocr_files: odem_fmt.postprocess_ocr_file(_ocr_file, strip_tags) @@ -298,17 +303,15 @@ class ODEMTesseract(ODEMWorkflow): """Tesseract Runner""" def __init__(self, odem_process: odem_c.ODEMProcess): - self.odem = odem_process - self.odem_configuration = odem_process.odem_configuration - self.logger = odem_process.the_logger + super().__init__(odem_process) self.pipeline_configuration = None def get_inputs(self): - images_4_ocr = self.odem.images_4_ocr + images_4_ocr = self.odem_process.ocr_candidates n_total = len(images_4_ocr) pipeline_cfg = self.read_pipeline_config() input_data = [(img, i, n_total, self.logger, pipeline_cfg) - for i, img in enumerate(self.odem.images_4_ocr, start=1)] + for i, img in enumerate(self.odem_process.ocr_candidates, start=1)] return input_data def run(self, input_data): @@ -323,15 +326,15 @@ def run(self, input_data): filesize_mb = filestat.st_size / 1048576 (mps, _) = odem_img.get_imageinfo(image_path) return stored, 1, mps, filesize_mb - + def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: """Read pipeline configuration and replace model_configs with known language data""" - + if self.pipeline_configuration is None: if path_config is None: - if self.odem_configuration.has_option(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config'): - path_config = os.path.abspath(self.odem_configuration.get(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config')) + if self.config.has_option(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config'): + path_config = os.path.abspath(self.config.get(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config')) if not os.path.isfile(path_config): raise odem_c.ODEMException(f"no ocr-pipeline conf {path_config} !") pipe_cfg = configparser.ConfigParser() @@ -339,17 +342,19 @@ def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: self.logger.info(f"use config '{path_config}'") for sect in pipe_cfg.sections(): if pipe_cfg.has_option(sect, 'model_configs'): - known_langs = self.odem._statistics_ocr.get(odem_c.STATS_KEY_LANGS) - model_files = self.odem.language_modelconfig(known_langs) - models = model_files.replace('.traineddata','') + known_langs = self.odem_process.process_statistics.get(odem_c.STATS_KEY_LANGS) + model_files = self.odem_process.language_modelconfig(known_langs) + models = model_files.replace('.traineddata', '') pipe_cfg.set(sect, 'model_configs', models) if pipe_cfg.has_option(sect, odem_tess.STEP_MOVE_PATH_TARGET): - pipe_cfg.set(sect, odem_tess.STEP_MOVE_PATH_TARGET, f'{self.odem.work_dir_main}/FULLTEXT') + move_target = f'{self.odem_process.work_dir_root}/FULLTEXT' + pipe_cfg.set(sect, odem_tess.STEP_MOVE_PATH_TARGET, move_target) self.pipeline_configuration = pipe_cfg return self.pipeline_configuration def foster_outputs(self): - self.ocr_files = odem_c.list_files(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) - strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') + list_from_dir = Path(self.odem_process.work_dir_root) / odem_c.FILEGROUP_FULLTEXT + self.ocr_files = odem_c.list_files(list_from_dir) + strip_tags = self.config.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') for _ocr_file in self.ocr_files: odem_fmt.postprocess_ocr_file(_ocr_file, strip_tags) diff --git a/lib/odem/ocrd3_odem.py b/lib/odem/ocrd3_odem.py index b947d14..96ccc35 100644 --- a/lib/odem/ocrd3_odem.py +++ b/lib/odem/ocrd3_odem.py @@ -3,10 +3,7 @@ from __future__ import annotations -import configparser import datetime -import typing -import logging import os import shutil import socket @@ -21,6 +18,7 @@ import digiflow as df import digiflow.digiflow_export as dfx import digiflow.digiflow_metadata as dfm +import digiflow.record as df_r import lib.odem.odem_commons as odem_c import lib.odem.processing.image as odem_image @@ -40,7 +38,7 @@ DEFAULT_LANG = 'ger' -class ODEMProcessImpl(odem_c.OdemProcess): +class ODEMProcessImpl(odem_c.ODEMProcess): """Create OCR for OAI Records. Runs both wiht OAIRecord or local path as input. @@ -53,7 +51,8 @@ class ODEMProcessImpl(odem_c.OdemProcess): for the underlying OCR-Engine Tesseract-OCR. """ - def __init__(self, record: df.OAIRecord, work_dir, executors=2, log_dir=None, logger=None): + def __init__(self, record: df_r.Record, work_dir, + log_dir=None, logger=None, configuration=None): """Create new ODEM Process. Args: record (OAIRecord): OAI Record dataset @@ -64,57 +63,31 @@ def __init__(self, record: df.OAIRecord, work_dir, executors=2, log_dir=None, lo Defaults to None. """ - self.record = record - self.work_dir_main = work_dir + super().__init__(configuration, work_dir_root=work_dir, + the_logger=logger, log_dir=log_dir, record=record) self.digi_type = None self.mods_identifier = None self.local_mode = record is None - self.process_identifier = None if self.local_mode: self.process_identifier = os.path.basename(work_dir) if record is not None and record.local_identifier is not None: self.process_identifier = record.local_identifier self.export_dir = None - self.the_logger: logging.Logger = None - self.odem_configuration: configparser.ConfigParser = None self.store: df.LocalStore = None - self.images_4_ocr: typing.List = [] # List[str] | List[Tuple[str, str]] self.ocr_files = [] - self.ocr_function = None - self.ocr_input: typing.List = [] - self._statistics_ocr = {'execs': executors} self._process_start = time.time() - if logger is not None: - self.the_logger = logger - elif log_dir is not None and os.path.exists(log_dir): - self._init_logger(log_dir) - self.mets_file = os.path.join( - work_dir, os.path.basename(work_dir) + '.xml') - - def _init_logger(self, log_dir): - today = time.strftime('%Y-%m-%d', time.localtime()) - if not log_dir: - log_parent = os.path.dirname(os.path.dirname(self.work_dir_main)) - if not os.access(log_parent, os.W_OK): - raise RuntimeError(f"cant store log files at invalid {log_dir}") - log_dir = os.path.join(log_parent, 'log') - os.makedirs(log_dir, exist_ok=True) - logfile_name = os.path.join( - log_dir, f"odem_{today}.log") - conf_logname = {'logname': logfile_name} - conf_file_location = os.path.join( - odem_c.PROJECT_ROOT, 'resources', 'odem_logging.ini') - logging.config.fileConfig(conf_file_location, defaults=conf_logname) - self.the_logger = logging.getLogger('odem') + # self.mets_file = os.path.join( + # work_dir, os.path.basename(work_dir) + '.xml') def load(self): request_identifier = self.record.identifier local_identifier = self.record.local_identifier req_dst_dir = os.path.join( - os.path.dirname(self.work_dir_main), local_identifier) + os.path.dirname(self.work_dir_root), local_identifier) if not os.path.exists(req_dst_dir): os.makedirs(req_dst_dir, exist_ok=True) - req_dst = os.path.join(req_dst_dir, local_identifier + '.xml') + # req_dst = os.path.join(req_dst_dir, local_identifier + '.xml') + req_dst = self.mets_file_path self.the_logger.debug("[%s] download %s to %s", self.process_identifier, request_identifier, req_dst) base_url = self.odem_configuration.get('global', 'base_url') @@ -122,7 +95,7 @@ def load(self): loader = df.OAILoader(req_dst_dir, base_url=base_url, post_oai=dfm.extract_mets) loader.store = self.store loader.load(request_identifier, local_dst=req_dst) - except df.OAILoadClientError as load_err: + except df.ClientError as load_err: raise odem_c.ODEMException(load_err.args[0]) from load_err except RuntimeError as _err: raise odem_c.ODEMException(_err.args[0]) from _err @@ -137,31 +110,31 @@ def clear_resources(self, remove_all=False): sweeper.sweep() if remove_all: shutil.rmtree(self.store.dir_store_root) - if os.path.exists(self.work_dir_main): - shutil.rmtree(self.work_dir_main) + if os.path.exists(self.work_dir_root): + shutil.rmtree(self.work_dir_root) def inspect_metadata(self): - insp = ODEMMetadataInspecteur(self.mets_file, + insp = ODEMMetadataInspecteur(self.mets_file_path, self.record.identifier, cfg=self.odem_configuration) try: the_report = insp.metadata_report() self.digi_type = the_report.type - self.images_4_ocr = insp.image_pairs + self.ocr_candidates = insp.image_pairs except RuntimeError as mde: raise odem_c.ODEMException(f"{mde.args[0]}") from mde self.mods_identifier = insp.mods_record_identifier for t, ident in insp.identifiers.items(): - self._statistics_ocr[t] = ident - self._statistics_ocr['type'] = insp.type - self._statistics_ocr[odem_c.STATS_KEY_LANGS] = insp.languages - self._statistics_ocr['n_images_pages'] = insp.n_images_pages - self._statistics_ocr['n_images_ocrable'] = insp.n_images_ocrable + self.process_statistics[t] = ident + self.process_statistics['type'] = insp.type + self.process_statistics[odem_c.STATS_KEY_LANGS] = insp.languages + self.process_statistics['n_images_pages'] = insp.n_images_pages + self.process_statistics['n_images_ocrable'] = insp.n_images_ocrable _ratio = insp.n_images_ocrable / insp.n_images_pages * 100 self.the_logger.info("[%s] %04d (%.2f%%) images used for OCR (total: %04d)", self.process_identifier, insp.n_images_ocrable, _ratio, insp.n_images_pages) - self._statistics_ocr['host'] = socket.gethostname() + self.process_statistics['host'] = socket.gethostname() def clear_existing_entries(self): """Clear METS/MODS of configured file groups""" @@ -170,7 +143,7 @@ def clear_existing_entries(self): _blacklisted = self.odem_configuration.getlist('mets', 'blacklist_file_groups') _ident = self.process_identifier self.the_logger.info("[%s] remove %s", _ident, _blacklisted) - _proc = df.MetsProcessor(self.mets_file) + _proc = df.MetsProcessor(self.mets_file_path) _proc.clear_filegroups(_blacklisted) _proc.write() @@ -190,7 +163,7 @@ def language_modelconfig(self, languages=None) -> str: self.the_logger.info("[%s] inspect languages '%s'", self.process_identifier, languages) if languages is None: - languages = self._statistics_ocr.get(odem_c.STATS_KEY_LANGS) + languages = self.process_statistics.get(odem_c.STATS_KEY_LANGS) for lang in languages: model_entry = model_mappings.get(lang) if not model_entry: @@ -201,7 +174,7 @@ def language_modelconfig(self, languages=None) -> str: else: raise odem_c.ODEMException(f"'{model}' model config not found !") _model_conf = '+'.join(_models) if self.odem_configuration.getboolean(odem_c.CFG_SEC_OCR, "model_combinable", fallback=True) else _models[0] - self._statistics_ocr[odem_c.STATS_KEY_MODELS] = _model_conf + self.process_statistics[odem_c.STATS_KEY_MODELS] = _model_conf self.the_logger.info("[%s] map languages '%s' => '%s'", self.process_identifier, languages, _model_conf) return _model_conf @@ -260,7 +233,7 @@ def get_local_image_paths(self, image_local_dir=None) -> typing.List[str]: i.e., pre-existing evaluation image data """ - image_dir = os.path.join(self.work_dir_main, 'MAX') + image_dir = os.path.join(self.work_dir_root, 'MAX') if image_local_dir: if not os.path.isdir(image_local_dir): raise RuntimeError(f"invalid path: {image_local_dir}!") @@ -288,13 +261,13 @@ def set_local_images(self): images and original page urn """ _images_of_interest = [] - _local_max_dir = os.path.join(self.work_dir_main, 'MAX') - for _img, _urn in self.images_4_ocr: + _local_max_dir = os.path.join(self.work_dir_root, 'MAX') + for _img, _urn in self.ocr_candidates: _the_file = os.path.join(_local_max_dir, _img) if not os.path.exists(_the_file): raise odem_c.ODEMException(f"[{self.process_identifier}] missing {_the_file}!") _images_of_interest.append((_the_file, _urn)) - self.images_4_ocr = _images_of_interest + self.ocr_candidates = _images_of_interest def calculate_statistics_ocr(self, outcomes: typing.List): """Calculate and aggregate runtime stats""" @@ -303,17 +276,18 @@ def calculate_statistics_ocr(self, outcomes: typing.List): _mod_val_counts = np.unique(_total_mps, return_counts=True) mps = list(zip(*_mod_val_counts)) total_mb = sum([e[3] for e in outcomes if e[0] == 1]) - self._statistics_ocr[odem_c.STATS_KEY_N_OCR] = n_ocr - self._statistics_ocr[odem_c.STATS_KEY_MB] = round(total_mb, 2) - self._statistics_ocr[odem_c.STATS_KEY_MPS] = mps + self.process_statistics[odem_c.STATS_KEY_N_OCR] = n_ocr + self.process_statistics[odem_c.STATS_KEY_MB] = round(total_mb, 2) + self.process_statistics[odem_c.STATS_KEY_MPS] = mps def link_ocr_files(self) -> int: """Prepare and link OCR-data""" - self.ocr_files = odem_c.list_files(self.work_dir_main, odem_c.FILEGROUP_FULLTEXT) + list_from_dir = Path(self.work_dir_root) / odem_c.FILEGROUP_FULLTEXT + self.ocr_files = odem_c.list_files(list_from_dir) if not self.ocr_files: return 0 - proc = df.MetsProcessor(self.mets_file) + proc = df.MetsProcessor(self.mets_file_path) _n_linked_ocr = integrate_ocr_file(proc.tree, self.ocr_files) proc.write() return _n_linked_ocr @@ -325,12 +299,12 @@ def create_text_bundle_data(self): txt_lines = extract_text_content(self.ocr_files) txt_content = '\n'.join(txt_lines) - _out_path = os.path.join(self.work_dir_main, f'{self.mods_identifier}.pdf.txt') + _out_path = os.path.join(self.work_dir_root, f'{self.mods_identifier}.pdf.txt') with open(_out_path, mode='w', encoding='UTF-8') as _writer: _writer.write(txt_content) self.the_logger.info("[%s] harvested %d lines from %d ocr files to %s", self.process_identifier, len(txt_lines), len(self.ocr_files), _out_path) - self._statistics_ocr['n_text_lines'] = len(txt_lines) + self.process_statistics['n_text_lines'] = len(txt_lines) def create_pdf(self): """Forward PDF-creation to Derivans""" @@ -350,7 +324,7 @@ def create_pdf(self): derivans_image = self.odem_configuration.get('derivans', 'derivans_image', fallback=None) path_logging = self.odem_configuration.get('derivans', 'derivans_logdir', fallback=None) derivans: df.BaseDerivansManager = df.BaseDerivansManager.create( - self.mets_file, + self.mets_file_path, container_image_name=derivans_image, path_binary=path_bin, path_configuration=path_cfg, @@ -372,7 +346,7 @@ def create_pdf(self): def delete_before_export(self, folders): """delete folders given by list""" - work = self.work_dir_main + work = self.work_dir_root self.the_logger.info( "[%s] delete folders: %s", self.process_identifier, folders) for folder in folders: @@ -385,7 +359,7 @@ def delete_before_export(self, folders): def postprocess_mets(self): """wrap work related to processing METS/MODS""" - postprocess_mets(self.mets_file, self.odem_configuration) + postprocess_mets(self.mets_file_path, self.odem_configuration) def validate_metadata(self): """Forward (optional) validation concerning @@ -402,7 +376,7 @@ def validate_metadata(self): # dtype = 'Aa' # if 'pica' in self.record.info: # dtype = self.record.info['pica'] - return validate(self.mets_file, validate_ddb=check_ddb, + return validate(self.mets_file_path, validate_ddb=check_ddb, digi_type=self.digi_type, ddb_ignores=ignore_ddb) def export_data(self): @@ -420,11 +394,11 @@ def export_data(self): # created due OCR and do more specific mapping, though exp_map = {k: v for k, v in exp_map.items() if v != 'mets.xml'} if export_mets: - exp_map[os.path.basename(self.mets_file)] = 'mets.xml' + exp_map[os.path.basename(self.mets_file_path)] = 'mets.xml' saf_name = self.mods_identifier if export_format == odem_c.ExportFormat.SAF: export_result = df.export_data_from( - self.mets_file, + self.mets_file_path, exp_col, saf_final_name=saf_name, export_dst=exp_dst, @@ -433,7 +407,7 @@ def export_data(self): ) elif export_format == odem_c.ExportFormat.FLAT_ZIP: prefix = 'opendata-working-' - source_path_dir = os.path.dirname(self.mets_file) + source_path_dir = os.path.dirname(self.mets_file_path) tmp_dir = tempfile.gettempdir() if exp_tmp: tmp_dir = exp_tmp @@ -443,7 +417,7 @@ def export_data(self): for mapping in export_mappings: mapping.copy() tmp_zip_path, size = ODEMProcessImpl.compress_flat(os.path.dirname(work_dir), saf_name) - path_export_processing = dfx._move_to_tmp_file(tmp_zip_path, exp_dst) + path_export_processing = dfx.move_to_tmp_file(tmp_zip_path, exp_dst) export_result = path_export_processing, size else: raise odem_c.ODEMException(f'Unsupported export format: {export_format}') @@ -464,6 +438,7 @@ def export_data(self): @classmethod def compress_flat(cls, work_dir, archive_name): + """Create flat ZIP file (instead of SAF with items)""" zip_file_path = os.path.join(os.path.dirname(work_dir), archive_name) + '.zip' previous_dir = os.getcwd() os.chdir(os.path.join(work_dir, archive_name)) @@ -475,12 +450,16 @@ def compress_flat(cls, work_dir, archive_name): return zip_file_path, f"{zip_size}MiB" @property - def duration(self): - """Get current duration of ODEMProcess. - Most likely at the final end to get an idea - how much the whole process takes.""" + def mets_file_path(self) -> Path: + """Get actual METS/MODS file from work_dir""" + mets_file = f"{os.path.basename(self.work_dir_root)}.xml" + return Path(self.work_dir_root) / mets_file - return datetime.timedelta(seconds=round(time.time() - self._process_start)) + @mets_file_path.setter + def mets_file_path(self, mets_path): + """Set enclosed MET/MODS data for testing purposes""" + mets_dir = os.path.dirname(mets_path) + self.work_dir_root = mets_dir @property def statistics(self): @@ -488,5 +467,6 @@ def statistics(self): with execution duration updated each call by requesting it's string representation""" - self._statistics_ocr['timedelta'] = f'{self.duration}' - return self._statistics_ocr + current_duration = datetime.timedelta(seconds=round(time.time() - self._process_start)) + self.process_statistics['timedelta'] = f'{current_duration}' + return self.process_statistics diff --git a/lib/odem/odem_commons.py b/lib/odem/odem_commons.py index a0d1eeb..e7f1370 100644 --- a/lib/odem/odem_commons.py +++ b/lib/odem/odem_commons.py @@ -2,6 +2,7 @@ import configparser import logging +import logging.config import os import socket import time @@ -11,7 +12,7 @@ from pathlib import Path import ocrd_utils -import digiflow as df +import digiflow.record as df_r # # ODEM States @@ -30,6 +31,7 @@ class ExportFormat(str, Enum): + """Set of excepted eport formats""" SAF = 'SAF' FLAT_ZIP = 'FLAT_ZIP' @@ -109,13 +111,47 @@ class OAIRecordExhaustedException(Exception): class OdemWorkflowProcessType(str, Enum): + """Accepted values for process types""" OCRD_PAGE_PARALLEL = "OCRD_PAGE_PARALLEL" ODEM_TESSERACT = "ODEM_TESSERACT" -class OdemProcess: +class ODEMProcess: """Basic Interface for ODEM""" + def __init__(self, + configuration: configparser.ConfigParser, + work_dir_root: Path, + the_logger: logging.Logger, + log_dir=None, + record: df_r.Record = None): + self.odem_configuration = configuration + self.work_dir_root = work_dir_root + self.record = record + self.process_identifier = None + self.the_logger = the_logger + self.process_statistics = {} + self.ocr_candidates = [] + if the_logger is not None: + self.the_logger = the_logger + if log_dir is not None and os.path.exists(log_dir): + self._init_logger(log_dir) + + def _init_logger(self, log_dir): + today = time.strftime('%Y-%m-%d', time.localtime()) + if not log_dir: + log_parent = os.path.dirname(os.path.dirname(self.work_dir_root)) + if not os.access(log_parent, os.W_OK): + raise RuntimeError(f"cant store log files at invalid {log_dir}") + log_dir = os.path.join(log_parent, 'log') + os.makedirs(log_dir, exist_ok=True) + logfile_name = os.path.join( + log_dir, f"odem_{today}.log") + conf_logname = {'logname': logfile_name} + conf_file_location = os.path.join(PROJECT_ROOT, 'resources', 'odem_logging.ini') + logging.config.fileConfig(conf_file_location, defaults=conf_logname) + self.the_logger = logging.getLogger('odem') + def load(self): """Load Data via OAI-PMH-API very LAZY i.e. if not metadata file exists already in @@ -131,7 +167,7 @@ def inspect_metadata(self): will be corrupt at this segment) * no page images for OCR """ - + def export_data(self): """re-do metadata and transform into output format""" @@ -165,7 +201,7 @@ def get_logger(log_dir, log_infix=None, path_log_config=None) -> logging.Logger: in log_dir. Log output from "page-to-alto" set to disable WARNING: "PAGE-XML has Border but no PrintSpace - Margins will be empty" - + please note: call of OCR-D initLogging() required, otherwise something like this pops up: @@ -215,7 +251,7 @@ def merge_args(the_configuration: configparser.ConfigParser, the_args) -> typing return _repls -def to_dict(record: df.OAIRecord) -> typing.Dict: +def to_dict(record: df_r.Record) -> typing.Dict: """Serialize OAIRecord into dictionary as input for JSON format""" @@ -227,18 +263,19 @@ def to_dict(record: df.OAIRecord) -> typing.Dict: RECORD_TIME: record.state_datetime, } -def from_dict(data) -> df.OAIRecord: + +def from_dict(data) -> df_r.Record: """deserialize into OAIRecord""" - _record = df.OAIRecord(data[RECORD_IDENTIFIER]) + _record = df_r.Record(data[RECORD_IDENTIFIER]) _record.info = data[RECORD_INFO] return _record -def list_files(dir_root, sub_dir, format='.xml') -> typing.List: - actual_dir = os.path.join(dir_root, sub_dir) +def list_files(the_directory, file_ext='.xml') -> typing.List: + """List all files in the_directory with given suffix""" return [ - os.path.join(actual_dir, dir_file) - for dir_file in os.listdir(actual_dir) - if Path(dir_file).suffix == format + os.path.join(the_directory, dir_file) + for dir_file in os.listdir(the_directory) + if Path(dir_file).suffix == file_ext ] diff --git a/tests/conftest.py b/tests/conftest.py index 8ae721f..9023e34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,9 +9,9 @@ import pytest -import digiflow as df +import digiflow.record as df_r -import lib.odem as odem +from lib import odem PROJECT_ROOT_DIR = pathlib.Path(__file__).resolve().parents[1] @@ -28,12 +28,14 @@ def fixture_configuration(): config = odem.get_configparser() config.read(os.path.join(PROJECT_ROOT_DIR, 'resources', 'odem.ocrd.tesseract.ini')) config.set('global', 'data_fields', 'IDENTIFIER, SETSPEC, CREATED, INFO, STATE, STATE_TIME') - config.set(odem.CFG_SEC_METS, 'blacklist_file_groups', 'DEFAULT, THUMB, THUMBS, MIN, FULLTEXT, DOWNLOAD') + config.set(odem.CFG_SEC_METS, 'blacklist_file_groups', + 'DEFAULT, THUMB, THUMBS, MIN, FULLTEXT, DOWNLOAD') config.set(odem.CFG_SEC_METS, 'blacklist_logical_containers', 'cover_front,cover_back') config.set(odem.CFG_SEC_METS, 'blacklist_physical_container_labels', 'Auftragszettel,Colorchecker,Leerseite,Rückdeckel,Deckblatt,Vorderdeckel,Illustration') config.set(odem.CFG_SEC_METS, 'agents', 'DFG-OCRD3-ODEM_ocrd/all:2022-08-15') - config.set(odem.CFG_SEC_OCR, 'strip_tags', 'alto:Shape,alto:Processing,alto:Illustration,alto:GraphicalElement') + config.set(odem.CFG_SEC_OCR, 'strip_tags', + 'alto:Shape,alto:Processing,alto:Illustration,alto:GraphicalElement') config.set(odem.CFG_SEC_OCR, 'ocrd_baseimage', 'ocrd/all:2022-08-15') return config @@ -62,24 +64,28 @@ def prepare_kraken_dir(tmp_path: Path) -> str: @pytest.fixture(name="fixture_27949", scope='module') def _module_fixture_123456789_27949(tmp_path_factory): - path_workdir = tmp_path_factory.mktemp('workdir') - orig_file = TEST_RES / '123456789_27949.xml' - trgt_mets = path_workdir / 'test.xml' + identifier = '123456789_27949' + work_dir_root = tmp_path_factory.mktemp('work_dir') + (work_dir_root / 'log').mkdir() + path_work_dir = work_dir_root / identifier + path_work_dir.mkdir() + orig_file = TEST_RES / f'{identifier}.xml' + trgt_mets = path_work_dir / f'{identifier}.xml' orig_alto = TEST_RES / '123456789_27949_FULLTEXT' - trgt_alto = path_workdir / 'FULLTEXT' + trgt_alto = path_work_dir / 'FULLTEXT' shutil.copyfile(orig_file, trgt_mets) shutil.copytree(orig_alto, trgt_alto) - (path_workdir / 'log').mkdir() - _model_dir = prepare_tessdata_dir(path_workdir) - record = df.OAIRecord('oai:dev.opendata.uni-halle.de:123456789/27949') - _oproc = odem.ODEMProcessImpl(record, work_dir=path_workdir, log_dir=path_workdir / 'log') - _oproc.odem_configuration = fixture_configuration() - _oproc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') - _oproc.ocr_files = [os.path.join(trgt_alto, a) + _model_dir = prepare_tessdata_dir(path_work_dir) + record = df_r.Record('oai:dev.opendata.uni-halle.de:123456789/27949') + odem_proc = odem.ODEMProcessImpl(record, work_dir=path_work_dir, log_dir=work_dir_root / 'log') + odem_proc.odem_configuration = fixture_configuration() + odem_proc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, + f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') + odem_proc.ocr_files = [os.path.join(trgt_alto, a) for a in os.listdir(trgt_alto)] - _oproc.mets_file = str(trgt_mets) - _oproc.inspect_metadata() - _oproc.clear_existing_entries() - n_integrated = _oproc.link_ocr_files() + odem_proc.mets_file_path = str(trgt_mets) + odem_proc.inspect_metadata() + odem_proc.clear_existing_entries() + n_integrated = odem_proc.link_ocr_files() assert n_integrated == 4 - yield _oproc + yield odem_proc diff --git a/tests/test_ocrd3_odem.py b/tests/test_ocrd3_odem.py index 289e519..8ec0afe 100644 --- a/tests/test_ocrd3_odem.py +++ b/tests/test_ocrd3_odem.py @@ -6,11 +6,15 @@ import unittest import unittest.mock +from pathlib import Path + +import digiflow as df +import digiflow.record as df_r import lxml.etree as ET + import pytest -import digiflow as df -import lib.odem as odem +from lib import odem from .conftest import ( PROJECT_ROOT_DIR, @@ -22,11 +26,14 @@ @pytest.mark.parametrize("img_path,lang_str", [ ('resources/urn+nbn+de+gbv+3+1-116899-p0062-3_ger.jpg', 'gt4hist_5000k.traineddata'), - ('resources/urn+nbn+de+gbv+3+1-116299-p0107-6_lat+ger.jpg', 'lat_ocr.traineddata+gt4hist_5000k.traineddata'), - ('resources/urn+nbn+de+gbv+3+1-118702-p0055-9_gre+lat.jpg', 'grc.traineddata+lat_ocr.traineddata'), + ('resources/urn+nbn+de+gbv+3+1-116299-p0107-6_lat+ger.jpg', + 'lat_ocr.traineddata+gt4hist_5000k.traineddata'), + ('resources/urn+nbn+de+gbv+3+1-118702-p0055-9_gre+lat.jpg', + 'grc.traineddata+lat_ocr.traineddata'), ('resources/urn+nbn+de+gbv+3+1-116899-p0062-3_ger.jpg', 'gt4hist_5000k.traineddata'), ('resources/urn+nbn+de+gbv+3+1-116299-p0107-6_lat.jpg', 'lat_ocr.traineddata'), - ('resources/urn+nbn+de+gbv+3+1-118702-p0055-9_ger+lat.jpg', 'gt4hist_5000k.traineddata+lat_ocr.traineddata') + ('resources/urn+nbn+de+gbv+3+1-118702-p0055-9_ger+lat.jpg', + 'gt4hist_5000k.traineddata+lat_ocr.traineddata') ]) def test_mapping_from_imagefilename(img_path, lang_str, tmp_path): """Ensure ODEM Object picks @@ -43,7 +50,7 @@ def test_mapping_from_imagefilename(img_path, lang_str, tmp_path): odem_processor.odem_configuration = fixture_configuration() _tess_dir = prepare_tessdata_dir(tmp_path) odem_processor.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, - f'{_tess_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') + f'{_tess_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') odem_processor.the_logger = odem.get_logger(str(log_dir)) odem_processor.local_mode = True @@ -148,7 +155,7 @@ def _side_effect(*args, **kwargs): _workdir.mkdir() _log_dir = _root_workdir / 'log' _log_dir.mkdir() - _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') + _record = df_r.Record('oai:opendata.uni-halle.de:1981185920/44046') odem_proc = odem.ODEMProcessImpl(_record, _workdir) odem_proc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(_workdir) @@ -167,7 +174,7 @@ def _side_effect(*args, **kwargs): # assert assert request_mock.call_count == 1 - assert os.path.exists(odem_proc.mets_file) + assert os.path.exists(odem_proc.mets_file_path) def test_odem_process_identifier_local_workdir(tmp_path): @@ -199,7 +206,7 @@ def _fixture_odem_setup(tmp_path): odem_processor.odem_configuration = cfg _model_dir = prepare_tessdata_dir(work_dir) odem_processor.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, - f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') + f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') odem_processor.local_mode = True odem_processor.the_logger = odem.get_logger(log_dir) return odem_processor @@ -242,7 +249,7 @@ def test_module_fixture_one_integrated_ocr_in_mets(fixture_27949: odem.ODEMProce # arrange assert len(fixture_27949.ocr_files) == 4 - _root = ET.parse(fixture_27949.mets_file).getroot() + _root = ET.parse(fixture_27949.mets_file_path).getroot() _phys_links = _root.xpath('//mets:div[@TYPE="physSequence"]/mets:div', namespaces=df.XMLNS) # at most 2: one MAX-Image plus according optional FULLTEXT assert len(_phys_links[1].getchildren()) == 1 @@ -261,7 +268,7 @@ def test_module_fixture_one_images_4_ocr_by_metadata(fixture_27949: odem.ODEMPro their physical presens and according METS metadata """ - assert len(fixture_27949.images_4_ocr) == 4 + assert len(fixture_27949.ocr_candidates) == 4 def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEMProcessImpl): @@ -275,14 +282,14 @@ def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEM """ # arrange - tmp_path = fixture_27949.work_dir_main + tmp_path = fixture_27949.work_dir_root # act fixture_27949.link_ocr_files() fixture_27949.create_text_bundle_data() # assert - _txt_bundle_file = tmp_path / '198114125.pdf.txt' + _txt_bundle_file = Path(tmp_path) / '198114125.pdf.txt' assert os.path.exists(_txt_bundle_file) assert 111 == fixture_27949.statistics['n_text_lines'] with open(_txt_bundle_file, encoding='utf-8') as bundle_handle: @@ -299,7 +306,7 @@ def test_images_4_ocr_properly_filtered(tmp_path): """ - _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') + _record = df_r.Record('oai:opendata.uni-halle.de:1981185920/44046') _work_dir = tmp_path / '1981185920_44046' _work_dir.mkdir() _max_dir = _work_dir / 'MAX' @@ -323,17 +330,17 @@ def test_images_4_ocr_properly_filtered(tmp_path): odem_processor.set_local_images() # assert - assert len(odem_processor.images_4_ocr) == 4 - assert odem_processor.images_4_ocr[0][0].endswith('1981185920_44046/MAX/00000001.jpg') + assert len(odem_processor.ocr_candidates) == 4 + assert odem_processor.ocr_candidates[0][0].endswith('1981185920_44046/MAX/00000001.jpg') -@unittest.mock.patch('digiflow.OAILoader.load', side_effect=df.OAILoadException("url '{}' returned '{}'")) +@unittest.mock.patch('digiflow.OAILoader.load', side_effect=df.LoadException("url '{}' returned '{}'")) def test_no_catch_when_load_exc(mock_load, tmp_path): """Ensure df.OAILoadException is raised for internal server errors (#9992) """ # arrange - _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') + _record = df_r.Record('oai:opendata.uni-halle.de:1981185920/44046') _work_dir = tmp_path / '1981185920_44046' _work_dir.mkdir() odem_processor = odem.ODEMProcessImpl(_record, work_dir=_work_dir) @@ -345,7 +352,7 @@ def test_no_catch_when_load_exc(mock_load, tmp_path): odem_processor.the_logger = odem.get_logger(str(_log_dir)) # act - with pytest.raises(df.OAILoadException) as err: + with pytest.raises(df.LoadException) as err: odem_processor.load() # assert @@ -359,19 +366,20 @@ def test_record_with_unknown_language(tmp_path): not unknown (gmh == German, Middle High 1050-1500) """ - path_workdir = tmp_path / 'workdir' + identifier = '1981185920_72977' + path_workdir = tmp_path / identifier path_workdir.mkdir() - orig_file = TEST_RES / '1981185920_72977.xml' - trgt_mets = path_workdir / 'test.xml' + orig_file = TEST_RES / f'{identifier}.xml' + trgt_mets = path_workdir / f'{identifier}.xml' shutil.copyfile(orig_file, trgt_mets) (path_workdir / 'log').mkdir() - record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/72977') + record = df_r.Record('oai:opendata.uni-halle.de:1981185920/72977') oproc = odem.ODEMProcessImpl(record, work_dir=path_workdir, log_dir=path_workdir / 'log') oproc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(tmp_path) oproc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, - f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') - oproc.mets_file = str(trgt_mets) + f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize') + oproc.mets_file_path = str(trgt_mets) oproc.inspect_metadata() _langs = oproc.statistics.get(odem.STATS_KEY_LANGS) @@ -390,7 +398,8 @@ def test_export_flat_zip(tmp_path): semantics VLS systems """ - path_workdir = tmp_path / 'workdir' + identifier = '1981185920_44046' + path_workdir = tmp_path / identifier path_workdir.mkdir() path_tmp_export_dir = tmp_path / 'tmp_export' path_tmp_export_dir.mkdir() @@ -398,7 +407,7 @@ def test_export_flat_zip(tmp_path): path_export_dir.mkdir() orig_file = TEST_RES / '1981185920_44046.xml' - trgt_mets = path_workdir / 'test.xml' + trgt_mets = path_workdir / f'{identifier}.xml' shutil.copyfile(orig_file, trgt_mets) orig_files = TEST_RES / 'vd18-1180329' / 'FULLTEXT' @@ -406,7 +415,7 @@ def test_export_flat_zip(tmp_path): shutil.copytree(orig_files, trgt_files) (path_workdir / 'log').mkdir() - record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/44046') + record = df_r.Record('oai:opendata.uni-halle.de:1981185920/44046') oproc = odem.ODEMProcessImpl(record, work_dir=path_workdir, log_dir=path_workdir / 'log') oproc.odem_configuration = fixture_configuration() _model_dir = prepare_tessdata_dir(tmp_path) @@ -420,7 +429,7 @@ def test_export_flat_zip(tmp_path): f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize' ) - oproc.mets_file = str(trgt_mets) + oproc.mets_file_path = str(trgt_mets) oproc.inspect_metadata() # _langs = oproc.statistics.get(odem.STATS_KEY_LANGS) diff --git a/tests/test_odem_ocr_pipeline.py b/tests/test_odem_ocr_pipeline.py index 96d33d3..b6ae05d 100644 --- a/tests/test_odem_ocr_pipeline.py +++ b/tests/test_odem_ocr_pipeline.py @@ -13,9 +13,9 @@ import pytest import lxml.etree as ET -import digiflow as df +import digiflow.record as df_r -import lib.odem as odem +from lib import odem import lib.odem.ocr.ocr_pipeline as o3o_pop from .conftest import TEST_RES, PROD_RES @@ -81,10 +81,10 @@ def fixure_a_workspace(tmp_path): @pytest.fixture(name="my_pipeline") def _fixture_default_pipeline(a_workspace: Path): - _record = df.OAIRecord('oai:urn:mwe') + _record = df_r.Record('oai:urn:mwe') odem_process = odem.ODEMProcessImpl(_record, a_workspace) odem_process.odem_configuration = ODEM_CFG - odem_process._statistics_ocr['languages'] = ['ger'] + odem_process.process_statistics['languages'] = ['ger'] odem_process.the_logger = odem.get_logger(a_workspace / 'log') odem_tess = odem.ODEMTesseract(odem_process) return odem_tess @@ -119,9 +119,9 @@ def _fixture_custom_config_pipeline(a_workspace): conf_dir.mkdir() conf_file = TEST_RES / 'ocr_config_full.ini' assert os.path.isfile(conf_file) - odem_process = odem.ODEMProcessImpl(df.OAIRecord('oai:urn_custom'), a_workspace) + odem_process = odem.ODEMProcessImpl(df_r.Record('oai:urn_custom'), a_workspace) odem_process.odem_configuration = ODEM_CFG - odem_process._statistics_ocr['languages'] = ['ger', 'lat'] + odem_process.process_statistics['languages'] = ['ger', 'lat'] odem_process.the_logger = odem.get_logger(a_workspace / 'log') odem_tess = odem.ODEMTesseract(odem_process) odem_tess.read_pipeline_config(conf_file) @@ -222,7 +222,7 @@ def test_step_tesseract_path_out_folder(max_dir): step.path_in = os.path.join(max_dir, TIF_001) # assert - assert step.path_next.name == '001.xml' + assert step.path_next.name == '001.xml' def test_step_tesseract_change_input(max_dir): @@ -816,10 +816,10 @@ def test_step_replace_regex_literal(tmp_path): step = o3o_pop.StepPostReplaceCharsRegex(params) step.path_in = tmp_file - + # act step.execute() - + # assert assert hasattr(step, 'statistics') assert len(step.statistics) == 9 @@ -847,10 +847,10 @@ def test_step_replace_regex_from_configuration(tmp_path): params = {k: cfg_parser['step_02'][k] for k in step_keys} step = o3o_pop.StepPostReplaceCharsRegex(params) step.path_in = tmp_file - + # act step.execute() - + # assert assert hasattr(step, 'statistics') assert len(step.statistics) == 9 diff --git a/tests/test_odem_processing_mets.py b/tests/test_odem_processing_mets.py index 9382eef..96c2007 100644 --- a/tests/test_odem_processing_mets.py +++ b/tests/test_odem_processing_mets.py @@ -8,6 +8,7 @@ import lxml.etree as ET import digiflow as df +import digiflow.record as df_r import lib.odem as odem import lib.odem.processing.mets as o3o_pm @@ -26,8 +27,8 @@ def _fixture_1981185920_44046(): _ident = '1981185920_44046' file = TEST_RES / '1981185920_44046.xml' inspc = odem.ODEMMetadataInspecteur(file, - process_identifier=_ident, - cfg=fixture_configuration()) + process_identifier=_ident, + cfg=fixture_configuration()) yield inspc @@ -135,7 +136,7 @@ def test_opendata_record_no_printwork(): inspc.metadata_report() # assert - assert f"{_oai_urn} no PICA type for OCR: Ac" == odem_exc.value.args[0] + assert f"{_oai_urn} no PICA type for OCR: Ac" == odem_exc.value.args[0] def test_opendata_record_no_granular_urn_present(): @@ -175,7 +176,7 @@ def test_opendata_record_type_error(): inspc.metadata_report() # assert - assert "2x: Page PHYS_0112 not linked,Page PHYS_0113 not linked" == odem_exc.value.args[0] + assert "2x: Page PHYS_0112 not linked,Page PHYS_0113 not linked" == odem_exc.value.args[0] def test_mets_mods_sbb_vol01_with_ulb_defaults(): @@ -274,34 +275,34 @@ def test_validate_mets_105054_schema_fails(tmp_path): If Schema validation is required, then throw according exception in this case: alert invalid order data format """ - _record = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/105054') + _record = df_r.Record('oai:opendata.uni-halle.de:1981185920/105054') _work_dir = tmp_path / '1981185920_105054' _work_dir.mkdir() _orig_mets = TEST_RES / '1981185920_105054.xml' shutil.copyfile(_orig_mets, _work_dir / '1981185920_105054.xml') odem_processor = odem.ODEMProcessImpl(_record, work_dir=_work_dir) odem_processor.odem_configuration = fixture_configuration() - with pytest.raises(odem.ODEMException) as exec: + with pytest.raises(odem.ODEMException) as odem_exec: odem_processor.validate_metadata() - assert "'order': '1.1979' is not a valid value of the atomic type 'xs:integer'" in exec.value.args[0] - - + assert "'order': '1.1979' is not a valid value of the atomic type 'xs:integer'" in odem_exec.value.args[0] + + def test_validate_mets_37167_schema_fails(tmp_path): """ if is invalid mets file, throw according exception """ - rec = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/37167') + rec = df_r.Record('oai:opendata.uni-halle.de:1981185920/37167') work_dir = tmp_path / '1981185920_37167' work_dir.mkdir() original_mets = TEST_RES / '1981185920_37167_01.xml' shutil.copyfile(original_mets, work_dir / '1981185920_37167.xml') odem_processor = odem.ODEMProcessImpl(rec, work_dir=work_dir) odem_processor.odem_configuration = fixture_configuration() - with pytest.raises(odem.ODEMException) as exec: + with pytest.raises(odem.ODEMException) as odem_exc: odem_processor.validate_metadata() - assert "recordIdentifier': This element is not expected" in exec.value.args[0] + assert "recordIdentifier': This element is not expected" in odem_exc.value.args[0] def test_validate_mets_37167_ddb_fails(tmp_path): @@ -312,10 +313,10 @@ def test_validate_mets_37167_ddb_fails(tmp_path): * extra mets:dmdSec not linked to LOGICAL MAP with only shelfLocator and also missing titleInfo (these are all related to each other) - + => this we had already at Rahbar """ - rec = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/37167') + rec = df_r.Record('oai:opendata.uni-halle.de:1981185920/37167') work_dir = tmp_path / '1981185920_37167' work_dir.mkdir() original_mets = TEST_RES / '1981185920_37167_02.xml' @@ -323,10 +324,10 @@ def test_validate_mets_37167_ddb_fails(tmp_path): odem_processor = odem.ODEMProcessImpl(rec, work_dir=work_dir) odem_processor.odem_configuration = fixture_configuration() odem_processor.odem_configuration.set('mets', 'ddb_validation', 'True') - with pytest.raises(odem.ODEMException) as exec: + with pytest.raises(odem.ODEMException) as odem_exec: odem_processor.validate_metadata() - ddb_complains = exec.value.args[0] + ddb_complains = odem_exec.value.args[0] assert len(ddb_complains) == 4 assert '[titleInfo_02] dmd_id:DMDPHYS_0000 test:Pon Ya 4371' in ddb_complains[0] assert '[relatedItem_04] dmd_id:DMDLOG_0000' in ddb_complains[1] @@ -339,8 +340,8 @@ def test_validate_mets_37167_finally_succeeds(tmp_path): This time METS/MODS and also DDB-validation are both pleased, therefore a plain 'True' shall be returned """ - - rec = df.OAIRecord('oai:opendata.uni-halle.de:1981185920/37167') + + rec = df_r.Record('oai:opendata.uni-halle.de:1981185920/37167') work_dir = tmp_path / '1981185920_37167' work_dir.mkdir() original_mets = TEST_RES / '1981185920_37167_03.xml' @@ -348,7 +349,7 @@ def test_validate_mets_37167_finally_succeeds(tmp_path): odem_processor = odem.ODEMProcessImpl(rec, work_dir=work_dir) odem_processor.odem_configuration = fixture_configuration() odem_processor.odem_configuration.set('mets', 'ddb_validation', 'True') - + assert odem_processor.validate_metadata() @@ -358,7 +359,7 @@ def test_integrate_alto_from_ocr_pipeline(tmp_path): """ # arrange - mets_file = TEST_RES / '1981185920_42296.xml' + mets_file = TEST_RES / '1981185920_42296.xml' fulltext_dir = TEST_RES / '1981185920_42296_FULLTEXT' assert mets_file.exists() assert fulltext_dir.exists() @@ -370,7 +371,7 @@ def test_integrate_alto_from_ocr_pipeline(tmp_path): # actsert assert 4 == o3o_pm.integrate_ocr_file(mets_tree, ocr_files) - + def test_extract_text_content_from_alto_file(): """Ensure we can read ALTO output and get its contents @@ -384,7 +385,7 @@ def test_extract_text_content_from_alto_file(): # act text = o3o_pm.extract_text_content(ocr_files) - # assert + # assert assert text is not None assert len(text) == 126 @@ -398,7 +399,7 @@ def test_extract_identifiers(): # arrange mets_file = TEST_RES / '1516514412012_175762.xml' - inspecteur = o3o_pm.ODEMMetadataInspecteur(mets_file, + inspecteur = o3o_pm.ODEMMetadataInspecteur(mets_file, '1516514412012_175762', fixture_configuration()) # act diff --git a/tests/test_odem_processing_ocr_files.py b/tests/test_odem_processing_ocr_files.py index 2ef5acd..f7efbe5 100644 --- a/tests/test_odem_processing_ocr_files.py +++ b/tests/test_odem_processing_ocr_files.py @@ -2,6 +2,8 @@ import os +from pathlib import Path + import lxml.etree as ET import digiflow as df @@ -17,7 +19,7 @@ def test_module_fixture_one_integrated_ocr_files_fit_identifier(fixture_27949: o """ # arrange - tmp_path = fixture_27949.work_dir_main + tmp_path = Path(fixture_27949.work_dir_root) # assert assert not os.path.exists(tmp_path / 'FULLTEXT' / '00000002.xml') @@ -35,7 +37,7 @@ def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcessImpl): diacritics occour more several times in single word""" # arrange - tmp_path = fixture_27949.work_dir_main + tmp_path = Path(fixture_27949.work_dir_root) path_file = tmp_path / 'FULLTEXT' / '00000003.xml' strip_tags = fixture_configuration().getlist(odem.CFG_SEC_OCR, 'strip_tags') # pylint: disable=no-member