diff --git a/cli_mets_local.py b/cli_mets_local.py index 0439271..1a66d30 100644 --- a/cli_mets_local.py +++ b/cli_mets_local.py @@ -137,8 +137,8 @@ odem_process.set_local_images() # NEW NEW NEW - odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process) - odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) + odem_pipeline = odem.ODEMWorkflow.create(proc_type, odem_process) + odem_runner = odem.ODEMWorkflowRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) ocr_results = process_resource_monitor.monit_vmem(odem_runner.run) if ocr_results is None or len(ocr_results) == 0: raise odem.ODEMException(f"OCR Process Runner error for {record.identifier}") diff --git a/cli_oai_client.py b/cli_oai_client.py index c823dfc..e95e1a3 100644 --- a/cli_oai_client.py +++ b/cli_oai_client.py @@ -299,8 +299,8 @@ def oai_arg_parser(value): # NEW NEW NEW proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None) - odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process) - odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) + odem_pipeline = odem.ODEMWorkflow.create(proc_type, odem_process) + odem_runner = odem.ODEMWorkflowRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) ocr_results = process_resource_monitor.monit_vmem(odem_runner.run) if ocr_results is None or len(ocr_results) == 0: raise odem.ODEMException(f"process run error: {record.identifier}") diff --git a/cli_oai_local.py b/cli_oai_local.py index 7bbb9c8..76fbf31 100644 --- a/cli_oai_local.py +++ b/cli_oai_local.py @@ -183,16 +183,16 @@ def wrap_save_record_state(status: str, urn, **kwargs): odem_process.set_local_images() # NEW NEW NEW - odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process) - odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) + odem_pipeline = odem.ODEMWorkflow.create(proc_type, odem_process) + odem_runner = odem.ODEMWorkflowRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline) ocr_results = process_resource_monitor.monit_vmem(odem_runner.run) if ocr_results is None or len(ocr_results) == 0: raise ODEMException(f"process run error: {record.identifier}") odem_process.calculate_statistics_ocr(ocr_results) odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics) - odem_process.link_ocr_files() - odem_process.postprocess_ocr() + # odem_process.link_ocr_files() + # odem_process.postprocess_ocr() wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True) if wf_enrich_ocr: odem_process.link_ocr_files() @@ -214,7 +214,7 @@ def wrap_save_record_state(status: str, urn, **kwargs): odem_process.record.info.update(_kwargs) _info = f"{odem_process.record.info}" except: - odem_process.the_logger.error("Can't parse '%s', store info literally", + odem_process.the_logger.warning("Can't parse '%s', store info literally", odem_process.record.info) _info = f"{_kwargs}" else: diff --git a/lib/odem/__init__.py b/lib/odem/__init__.py index de83c9b..b1e9072 100644 --- a/lib/odem/__init__.py +++ b/lib/odem/__init__.py @@ -5,8 +5,8 @@ from .ocrd3_odem import ( OdemWorkflowProcessType, ODEMProcess, - ODEMOCRPipeline, - ODEMPipelineRunner, + ODEMWorkflow, + ODEMWorkflowRunner, OCRDPageParallel, ODEMTesseract, ) diff --git a/lib/odem/ocrd3_odem.py b/lib/odem/ocrd3_odem.py index 44b0266..aff7cf6 100644 --- a/lib/odem/ocrd3_odem.py +++ b/lib/odem/ocrd3_odem.py @@ -24,21 +24,8 @@ import digiflow.digiflow_export as dfx import digiflow.digiflow_metadata as dfm -from .odem_commons import ( - CFG_SEC_OCR, - CFG_SEC_OCR_OPT_RES_VOL, - DEFAULT_RTL_MODELS, - FILEGROUP_OCR, - KEY_LANGUAGES, - STATS_KEY_LANGS, - STATS_KEY_MODELS, - STATS_KEY_N_OCR, - STATS_KEY_MB, - STATS_KEY_MPS, - PROJECT_ROOT, - ExportFormat, - ODEMException, -) +import lib.odem.odem_commons as odem_c + from .processing.mets import ( ODEMMetadataInspecteur, extract_text_content, @@ -55,7 +42,6 @@ ) from .processing.ocr_files import ( convert_to_output_format, - list_files, postprocess_ocr_file, ) from .processing.image import ( @@ -80,6 +66,8 @@ # how long to process single page? DEFAULT_DOCKER_CONTAINER_TIMEOUT = 600 +LOCAL_OCRD_RESULT_DIR = 'PAGE' + class OdemWorkflowProcessType(str, Enum): OCRD_PAGE_PARALLEL = "OCRD_PAGE_PARALLEL" @@ -149,7 +137,7 @@ def _init_logger(self, log_dir): log_dir, f"odem_{today}.log") conf_logname = {'logname': logfile_name} conf_file_location = os.path.join( - PROJECT_ROOT, 'resources', 'odem_logging.ini') + odem_c.PROJECT_ROOT, 'resources', 'odem_logging.ini') logging.config.fileConfig(conf_file_location, defaults=conf_logname) self.the_logger = logging.getLogger('odem') @@ -173,9 +161,9 @@ def load(self): loader.store = self.store loader.load(request_identifier, local_dst=req_dst) except df.OAILoadClientError as load_err: - raise ODEMException(load_err.args[0]) from load_err + raise odem_c.ODEMException(load_err.args[0]) from load_err except RuntimeError as _err: - raise ODEMException(_err.args[0]) from _err + raise odem_c.ODEMException(_err.args[0]) from _err def clear_resources(self, remove_all=False): """Remove OAI-Resources from store or even @@ -211,12 +199,12 @@ def inspect_metadata(self): self.digi_type = the_report.type self.images_4_ocr = insp.image_pairs except RuntimeError as mde: - raise ODEMException(f"{mde.args[0]}") from mde + raise odem_c.ODEMException(f"{mde.args[0]}") from mde self.mods_identifier = insp.mods_record_identifier for t, ident in insp.identifiers.items(): self._statistics_ocr[t] = ident self._statistics_ocr['type'] = insp.type - self._statistics_ocr[STATS_KEY_LANGS] = insp.languages + self._statistics_ocr[odem_c.STATS_KEY_LANGS] = insp.languages self._statistics_ocr['n_images_pages'] = insp.n_images_pages self._statistics_ocr['n_images_ocrable'] = insp.n_images_ocrable _ratio = insp.n_images_ocrable / insp.n_images_pages * 100 @@ -248,22 +236,22 @@ def language_modelconfig(self, languages=None) -> str: _models = [] model_mappings: dict = self.odem_configuration.getdict( # pylint: disable=no-member - CFG_SEC_OCR, 'model_mapping') + odem_c.CFG_SEC_OCR, 'model_mapping') self.the_logger.info("[%s] inspect languages '%s'", self.process_identifier, languages) if languages is None: - languages = self._statistics_ocr.get(STATS_KEY_LANGS) + languages = self._statistics_ocr.get(odem_c.STATS_KEY_LANGS) for lang in languages: model_entry = model_mappings.get(lang) if not model_entry: - raise ODEMException(f"'{lang}' mapping not found (languages: {languages})!") + raise odem_c.ODEMException(f"'{lang}' mapping not found (languages: {languages})!") for model in model_entry.split('+'): if self._is_model_available(model): _models.append(model) else: - raise ODEMException(f"'{model}' model config not found !") - _model_conf = '+'.join(_models) if self.odem_configuration.getboolean(CFG_SEC_OCR, "model_combinable", fallback=True) else _models[0] - self._statistics_ocr[STATS_KEY_MODELS] = _model_conf + raise odem_c.ODEMException(f"'{model}' model config not found !") + _model_conf = '+'.join(_models) if self.odem_configuration.getboolean(odem_c.CFG_SEC_OCR, "model_combinable", fallback=True) else _models[0] + self._statistics_ocr[odem_c.STATS_KEY_MODELS] = _model_conf self.the_logger.info("[%s] map languages '%s' => '%s'", self.process_identifier, languages, _model_conf) return _model_conf @@ -284,17 +272,17 @@ def map_language_to_modelconfig(self, image_path) -> str: _file_lang_suffixes = DEFAULT_LANG # inspect language arg - if self.odem_configuration.has_option(CFG_SEC_OCR, KEY_LANGUAGES): - _file_lang_suffixes = self.odem_configuration.get(CFG_SEC_OCR, KEY_LANGUAGES).split('+') + if self.odem_configuration.has_option(odem_c.CFG_SEC_OCR, odem_c.KEY_LANGUAGES): + _file_lang_suffixes = self.odem_configuration.get(odem_c.CFG_SEC_OCR, odem_c.KEY_LANGUAGES).split('+') return self.language_modelconfig(_file_lang_suffixes) # inspect final '_' segment of local file names if self.local_mode: try: _image_name = Path(image_path).stem if '_' not in _image_name: - raise ODEMException(f"Miss language mark for '{_image_name}'!") + raise odem_c.ODEMException(f"Miss language mark for '{_image_name}'!") _file_lang_suffixes = _image_name.split('_')[-1].split('+') - except ODEMException as oxc: + except odem_c.ODEMException as oxc: self.the_logger.warning("[%s] language mapping err '%s' for '%s', fallback to %s", self.process_identifier, oxc.args[0], image_path, DEFAULT_LANG) @@ -305,7 +293,7 @@ def map_language_to_modelconfig(self, image_path) -> str: def _is_model_available(self, model) -> bool: """Determine whether model is available""" - resource_dir_mappings = self.odem_configuration.getdict(CFG_SEC_OCR, CFG_SEC_OCR_OPT_RES_VOL, fallback={}) + resource_dir_mappings = self.odem_configuration.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) for host_dir, _ in resource_dir_mappings.items(): training_file = host_dir + '/' + model if os.path.exists(training_file): @@ -338,7 +326,7 @@ def get_local_image_paths(self, image_local_dir=None) -> typing.List[str]: # this shouldn't happen if len(images) < 1: - raise ODEMException(f"{self.record.identifier} contains no images!") + raise odem_c.ODEMException(f"{self.record.identifier} contains no images!") self.the_logger.info("[%s] %d images total", self.process_identifier, len(images)) @@ -354,7 +342,7 @@ def set_local_images(self): for _img, _urn in self.images_4_ocr: _the_file = os.path.join(_local_max_dir, _img) if not os.path.exists(_the_file): - raise ODEMException(f"[{self.process_identifier}] missing {_the_file}!") + raise odem_c.ODEMException(f"[{self.process_identifier}] missing {_the_file}!") _images_of_interest.append((_the_file, _urn)) self.images_4_ocr = _images_of_interest @@ -365,14 +353,14 @@ def calculate_statistics_ocr(self, outcomes: typing.List): _mod_val_counts = np.unique(_total_mps, return_counts=True) mps = list(zip(*_mod_val_counts)) total_mb = sum([e[3] for e in outcomes if e[0] == 1]) - self._statistics_ocr[STATS_KEY_N_OCR] = n_ocr - self._statistics_ocr[STATS_KEY_MB] = round(total_mb, 2) - self._statistics_ocr[STATS_KEY_MPS] = mps + self._statistics_ocr[odem_c.STATS_KEY_N_OCR] = n_ocr + self._statistics_ocr[odem_c.STATS_KEY_MB] = round(total_mb, 2) + self._statistics_ocr[odem_c.STATS_KEY_MPS] = mps def link_ocr_files(self) -> int: """Prepare and link OCR-data""" - self.ocr_files = list_files(self.work_dir_main, FILEGROUP_OCR) + self.ocr_files = odem_c.list_files(self.work_dir_main, odem_c.FILEGROUP_FULLTEXT) if not self.ocr_files: return 0 proc = df.MetsProcessor(self.mets_file) @@ -380,16 +368,6 @@ def link_ocr_files(self) -> int: proc.write() return _n_linked_ocr - def postprocess_ocr(self): - """Apply additional postprocessing to OCR data""" - - # inspect each single created ocr file - # drop unwanted elements - # clear punctual regions - strip_tags = self.odem_configuration.getlist(CFG_SEC_OCR, 'strip_tags') - for _ocr_file in self.ocr_files: - postprocess_ocr_file(_ocr_file, strip_tags) - def create_text_bundle_data(self): """create additional dspace bundle for indexing ocr text read ocr-file sequential according to their number label @@ -410,13 +388,13 @@ def create_pdf(self): _cfg_path_dir_bin = self.odem_configuration.get('derivans', 'derivans_dir_bin', fallback=None) path_bin = None if _cfg_path_dir_bin is not None: - path_bin = os.path.join(PROJECT_ROOT, _cfg_path_dir_bin) + path_bin = os.path.join(odem_c.PROJECT_ROOT, _cfg_path_dir_bin) _cfg_path_dir_project = self.odem_configuration.get('derivans', 'derivans_dir_project', fallback=None) path_prj = None if _cfg_path_dir_project is not None: - path_prj = os.path.join(PROJECT_ROOT, _cfg_path_dir_project) + path_prj = os.path.join(odem_c.PROJECT_ROOT, _cfg_path_dir_project) path_cfg = os.path.join( - PROJECT_ROOT, + odem_c.PROJECT_ROOT, self.odem_configuration.get('derivans', 'derivans_config') ) derivans_image = self.odem_configuration.get('derivans', 'derivans_image', fallback=None) @@ -439,7 +417,7 @@ def create_pdf(self): _err_msg = _sub_err.stdout.decode().split(os.linesep)[0].replace("'", "\"") _args = [_err_msg] _args.extend(_sub_err.args) - raise ODEMException(_args) from _sub_err + raise odem_c.ODEMException(_args) from _sub_err def delete_before_export(self, folders): """delete folders given by list""" @@ -480,7 +458,7 @@ def validate_metadata(self): def export_data(self): """re-do metadata and transform into output format""" - export_format: str = self.odem_configuration.get('export', 'export_format', fallback=ExportFormat.SAF) + export_format: str = self.odem_configuration.get('export', 'export_format', fallback=odem_c.ExportFormat.SAF) export_mets: bool = self.odem_configuration.getboolean('export', 'export_mets', fallback=True) exp_dst = self.odem_configuration.get('export', 'local_export_dir') @@ -494,7 +472,7 @@ def export_data(self): if export_mets: exp_map[os.path.basename(self.mets_file)] = 'mets.xml' saf_name = self.mods_identifier - if export_format == ExportFormat.SAF: + if export_format == odem_c.ExportFormat.SAF: export_result = df.export_data_from( self.mets_file, exp_col, @@ -503,7 +481,7 @@ def export_data(self): export_map=exp_map, tmp_saf_dir=exp_tmp, ) - elif export_format == ExportFormat.FLAT_ZIP: + elif export_format == odem_c.ExportFormat.FLAT_ZIP: prefix = 'opendata-working-' source_path_dir = os.path.dirname(self.mets_file) tmp_dir = tempfile.gettempdir() @@ -514,11 +492,11 @@ def export_data(self): export_mappings = df.map_contents(source_path_dir, work_dir, exp_map) for mapping in export_mappings: mapping.copy() - tmp_zip_path, size = self._compress(os.path.dirname(work_dir), saf_name) + tmp_zip_path, size = ODEMProcess.compress_flat(os.path.dirname(work_dir), saf_name) path_export_processing = dfx._move_to_tmp_file(tmp_zip_path, exp_dst) export_result = path_export_processing, size else: - raise ODEMException(f'Unsupported export format: {export_format}') + raise odem_c.ODEMException(f'Unsupported export format: {export_format}') self.the_logger.info("[%s] exported data: %s", self.process_identifier, export_result) if export_result: @@ -532,9 +510,20 @@ def export_data(self): self.process_identifier, pth, final_path) shutil.move(pth, final_path) return final_path, size - return None + @classmethod + def compress_flat(cls, work_dir, archive_name): + zip_file_path = os.path.join(os.path.dirname(work_dir), archive_name) + '.zip' + previous_dir = os.getcwd() + os.chdir(os.path.join(work_dir, archive_name)) + cmd = f'zip -q -r {zip_file_path} ./*' + subprocess.run(cmd, shell=True, check=True) + os.chmod(zip_file_path, 0o666) + zip_size = int(os.path.getsize(zip_file_path) / 1024 / 1024) + os.chdir(previous_dir) + return zip_file_path, f"{zip_size}MiB" + @property def duration(self): """Get current duration of ODEMProcess. @@ -552,35 +541,25 @@ def statistics(self): self._statistics_ocr['timedelta'] = f'{self.duration}' return self._statistics_ocr - def _compress(self, work_dir, archive_name): - zip_file_path = os.path.join(os.path.dirname(work_dir), archive_name) + '.zip' - previous_dir = os.getcwd() - os.chdir(os.path.join(work_dir, archive_name)) - cmd = f'zip -q -r {zip_file_path} ./*' - subprocess.run(cmd, shell=True, check=True) - os.chmod(zip_file_path, 0o666) - zip_size = int(os.path.getsize(zip_file_path) / 1024 / 1024) - os.chdir(previous_dir) - return zip_file_path, f"{zip_size}MiB" - -class ODEMPipelineRunner: +class ODEMWorkflowRunner: """Wrap actual ODEM process execution""" def __init__(self, identifier, n_executors, - internal_logger, odem_ocr_pipeline) -> None: + internal_logger, odem_workflow) -> None: self.process_identifier = identifier self.n_executors = n_executors self.logger:logging.Logger = internal_logger - self.odem_ocr_pipeline: ODEMOCRPipeline = odem_ocr_pipeline + self.odem_workflow: ODEMWorkflow = odem_workflow def run(self): - input_data = self.odem_ocr_pipeline.get_input() + input_data = self.odem_workflow.get_inputs() the_outcomes = [(0, 0, 0, 0)] if self.n_executors > 1: the_outcomes = self.run_parallel(input_data) else: the_outcomes = self.run_sequential(input_data) + self.odem_workflow.foster_outputs() return the_outcomes def run_parallel(self, input_data): @@ -594,10 +573,10 @@ def run_parallel(self, input_data): max_workers=self.n_executors, thread_name_prefix='odem.ocrd' ) as executor: - return list(executor.map(self.odem_ocr_pipeline.process, input_data)) + return list(executor.map(self.odem_workflow.run, input_data)) except (OSError, AttributeError) as err: self.logger.error(err) - raise ODEMException(f"ODEM parallel: {err.args[0]}") from err + raise odem_c.ODEMException(f"ODEM parallel: {err.args[0]}") from err def run_sequential(self, input_data): """run complete workflow plain sequential @@ -609,35 +588,40 @@ def run_sequential(self, input_data): self.logger.info("[%s] %d inputs run_sequential, estm. %dmin", self.process_identifier, len_img, estm_min) try: - outcomes = [self.odem_ocr_pipeline.process(the_input) + outcomes = [self.odem_workflow.run(the_input) for the_input in input_data] return outcomes except (OSError, AttributeError) as err: self.logger.error(err) - raise ODEMException(f"ODEM sequential: {err.args[0]}") from err + raise odem_c.ODEMException(f"ODEM sequential: {err.args[0]}") from err -class ODEMOCRPipeline: +class ODEMWorkflow: """Base Interface""" @staticmethod def create( workflow_type: OdemWorkflowProcessType | str, odem: ODEMProcess, - ) -> ODEMOCRPipeline: + ) -> ODEMWorkflow: if (workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT or workflow_type == OdemWorkflowProcessType.ODEM_TESSERACT.value): return ODEMTesseract(odem) return OCRDPageParallel(odem) - def get_input(self) -> typing.List: - pass + def get_inputs(self) -> typing.List: + """Collect all input data files for processing""" - def process(self): - pass + def run(self): + """Run actual implemented Workflow""" + + def foster_outputs(self): + """Work to do after pipeline has been run successfully + like additional format transformations or sanitizings + """ -class OCRDPageParallel(ODEMOCRPipeline): +class OCRDPageParallel(ODEMWorkflow): """Use page parallel workflow""" def __init__(self, odem_process: ODEMProcess): @@ -645,14 +629,14 @@ def __init__(self, odem_process: ODEMProcess): self.cfg = odem_process.odem_configuration self.logger = odem_process.the_logger - def get_input(self): + def get_inputs(self): return self.odem.images_4_ocr - def process(self, input_data): + def run(self, input_data): """Create OCR Data""" ocr_log_conf = os.path.join( - PROJECT_ROOT, self.cfg.get(CFG_SEC_OCR, 'ocrd_logging')) + odem_c.PROJECT_ROOT, self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_logging')) # Preprare workspace with makefile (image_path, ident) = input_data @@ -695,17 +679,17 @@ def process(self, input_data): profiling = ('n.a.', 0) container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}' - container_memory_limit: str = self.cfg.get(CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) - container_user = self.cfg.get(CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) + container_memory_limit: str = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None) + container_user = self.cfg.get(odem_c.CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid()) container_timeout: int = self.cfg.getint( - CFG_SEC_OCR, + odem_c.CFG_SEC_OCR, 'docker_container_timeout', fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT ) - base_image = self.cfg.get(CFG_SEC_OCR, 'ocrd_baseimage') - ocrd_process_list = self.cfg.getlist(CFG_SEC_OCR, 'ocrd_process_list') - tesseract_model_rtl: typing.List[str] = self.cfg.getlist(CFG_SEC_OCR, 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS) - ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(CFG_SEC_OCR, CFG_SEC_OCR_OPT_RES_VOL, fallback={}) + base_image = self.cfg.get(odem_c.CFG_SEC_OCR, 'ocrd_baseimage') + ocrd_process_list = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'ocrd_process_list') + tesseract_model_rtl: typing.List[str] = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'tesseract_model_rtl', fallback=odem_c.DEFAULT_RTL_MODELS) + ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(odem_c.CFG_SEC_OCR, odem_c.CFG_SEC_OCR_OPT_RES_VOL, fallback={}) if self.odem.local_mode: container_name = os.path.basename(page_workdir) @@ -739,7 +723,7 @@ def process(self, input_data): _ident, plain_exc, base_image) os.chdir(self.odem.work_dir_main) - if self.cfg.getboolean(CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: + if self.cfg.getboolean(odem_c.CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False: shutil.rmtree(page_workdir, ignore_errors=True) return stored, 1, mps, filesize_mb @@ -771,7 +755,7 @@ def _store_fulltext(self, image_subdir, original_image_path) -> int: # inspect possible ocr result dirs from within # the OCR-D subordinate workspaces for each image old_id = os.path.basename(image_subdir) - ocr_result_dir = os.path.join(image_subdir, 'PAGE') + ocr_result_dir = os.path.join(image_subdir, LOCAL_OCRD_RESULT_DIR) if not os.path.isdir(ocr_result_dir): self.logger.info("[%s] no ocr results for '%s'", self.odem.process_identifier, ocr_result_dir) @@ -789,7 +773,7 @@ def _store_fulltext(self, image_subdir, original_image_path) -> int: # regular case: OAI Workflow if not self.odem.local_mode: # export to 'PAGE' dir - wd_fulltext = os.path.join(self.odem.work_dir_main, 'PAGE') + wd_fulltext = os.path.join(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) if not os.path.exists(wd_fulltext): os.mkdir(wd_fulltext) @@ -802,19 +786,29 @@ def _store_fulltext(self, image_subdir, original_image_path) -> int: shutil.copy(renamed, target_path) return 1 - def to_alto(self) -> int: - """Forward OCR format conversion""" + def foster_outputs(self): + """In this case: + * move files from dir PAGE to FULLTEXT + * convert OCR format PAGE => ALTO + * some additional tag stripping + """ - _cnv = convert_to_output_format(self.work_dir_main) - n_candidates = len(self.images_4_ocr) - if len(_cnv) == 0 and n_candidates > 0: - raise ODEMException(f"No OCR result for {n_candidates} candidates created!") - self.ocr_files = _cnv + n_candidates = len(self.odem.images_4_ocr) + ocrd_data_files = odem_c.list_files(self.odem.work_dir_main, LOCAL_OCRD_RESULT_DIR) + if len(ocrd_data_files) == 0 and n_candidates > 0: + raise odem_c.ODEMException(f"No OCR result for {n_candidates} candidates created!") + final_fulltext_dir = os.path.join(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) + if not os.path.isdir(final_fulltext_dir): + os.makedirs(final_fulltext_dir, exist_ok=True) + self.ocr_files = convert_to_output_format(ocrd_data_files, final_fulltext_dir) self.logger.info("[%s] converted '%d' files page-to-alto", - self.odem.process_identifier, len(_cnv)) + self.odem.process_identifier, len(self.ocr_files)) + strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') + for _ocr_file in self.ocr_files: + postprocess_ocr_file(_ocr_file, strip_tags) -class ODEMTesseract(ODEMOCRPipeline): +class ODEMTesseract(ODEMWorkflow): """Tesseract Runner""" def __init__(self, odem_process: ODEMProcess): @@ -823,7 +817,7 @@ def __init__(self, odem_process: ODEMProcess): self.logger = odem_process.the_logger self.pipeline_configuration = None - def get_input(self): + def get_inputs(self): images_4_ocr = self.odem.images_4_ocr n_total = len(images_4_ocr) pipeline_cfg = self.read_pipeline_config() @@ -831,7 +825,7 @@ def get_input(self): for i, img in enumerate(self.odem.images_4_ocr, start=1)] return input_data - def process(self, input_data): + def run(self, input_data): image_path = input_data[0][0] pipeline_result = run_pipeline(input_data) @@ -850,16 +844,16 @@ def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: if self.pipeline_configuration is None: if path_config is None: - if self.odem_configuration.has_option(CFG_SEC_OCR, 'ocr_pipeline_config'): - path_config = os.path.abspath(self.odem_configuration.get(CFG_SEC_OCR, 'ocr_pipeline_config')) + if self.odem_configuration.has_option(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config'): + path_config = os.path.abspath(self.odem_configuration.get(odem_c.CFG_SEC_OCR, 'ocr_pipeline_config')) if not os.path.isfile(path_config): - raise ODEMException(f"no ocr-pipeline conf {path_config} !") + raise odem_c.ODEMException(f"no ocr-pipeline conf {path_config} !") pipe_cfg = configparser.ConfigParser() pipe_cfg.read(path_config) self.logger.info(f"use config '{path_config}'") for sect in pipe_cfg.sections(): if pipe_cfg.has_option(sect, 'model_configs'): - known_langs = self.odem._statistics_ocr.get(STATS_KEY_LANGS) + known_langs = self.odem._statistics_ocr.get(odem_c.STATS_KEY_LANGS) model_files = self.odem.language_modelconfig(known_langs) models = model_files.replace('.traineddata','') pipe_cfg.set(sect, 'model_configs', models) @@ -867,3 +861,9 @@ def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser: pipe_cfg.set(sect, STEP_MOVE_PATH_TARGET, f'{self.odem.work_dir_main}/FULLTEXT') self.pipeline_configuration = pipe_cfg return self.pipeline_configuration + + def foster_outputs(self): + self.ocr_files = odem_c.list_files(self.odem.work_dir_main, odem_c.FILEGROUP_FULLTEXT) + strip_tags = self.cfg.getlist(odem_c.CFG_SEC_OCR, 'strip_tags') + for _ocr_file in self.ocr_files: + postprocess_ocr_file(_ocr_file, strip_tags) diff --git a/lib/odem/odem_commons.py b/lib/odem/odem_commons.py index af44807..3426a8c 100644 --- a/lib/odem/odem_commons.py +++ b/lib/odem/odem_commons.py @@ -5,26 +5,13 @@ import os import socket import time +import typing -from configparser import ( - ConfigParser, -) from enum import Enum -from pathlib import ( - Path -) -from typing import ( - Dict, - List, -) - -from ocrd_utils import ( - initLogging -) - -from digiflow import ( - OAIRecord, -) +from pathlib import Path + +import ocrd_utils +import digiflow as df # # ODEM States @@ -87,7 +74,7 @@ class ExportFormat(str, Enum): # ODEM metadata # # file groups -FILEGROUP_OCR = 'FULLTEXT' +FILEGROUP_FULLTEXT = 'FULLTEXT' FILEGROUP_IMG = 'MAX' # statistic keys STATS_KEY_LANGS = 'languages' @@ -161,7 +148,7 @@ def get_logger(log_dir, log_infix=None, path_log_config=None) -> logging.Logger: ... """ - initLogging() + ocrd_utils.initLogging() logging.getLogger('page-to-alto').setLevel('CRITICAL') _today = time.strftime('%Y-%m-%d', time.localtime()) _host = socket.gethostname() @@ -176,7 +163,7 @@ def get_logger(log_dir, log_infix=None, path_log_config=None) -> logging.Logger: return logging.getLogger('odem') -def merge_args(the_configuration: ConfigParser, the_args) -> List: +def merge_args(the_configuration: configparser.ConfigParser, the_args) -> typing.List: """Merge additionally provided arguements into existing configurations, overwrite these and communication the replaced options @@ -200,7 +187,7 @@ def merge_args(the_configuration: ConfigParser, the_args) -> List: return _repls -def to_dict(record: OAIRecord) -> Dict: +def to_dict(record: df.OAIRecord) -> typing.Dict: """Serialize OAIRecord into dictionary as input for JSON format""" @@ -212,9 +199,18 @@ def to_dict(record: OAIRecord) -> Dict: RECORD_TIME: record.state_datetime, } -def from_dict(data) -> OAIRecord: +def from_dict(data) -> df.OAIRecord: """deserialize into OAIRecord""" - _record = OAIRecord(data[RECORD_IDENTIFIER]) + _record = df.OAIRecord(data[RECORD_IDENTIFIER]) _record.info = data[RECORD_INFO] return _record + + +def list_files(dir_root, sub_dir, format='.xml') -> typing.List: + actual_dir = os.path.join(dir_root, sub_dir) + return [ + os.path.join(actual_dir, dir_file) + for dir_file in os.listdir(actual_dir) + if Path(dir_file).suffix == format + ] diff --git a/lib/odem/processing/mets.py b/lib/odem/processing/mets.py index 6d7b971..c4128d8 100644 --- a/lib/odem/processing/mets.py +++ b/lib/odem/processing/mets.py @@ -264,10 +264,10 @@ def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int: tag_file = f'{{{df.XMLNS["mets"]}}}file' tag_flocat = f'{{{df.XMLNS["mets"]}}}FLocat' - file_grp_fulltext = ET.Element(tag_file_group, USE=odem_c.FILEGROUP_OCR) + file_grp_fulltext = ET.Element(tag_file_group, USE=odem_c.FILEGROUP_FULLTEXT) for _ocr_file in ocr_files: _file_name = os.path.basename(_ocr_file).split('.')[0] - new_id = odem_c.FILEGROUP_OCR + '_' + _file_name + new_id = odem_c.FILEGROUP_FULLTEXT + '_' + _file_name file_ocr = ET.Element( tag_file, MIMETYPE="application/alto+xml", ID=new_id) flocat_href = ET.Element(tag_flocat, LOCTYPE="URL") diff --git a/lib/odem/processing/ocr_files.py b/lib/odem/processing/ocr_files.py index 20ec2aa..5665bfa 100644 --- a/lib/odem/processing/ocr_files.py +++ b/lib/odem/processing/ocr_files.py @@ -41,7 +41,7 @@ 'alto:Illustration', 'alto:GraphicalElement'] -LOCAL_DIR_RESULT = 'PAGE' +# LOCAL_OCRD_RESULT_DIR = 'PAGE' class ODEMMetadataOcrException(Exception): @@ -92,33 +92,20 @@ def postprocess_ocr_file(ocr_file, strip_tags): mproc.write() -def list_files(dir_root, sub_dir) -> typing.List: - _curr_dir = os.path.join(dir_root, sub_dir) - return [ - os.path.join(_curr_dir, _file) - for _file in os.listdir(_curr_dir) - if str(_file).endswith('.xml') - ] - - -def convert_to_output_format(work_dir_root): +def convert_to_output_format(ocrd_results: typing.List, dst_dir): """Convert created OCR-Files to required presentation format (i.e. ALTO) """ - _converted = [] - _fulltext_dir = os.path.join(work_dir_root, odem_c.FILEGROUP_OCR) - if not os.path.isdir(_fulltext_dir): - os.makedirs(_fulltext_dir, exist_ok=True) - _results = list_files(work_dir_root, LOCAL_DIR_RESULT) - for _file in _results: + converted_files = [] + for _file in ocrd_results: the_id = os.path.basename(_file) - output_file = os.path.join(_fulltext_dir, the_id) - converter = opta_c.OcrdPageAltoConverter(page_filename=_file).convert() + output_file = os.path.join(dst_dir, the_id) + converted = opta_c.OcrdPageAltoConverter(page_filename=_file).convert() with open(output_file, 'w', encoding='utf-8') as output: - output.write(str(converter)) - _converted.append(output_file) - return _converted + output.write(str(converted)) + converted_files.append(output_file) + return converted_files def _is_completely_punctuated(a_string): diff --git a/lib/odem/processing/processing_mets.py b/lib/odem/processing/processing_mets.py index 1691e84..c18c38f 100644 --- a/lib/odem/processing/processing_mets.py +++ b/lib/odem/processing/processing_mets.py @@ -262,10 +262,10 @@ def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int: tag_file = f'{{{df.XMLNS["mets"]}}}file' tag_flocat = f'{{{df.XMLNS["mets"]}}}FLocat' - file_grp_fulltext = ET.Element(tag_file_group, USE=odem.FILEGROUP_OCR) + file_grp_fulltext = ET.Element(tag_file_group, USE=odem.FILEGROUP_FULLTEXT) for _ocr_file in ocr_files: _file_name = os.path.basename(_ocr_file).split('.')[0] - new_id = odem.FILEGROUP_OCR + '_' + _file_name + new_id = odem.FILEGROUP_FULLTEXT + '_' + _file_name file_ocr = ET.Element( tag_file, MIMETYPE="application/alto+xml", ID=new_id) flocat_href = ET.Element(tag_flocat, LOCTYPE="URL") diff --git a/tests/test_ocrd3_odem.py b/tests/test_ocrd3_odem.py index 89f716e..58c4f45 100644 --- a/tests/test_ocrd3_odem.py +++ b/tests/test_ocrd3_odem.py @@ -4,6 +4,7 @@ import os import shutil import unittest +import unittest.mock import lxml.etree as ET import pytest @@ -266,6 +267,11 @@ def test_module_fixture_one_images_4_ocr_by_metadata(fixture_27949: odem.ODEMPro def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEMProcess): """Ensure text bundle data created and present with expected number of text rows + Please note: + according to workflow modifications the ocr-output + is no longer postprocessed, and lots of to short + non-alphabetical lines will remain + therefore line number increased from 77 => 111 """ # arrange @@ -273,15 +279,14 @@ def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEM # act fixture_27949.link_ocr_files() - fixture_27949.postprocess_ocr() fixture_27949.create_text_bundle_data() # assert _txt_bundle_file = tmp_path / '198114125.pdf.txt' assert os.path.exists(_txt_bundle_file) - assert 77 == fixture_27949.statistics['n_text_lines'] + assert 111 == fixture_27949.statistics['n_text_lines'] with open(_txt_bundle_file, encoding='utf-8') as bundle_handle: - assert 77 == len(bundle_handle.readlines()) + assert 111 == len(bundle_handle.readlines()) def test_images_4_ocr_properly_filtered(tmp_path): @@ -417,7 +422,7 @@ def test_export_flat_zip(tmp_path): oproc.mets_file = str(trgt_mets) oproc.inspect_metadata() - _langs = oproc.statistics.get(odem.STATS_KEY_LANGS) + # _langs = oproc.statistics.get(odem.STATS_KEY_LANGS) # act zipfilepath, _ = oproc.export_data() diff --git a/tests/test_odem_processing_ocr_files.py b/tests/test_odem_processing_ocr_files.py index d34e8b2..7120e5d 100644 --- a/tests/test_odem_processing_ocr_files.py +++ b/tests/test_odem_processing_ocr_files.py @@ -2,8 +2,6 @@ import os -import pytest - import lxml.etree as ET import digiflow as df @@ -32,22 +30,6 @@ def test_module_fixture_one_integrated_ocr_files_fit_identifier(fixture_27949: o assert not os.path.exists(tmp_path / 'FULLTEXT' / '00000007.xml') -def test_fixture_one_postprocessed_ocr_files_elements(fixture_27949: odem.ODEMProcess): - """Ensure ocr-file unwanted elements dropped as expected - """ - - # arrange - tmp_path = fixture_27949.work_dir_main - - # act - # fixture_27949.link_ocr() - fixture_27949.postprocess_ocr() - - # assert - ocr_file_03 = ET.parse(str(tmp_path / 'FULLTEXT' / '00000003.xml')).getroot() - assert not ocr_file_03.xpath('//alto:Shape', namespaces=df.XMLNS) - - def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcess): """Ensure expected replacements done *even* when diacritics occour more several times in single word"""