[app][rfct] increase usage of consts

ulb-sachsen-anhalt · Jun 14, 2024 · a138e25 · a138e25
1 parent 255804f
commit a138e25
Show file tree

Hide file tree

Showing 9 changed files with 51 additions and 47 deletions.
diff --git a/cli_dir_local.py b/cli_dir_local.py
@@ -92,9 +92,9 @@
             shutil.rmtree(req_dst_dir)
         os.makedirs(req_dst_dir, exist_ok=True)
 
-        proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
+        proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
         if proc_type is None:
-            LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'")
+            LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'")
         PROCESS: odem.ODEMProcess = odem.ODEMProcess.create(proc_type, None, req_dst_dir, EXECUTORS)
         PROCESS.local_mode = True
         PROCESS.odem_configuration = CFG

diff --git a/cli_mets_local.py b/cli_mets_local.py
@@ -78,8 +78,8 @@
         print(f"unable to read config from '{CONF_FILE}! exit!")
         sys.exit(1)
 
-    CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
-    ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True)
+    CREATE_PDF = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
+
 
     # set work_dirs and logger
     DELETE_BEVOR_EXPORT = []
@@ -103,18 +103,18 @@
 
     # if valid n_executors via cli, use it's value
     if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0:
-        CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS))
-    EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=DEFAULT_EXECUTORS)
+        CFG.set(odem.CFG_SEC_OCR, 'n_executors', str(EXECUTOR_ARGS))
+    EXECUTORS = CFG.getint(odem.CFG_SEC_OCR, 'n_executors', fallback=DEFAULT_EXECUTORS)
     if SEQUENTIAL:
         EXECUTORS = 1
     LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s",
                  mets_file_dir, EXECUTORS, MUST_KEEP_RESOURCES, MUST_LOCK)
 
     try:
         local_ident = mets_file.stem
-        proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
+        proc_type: str = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
         if proc_type is None:
-            LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'")
+            LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'")
         record = df.OAIRecord(local_ident)
         odem_process: odem.ODEMProcess = odem.ODEMProcess(record, mets_file_dir)
         odem_process.the_logger = LOGGER
@@ -130,7 +130,7 @@
         process_resource_monitor.check_vmem()
         # process_resource_monitor.monit_disk_space(odem_process.load)
         odem_process.inspect_metadata()
-        if CFG.getboolean('mets','prevalidate', fallback=True):
+        if CFG.getboolean('mets', 'prevalidate', fallback=True):
             odem_process.validate_metadata()
         odem_process.clear_existing_entries()
         odem_process.language_modelconfig()
@@ -147,30 +147,33 @@
         odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
         odem_process.link_ocr_files()
         odem_process.postprocess_ocr()
+        wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True)
+        if wf_enrich_ocr:
+            odem_process.link_ocr_files()
         if CREATE_PDF:
             odem_process.create_pdf()
         if CREATE_PDF:
             odem_process.create_text_bundle_data()
         odem_process.postprocess_mets()
-        if CFG.getboolean('mets','postvalidate', fallback=True):
+        if CFG.getboolean('mets', 'postvalidate', fallback=True):
             odem_process.validate_metadata()
         if odem_process.odem_configuration.has_option('export', 'local_export_dir'):
-            odem_process.the_logger.info("[%s] start to export data", 
+            odem_process.the_logger.info("[%s] start to export data",
                                          odem_process.process_identifier)
             if not MUST_KEEP_RESOURCES and len(DELETE_BEVOR_EXPORT) > 0:
                 odem_process.delete_before_export(DELETE_BEVOR_EXPORT)
             odem_process.export_data()
         _mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}'
         odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier,
-                                odem_process.duration, _mode, odem_process.statistics)
+                                     odem_process.duration, _mode, odem_process.statistics)
         LOGGER.info("[%s] odem done in '%s' (%d executors)",
                     odem_process.process_identifier, odem_process.duration, EXECUTORS)
     except odem.ODEMNoTypeForOCRException as type_unknown:
-        LOGGER.warning("[%s] odem skips '%s'", 
+        LOGGER.warning("[%s] odem skips '%s'",
                        odem_process.process_identifier, type_unknown.args[0])
     except odem.ODEMNoImagesForOCRException as not_ocrable:
-        LOGGER.warning("[%s] odem no ocrables '%s'", 
-                       odem_process.process_identifier,  not_ocrable.args)
+        LOGGER.warning("[%s] odem no ocrables '%s'",
+                       odem_process.process_identifier, not_ocrable.args)
     except odem.ODEMException as _odem_exc:
         _err_args = {'ODEMException': _odem_exc.args[0]}
         LOGGER.error("[%s] odem fails with: '%s'", odem_process.process_identifier, _err_args)

diff --git a/cli_oai_client.py b/cli_oai_client.py
@@ -197,9 +197,6 @@ def oai_arg_parser(value):
         print(f"[ERROR] unable to read config from '{CONF_FILE}! exit!")
         sys.exit(1)
 
-    CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
-    ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True)
-
     # set work_dirs and logger
     LOCAL_WORK_ROOT = CFG.get('global', 'local_work_root')
     LOCAL_DELETE_BEFORE_EXPORT = []
@@ -233,8 +230,8 @@ def oai_arg_parser(value):
     # parallel OCR-D instances shall be used
     EXECUTOR_ARGS = ARGS.executors
     if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0:
-        CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS))
-    EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=odem.DEFAULT_EXECUTORS)
+        CFG.set(odem.CFG_SEC_OCR, 'n_executors', str(EXECUTOR_ARGS))
+    EXECUTORS = CFG.getint(odem.CFG_SEC_OCR, 'n_executors', fallback=odem.DEFAULT_EXECUTORS)
     LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s",
                  LOCAL_WORK_ROOT, EXECUTORS, MUST_KEEP_RESOURCES, MUST_LOCK)
     DATA_FIELDS = CFG.getlist('global', 'data_fields')
@@ -266,7 +263,7 @@ def oai_arg_parser(value):
     odem_process: odem.ODEMProcess = odem.ODEMProcess(record, req_dst_dir)
     odem_process.the_logger = LOGGER
     odem_process.the_logger.debug(
-        "request %s from %s, %s part slots)",
+        "request %s from %s (%s part slots)",
         local_ident,
         CLIENT.host, EXECUTORS
     )
@@ -301,7 +298,7 @@ def oai_arg_parser(value):
         odem_process.set_local_images()
 
         # NEW NEW NEW
-        proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
+        proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
         odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process)
         odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
         ocr_results = process_resource_monitor.monit_vmem(odem_runner.run)
@@ -312,11 +309,12 @@ def oai_arg_parser(value):
         _stats_ocr = odem_process.statistics
         odem_process.the_logger.info("[%s] %s", local_ident, _stats_ocr)
         odem_process.postprocess_ocr()
-        if ENRICH_METS_FULLTEXT:
+        wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True)
+        if wf_enrich_ocr:
             odem_process.link_ocr_files()
-        if CREATE_PDF:
+        wf_create_pdf = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
+        if wf_create_pdf:
             odem_process.create_pdf()
-        if CREATE_PDF:
             odem_process.create_text_bundle_data()
         odem_process.postprocess_mets()
         if CFG.getboolean('mets', 'postvalidate', fallback=True):

diff --git a/cli_oai_local.py b/cli_oai_local.py
@@ -104,7 +104,7 @@ def trnfrm(row):
         sys.exit(1)
 
     CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
-    ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True)
+    ENRICH_METS_FULLTEXT: bool = CFG.getboolean(odem.CFG_SEC_METS, 'enrich_mets_fulltext', fallback=True)
 
     # set work_dirs and logger
     LOCAL_WORK_ROOT = CFG.get('global', 'local_work_root')
@@ -126,8 +126,8 @@ def trnfrm(row):
 
     # if valid n_executors via cli, use it's value
     if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0:
-        CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS))
-    EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=DEFAULT_EXECUTORS)
+        CFG.set(odem.CFG_SEC_OCR, 'n_executors', str(EXECUTOR_ARGS))
+    EXECUTORS = CFG.getint(odem.CFG_SEC_OCR, 'n_executors', fallback=DEFAULT_EXECUTORS)
     if SEQUENTIAL:
         EXECUTORS = 1
     LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s",
@@ -156,9 +156,9 @@ def wrap_save_record_state(status: str, urn, **kwargs):
         if os.path.exists(req_dst_dir):
             shutil.rmtree(req_dst_dir)
 
-        proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
+        proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
         if proc_type is None:
-            LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'")
+            LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'")
         odem_process: ODEMProcess = ODEMProcess(record, req_dst_dir)
         odem_process.the_logger = LOGGER
         odem_process.the_logger.info("[%s] odem from %s, %d executors", local_ident, OAI_RECORD_FILE, EXECUTORS)
@@ -196,6 +196,8 @@ def wrap_save_record_state(status: str, urn, **kwargs):
         odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
         odem_process.link_ocr_files()
         odem_process.postprocess_ocr()
+        if ENRICH_METS_FULLTEXT:
+            odem_process.link_ocr_files()
         if CREATE_PDF:
             odem_process.create_pdf()
         if CREATE_PDF:

diff --git a/lib/odem/ocrd3_odem.py b/lib/odem/ocrd3_odem.py
@@ -248,7 +248,7 @@ def language_modelconfig(self, languages=None) -> str:
 
         _models = []
         model_mappings: dict = self.odem_configuration.getdict(  # pylint: disable=no-member
-            'ocr', 'model_mapping')
+            CFG_SEC_OCR, 'model_mapping')
         self.the_logger.info("[%s] inspect languages '%s'",
                              self.process_identifier, languages)
         if languages is None:
@@ -262,7 +262,7 @@ def language_modelconfig(self, languages=None) -> str:
                     _models.append(model)
                 else:
                     raise ODEMException(f"'{model}' model config not found !")
-        _model_conf = '+'.join(_models) if self.odem_configuration.getboolean('ocr', "model_combinable", fallback=True) else _models[0]
+        _model_conf = '+'.join(_models) if self.odem_configuration.getboolean(CFG_SEC_OCR, "model_combinable", fallback=True) else _models[0]
         self._statistics_ocr[STATS_KEY_MODELS] = _model_conf
         self.the_logger.info("[%s] map languages '%s' => '%s'",
                              self.process_identifier, languages, _model_conf)
@@ -386,7 +386,7 @@ def postprocess_ocr(self):
         # inspect each single created ocr file
         # drop unwanted elements
         # clear punctual regions
-        strip_tags = self.odem_configuration.getlist('ocr', 'strip_tags')
+        strip_tags = self.odem_configuration.getlist(CFG_SEC_OCR, 'strip_tags')
         for _ocr_file in self.ocr_files:
             postprocess_ocr_file(_ocr_file, strip_tags)
 
@@ -652,7 +652,7 @@ def process(self, input_data):
         """Create OCR Data"""
 
         ocr_log_conf = os.path.join(
-            PROJECT_ROOT, self.cfg.get('ocr', 'ocrd_logging'))
+            PROJECT_ROOT, self.cfg.get(CFG_SEC_OCR, 'ocrd_logging'))
 
         # Preprare workspace with makefile
         (image_path, ident) = input_data
@@ -695,17 +695,17 @@ def process(self, input_data):
         profiling = ('n.a.', 0)
 
         container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}'
-        container_memory_limit: str = self.cfg.get('ocr', 'docker_container_memory_limit', fallback=None)
-        container_user = self.cfg.get('ocr', 'docker_container_user', fallback=os.getuid())
+        container_memory_limit: str = self.cfg.get(CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None)
+        container_user = self.cfg.get(CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid())
         container_timeout: int = self.cfg.getint(
-            'ocr',
+            CFG_SEC_OCR,
             'docker_container_timeout',
             fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT
         )
-        base_image = self.cfg.get('ocr', 'ocrd_baseimage')
-        ocrd_process_list = self.cfg.getlist('ocr', 'ocrd_process_list')
-        tesseract_model_rtl: typing.List[str] = self.cfg.getlist('ocr', 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS)
-        ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict('ocr', CFG_SEC_OCR_OPT_RES_VOL, fallback={})
+        base_image = self.cfg.get(CFG_SEC_OCR, 'ocrd_baseimage')
+        ocrd_process_list = self.cfg.getlist(CFG_SEC_OCR, 'ocrd_process_list')
+        tesseract_model_rtl: typing.List[str] = self.cfg.getlist(CFG_SEC_OCR, 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS)
+        ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(CFG_SEC_OCR, CFG_SEC_OCR_OPT_RES_VOL, fallback={})
 
         if self.odem.local_mode:
             container_name = os.path.basename(page_workdir)
@@ -739,7 +739,7 @@ def process(self, input_data):
                                   _ident, plain_exc, base_image)
 
         os.chdir(self.odem.work_dir_main)
-        if self.cfg.getboolean('ocr', 'keep_temp_orcd_data', fallback=False) is False:
+        if self.cfg.getboolean(CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False:
             shutil.rmtree(page_workdir, ignore_errors=True)
         return stored, 1, mps, filesize_mb
 
@@ -850,8 +850,8 @@ def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser:
 
         if self.pipeline_configuration is None:
             if path_config is None:
-                if self.odem_configuration.has_option('ocr', 'ocr_pipeline_config'):
-                    path_config = os.path.abspath(self.odem_configuration.get('ocr', 'ocr_pipeline_config'))
+                if self.odem_configuration.has_option(CFG_SEC_OCR, 'ocr_pipeline_config'):
+                    path_config = os.path.abspath(self.odem_configuration.get(CFG_SEC_OCR, 'ocr_pipeline_config'))
             if not os.path.isfile(path_config):
                 raise ODEMException(f"no ocr-pipeline conf {path_config} !")
             pipe_cfg = configparser.ConfigParser()

diff --git a/lib/odem/odem_commons.py b/lib/odem/odem_commons.py
@@ -54,6 +54,7 @@ class ExportFormat(str, Enum):
 CFG_SEC_OCR_OPT_MODEL_COMBINABLE = "model_combinable"
 CFG_SEC_METS = 'mets'
 CFG_SEC_METS_OPT_AGENTS = 'agents'
+CFG_SEC_METS_OPT_ENRICH = 'enrich_fulltext'
 KEY_EXECS = 'n_executors'
 KEY_LANGUAGES = 'language_model'
 KEY_MODEL_MAP = 'model_mapping'

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -74,7 +74,7 @@ def _module_fixture_123456789_27949(tmp_path_factory):
     record = df.OAIRecord('oai:dev.opendata.uni-halle.de:123456789/27949')
     _oproc = odem.ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log')
     _oproc.odem_configuration = fixture_configuration()
-    _oproc.odem_configuration.set('ocr', odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize')
+    _oproc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize')
     _oproc.ocr_files = [os.path.join(trgt_alto, a)
                         for a in os.listdir(trgt_alto)]
     _oproc.mets_file = str(trgt_mets)

diff --git a/tests/test_odem_processing_mets.py b/tests/test_odem_processing_mets.py
@@ -75,7 +75,7 @@ def test_postprocess_mets_agent_odem_fits(post_mets):
     _agent_odem = post_mets.xpath('//mets:agent', namespaces=df.XMLNS)[3]
     _xp_agent_note = 'mets:note/text()'
     _xp_agent_name = 'mets:name/text()'
-    _curr_image = fixture_configuration().get('ocr', 'ocrd_baseimage')
+    _curr_image = fixture_configuration().get(odem.CFG_SEC_OCR, 'ocrd_baseimage')
     assert _agent_odem.xpath(_xp_agent_name, namespaces=df.XMLNS)[0] == f'DFG-OCRD3-ODEM_{_curr_image}'
     _today = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
     assert _today in _agent_odem.xpath(_xp_agent_note, namespaces=df.XMLNS)[0]

diff --git a/tests/test_odem_processing_ocr_files.py b/tests/test_odem_processing_ocr_files.py
@@ -55,7 +55,7 @@ def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcess):
     # arrange
     tmp_path = fixture_27949.work_dir_main
     path_file = tmp_path / 'FULLTEXT' / '00000003.xml'
-    strip_tags = fixture_configuration().getlist('ocr', 'strip_tags') # pylint: disable=no-member
+    strip_tags = fixture_configuration().getlist(odem.CFG_SEC_OCR, 'strip_tags')  # pylint: disable=no-member
 
     # act
     odem.postprocess_ocr_file(path_file, strip_tags)