Skip to content

Commit

Permalink
[app][rfct] increase usage of consts
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Jun 14, 2024
1 parent 255804f commit a138e25
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 47 deletions.
4 changes: 2 additions & 2 deletions cli_dir_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@
shutil.rmtree(req_dst_dir)
os.makedirs(req_dst_dir, exist_ok=True)

proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
if proc_type is None:
LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'")
LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'")
PROCESS: odem.ODEMProcess = odem.ODEMProcess.create(proc_type, None, req_dst_dir, EXECUTORS)
PROCESS.local_mode = True
PROCESS.odem_configuration = CFG
Expand Down
29 changes: 16 additions & 13 deletions cli_mets_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@
print(f"unable to read config from '{CONF_FILE}! exit!")
sys.exit(1)

CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True)
CREATE_PDF = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)


# set work_dirs and logger
DELETE_BEVOR_EXPORT = []
Expand All @@ -103,18 +103,18 @@

# if valid n_executors via cli, use it's value
if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0:
CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS))
EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=DEFAULT_EXECUTORS)
CFG.set(odem.CFG_SEC_OCR, 'n_executors', str(EXECUTOR_ARGS))
EXECUTORS = CFG.getint(odem.CFG_SEC_OCR, 'n_executors', fallback=DEFAULT_EXECUTORS)
if SEQUENTIAL:
EXECUTORS = 1
LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s",
mets_file_dir, EXECUTORS, MUST_KEEP_RESOURCES, MUST_LOCK)

try:
local_ident = mets_file.stem
proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
proc_type: str = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
if proc_type is None:
LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'")
LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'")
record = df.OAIRecord(local_ident)
odem_process: odem.ODEMProcess = odem.ODEMProcess(record, mets_file_dir)
odem_process.the_logger = LOGGER
Expand All @@ -130,7 +130,7 @@
process_resource_monitor.check_vmem()
# process_resource_monitor.monit_disk_space(odem_process.load)
odem_process.inspect_metadata()
if CFG.getboolean('mets','prevalidate', fallback=True):
if CFG.getboolean('mets', 'prevalidate', fallback=True):
odem_process.validate_metadata()
odem_process.clear_existing_entries()
odem_process.language_modelconfig()
Expand All @@ -147,30 +147,33 @@
odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
odem_process.link_ocr_files()
odem_process.postprocess_ocr()
wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True)
if wf_enrich_ocr:
odem_process.link_ocr_files()
if CREATE_PDF:
odem_process.create_pdf()
if CREATE_PDF:
odem_process.create_text_bundle_data()
odem_process.postprocess_mets()
if CFG.getboolean('mets','postvalidate', fallback=True):
if CFG.getboolean('mets', 'postvalidate', fallback=True):
odem_process.validate_metadata()
if odem_process.odem_configuration.has_option('export', 'local_export_dir'):
odem_process.the_logger.info("[%s] start to export data",
odem_process.the_logger.info("[%s] start to export data",
odem_process.process_identifier)
if not MUST_KEEP_RESOURCES and len(DELETE_BEVOR_EXPORT) > 0:
odem_process.delete_before_export(DELETE_BEVOR_EXPORT)
odem_process.export_data()
_mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}'
odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier,
odem_process.duration, _mode, odem_process.statistics)
odem_process.duration, _mode, odem_process.statistics)
LOGGER.info("[%s] odem done in '%s' (%d executors)",
odem_process.process_identifier, odem_process.duration, EXECUTORS)
except odem.ODEMNoTypeForOCRException as type_unknown:
LOGGER.warning("[%s] odem skips '%s'",
LOGGER.warning("[%s] odem skips '%s'",
odem_process.process_identifier, type_unknown.args[0])
except odem.ODEMNoImagesForOCRException as not_ocrable:
LOGGER.warning("[%s] odem no ocrables '%s'",
odem_process.process_identifier, not_ocrable.args)
LOGGER.warning("[%s] odem no ocrables '%s'",
odem_process.process_identifier, not_ocrable.args)
except odem.ODEMException as _odem_exc:
_err_args = {'ODEMException': _odem_exc.args[0]}
LOGGER.error("[%s] odem fails with: '%s'", odem_process.process_identifier, _err_args)
Expand Down
18 changes: 8 additions & 10 deletions cli_oai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,6 @@ def oai_arg_parser(value):
print(f"[ERROR] unable to read config from '{CONF_FILE}! exit!")
sys.exit(1)

CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True)

# set work_dirs and logger
LOCAL_WORK_ROOT = CFG.get('global', 'local_work_root')
LOCAL_DELETE_BEFORE_EXPORT = []
Expand Down Expand Up @@ -233,8 +230,8 @@ def oai_arg_parser(value):
# parallel OCR-D instances shall be used
EXECUTOR_ARGS = ARGS.executors
if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0:
CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS))
EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=odem.DEFAULT_EXECUTORS)
CFG.set(odem.CFG_SEC_OCR, 'n_executors', str(EXECUTOR_ARGS))
EXECUTORS = CFG.getint(odem.CFG_SEC_OCR, 'n_executors', fallback=odem.DEFAULT_EXECUTORS)
LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s",
LOCAL_WORK_ROOT, EXECUTORS, MUST_KEEP_RESOURCES, MUST_LOCK)
DATA_FIELDS = CFG.getlist('global', 'data_fields')
Expand Down Expand Up @@ -266,7 +263,7 @@ def oai_arg_parser(value):
odem_process: odem.ODEMProcess = odem.ODEMProcess(record, req_dst_dir)
odem_process.the_logger = LOGGER
odem_process.the_logger.debug(
"request %s from %s, %s part slots)",
"request %s from %s (%s part slots)",
local_ident,
CLIENT.host, EXECUTORS
)
Expand Down Expand Up @@ -301,7 +298,7 @@ def oai_arg_parser(value):
odem_process.set_local_images()

# NEW NEW NEW
proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process)
odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
ocr_results = process_resource_monitor.monit_vmem(odem_runner.run)
Expand All @@ -312,11 +309,12 @@ def oai_arg_parser(value):
_stats_ocr = odem_process.statistics
odem_process.the_logger.info("[%s] %s", local_ident, _stats_ocr)
odem_process.postprocess_ocr()
if ENRICH_METS_FULLTEXT:
wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True)
if wf_enrich_ocr:
odem_process.link_ocr_files()
if CREATE_PDF:
wf_create_pdf = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
if wf_create_pdf:
odem_process.create_pdf()
if CREATE_PDF:
odem_process.create_text_bundle_data()
odem_process.postprocess_mets()
if CFG.getboolean('mets', 'postvalidate', fallback=True):
Expand Down
12 changes: 7 additions & 5 deletions cli_oai_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def trnfrm(row):
sys.exit(1)

CREATE_PDF: bool = CFG.getboolean('derivans', 'derivans_enabled', fallback=True)
ENRICH_METS_FULLTEXT: bool = CFG.getboolean('export', 'enrich_mets_fulltext', fallback=True)
ENRICH_METS_FULLTEXT: bool = CFG.getboolean(odem.CFG_SEC_METS, 'enrich_mets_fulltext', fallback=True)

# set work_dirs and logger
LOCAL_WORK_ROOT = CFG.get('global', 'local_work_root')
Expand All @@ -126,8 +126,8 @@ def trnfrm(row):

# if valid n_executors via cli, use it's value
if EXECUTOR_ARGS and int(EXECUTOR_ARGS) > 0:
CFG.set('ocr', 'n_executors', str(EXECUTOR_ARGS))
EXECUTORS = CFG.getint('ocr', 'n_executors', fallback=DEFAULT_EXECUTORS)
CFG.set(odem.CFG_SEC_OCR, 'n_executors', str(EXECUTOR_ARGS))
EXECUTORS = CFG.getint(odem.CFG_SEC_OCR, 'n_executors', fallback=DEFAULT_EXECUTORS)
if SEQUENTIAL:
EXECUTORS = 1
LOGGER.debug("local work_root: '%s', executors:%s, keep_res:%s, lock:%s",
Expand Down Expand Up @@ -156,9 +156,9 @@ def wrap_save_record_state(status: str, urn, **kwargs):
if os.path.exists(req_dst_dir):
shutil.rmtree(req_dst_dir)

proc_type: str = CFG.get('ocr', 'workflow_type', fallback=None)
proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
if proc_type is None:
LOGGER.warning("no 'workflow_type' config option in section 'ocr' defined. defaults to 'OCRD_PAGE_PARALLEL'")
LOGGER.warning("no 'workflow_type' config option in section ocr defined. defaults to 'OCRD_PAGE_PARALLEL'")
odem_process: ODEMProcess = ODEMProcess(record, req_dst_dir)
odem_process.the_logger = LOGGER
odem_process.the_logger.info("[%s] odem from %s, %d executors", local_ident, OAI_RECORD_FILE, EXECUTORS)
Expand Down Expand Up @@ -196,6 +196,8 @@ def wrap_save_record_state(status: str, urn, **kwargs):
odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
odem_process.link_ocr_files()
odem_process.postprocess_ocr()
if ENRICH_METS_FULLTEXT:
odem_process.link_ocr_files()
if CREATE_PDF:
odem_process.create_pdf()
if CREATE_PDF:
Expand Down
28 changes: 14 additions & 14 deletions lib/odem/ocrd3_odem.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def language_modelconfig(self, languages=None) -> str:

_models = []
model_mappings: dict = self.odem_configuration.getdict( # pylint: disable=no-member
'ocr', 'model_mapping')
CFG_SEC_OCR, 'model_mapping')
self.the_logger.info("[%s] inspect languages '%s'",
self.process_identifier, languages)
if languages is None:
Expand All @@ -262,7 +262,7 @@ def language_modelconfig(self, languages=None) -> str:
_models.append(model)
else:
raise ODEMException(f"'{model}' model config not found !")
_model_conf = '+'.join(_models) if self.odem_configuration.getboolean('ocr', "model_combinable", fallback=True) else _models[0]
_model_conf = '+'.join(_models) if self.odem_configuration.getboolean(CFG_SEC_OCR, "model_combinable", fallback=True) else _models[0]
self._statistics_ocr[STATS_KEY_MODELS] = _model_conf
self.the_logger.info("[%s] map languages '%s' => '%s'",
self.process_identifier, languages, _model_conf)
Expand Down Expand Up @@ -386,7 +386,7 @@ def postprocess_ocr(self):
# inspect each single created ocr file
# drop unwanted elements
# clear punctual regions
strip_tags = self.odem_configuration.getlist('ocr', 'strip_tags')
strip_tags = self.odem_configuration.getlist(CFG_SEC_OCR, 'strip_tags')
for _ocr_file in self.ocr_files:
postprocess_ocr_file(_ocr_file, strip_tags)

Expand Down Expand Up @@ -652,7 +652,7 @@ def process(self, input_data):
"""Create OCR Data"""

ocr_log_conf = os.path.join(
PROJECT_ROOT, self.cfg.get('ocr', 'ocrd_logging'))
PROJECT_ROOT, self.cfg.get(CFG_SEC_OCR, 'ocrd_logging'))

# Preprare workspace with makefile
(image_path, ident) = input_data
Expand Down Expand Up @@ -695,17 +695,17 @@ def process(self, input_data):
profiling = ('n.a.', 0)

container_name: str = f'{self.odem.process_identifier}_{os.path.basename(page_workdir)}'
container_memory_limit: str = self.cfg.get('ocr', 'docker_container_memory_limit', fallback=None)
container_user = self.cfg.get('ocr', 'docker_container_user', fallback=os.getuid())
container_memory_limit: str = self.cfg.get(CFG_SEC_OCR, 'docker_container_memory_limit', fallback=None)
container_user = self.cfg.get(CFG_SEC_OCR, 'docker_container_user', fallback=os.getuid())
container_timeout: int = self.cfg.getint(
'ocr',
CFG_SEC_OCR,
'docker_container_timeout',
fallback=DEFAULT_DOCKER_CONTAINER_TIMEOUT
)
base_image = self.cfg.get('ocr', 'ocrd_baseimage')
ocrd_process_list = self.cfg.getlist('ocr', 'ocrd_process_list')
tesseract_model_rtl: typing.List[str] = self.cfg.getlist('ocr', 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS)
ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict('ocr', CFG_SEC_OCR_OPT_RES_VOL, fallback={})
base_image = self.cfg.get(CFG_SEC_OCR, 'ocrd_baseimage')
ocrd_process_list = self.cfg.getlist(CFG_SEC_OCR, 'ocrd_process_list')
tesseract_model_rtl: typing.List[str] = self.cfg.getlist(CFG_SEC_OCR, 'tesseract_model_rtl', fallback=DEFAULT_RTL_MODELS)
ocrd_resources_volumes: typing.Dict[str, str] = self.cfg.getdict(CFG_SEC_OCR, CFG_SEC_OCR_OPT_RES_VOL, fallback={})

if self.odem.local_mode:
container_name = os.path.basename(page_workdir)
Expand Down Expand Up @@ -739,7 +739,7 @@ def process(self, input_data):
_ident, plain_exc, base_image)

os.chdir(self.odem.work_dir_main)
if self.cfg.getboolean('ocr', 'keep_temp_orcd_data', fallback=False) is False:
if self.cfg.getboolean(CFG_SEC_OCR, 'keep_temp_orcd_data', fallback=False) is False:
shutil.rmtree(page_workdir, ignore_errors=True)
return stored, 1, mps, filesize_mb

Expand Down Expand Up @@ -850,8 +850,8 @@ def read_pipeline_config(self, path_config=None) -> configparser.ConfigParser:

if self.pipeline_configuration is None:
if path_config is None:
if self.odem_configuration.has_option('ocr', 'ocr_pipeline_config'):
path_config = os.path.abspath(self.odem_configuration.get('ocr', 'ocr_pipeline_config'))
if self.odem_configuration.has_option(CFG_SEC_OCR, 'ocr_pipeline_config'):
path_config = os.path.abspath(self.odem_configuration.get(CFG_SEC_OCR, 'ocr_pipeline_config'))
if not os.path.isfile(path_config):
raise ODEMException(f"no ocr-pipeline conf {path_config} !")
pipe_cfg = configparser.ConfigParser()
Expand Down
1 change: 1 addition & 0 deletions lib/odem/odem_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class ExportFormat(str, Enum):
CFG_SEC_OCR_OPT_MODEL_COMBINABLE = "model_combinable"
CFG_SEC_METS = 'mets'
CFG_SEC_METS_OPT_AGENTS = 'agents'
CFG_SEC_METS_OPT_ENRICH = 'enrich_fulltext'
KEY_EXECS = 'n_executors'
KEY_LANGUAGES = 'language_model'
KEY_MODEL_MAP = 'model_mapping'
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _module_fixture_123456789_27949(tmp_path_factory):
record = df.OAIRecord('oai:dev.opendata.uni-halle.de:123456789/27949')
_oproc = odem.ODEMProcess(record, work_dir=path_workdir, log_dir=path_workdir / 'log')
_oproc.odem_configuration = fixture_configuration()
_oproc.odem_configuration.set('ocr', odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize')
_oproc.odem_configuration.set(odem.CFG_SEC_OCR, odem.CFG_SEC_OCR_OPT_RES_VOL, f'{_model_dir}:/usr/local/share/ocrd-resources/ocrd-tesserocr-recognize')
_oproc.ocr_files = [os.path.join(trgt_alto, a)
for a in os.listdir(trgt_alto)]
_oproc.mets_file = str(trgt_mets)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_odem_processing_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_postprocess_mets_agent_odem_fits(post_mets):
_agent_odem = post_mets.xpath('//mets:agent', namespaces=df.XMLNS)[3]
_xp_agent_note = 'mets:note/text()'
_xp_agent_name = 'mets:name/text()'
_curr_image = fixture_configuration().get('ocr', 'ocrd_baseimage')
_curr_image = fixture_configuration().get(odem.CFG_SEC_OCR, 'ocrd_baseimage')
assert _agent_odem.xpath(_xp_agent_name, namespaces=df.XMLNS)[0] == f'DFG-OCRD3-ODEM_{_curr_image}'
_today = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
assert _today in _agent_odem.xpath(_xp_agent_note, namespaces=df.XMLNS)[0]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_odem_processing_ocr_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcess):
# arrange
tmp_path = fixture_27949.work_dir_main
path_file = tmp_path / 'FULLTEXT' / '00000003.xml'
strip_tags = fixture_configuration().getlist('ocr', 'strip_tags') # pylint: disable=no-member
strip_tags = fixture_configuration().getlist(odem.CFG_SEC_OCR, 'strip_tags') # pylint: disable=no-member

# act
odem.postprocess_ocr_file(path_file, strip_tags)
Expand Down

0 comments on commit a138e25

Please sign in to comment.