Skip to content

Commit

Permalink
[app][fix] comply digiflow 4
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Jun 18, 2024
1 parent d6df6ab commit 291988d
Show file tree
Hide file tree
Showing 12 changed files with 311 additions and 268 deletions.
15 changes: 8 additions & 7 deletions cli_dir_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,20 @@
PROCESS.odem_configuration = CFG
PROCESS.the_logger = LOGGER
local_images = PROCESS.get_local_image_paths(image_local_dir=ROOT_PATH)
PROCESS._statistics_ocr[odem.STATS_KEY_N_PAGES] = len(local_images)
PROCESS._statistics_ocr[odem.STATS_KEY_N_OCRABLE] = 0
PROCESS._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS
PROCESS.images_4_ocr = local_images
PROCESS.process_statistics[odem.STATS_KEY_N_PAGES] = len(local_images)
PROCESS.process_statistics[odem.STATS_KEY_N_OCRABLE] = 0
PROCESS.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS
PROCESS.ocr_candidates = local_images
# Type and Value change!!!
# ODEMProcess.single_ocr() needs Tuple[str,str], in non-local
# this is assigned to "PROCESS.images_4_ocr" in ODEMProcess.filter_images()
# thats why we have to manually fit that requirement
PROCESS.images_4_ocr = list(zip(PROCESS.images_4_ocr, [pathlib.Path(i).stem for i in PROCESS.images_4_ocr]))
PROCESS.ocr_candidates = list(zip(PROCESS.ocr_candidates,
[pathlib.Path(i).stem for i in PROCESS.ocr_candidates]))
PROCESS.run()
PROCESS.the_logger.info("[%s] duration: %s (%s)", req_idn,
PROCESS.duration, PROCESS.statistics)
PROCESS.statistics['timedelta'], PROCESS.statistics)
except Exception as exc:
LOGGER.error("odem fails for '%s' after %s with: '%s'",
req_idn, PROCESS.duration, str(exc))
req_idn, PROCESS.statistics['timedelta'], str(exc))
sys.exit(0)
8 changes: 4 additions & 4 deletions cli_mets_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@
if ocr_results is None or len(ocr_results) == 0:
raise odem.ODEMException(f"OCR Process Runner error for {record.identifier}")
odem_process.calculate_statistics_ocr(ocr_results)
odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS
odem_process.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS
odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
odem_process.link_ocr_files()
odem_process.postprocess_ocr()
Expand All @@ -165,9 +165,9 @@
odem_process.export_data()
_mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}'
odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier,
odem_process.duration, _mode, odem_process.statistics)
odem_process.statistics['timedelta'], _mode, odem_process.statistics)
LOGGER.info("[%s] odem done in '%s' (%d executors)",
odem_process.process_identifier, odem_process.duration, EXECUTORS)
odem_process.process_identifier, odem_process.statistics['timedelta'], EXECUTORS)
except odem.ODEMNoTypeForOCRException as type_unknown:
LOGGER.warning("[%s] odem skips '%s'",
odem_process.process_identifier, type_unknown.args[0])
Expand All @@ -179,5 +179,5 @@
LOGGER.error("[%s] odem fails with: '%s'", odem_process.process_identifier, _err_args)
except RuntimeError as exc:
LOGGER.error("odem fails for '%s' after %s with: '%s'",
record, odem_process.duration, str(exc))
record, odem_process.statistics['timedelta'], str(exc))
sys.exit(1)
20 changes: 11 additions & 9 deletions cli_oai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import typing

import requests

import digiflow as df
import digiflow.record as df_r

import lib.odem as odem
import lib.odem.monitoring as odem_rm
Expand All @@ -32,7 +34,7 @@
def trnfrm(row):
"""callback function"""
oai_id = row['IDENTIFIER']
oai_record = df.OAIRecord(oai_id)
oai_record = df_r.Record(oai_id)
return oai_record


Expand Down Expand Up @@ -92,13 +94,13 @@ def _request_record(self):
sys.exit(1)
return response.json()

def get_record(self) -> df.OAIRecord:
def get_record(self) -> df_r.Record:
"""Return requested data
as temporary OAI Record but
store internally as plain dictionary"""

self.record_data = self._request_record()
_oai_record = df.OAIRecord(self.record_data[odem.RECORD_IDENTIFIER])
_oai_record = df_r.Record(self.record_data[odem.RECORD_IDENTIFIER])
return _oai_record

def update(self, status, oai_urn, **kwargs):
Expand Down Expand Up @@ -305,7 +307,7 @@ def oai_arg_parser(value):
if ocr_results is None or len(ocr_results) == 0:
raise odem.ODEMException(f"process run error: {record.identifier}")
odem_process.calculate_statistics_ocr(ocr_results)
odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS
odem_process.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS
_stats_ocr = odem_process.statistics
odem_process.the_logger.info("[%s] %s", local_ident, _stats_ocr)
wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True)
Expand All @@ -331,16 +333,16 @@ def oai_arg_parser(value):
# finale
odem_process.clear_resources(remove_all=True)
LOGGER.info("[%s] odem done in '%s' (%d executors)",
odem_process.process_identifier, odem_process.duration, EXECUTORS)
odem_process.process_identifier, odem_process.statistics['timedelta'], EXECUTORS)
except odem.ODEMNoTypeForOCRException as type_unknown:
LOGGER.warning("[%s] odem skips '%s'",
odem_process.process_identifier, type_unknown.args)
LOGGER.warning("[%s] odem skips '%s'",
odem_process.process_identifier, type_unknown.args)
err_dict = {'NoTypeForOCR': type_unknown.args[0]}
CLIENT.update(status=odem.MARK_OCR_SKIP, oai_urn=rec_ident, **err_dict)
odem_process.clear_resources(remove_all=True)
except odem.ODEMNoImagesForOCRException as not_ocrable:
LOGGER.warning("[%s] odem no ocrables '%s'",
odem_process.process_identifier, not_ocrable.args)
LOGGER.warning("[%s] odem no ocrables '%s'",
odem_process.process_identifier, not_ocrable.args)
err_dict = {'NoImagesForOCR': not_ocrable.args[0]}
CLIENT.update(status=odem.MARK_OCR_SKIP, oai_urn=rec_ident, **err_dict)
odem_process.clear_resources(remove_all=True)
Expand Down
22 changes: 11 additions & 11 deletions cli_oai_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import shutil
import sys

import digiflow as df
import digiflow.record as df_r

import lib.odem as odem
import lib.odem.monitoring as odem_rm
Expand All @@ -24,7 +24,7 @@
ODEMProcessImpl,
ODEMException,
get_configparser,
get_logger,
get_logger,
)

DEFAULT_EXECUTORS = 2
Expand All @@ -34,9 +34,9 @@ def trnfrm(row):
oai_id = row[RECORD_IDENTIFIER]
try:
_info = ast.literal_eval(row[RECORD_INFO])
except:
except SyntaxError:
_info = row[RECORD_INFO]
_record = df.OAIRecord(oai_id,)
_record = df_r.Record(oai_id,)
_record.info = _info
return _record

Expand Down Expand Up @@ -134,9 +134,9 @@ def trnfrm(row):
DATA_FIELDS = CFG.getlist('global', 'data_fields')
LOGGER.info("data fields: '%s'", DATA_FIELDS)
LOGGER.info("use records from '%s'", OAI_RECORD_FILE)
handler = df.OAIRecordHandler(
handler = df_r.RecordHandler(
OAI_RECORD_FILE, data_fields=DATA_FIELDS, transform_func=trnfrm)
record: df.OAIRecord = handler.next_record(state=MARK_OCR_OPEN)
record: df_r.Record = handler.next_record(state=MARK_OCR_OPEN)
if not record:
LOGGER.info("no open records in '%s', work done", OAI_RECORD_FILE)
sys.exit(1)
Expand All @@ -163,7 +163,7 @@ def wrap_save_record_state(status: str, urn, **kwargs):
LOCAL_STORE_ROOT = CFG.get('global', 'local_store_root', fallback=None)
if LOCAL_STORE_ROOT is not None:
STORE_DIR = os.path.join(LOCAL_STORE_ROOT, local_ident)
STORE = df.LocalStore(STORE_DIR, req_dst_dir)
STORE = df_r.LocalStore(STORE_DIR, req_dst_dir)
odem_process.store = STORE
process_resource_monitor: odem_rm.ProcessResourceMonitor = odem_rm.ProcessResourceMonitor(
odem_rm.from_configuration(CFG),
Expand All @@ -189,7 +189,7 @@ def wrap_save_record_state(status: str, urn, **kwargs):
if ocr_results is None or len(ocr_results) == 0:
raise ODEMException(f"process run error: {record.identifier}")
odem_process.calculate_statistics_ocr(ocr_results)
odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS
odem_process.process_statistics[odem.STATS_KEY_N_EXECS] = EXECUTORS
odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
# odem_process.link_ocr_files()
# odem_process.postprocess_ocr()
Expand Down Expand Up @@ -222,10 +222,10 @@ def wrap_save_record_state(status: str, urn, **kwargs):
handler.save_record_state(record.identifier, MARK_OCR_DONE, INFO=_info)
_mode = 'sequential' if SEQUENTIAL else f'n_execs:{EXECUTORS}'
odem_process.the_logger.info("[%s] duration: %s/%s (%s)", odem_process.process_identifier,
odem_process.duration, _mode, odem_process.statistics)
odem_process.statistics['timedelta'], _mode, odem_process.statistics)
# finale
LOGGER.info("[%s] odem done in '%s' (%d executors)",
odem_process.process_identifier, odem_process.duration, EXECUTORS)
odem_process.process_identifier, odem_process.statistics['timedelta'], EXECUTORS)
except odem.ODEMNoTypeForOCRException as type_unknown:
# we don't ocr this one
LOGGER.warning("[%s] odem skips '%s'",
Expand All @@ -241,6 +241,6 @@ def wrap_save_record_state(status: str, urn, **kwargs):
handler.save_record_state(record.identifier, MARK_OCR_FAIL, INFO=f'{_err_args}')
except RuntimeError as exc:
LOGGER.error("odem fails for '%s' after %s with: '%s'",
record, odem_process.duration, str(exc))
record, odem_process.statistics['timedelta'], str(exc))
handler.save_record_state(record.identifier, MARK_OCR_FAIL, INFO=f'{str(exc) : exc.args[0]}')
sys.exit(1)
Loading

0 comments on commit 291988d

Please sign in to comment.