Skip to content

Commit

Permalink
[app][fix] fix oai local ocrd
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Jun 14, 2024
1 parent a6dbb95 commit b00455c
Show file tree
Hide file tree
Showing 11 changed files with 165 additions and 195 deletions.
4 changes: 2 additions & 2 deletions cli_mets_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@
odem_process.set_local_images()

# NEW NEW NEW
odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process)
odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
odem_pipeline = odem.ODEMWorkflow.create(proc_type, odem_process)
odem_runner = odem.ODEMWorkflowRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
ocr_results = process_resource_monitor.monit_vmem(odem_runner.run)
if ocr_results is None or len(ocr_results) == 0:
raise odem.ODEMException(f"OCR Process Runner error for {record.identifier}")
Expand Down
4 changes: 2 additions & 2 deletions cli_oai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ def oai_arg_parser(value):

# NEW NEW NEW
proc_type = CFG.get(odem.CFG_SEC_OCR, 'workflow_type', fallback=None)
odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process)
odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
odem_pipeline = odem.ODEMWorkflow.create(proc_type, odem_process)
odem_runner = odem.ODEMWorkflowRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
ocr_results = process_resource_monitor.monit_vmem(odem_runner.run)
if ocr_results is None or len(ocr_results) == 0:
raise odem.ODEMException(f"process run error: {record.identifier}")
Expand Down
10 changes: 5 additions & 5 deletions cli_oai_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,16 +183,16 @@ def wrap_save_record_state(status: str, urn, **kwargs):
odem_process.set_local_images()

# NEW NEW NEW
odem_pipeline = odem.ODEMOCRPipeline.create(proc_type, odem_process)
odem_runner = odem.ODEMPipelineRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
odem_pipeline = odem.ODEMWorkflow.create(proc_type, odem_process)
odem_runner = odem.ODEMWorkflowRunner(local_ident, EXECUTORS, LOGGER, odem_pipeline)
ocr_results = process_resource_monitor.monit_vmem(odem_runner.run)
if ocr_results is None or len(ocr_results) == 0:
raise ODEMException(f"process run error: {record.identifier}")
odem_process.calculate_statistics_ocr(ocr_results)
odem_process._statistics_ocr[odem.STATS_KEY_N_EXECS] = EXECUTORS
odem_process.the_logger.info("[%s] %s", local_ident, odem_process.statistics)
odem_process.link_ocr_files()
odem_process.postprocess_ocr()
# odem_process.link_ocr_files()
# odem_process.postprocess_ocr()
wf_enrich_ocr = CFG.getboolean(odem.CFG_SEC_METS, odem.CFG_SEC_METS_OPT_ENRICH, fallback=True)
if wf_enrich_ocr:
odem_process.link_ocr_files()
Expand All @@ -214,7 +214,7 @@ def wrap_save_record_state(status: str, urn, **kwargs):
odem_process.record.info.update(_kwargs)
_info = f"{odem_process.record.info}"
except:
odem_process.the_logger.error("Can't parse '%s', store info literally",
odem_process.the_logger.warning("Can't parse '%s', store info literally",
odem_process.record.info)
_info = f"{_kwargs}"
else:
Expand Down
4 changes: 2 additions & 2 deletions lib/odem/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from .ocrd3_odem import (
OdemWorkflowProcessType,
ODEMProcess,
ODEMOCRPipeline,
ODEMPipelineRunner,
ODEMWorkflow,
ODEMWorkflowRunner,
OCRDPageParallel,
ODEMTesseract,
)
Expand Down
224 changes: 112 additions & 112 deletions lib/odem/ocrd3_odem.py

Large diffs are not rendered by default.

44 changes: 20 additions & 24 deletions lib/odem/odem_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,13 @@
import os
import socket
import time
import typing

from configparser import (
ConfigParser,
)
from enum import Enum
from pathlib import (
Path
)
from typing import (
Dict,
List,
)

from ocrd_utils import (
initLogging
)

from digiflow import (
OAIRecord,
)
from pathlib import Path

import ocrd_utils
import digiflow as df

#
# ODEM States
Expand Down Expand Up @@ -87,7 +74,7 @@ class ExportFormat(str, Enum):
# ODEM metadata
#
# file groups
FILEGROUP_OCR = 'FULLTEXT'
FILEGROUP_FULLTEXT = 'FULLTEXT'
FILEGROUP_IMG = 'MAX'
# statistic keys
STATS_KEY_LANGS = 'languages'
Expand Down Expand Up @@ -161,7 +148,7 @@ def get_logger(log_dir, log_infix=None, path_log_config=None) -> logging.Logger:
...
"""

initLogging()
ocrd_utils.initLogging()
logging.getLogger('page-to-alto').setLevel('CRITICAL')
_today = time.strftime('%Y-%m-%d', time.localtime())
_host = socket.gethostname()
Expand All @@ -176,7 +163,7 @@ def get_logger(log_dir, log_infix=None, path_log_config=None) -> logging.Logger:
return logging.getLogger('odem')


def merge_args(the_configuration: ConfigParser, the_args) -> List:
def merge_args(the_configuration: configparser.ConfigParser, the_args) -> typing.List:
"""Merge additionally provided arguements into
existing configurations, overwrite these and
communication the replaced options
Expand All @@ -200,7 +187,7 @@ def merge_args(the_configuration: ConfigParser, the_args) -> List:
return _repls


def to_dict(record: OAIRecord) -> Dict:
def to_dict(record: df.OAIRecord) -> typing.Dict:
"""Serialize OAIRecord into dictionary
as input for JSON format"""

Expand All @@ -212,9 +199,18 @@ def to_dict(record: OAIRecord) -> Dict:
RECORD_TIME: record.state_datetime,
}

def from_dict(data) -> OAIRecord:
def from_dict(data) -> df.OAIRecord:
"""deserialize into OAIRecord"""

_record = OAIRecord(data[RECORD_IDENTIFIER])
_record = df.OAIRecord(data[RECORD_IDENTIFIER])
_record.info = data[RECORD_INFO]
return _record


def list_files(dir_root, sub_dir, format='.xml') -> typing.List:
actual_dir = os.path.join(dir_root, sub_dir)
return [
os.path.join(actual_dir, dir_file)
for dir_file in os.listdir(actual_dir)
if Path(dir_file).suffix == format
]
4 changes: 2 additions & 2 deletions lib/odem/processing/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,10 +264,10 @@ def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int:
tag_file = f'{{{df.XMLNS["mets"]}}}file'
tag_flocat = f'{{{df.XMLNS["mets"]}}}FLocat'

file_grp_fulltext = ET.Element(tag_file_group, USE=odem_c.FILEGROUP_OCR)
file_grp_fulltext = ET.Element(tag_file_group, USE=odem_c.FILEGROUP_FULLTEXT)
for _ocr_file in ocr_files:
_file_name = os.path.basename(_ocr_file).split('.')[0]
new_id = odem_c.FILEGROUP_OCR + '_' + _file_name
new_id = odem_c.FILEGROUP_FULLTEXT + '_' + _file_name
file_ocr = ET.Element(
tag_file, MIMETYPE="application/alto+xml", ID=new_id)
flocat_href = ET.Element(tag_flocat, LOCTYPE="URL")
Expand Down
31 changes: 9 additions & 22 deletions lib/odem/processing/ocr_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
'alto:Illustration',
'alto:GraphicalElement']

LOCAL_DIR_RESULT = 'PAGE'
# LOCAL_OCRD_RESULT_DIR = 'PAGE'


class ODEMMetadataOcrException(Exception):
Expand Down Expand Up @@ -92,33 +92,20 @@ def postprocess_ocr_file(ocr_file, strip_tags):
mproc.write()


def list_files(dir_root, sub_dir) -> typing.List:
_curr_dir = os.path.join(dir_root, sub_dir)
return [
os.path.join(_curr_dir, _file)
for _file in os.listdir(_curr_dir)
if str(_file).endswith('.xml')
]


def convert_to_output_format(work_dir_root):
def convert_to_output_format(ocrd_results: typing.List, dst_dir):
"""Convert created OCR-Files to required presentation
format (i.e. ALTO)
"""

_converted = []
_fulltext_dir = os.path.join(work_dir_root, odem_c.FILEGROUP_OCR)
if not os.path.isdir(_fulltext_dir):
os.makedirs(_fulltext_dir, exist_ok=True)
_results = list_files(work_dir_root, LOCAL_DIR_RESULT)
for _file in _results:
converted_files = []
for _file in ocrd_results:
the_id = os.path.basename(_file)
output_file = os.path.join(_fulltext_dir, the_id)
converter = opta_c.OcrdPageAltoConverter(page_filename=_file).convert()
output_file = os.path.join(dst_dir, the_id)
converted = opta_c.OcrdPageAltoConverter(page_filename=_file).convert()
with open(output_file, 'w', encoding='utf-8') as output:
output.write(str(converter))
_converted.append(output_file)
return _converted
output.write(str(converted))
converted_files.append(output_file)
return converted_files


def _is_completely_punctuated(a_string):
Expand Down
4 changes: 2 additions & 2 deletions lib/odem/processing/processing_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,10 @@ def integrate_ocr_file(xml_tree, ocr_files: typing.List) -> int:
tag_file = f'{{{df.XMLNS["mets"]}}}file'
tag_flocat = f'{{{df.XMLNS["mets"]}}}FLocat'

file_grp_fulltext = ET.Element(tag_file_group, USE=odem.FILEGROUP_OCR)
file_grp_fulltext = ET.Element(tag_file_group, USE=odem.FILEGROUP_FULLTEXT)
for _ocr_file in ocr_files:
_file_name = os.path.basename(_ocr_file).split('.')[0]
new_id = odem.FILEGROUP_OCR + '_' + _file_name
new_id = odem.FILEGROUP_FULLTEXT + '_' + _file_name
file_ocr = ET.Element(
tag_file, MIMETYPE="application/alto+xml", ID=new_id)
flocat_href = ET.Element(tag_flocat, LOCTYPE="URL")
Expand Down
13 changes: 9 additions & 4 deletions tests/test_ocrd3_odem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import shutil
import unittest
import unittest.mock

import lxml.etree as ET
import pytest
Expand Down Expand Up @@ -266,22 +267,26 @@ def test_module_fixture_one_images_4_ocr_by_metadata(fixture_27949: odem.ODEMPro
def test_fixture_one_postprocess_ocr_create_text_bundle(fixture_27949: odem.ODEMProcess):
"""Ensure text bundle data created
and present with expected number of text rows
Please note:
according to workflow modifications the ocr-output
is no longer postprocessed, and lots of to short
non-alphabetical lines will remain
therefore line number increased from 77 => 111
"""

# arrange
tmp_path = fixture_27949.work_dir_main

# act
fixture_27949.link_ocr_files()
fixture_27949.postprocess_ocr()
fixture_27949.create_text_bundle_data()

# assert
_txt_bundle_file = tmp_path / '198114125.pdf.txt'
assert os.path.exists(_txt_bundle_file)
assert 77 == fixture_27949.statistics['n_text_lines']
assert 111 == fixture_27949.statistics['n_text_lines']
with open(_txt_bundle_file, encoding='utf-8') as bundle_handle:
assert 77 == len(bundle_handle.readlines())
assert 111 == len(bundle_handle.readlines())


def test_images_4_ocr_properly_filtered(tmp_path):
Expand Down Expand Up @@ -417,7 +422,7 @@ def test_export_flat_zip(tmp_path):

oproc.mets_file = str(trgt_mets)
oproc.inspect_metadata()
_langs = oproc.statistics.get(odem.STATS_KEY_LANGS)
# _langs = oproc.statistics.get(odem.STATS_KEY_LANGS)

# act
zipfilepath, _ = oproc.export_data()
Expand Down
18 changes: 0 additions & 18 deletions tests/test_odem_processing_ocr_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import os

import pytest

import lxml.etree as ET
import digiflow as df

Expand Down Expand Up @@ -32,22 +30,6 @@ def test_module_fixture_one_integrated_ocr_files_fit_identifier(fixture_27949: o
assert not os.path.exists(tmp_path / 'FULLTEXT' / '00000007.xml')


def test_fixture_one_postprocessed_ocr_files_elements(fixture_27949: odem.ODEMProcess):
"""Ensure ocr-file unwanted elements dropped as expected
"""

# arrange
tmp_path = fixture_27949.work_dir_main

# act
# fixture_27949.link_ocr()
fixture_27949.postprocess_ocr()

# assert
ocr_file_03 = ET.parse(str(tmp_path / 'FULLTEXT' / '00000003.xml')).getroot()
assert not ocr_file_03.xpath('//alto:Shape', namespaces=df.XMLNS)


def test_fixture_one_postprocess_ocr_files(fixture_27949: odem.ODEMProcess):
"""Ensure expected replacements done *even* when
diacritics occour more several times in single word"""
Expand Down

0 comments on commit b00455c

Please sign in to comment.