Skip to content

Commit

Permalink
[app][rfct] clean ocr files module
Browse files Browse the repository at this point in the history
  • Loading branch information
M3ssman committed Jun 14, 2024
1 parent 9dbf431 commit d6df6ab
Showing 1 changed file with 7 additions and 11 deletions.
18 changes: 7 additions & 11 deletions lib/odem/processing/ocr_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,12 @@
import digiflow as df
import ocrd_page_to_alto.convert as opta_c

import lib.odem.odem_commons as odem_c


# define propably difficult characters
# very common separator '⸗'
DOUBLE_OBLIQUE_HYPHEN = '\u2E17'
# rare Geviertstrich '—'
# "Geviertstrich": '—'
EM_DASH = '\u2014'

ODEM_PUNCTUATIONS = string.punctuation + EM_DASH + DOUBLE_OBLIQUE_HYPHEN
# create module-wide translator
PUNCT_TRANSLATOR = str.maketrans('', '', ODEM_PUNCTUATIONS)
Expand All @@ -41,8 +39,6 @@
'alto:Illustration',
'alto:GraphicalElement']

# LOCAL_OCRD_RESULT_DIR = 'PAGE'


class ODEMMetadataOcrException(Exception):
"""Mark any problems related to OCR
Expand All @@ -61,19 +57,19 @@ def postprocess_ocr_file(ocr_file, strip_tags):
"""

# the xml cleanup
mproc = df.MetsProcessor(str(ocr_file))
xml_proc: df.XMLProcessor = df.XMLProcessor(ocr_file)
if strip_tags:
mproc.remove(strip_tags)
xml_proc.remove(strip_tags)

# inspect transformation artifacts
_all_text_blocks = mproc.tree.xpath('//alto:TextBlock', namespaces=df.XMLNS)
_all_text_blocks = xml_proc.tree.xpath('//alto:TextBlock', namespaces=df.XMLNS)
for _block in _all_text_blocks:
if 'IDNEXT' in _block.attrib:
del _block.attrib['IDNEXT']

# inspect textual content
# _all_strings = mproc.tree.xpath('//alto:String', namespaces=XMLNS)
_all_strings = mproc.tree.findall('.//alto:String', df.XMLNS)
_all_strings = xml_proc.tree.findall('.//alto:String', df.XMLNS)
for _string_el in _all_strings:
_content = _string_el.attrib['CONTENT'].strip()
if _is_completely_punctuated(_content):
Expand All @@ -89,7 +85,7 @@ def postprocess_ocr_file(ocr_file, strip_tags):
if len(_content) < MINIMUM_WORD_LEN:
# too few content, remove element bottom-up
_uplete(_string_el, _string_el.getparent())
mproc.write()
xml_proc.write()


def convert_to_output_format(ocrd_results: typing.List, dst_dir):
Expand Down

0 comments on commit d6df6ab

Please sign in to comment.