Skip to content

skip input files #1330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/ocrd/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@

# pylint: disable=wrong-import-position

def command_with_replaced_help(*replacements):

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E302

expected 2 blank lines, found 1

Check failure on line 15 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E302

expected 2 blank lines, found 1

class CommandWithReplacedHelp(click.Command):
def get_help(self, ctx):
newhelp : str = super().get_help(ctx)

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E203

whitespace before ':'

Check failure on line 19 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E203

whitespace before ':'
for replacement in replacements:
newhelp = re.sub(*replacement, newhelp)
# print(newhelp)
Expand All @@ -26,13 +26,13 @@

# pylint: enable=wrong-import-position

from ..decorators import ocrd_loglevel

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E305

expected 2 blank lines after class or function definition, found 1

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file

Check failure on line 29 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E305

expected 2 blank lines after class or function definition, found 1
from .ocrd_tool import ocrd_tool_cli

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 30 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file
from .workspace import workspace_cli

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 31 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file
from .process import process_cli

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 32 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file
from .bashlib import bashlib_cli

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 33 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file
from .validate import validate_cli

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 34 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file
from .resmgr import resmgr_cli

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E402

module level import not at top of file

Check failure on line 35 in src/ocrd/cli/__init__.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E402

module level import not at top of file
from .zip import zip_cli
from .log import log_cli
from .network import network_cli
Expand Down Expand Up @@ -67,6 +67,12 @@
\b
{config.describe('OCRD_EXISTING_OUTPUT', wrap_text=False)}
\b
{config.describe('OCRD_MAX_MISSING_OUTPUTS')}
\b
{config.describe('OCRD_MAX_PARALLEL_PAGES')}
\b
{config.describe('OCRD_PROCESSING_PAGE_TIMEOUT')}
\b
{config.describe('OCRD_METS_CACHING')}
\b
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
Expand Down
34 changes: 21 additions & 13 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
# this is where the fixes came from:
from loky import Future, ProcessPoolExecutor
import multiprocessing as mp
from threading import Timer
from _thread import interrupt_main
from multiprocessing.pool import ThreadPool

from click import wrap_text
from deprecated import deprecated
Expand Down Expand Up @@ -783,11 +782,16 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
page_id = input_files[input_pos].pageId
self._base_logger.info("processing page %s", page_id)
for i, input_file in enumerate(input_files):
grp = self.input_file_grp.split(',')[i]
if input_file is None:
grp = self.input_file_grp.split(',')[i]
self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
continue
assert isinstance(input_file, get_args(OcrdFileType))
if not input_file.local_filename:
self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
if config.OCRD_MISSING_INPUT == 'ABORT':
raise MissingInputFile(grp, page_id, input_file.mimetype)
continue
self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
try:
page_ = page_from_file(input_file)
Expand All @@ -796,6 +800,9 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None:
except ValueError as err:
# not PAGE and not an image to generate PAGE for
self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
if not any(input_pcgts):
self._base_logger.warning(f'skipping page {page_id}')
return
output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
if input_files[input_pos].fileGrp == self.output_file_grp:
# input=output fileGrp: re-use ID exactly
Expand Down Expand Up @@ -1107,7 +1114,11 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
self._base_logger.critical(f"Could not find any files for selected pageId {self.page_id}.\n"
f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
ifts = []
for page, ifiles in pages.items():
# use physical page order
for page in self.workspace.mets.physical_pages:
if page not in pages:
continue
ifiles = pages[page]
for i, ifg in enumerate(ifgs):
if not ifiles[i]:
# could be from non-unique with on_error=skip or from true gap
Expand Down Expand Up @@ -1150,18 +1161,15 @@ def _page_worker(timeout, *input_files):
"""
page_id = next((file.pageId for file in input_files
if hasattr(file, 'pageId')), "")
if timeout > 0:
timer = Timer(timeout, interrupt_main)
timer.start()
pool = ThreadPool(processes=1)
try:
_page_worker_processor.process_page_file(*input_files)
#_page_worker_processor.process_page_file(*input_files)
async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files)
async_result.get(timeout or None)
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
except KeyboardInterrupt:
except mp.TimeoutError:
_page_worker_processor.logger.debug("page worker timed out for page %s", page_id)
raise TimeoutError()
finally:
if timeout > 0:
timer.cancel()
raise

def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None):
"""Generate a string describing the full CLI of this processor including params.
Expand Down
8 changes: 6 additions & 2 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
from io import StringIO
from contextlib import contextmanager
import sys
from packaging.version import Version

from click.testing import CliRunner
import pytest

# pylint: disable=import-error, no-name-in-module
from tests.base import CapturingTestCase as TestCase, assets, copy_of_directory, main

from ocrd_utils import initLogging, pushd_popd, setOverrideLogLevel, disableLogging
from ocrd_utils import initLogging, pushd_popd, setOverrideLogLevel, disableLogging, dist_version
from ocrd.cli.workspace import workspace_cli
from ocrd import Resolver

Expand All @@ -31,7 +32,10 @@ def setUp(self):
disableLogging()
self.maxDiff = None
self.resolver = Resolver()
self.runner = CliRunner(mix_stderr=False)
if Version(dist_version('click')) >= Version('8.2'):
self.runner = CliRunner()
else:
self.runner = CliRunner(mix_stderr=False)

def test_add(self):
"""
Expand Down
23 changes: 18 additions & 5 deletions tests/processor/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from PIL import Image
from io import BytesIO
from contextlib import ExitStack
import multiprocessing as mp

from tempfile import TemporaryDirectory
from pathlib import Path
Expand Down Expand Up @@ -232,8 +233,16 @@ def test_run_input(self):
def test_run_output0(self):
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002')
file1 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001',
url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0001_FULLTEXT.xml'))
file2 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002',
url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0002_FULLTEXT.xml'))
run_processor(DummyProcessorWithOutput, workspace=ws,
input_file_grp="GRP1",
output_file_grp="OCR-D-OUT")
assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0, "no output because no download"
ws.download_file(file1)
ws.download_file(file2)
run_processor(DummyProcessorWithOutput, workspace=ws,
input_file_grp="GRP1",
output_file_grp="OCR-D-OUT")
Expand Down Expand Up @@ -303,7 +312,7 @@ def test_run_output_timeout(self):
assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG"))
config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
config.OCRD_PROCESSING_PAGE_TIMEOUT = 1
with pytest.raises(TimeoutError) as exc:
with pytest.raises(mp.TimeoutError) as exc:
run_processor(DummyProcessorWithOutputSleep, workspace=ws,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-OUT",
Expand All @@ -312,8 +321,12 @@ def test_run_output_timeout(self):
def test_run_output_overwrite(self):
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002')
file1 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001',
url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0001_FULLTEXT.xml'))
file2 = ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002',
url=assets.path_to('SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0002_FULLTEXT.xml'))
ws.download_file(file1)
ws.download_file(file2)
config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001')
config.OCRD_EXISTING_OUTPUT = 'ABORT'
Expand Down
Loading