Skip to content

Commit 4d1de95

Browse files
authored
Merge pull request #216 from bertsky/migrate-ocrd-v3
Migrate ocrd v3
2 parents dcbd522 + fa18ecd commit 4d1de95

30 files changed

+1652
-1702
lines changed

.circleci/config.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@ jobs:
2222
- run: make install
2323
- run: make deps-test
2424
- run: mkdir test-results
25-
- run: make test PYTEST_ARGS=--junitxml=test-results/test.xml
25+
- run:
26+
name: run tests
27+
command: make test PYTEST_ARGS="-vv --junitxml=test-results/test.xml"
28+
no_output_timeout: 30m
2629
- store_test_results:
2730
path: test-results
2831
- run: make test-cli
2932
- run: make coverage
3033
- codecov/upload
34+
resource_class: large
3135

3236
build-docker:
3337
docker:
@@ -75,7 +79,7 @@ workflows:
7579
- test-python:
7680
matrix:
7781
parameters:
78-
python-version: ['3.7', '3.8', '3.9', '3.10']
82+
python-version: ['3.8', '3.9', '3.10', '3.11']
7983
- build-docker
8084

8185
deploy:

.dockerignore

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
*
2-
!.git
3-
!.gitmodules
4-
!Makefile
5-
!setup.py
6-
!requirements.txt
7-
!requirements_test.txt
8-
!LICENSE
9-
!README.md
10-
!repo/tesserocr
11-
!repo/tesseract
12-
13-
# avoid .git and __pycache__ etc:
14-
!ocrd_tesserocr/**/*.py
15-
!ocrd_tesserocr/**/*.json
16-
!test/**/*.py
17-
1+
test/
2+
dist/
3+
build/

.pylintrc

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,5 @@ disable =
2121
wrong-import-order,
2222
duplicate-code
2323

24-
# allow indented whitespace (as required by interpreter):
25-
no-space-check=empty-line
26-
2724
# allow non-snake-case identifiers:
2825
good-names=n,i

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).
55

66
## Unreleased
77

8+
Changed:
9+
10+
* adapt to ocrd 3.0, #216
11+
812
## [0.19.1] - 2024-07-01
913

1014
Fixed:

Dockerfile

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,54 @@
1-
FROM docker.io/ocrd/core:v2.67.2 AS base
2-
# set proper locales
3-
ENV LANG C.UTF-8
4-
ENV LC_ALL C.UTF-8
1+
ARG DOCKER_BASE_IMAGE
2+
FROM $DOCKER_BASE_IMAGE
53
# install ocrd-tesserocr (until here commands for installing tesseract-ocr)
64
ARG VCS_REF
75
ARG BUILD_DATE
86
LABEL \
97
maintainer="https://ocr-d.de/kontakt" \
108
org.label-schema.vcs-ref=$VCS_REF \
119
org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \
12-
org.label-schema.build-date=$BUILD_DATE
10+
org.label-schema.build-date=$BUILD_DATE \
11+
org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
12+
org.opencontainers.image.title="ocrd_tesserocr" \
13+
org.opencontainers.image.description="Tesseract OCR bindings" \
14+
org.opencontainers.image.source="https://github.com/OCR-D/ocrd_tesserocr" \
15+
org.opencontainers.image.documentation="https://github.com/OCR-D/ocrd_tesserocr/blob/${VCS_REF}/README.md" \
16+
org.opencontainers.image.revision=$VCS_REF \
17+
org.opencontainers.image.created=$BUILD_DATE \
18+
org.opencontainers.image.base.name=ocrd/core
1319

14-
ENV PYTHONIOENCODING utf8
1520

1621
# set frontend non-interactive to silence interactive tzdata config
17-
ARG DEBIAN_FRONTEND=noninteractive
22+
ENV DEBIAN_FRONTEND noninteractive
23+
# set proper locales
24+
ENV PYTHONIOENCODING utf8
25+
ENV LANG C.UTF-8
26+
ENV LC_ALL C.UTF-8
1827

1928
# set proper date and timezone in container
2029
RUN echo "Europe/Berlin" > /etc/timezone
2130
RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime
2231
RUN dpkg-reconfigure -f noninteractive tzdata
23-
2432
# diagnostic output - check timezone settings
2533
# RUN cat /etc/timezone
2634

2735
# avoid HOME/.local/share (hard to predict USER here)
2836
# so let XDG_DATA_HOME coincide with fixed system location
2937
# (can still be overridden by derived stages)
3038
ENV XDG_DATA_HOME /usr/local/share
39+
# avoid the need for an extra volume for persistent resource user db
40+
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
3141
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
3242
ENV TESSDATA_PREFIX $XDG_DATA_HOME/tessdata
3343

3444
WORKDIR /build/ocrd_tesserocr
35-
COPY setup.py .
36-
COPY ocrd_tesserocr/ocrd-tool.json .
37-
COPY README.md .
38-
COPY requirements.txt .
39-
COPY requirements_test.txt .
40-
COPY .git .git
41-
COPY .gitmodules .
42-
COPY ocrd_tesserocr ocrd_tesserocr
43-
COPY repo/tesserocr repo/tesserocr
44-
COPY repo/tesseract repo/tesseract
45-
COPY Makefile .
45+
COPY . .
46+
# prepackage ocrd-tool.json as ocrd-all-tool.json
47+
RUN ocrd ocrd-tool ocrd_tesserocr/ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
48+
# install everything and reduce image size
4649
RUN make deps-ubuntu \
47-
&& make -j4 install-tesseract \
48-
&& make -j4 install-tesseract-training \
49-
&& make deps install \
50+
&& make -j4 install GIT_SUBMODULE=: \
51+
&& make -j4 install-tesseract-training GIT_SUBMODULE=: \
5052
&& rm -rf /build/ocrd_tesserocr \
5153
&& apt-get -y remove --auto-remove g++ libtesseract-dev make
5254

Makefile

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ export
33
SHELL = /bin/bash
44
PYTHON = python3
55
PIP = pip3
6+
GIT_SUBMODULE = git submodule
67
LOG_LEVEL = INFO
78
PYTHONIOENCODING=utf8
89
LC_ALL = C.UTF-8
@@ -27,6 +28,7 @@ PYTEST_ARGS =
2728

2829
# Docker container tag
2930
DOCKER_TAG = 'ocrd/tesserocr'
31+
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0
3032

3133
help:
3234
@echo ""
@@ -38,6 +40,7 @@ help:
3840
@echo " install-tesserocr Compile and install Tesserocr"
3941
@echo " deps Install Tesseract/Tesserocr and all Python dependencies"
4042
@echo " install Install this package with all dependencies and download minimal models"
43+
@echo " build Build source and binary distribution"
4144
@echo " deps-test Install Python deps for test via pip"
4245
@echo " test Run unit tests"
4346
@echo " coverage Run unit tests and determine test coverage"
@@ -100,6 +103,7 @@ deps-test:
100103
# Build docker image
101104
docker: repo/tesseract repo/tesserocr
102105
docker build \
106+
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
103107
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
104108
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
105109
-t $(DOCKER_TAG) .
@@ -131,8 +135,8 @@ repo/tesseract/Makefile.in: repo/tesseract
131135
# phony to ensure this recipe is fired (as in empty directory after clone)
132136
.PHONY: repo/tesserocr repo/tesseract repo/assets
133137
repo/tesserocr repo/tesseract repo/assets:
134-
git submodule sync $@
135-
git submodule update --init $@
138+
$(GIT_SUBMODULE) sync $@
139+
$(GIT_SUBMODULE) update --init $@
136140

137141
# Install this package
138142
install: deps
@@ -141,13 +145,16 @@ install: deps
141145
ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
142146
ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
143147

148+
build:
149+
$(PIP) install build
150+
$(PYTHON) -m build .
151+
152+
test test-cli coverage: export OCRD_MISSING_OUTPUT := ABORT
153+
144154
# Run unit tests
145155
test: test/assets deps-test
146156
@# declare -p HTTP_PROXY
147-
#$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS)
148-
# workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other:
149-
$(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS)
150-
$(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS)
157+
$(PYTHON) -m pytest test --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)
151158

152159
# Run unit tests and determine test coverage
153160
coverage:
@@ -166,7 +173,7 @@ test-cli: test/assets deps-test
166173
ocrd-tesserocr-recognize -l DEBUG -I OCR-D-SEG-LINE -O OCR-D-TESS-OCR -P model deu
167174

168175
.PHONY: test test-cli install deps deps-ubuntu deps-test help
169-
.PHONY: install-tesseract install-tesserocr install-tesseract-training
176+
.PHONY: install-tesseract install-tesserocr install-tesseract-training build
170177

171178
#
172179
# Assets

0 commit comments

Comments
 (0)