Skip to content

Commit 4f61cdd

Browse files
authored
Merge pull request #2 from papermerge/generate-doc-previews-after-ocr
generate document previews after OCR
2 parents f73dea4 + bf8562b commit 4f61cdd

File tree

5 files changed

+63
-2
lines changed

5 files changed

+63
-2
lines changed

README.md

+44
Original file line numberDiff line numberDiff line change
@@ -1 +1,45 @@
11
# OCR Worker
2+
3+
Performs OCR on the documents. Optionally can download/upload documents
4+
from S3 storage.
5+
6+
## Run it:
7+
8+
poetry run task worker
9+
10+
## Configuration
11+
12+
OCR Worker is configured via environment variables
13+
14+
### PAPERMERGE__DATABASE__URL
15+
16+
Database URL (URI). For details see: [Database URLs](https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls)
17+
Default value is `sqlite:////db/db.sqlite3`
18+
19+
Example:
20+
21+
export PAPERMERGE__DATABASE__URL=sqlite:////opt/cocodb.sqlite3
22+
23+
### PAPERMERGE__REDIS__URL
24+
25+
Redis URL (URI).
26+
If no value is provided, then it will not connect to Redis.
27+
28+
Example:
29+
30+
export PAPERMERGE__REDIS__URL=redis://localhost:6379/0
31+
32+
### PAPERMERGE__MAIN__LOGGING_CFG
33+
34+
Example:
35+
36+
export PAPERMERGE__MAIN__LOGGING_CFG=/etc/papermerge/logging.yaml
37+
38+
### PAPERMERGE__MAIN__MEDIA_ROOT
39+
40+
Path to media root. If no value provided, current working directory
41+
is used as media root.
42+
43+
Example:
44+
45+
export PAPERMERGE__MAIN__MEDIA_ROOT=/opt/media_root

ocrworker/celery_app.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import logging
12
from celery import Celery
23
from ocrworker import config, utils
34
from celery.signals import setup_logging
45

56

67
settings = config.get_settings()
8+
logger = logging.getLogger(__name__)
79

810
app = Celery(
911
"ocrworker",

ocrworker/constants.py

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
PAGE_PDF = "page.pdf"
99
INDEX_ADD_DOCS = "index_add_docs"
1010
WORKER_OCR_DOCUMENT = "worker_ocr_document"
11+
S3_WORKER_GENERATE_PREVIEW = "s3_worker_generate_preview"

ocrworker/tasks.py

+14
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def ocr_document_task(document_id: str, lang: str):
8080
target_docver_id=target_docver_uuid,
8181
target_page_ids=target_page_uuids,
8282
).set(queue=prefixed(const.OCR))
83+
| generate_preview.s(doc_id=document_id).set(queue=prefixed(const.OCR))
8384
| notify_index_task.s(doc_id=document_id).set(queue=prefixed(const.OCR))
8485
)
8586
# I've tried workflow.apply_async(queue=prefixed(OCR))
@@ -198,6 +199,19 @@ def notify_index_task(_, **kwargs):
198199
)
199200

200201

202+
@shared_task()
203+
def generate_preview(_, **kwargs):
204+
logger.debug(f"Generate thumbnail/page previews for doc_id={kwargs}")
205+
206+
doc_id = kwargs["doc_id"]
207+
208+
celery_app.send_task(
209+
const.S3_WORKER_GENERATE_PREVIEW,
210+
kwargs={"doc_id": doc_id},
211+
route_name="s3preview",
212+
)
213+
214+
201215
def prefixed(name: str) -> str:
202216
pref = settings.papermerge__main__prefix
203217
if pref:

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ocrworker"
3-
version = "0.1.0"
3+
version = "0.2.0"
44
description = "OCR Worker"
55
authors = ["Eugen Ciur <[email protected]>"]
66
readme = "README.md"
@@ -32,7 +32,7 @@ databases = ["mysqlclient", "psycopg2"]
3232
ocr = "ocrworker.cli.ocr:app"
3333

3434
[tool.taskipy.tasks]
35-
worker = "celery -A ocrworker.celery_app worker -E -c 8 -Q dev-coco_ocr"
35+
worker = "celery -A ocrworker.celery_app worker -E -c 8 -Q ocr"
3636

3737
[tool.poetry.group.dev.dependencies]
3838
black = "^24.4.2"

0 commit comments

Comments
 (0)