Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hOCR output format #1275

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion easyocr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def parse_args():
parser.add_argument(
"--output_format",
type=str,
choices=["standard", 'dict', 'json'],
choices=["standard", 'dict', 'json', "hocr"],
default='standard',
help="output format.",
)
Expand Down
4 changes: 3 additions & 1 deletion easyocr/easyocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .utils import group_text_box, get_image_list, calculate_md5, get_paragraph,\
download_and_unzip, printProgressBar, diff, reformat_input,\
make_rotated_img_list, set_result_with_confidence,\
reformat_input_batched, merge_to_free
reformat_input_batched, merge_to_free, to_hocr
from .config import *
from bidi.algorithm import get_display
import numpy as np
Expand Down Expand Up @@ -434,6 +434,8 @@ def recognize(self, img_cv_grey, horizontal_list=None, free_list=None,\
return [json.dumps({'boxes':[list(map(int, lst)) for lst in item[0]],'text':item[1],'confident':item[2]}, ensure_ascii=False) for item in result]
elif output_format == 'free_merge':
return merge_to_free(result, free_list)
elif output_format == "hocr":
return to_hocr(result)
else:
return result

Expand Down
44 changes: 44 additions & 0 deletions easyocr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from PIL import Image, JpegImagePlugin
from scipy import ndimage
import hashlib
import html
import sys, os
from zipfile import ZipFile
from .imgproc import loadImage
Expand Down Expand Up @@ -383,6 +384,49 @@ def decode_wordbeamsearch(self, mat, beamWidth=5):
texts.append(string)
return texts

OCR_PREAMBLE = """
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<meta name='ocr-system' content='EasyOCR' />
<meta name='ocr-capabilities' content='ocrx_word'/>
</head>
<body>
<div class="ocr_page" id="page_1" title="image 'image.png'; bbox {x0} {y0} {x1} {y1}; ppageno 0">
""".strip()


# In order to get a browser-renderable HTML file, you can add this before the closing </body> tag:
#
# <script src="https://unpkg.com/hocrjs"></script>

OCR_POSTAMBLE = """ </div>
</body>
</html>
""".splitlines()

def to_hocr(result):
content = []
min_x0, min_y0, max_x1, max_y1 = 1e9, 1e9, 0, 0
for box, text, confidence in result:
# We have the corners of the box, clockwise from top-left
c1, _, c3, _ = [[int(x) for x in c] for c in box]
x0, y0 = c1
x1, y1 = c3
min_x0 = min(min_x0, x0)
min_y0 = min(min_y0, y0)
max_x1 = max(max_x1, x1)
max_y1 = max(max_y1, y1)
content.append(' <span class="ocrx_word" title="bbox {x0} {y0} {x1} {y1}">{text}</span>'.format(
x0=x0, y0=y0, x1=x1, y1=y1, text=html.escape(text)
))
preamble = OCR_PREAMBLE.format(x0=min_x0, y0=min_y0, x1=max_x1, y1=max_y1).splitlines()
return preamble + content + OCR_POSTAMBLE

def merge_to_free(merge_result, free_list):
merge_result_buf, mr_buf = [], []

Expand Down