Skip to content

Commit ffb1f0b

Browse files
Refactor: Remove OCR related code for entire page OCR (#231)
## Summary One part of OCR refactor to move it from inference repo to unstructured repo. This PR removes all OCR related code for entire page OCR, which means all table related OCR still remain the same (will be moved after table refactor to accept preprocessed OCR data) ## Test Please see test description in Unstructured-IO/unstructured#1579, since those two need to work together. ## Note The ingest test won't pass until we merge the unstructured refactor PR --------- Co-authored-by: christinestraub <[email protected]>
1 parent cf15726 commit ffb1f0b

File tree

17 files changed

+30
-1024
lines changed

17 files changed

+30
-1024
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.7.0
2+
3+
* Remove all OCR related code expect the table OCR code
4+
15
## 0.6.6
26

37
* Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until

Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \
2020
pip install --no-cache -r requirements/base.txt && \
2121
pip install --no-cache -r requirements/test.txt && \
2222
pip install --no-cache -r requirements/dev.txt && \
23-
pip install "unstructured.PaddleOCR" && \
2423
dnf -y groupremove "Development Tools" && \
2524
dnf clean all
2625

examples/layout_analysis/visualization.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ def run(f_path, scope):
1414
"final": None,
1515
"extracted": {"layout": {"color": "green", "width": 2}},
1616
"inferred": {"inferred_layout": {"color": "blue", "width": 2}},
17-
"ocr": {"ocr_layout": {"color": "yellow", "width": 2}},
1817
}
1918

2019
f_basename = os.path.splitext(os.path.basename(f_path))[0]
@@ -47,8 +46,7 @@ def run(f_path, scope):
4746
write_image(img, output_f_path)
4847

4948
print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: "
50-
f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)} - "
51-
f"n_ocr_elements: {len(page.ocr_layout)}")
49+
f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)}")
5250

5351

5452
if __name__ == '__main__':

test_unstructured_inference/conftest.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,6 @@ def mock_embedded_text_regions():
107107
]
108108

109109

110-
@pytest.fixture()
111-
def mock_ocr_regions():
112-
return [
113-
EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None),
114-
EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None),
115-
EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None),
116-
]
117-
118-
119110
# TODO(alan): Make a better test layout
120111
@pytest.fixture()
121112
def mock_layout(mock_embedded_text_regions):
@@ -130,19 +121,3 @@ def mock_layout(mock_embedded_text_regions):
130121
)
131122
for r in mock_embedded_text_regions
132123
]
133-
134-
135-
@pytest.fixture()
136-
def mock_inferred_layout(mock_embedded_text_regions):
137-
return [
138-
LayoutElement(
139-
r.x1,
140-
r.y1,
141-
r.x2,
142-
r.y2,
143-
text=None,
144-
source=None,
145-
type="Text",
146-
)
147-
for r in mock_embedded_text_regions
148-
]

0 commit comments

Comments
 (0)