Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions docling_core/transforms/serializer/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,23 @@
DocSerializer,
create_ser_result,
)
from docling_core.types.doc.base import CoordOrigin
from docling_core.types.doc.document import (
from docling_core.types.doc import (
CoordOrigin,
DocItem,
DocItemLabel,
DoclingDocument,
FormItem,
InlineGroup,
KeyValueItem,
ListGroup,
NodeItem,
PictureItem,
ProvenanceItem,
RefItem,
RichTableCell,
TableItem,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel


def _bbox_to_polygon_coords(
Expand All @@ -78,7 +79,7 @@ def _bbox_to_polygon_for_item(
doc: DoclingDocument, item: DocItem
) -> Optional[list[float]]:
"""Compute a TOPLEFT-origin polygon for the first provenance of the item."""
if not item.prov:
if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
return None

prov = item.prov[0]
Expand Down Expand Up @@ -189,7 +190,7 @@ def serialize(

# Lists may be represented either as TextItem(ListItem) or via groups;
# we treat any TextItem as a paragraph-like entry.
if item.prov:
if item.prov and isinstance(item.prov[0], ProvenanceItem):
prov = item.prov[0]
page_no = prov.page_no
polygon = _bbox_to_polygon_for_item(doc, item)
Expand Down Expand Up @@ -241,7 +242,7 @@ def serialize(
) -> SerializationResult:
assert isinstance(doc_serializer, AzureDocSerializer)

if not item.prov:
if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
return create_ser_result()

prov = item.prov[0]
Expand Down Expand Up @@ -322,7 +323,7 @@ def serialize(
) -> SerializationResult:
assert isinstance(doc_serializer, AzureDocSerializer)

if not item.prov:
if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
return create_ser_result()

prov = item.prov[0]
Expand All @@ -340,7 +341,11 @@ def serialize(
for foot_ref in item.footnotes:
if isinstance(foot_ref, RefItem):
tgt = foot_ref.resolve(doc)
if isinstance(tgt, TextItem) and tgt.prov:
if (
isinstance(tgt, TextItem)
and tgt.prov
and isinstance(tgt.prov[0], ProvenanceItem)
):
f_poly = _bbox_to_polygon_for_item(doc, tgt)
if f_poly is not None:
foots.append(
Expand Down
25 changes: 19 additions & 6 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@
SerializationResult,
Span,
)
from docling_core.types.doc.document import (
DOCUMENT_TOKENS_EXPORT_LABELS,
from docling_core.types.doc import (
ContentLayer,
DescriptionAnnotation,
DocItem,
DocItemLabel,
DoclingDocument,
FloatingItem,
Formatting,
Expand All @@ -51,12 +51,13 @@
PictureDataType,
PictureItem,
PictureMoleculeData,
ProvenanceItem,
Script,
TableAnnotationType,
TableItem,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS

_DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
Expand Down Expand Up @@ -110,7 +111,11 @@ def _iterate_items(
add_page_breaks=add_page_breaks,
visited=my_visited,
):
if isinstance(it, DocItem) and it.prov:
if (
isinstance(it, DocItem)
and it.prov
and isinstance(it.prov[0], ProvenanceItem)
):
page_no = it.prov[0].page_no
if prev_page_nr is not None and page_no > prev_page_nr:
yield _PageBreakNode(
Expand All @@ -119,7 +124,11 @@ def _iterate_items(
next_page=page_no,
), lvl
break
elif isinstance(item, DocItem) and item.prov:
elif (
isinstance(item, DocItem)
and item.prov
and isinstance(item.prov[0], ProvenanceItem)
):
page_no = item.prov[0].page_no
if prev_page_nr is None or page_no > prev_page_nr:
if prev_page_nr is not None: # close previous range
Expand Down Expand Up @@ -288,7 +297,10 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
params.pages is not None
and (
(not item.prov)
or item.prov[0].page_no not in params.pages
or (
isinstance(item.prov[0], ProvenanceItem)
and item.prov[0].page_no not in params.pages
)
)
)
)
Expand Down Expand Up @@ -635,6 +647,7 @@ def _get_applicable_pages(self) -> Optional[list[int]]:
if (
isinstance(item, DocItem)
and item.prov
and isinstance(item.prov[0], ProvenanceItem)
and (
self.params.pages is None
or item.prov[0].page_no in self.params.pages
Expand Down
14 changes: 8 additions & 6 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
_should_use_legacy_annotations,
create_ser_result,
)
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc.document import (
BoundingBox,
CodeItem,
DocItem,
DocItemLabel,
DoclingDocument,
DocumentToken,
FloatingItem,
FormItem,
GroupItem,
Expand All @@ -40,17 +42,17 @@
ListItem,
NodeItem,
PictureClassificationData,
PictureClassificationLabel,
PictureItem,
PictureMoleculeData,
PictureTabularChartData,
ProvenanceItem,
SectionHeaderItem,
TableData,
TableItem,
TableToken,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken


def _wrap(text: str, wrap_tag: str) -> str:
Expand Down Expand Up @@ -360,7 +362,7 @@ def serialize(
results: list[SerializationResult] = []

page_no = 1
if len(item.prov) > 0:
if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem):
page_no = item.prov[0].page_no

if params.add_location:
Expand All @@ -380,7 +382,7 @@ def serialize(

for cell in item.graph.cells:
cell_txt = ""
if cell.prov is not None:
if cell.prov is not None and isinstance(cell.prov, ProvenanceItem):
if len(doc.pages.keys()):
page_w, page_h = doc.pages[page_no].size.as_tuple()
cell_txt += DocumentToken.get_location(
Expand Down Expand Up @@ -492,7 +494,7 @@ def _get_inline_location_tags(
doc_items: list[DocItem] = []
for it, _ in doc.iterate_items(root=item):
if isinstance(it, DocItem):
for prov in it.prov:
for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)):
boxes.append(prov.bbox)
doc_items.append(it)
if prov is None:
Expand Down
17 changes: 14 additions & 3 deletions docling_core/transforms/visualizer/key_value_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@
from typing_extensions import override

from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.document import ContentLayer, DoclingDocument
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
from docling_core.types.doc import (
ContentLayer,
DoclingDocument,
GraphCellLabel,
GraphLinkLabel,
ProvenanceItem,
)

# ---------------------------------------------------------------------------
# Helper functions / constants
Expand Down Expand Up @@ -78,7 +83,11 @@ def _draw_key_value_layer(
# First draw cells (rectangles + optional labels)
# ------------------------------------------------------------------
for cell in cell_dict.values():
if cell.prov is None or cell.prov.page_no != page_no:
if (
cell.prov is None
or not isinstance(cell.prov, ProvenanceItem)
or cell.prov.page_no != page_no
):
continue # skip cells not on this page or without bbox

tl_bbox = cell.prov.bbox.to_top_left_origin(
Expand Down Expand Up @@ -127,6 +136,8 @@ def _draw_key_value_layer(
if (
src_cell.prov is None
or tgt_cell.prov is None
or not isinstance(src_cell.prov, ProvenanceItem)
or not isinstance(tgt_cell.prov, ProvenanceItem)
or src_cell.prov.page_no != page_no
or tgt_cell.prov.page_no != page_no
):
Expand Down
18 changes: 13 additions & 5 deletions docling_core/transforms/visualizer/layout_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@
from typing_extensions import override

from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc import DocItemLabel
from docling_core.types.doc.base import CoordOrigin
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc import (
BoundingRectangle,
ContentLayer,
CoordOrigin,
DocItem,
DocItemLabel,
DoclingDocument,
ProvenanceItem,
TextCell,
)


class _TLBoundingRectangle(BoundingRectangle):
Expand Down Expand Up @@ -157,7 +163,9 @@ def _draw_doc_layout(
if len(elem.prov) == 0:
continue # Skip elements without provenances

for prov in elem.prov:
for prov in (
item for item in elem.prov if isinstance(item, ProvenanceItem)
):
page_nr = prov.page_no

if page_nr in my_images:
Expand Down
11 changes: 9 additions & 2 deletions docling_core/transforms/visualizer/reading_order_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
from typing_extensions import override

from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
from docling_core.types.doc.document import (
ContentLayer,
DocItem,
DoclingDocument,
ProvenanceItem,
)


class _NumberDrawingData(BaseModel):
Expand Down Expand Up @@ -102,7 +107,9 @@ def _draw_doc_reading_order(
if len(elem.prov) == 0:
continue # Skip elements without provenances

for prov in elem.prov:
for prov in (
item for item in elem.prov if isinstance(item, ProvenanceItem)
):
page_no = prov.page_no
image = my_images.get(page_no)

Expand Down
11 changes: 8 additions & 3 deletions docling_core/transforms/visualizer/table_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
from typing_extensions import override

from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
from docling_core.types.doc import (
ContentLayer,
DoclingDocument,
ProvenanceItem,
TableItem,
)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -171,12 +176,12 @@ def _draw_doc_tables(
image = deepcopy(pil_img)
my_images[page_nr] = image

for idx, (elem, _) in enumerate(
for _, (elem, _) in enumerate(
doc.iterate_items(included_content_layers=included_content_layers)
):
if not isinstance(elem, TableItem):
continue
if len(elem.prov) == 0:
if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem):
continue # Skip elements without provenances

if len(elem.prov) == 1:
Expand Down
1 change: 1 addition & 0 deletions docling_core/types/doc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
Script,
SectionHeaderItem,
SummaryMetaField,
TableAnnotationType,
TableCell,
TableData,
TableItem,
Expand Down
Loading
Loading