Skip to content

Commit 4dd9c85

Browse files
committed
fix: rich table triplet serialization
Signed-off-by: Vdaleke <[email protected]>
1 parent 6d48bf5 commit 4dd9c85

File tree

4 files changed

+78
-4
lines changed

4 files changed

+78
-4
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ def serialize(
6969
parts.append(cap_res)
7070

7171
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
72-
table_df = item.export_to_dataframe(doc)
72+
table_df = item.export_to_dataframe(
73+
doc,
74+
doc_serializer=doc_serializer,
75+
**kwargs,
76+
)
7377
if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
7478

7579
# copy header as first row and shift all rows by one

docling_core/types/doc/document.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,7 +1840,9 @@ def _migrate_annotations_to_meta(self) -> Self:
18401840
return self
18411841

18421842
def export_to_dataframe(
1843-
self, doc: Optional["DoclingDocument"] = None
1843+
self,
1844+
doc: Optional["DoclingDocument"] = None,
1845+
**kwargs: Any,
18441846
) -> pd.DataFrame:
18451847
"""Export the table as a Pandas DataFrame."""
18461848
if doc is None:
@@ -1876,14 +1878,14 @@ def export_to_dataframe(
18761878
columns = ["" for _ in range(self.data.num_cols)]
18771879
for i in range(num_headers):
18781880
for j, cell in enumerate(self.data.grid[i]):
1879-
col_name = cell._get_text(doc=doc)
1881+
col_name = cell._get_text(doc=doc, **kwargs)
18801882
if columns[j] != "":
18811883
col_name = f".{col_name}"
18821884
columns[j] += col_name
18831885

18841886
# Create table data
18851887
table_data = [
1886-
[cell._get_text(doc=doc) for cell in row]
1888+
[cell._get_text(doc=doc, **kwargs) for cell in row]
18871889
for row in self.data.grid[num_headers:]
18881890
]
18891891

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"root": [
3+
{
4+
"text": "cell 0,0, 1 = cell 0,1. cell 1,0, 1 = <em><p>text in italic</p></em>. <ul>\n<li>list item 1</li>\n<li>list item 2</li>\n</ul>, 1 = cell 2,1. cell 3,0, 1 = inner cell 0,0, 1 = inner cell 0,1. inner cell 0,0, 2 = inner cell 0,2. inner cell 1,0, 1 = inner cell 1,1. inner cell 1,0, 2 = inner cell 1,2. <p>Some text in a generic group.</p>\n<p>More text in the group.</p>, 1 = cell 4,1",
5+
"meta": {
6+
"schema_name": "docling_core.transforms.chunker.DocMeta",
7+
"version": "1.0.0",
8+
"doc_items": [
9+
{
10+
"self_ref": "#/tables/0",
11+
"parent": {
12+
"$ref": "#/body"
13+
},
14+
"children": [
15+
{
16+
"$ref": "#/texts/1"
17+
},
18+
{
19+
"$ref": "#/groups/0"
20+
},
21+
{
22+
"$ref": "#/tables/1"
23+
},
24+
{
25+
"$ref": "#/groups/1"
26+
}
27+
],
28+
"content_layer": "body",
29+
"label": "table",
30+
"prov": []
31+
}
32+
],
33+
"headings": [
34+
"Rich tables"
35+
]
36+
}
37+
}
38+
]
39+
}

test/test_hierarchical_chunker.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@
1010
ChunkingDocSerializer,
1111
ChunkingSerializerProvider,
1212
DocChunk,
13+
TripletTableSerializer,
1314
)
15+
from docling_core.transforms.serializer.html import HTMLDocSerializer
1416
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
1517
from docling_core.types.doc import DoclingDocument as DLDocument
1618
from docling_core.types.doc.document import DoclingDocument
1719

1820
from .test_data_gen_flag import GEN_TEST_DATA
21+
from .test_docling_doc import _construct_rich_table_doc
1922

2023

2124
def _process(act_data, exp_path_str):
@@ -71,3 +74,29 @@ def get_serializer(self, doc: DoclingDocument):
7174
act_data=act_data,
7275
exp_path_str="test/data/chunker/0b_out_chunks.json",
7376
)
77+
78+
79+
def test_chunk_rich_table_custom_serializer():
80+
doc = _construct_rich_table_doc()
81+
82+
class MySerializerProvider(ChunkingSerializerProvider):
83+
def get_serializer(self, doc: DoclingDocument):
84+
return HTMLDocSerializer(
85+
doc=doc,
86+
table_serializer=TripletTableSerializer(),
87+
)
88+
89+
chunker = HierarchicalChunker(
90+
merge_list_items=True,
91+
serializer_provider=MySerializerProvider(),
92+
)
93+
94+
chunks = chunker.chunk(dl_doc=doc)
95+
act_data = dict(
96+
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
97+
)
98+
99+
_process(
100+
act_data=act_data,
101+
exp_path_str="test/data/chunker/0c_out_chunks.json",
102+
)

0 commit comments

Comments
 (0)