From 9b774b3227a158b7b93098b2a5c2abe5702bdc89 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sat, 22 Mar 2025 07:20:00 +0100 Subject: [PATCH 1/2] adding unit-tests for docling-mcp Signed-off-by: Peter Staar --- .pre-commit-config.yaml | 1 + docling_mcp/tools/generation.py | 99 +++++++++++++++++++++++++++++++++ pyproject.toml | 3 +- tests/test_generation_tools.py | 49 ++++++++++++++++ 4 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 tests/test_generation_tools.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ec50965..73d7a57 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,7 @@ repos: # Run the Ruff linter. - id: ruff args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] + files: ^(docling_mcp|tests)/.*\.py$ - repo: local hooks: - id: system diff --git a/docling_mcp/tools/generation.py b/docling_mcp/tools/generation.py index ae24d57..27aa1e5 100644 --- a/docling_mcp/tools/generation.py +++ b/docling_mcp/tools/generation.py @@ -1,5 +1,15 @@ import hashlib +from io import BytesIO +# from bs4 import BeautifulSoup # , NavigableString, PageElement, Tag +from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat +from docling.datamodel.document import ( + ConversionResult, + DoclingDocument, +) +from docling.document_converter import DocumentConverter + +# from docling.backend.html_backend import HTMLDocumentBackend from docling_core.types.doc.document import ( ContentLayer, DoclingDocument, @@ -13,8 +23,12 @@ ) from docling_mcp.docling_cache import get_cache_dir +from docling_mcp.logger import setup_logger from docling_mcp.shared import local_document_cache, local_stack_cache, mcp +# Create a default project logger +logger = setup_logger() + def hash_string_md5(input_string: str) -> str: """Creates an md5 hash-string from the input string.""" @@ -407,3 +421,88 @@ def add_listitem_to_list_in_docling_document( ) return f"added listitem to list in document with key: {document_key}" + + +@mcp.tool() +def add_table_in_html_format_to_docling_document( + document_key: str, + html_table: str, + table_captions: list[str] = [], + table_footnotes: list[str] = [], +) -> str: + """ + Adds an HTML-formatted table to an existing document in the local document cache. + + This tool parses the provided HTML table string, converts it to a structured table + representation, and adds it to the specified document. It also supports optional + captions and footnotes for the table. + + Args: + document_key (str): The unique identifier for the document in the local cache. + html_table (str): The HTML string representation of the table to add. + table_captions (list[str], optional): A list of caption strings to associate with the table. + table_footnotes (list[str], optional): A list of footnote strings to associate with the table. + + Returns: + str: A confirmation message indicating the table was successfully added. + + Raises: + ValueError: If the specified document_key does not exist in the local cache. + ValueError: If the stack size for the document is zero. + HTMLParseError: If the provided HTML table string cannot be properly parsed. + + Example: + add_table_in_html_format_to_docling_document( + document_key="doc123", + html_table="
NameAge
John30
", + table_captions=["Table 1: Sample demographic data"], + table_footnotes=["Data collected in 2023"] + ) + + Example with rowspan and colspan: + add_table_in_html_format_to_docling_document( + document_key="doc123", + html_table="
Demographics
NameAge
John30
Jane
", + table_captions=["Table 2: Complex demographic data with merged cells"] + ) + """ + if document_key not in local_document_cache: + doc_keys = ", ".join(local_document_cache.keys()) + raise ValueError( + f"document-key: {document_key} is not found. Existing document-keys are: {doc_keys}" + ) + + doc = local_document_cache[document_key] + + if len(local_stack_cache[document_key]) == 0: + raise ValueError( + f"Stack size is zero for document with document-key: {document_key}. Abort document generation" + ) + + html_doc: str = f"{html_table}" + + buff = BytesIO(html_doc.encode("utf-8")) + doc_stream = DocumentStream(name="tmp", stream=buff) + + converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) + conv_result: ConversionResult = converter.convert(doc_stream) + + if ( + conv_result.status == ConversionStatus.SUCCESS + and len(conv_result.document.tables) > 0 + ): + table = doc.add_table(data=conv_result.document.tables[0].data) + + for _ in table_captions: + caption = doc.add_text(label=DocItemLabel.CAPTION, text=_) + table.captions.append(caption.get_ref()) + + for _ in table_footnotes: + footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text=_) + table.footnotes.append(footnote.get_ref()) + else: + raise ValueError( + "Could not parse the html string of the table! Please fix the html and try again!" + ) + + return f"Added table to a document with key: {document_key}" diff --git a/pyproject.toml b/pyproject.toml index 7326a25..3aa9c27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ Changelog = "https://github.com/docling-project/docling-mcp/blob/main/CHANGELOG. target-version = "py39" line-length = 88 respect-gitignore = true +include = ["docling_mcp", "tests"] [tool.ruff.format] skip-magic-trailing-comma = false @@ -110,7 +111,7 @@ ignore = [ [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] -"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests +# "tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests [tool.ruff.lint.mccabe] max-complexity = 15 diff --git a/tests/test_generation_tools.py b/tests/test_generation_tools.py new file mode 100644 index 0000000..996c014 --- /dev/null +++ b/tests/test_generation_tools.py @@ -0,0 +1,49 @@ +import re + +# Create a default project logger +from docling_mcp.logger import setup_logger +from docling_mcp.shared import local_document_cache +from docling_mcp.tools.generation import ( # noqa: F401 + add_listitem_to_list_in_docling_document, + add_paragraph_to_docling_document, + add_section_heading_to_docling_document, + add_table_in_html_format_to_docling_document, + add_title_to_docling_document, + close_list_in_docling_document, + create_new_docling_document, + export_docling_document_to_markdown, + open_list_in_docling_document, + save_docling_document, +) + +logger = setup_logger() + + +def test_create_docling_document(): + reply = create_new_docling_document(prompt="test-document") + key = extract_key_from_reply(reply=reply) + + assert key in local_document_cache + + +def extract_key_from_reply(reply: str) -> str: + match = re.search(r"document-key:\s*([a-fA-F0-9]{32})", reply) + if match: + return match.group(1) + + return "" + + +def test_table_in_html_format_to_docling_document(): + reply = create_new_docling_document(prompt="test-document") + key = extract_key_from_reply(reply=reply) + + html_table: str = "
Demographics
NameAge
John30
Jane
" + + reply = add_table_in_html_format_to_docling_document( + document_key=key, + html_table=html_table, + table_captions=["Table 2: Complex demographic data with merged cells"], + ) + + assert reply == f"Added table to a document with key: {key}" From 300b54ba44134c1640160dc2b1d65d4f6ba9bc20 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 12 Sep 2025 13:15:19 +0200 Subject: [PATCH 2/2] feat: added the extractor Signed-off-by: Peter Staar --- examples/mellea/agent/extractor.py | 50 ++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 examples/mellea/agent/extractor.py diff --git a/examples/mellea/agent/extractor.py b/examples/mellea/agent/extractor.py new file mode 100644 index 0000000..95c7acc --- /dev/null +++ b/examples/mellea/agent/extractor.py @@ -0,0 +1,50 @@ +import copy +import logging +import re +from datetime import datetime +from enum import Enum +from io import BytesIO +from typing import ClassVar +import json + +from pydantic import BaseModel, Field, validator + +from examples.mellea.agent_models import setup_local_session +from examples.mellea.agent.base import DoclingAgentType, BaseDoclingAgent + +# Configure logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +class DoclingExtractingAgent(BaseDoclingAgent): + system_prompt_for_editing_document: ClassVar[str] = ( + SYSTEM_PROMPT_FOR_EDITING_DOCUMENT + ) + system_prompt_for_editing_table: ClassVar[str] = SYSTEM_PROMPT_FOR_EDITING_TABLE + + system_prompt_expert_writer: ClassVar[str] = SYSTEM_PROMPT_EXPERT_WRITER + + def __init__(self, *, model_id: ModelIdentifier, tools: list[Tool]): + super().__init__( + agent_type=DoclingAgentType.DOCLING_DOCUMENT_EXTRACTOR, + model_id=model_id, + tools=tools, + ) + + def run(self, task: str, document: DoclingDocument, **kwargs) -> DoclingDocument: + schema: dict = self._extract_schema_from_task(task=task) + + extractions = [] + for item, level in document.iterate_items(): + if isinstance(item, TextItem): + self._extract_from_text_item( + item=item, schema=schema, extractions=extractions + ) + + return document + + def _extract_schema_from_task(self, task: str) -> dict: + return {}