Skip to content

feat: added confidence score and detected languages to Page classes #387

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,22 @@ def hocr_bounding_box(self) -> Optional[str]:
self.documentai_object, self._page.documentai_object.dimension
)

@cached_property
def confidence(self) -> Optional[float]:
"""
Optional. The confidence score of the page element detection.
"""
return getattr(self.documentai_object, "confidence", None)

@cached_property
def detected_languages(
self,
) -> Optional[List[documentai.Document.Page.DetectedLanguage]]:
"""
Optional. A list of detected languages for the page element.
"""
return getattr(self.documentai_object, "detected_languages", None)

# This field is a cached property to improve export times for hOCR
# as outlined in https://github.com/googleapis/python-documentai-toolbox/issues/312
@cached_property
Expand Down
10 changes: 10 additions & 0 deletions samples/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ def quickstart_sample(
print(line.text)
for token in page.tokens:
print(token.text)
# Print token confidence
print(f"\tConfidence: {token.confidence:.4f}")
# Print detected languages
if token.detected_languages:
print("\tDetected Languages:")
for lang in token.detected_languages:
confidence_str = f", confidence: {lang.confidence:.4f}" if hasattr(lang, "confidence") else ""
print(f"\t\t- {lang.language_code}{confidence_str}")
else:
print("\tNo language detected")

# Only supported with Form Parser processor
# https://cloud.google.com/document-ai/docs/form-parser
Expand Down
8 changes: 8 additions & 0 deletions samples/snippets/test_quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> N
assert "Document structure in Cloud Storage" in out
assert "Number of Pages: 1" in out
assert "Number of Entities: 35" in out
assert "Confidence:" in out
assert "Detected Languages:" in out or "No language detected" in out


def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None:
Expand All @@ -48,6 +50,8 @@ def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None:

assert "Number of Pages: 1" in out
assert "Number of Entities: 35" in out
assert "Confidence:" in out
assert "Detected Languages:" in out or "No language detected" in out


def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None:
Expand All @@ -58,6 +62,8 @@ def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None:
assert "Number of Pages: 1" in out
assert "Number of Entities: 0" in out
assert "Form Date" in out
assert "Confidence:" in out
assert "Detected Languages:" in out or "No language detected" in out


def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None:
Expand All @@ -72,6 +78,8 @@ def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) ->
assert "Number of Pages: 1" in out
assert "Number of Entities: 0" in out
assert "Form Date" in out
assert "Confidence:" in out
assert "Detected Languages:" in out or "No language detected" in out


def test_quickstart_sample_batch_process_metadata(
Expand Down
44 changes: 44 additions & 0 deletions tests/unit/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,17 @@ def test_Block(docproto):

assert block.paragraphs

# Check confidence value
assert isinstance(block.confidence, float)
assert 0.0 <= block.confidence <= 1.0

# Check detected languages
assert isinstance(block.detected_languages, list)
if block.detected_languages:
for language in block.detected_languages:
assert isinstance(language, documentai.Document.Page.DetectedLanguage)
assert hasattr(language, "language_code")


def test_Paragraph(docproto):
wrapped_page = page.Page(
Expand All @@ -268,6 +279,17 @@ def test_Paragraph(docproto):

assert paragraph.lines

# Check confidence value
assert isinstance(paragraph.confidence, float)
assert 0.0 <= paragraph.confidence <= 1.0

# Check detected languages
assert isinstance(paragraph.detected_languages, list)
if paragraph.detected_languages:
for language in paragraph.detected_languages:
assert isinstance(language, documentai.Document.Page.DetectedLanguage)
assert hasattr(language, "language_code")


def test_Line(docproto):
wrapped_page = page.Page(
Expand All @@ -284,6 +306,17 @@ def test_Line(docproto):

assert line.tokens

# Check confidence value
assert isinstance(line.confidence, float)
assert 0.0 <= line.confidence <= 1.0

# Check detected languages
assert isinstance(line.detected_languages, list)
if line.detected_languages:
for language in line.detected_languages:
assert isinstance(language, documentai.Document.Page.DetectedLanguage)
assert hasattr(language, "language_code")


def test_Token(docproto):
wrapped_page = page.Page(
Expand All @@ -298,6 +331,17 @@ def test_Token(docproto):
assert token.text == "Q.\n"
assert token.hocr_bounding_box == "bbox 585 1781 620 1818"

# Check confidence value
assert isinstance(token.confidence, float)
assert 0.0 <= token.confidence <= 1.0

# Check detected languages
assert isinstance(token.detected_languages, list)
if token.detected_languages:
for language in token.detected_languages:
assert isinstance(language, documentai.Document.Page.DetectedLanguage)
assert hasattr(language, "language_code")

assert token.symbols == []


Expand Down
Loading