Skip to content

Commit 1f62313

Browse files
committed
added confidence score and detected languages to Token class and tests
1 parent b1f3772 commit 1f62313

File tree

6 files changed

+248
-0
lines changed

6 files changed

+248
-0
lines changed

google/cloud/documentai_toolbox/wrappers/page.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,29 @@ class Token(_BasePageElement):
247247
Required. The text of the Token.
248248
symbols (List[Symbol]):
249249
Optional. The Symbols contained within the Token.
250+
confidence (float):
251+
Optional. The confidence score of the Token detection.
252+
detected_languages (List[documentai.Document.Page.DetectedLanguage]):
253+
Optional. A list of detected languages for this Token.
250254
"""
251255

252256
@cached_property
253257
def symbols(self) -> List[Symbol]:
254258
return self._get_children_of_element(self._page.symbols)
259+
260+
@cached_property
261+
def confidence(self) -> float:
262+
"""
263+
The confidence score of the Token detection.
264+
"""
265+
return self.documentai_object.layout.confidence
266+
267+
@cached_property
268+
def detected_languages(self) -> List[documentai.Document.Page.DetectedLanguage]:
269+
"""
270+
A list of detected languages for this Token.
271+
"""
272+
return self.documentai_object.detected_languages
255273

256274

257275
@dataclasses.dataclass
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
17+
import token_confidence_sample
18+
19+
20+
def test_token_confidence_sample(capsys):
21+
# Use a test document from the resources directory
22+
test_file_path = os.path.join(
23+
os.path.dirname(__file__), "resources", "form_with_tables.json"
24+
)
25+
26+
# Run the sample
27+
token_confidence_sample.token_confidence_sample(
28+
document_path=test_file_path
29+
)
30+
31+
# Capture output
32+
stdout, _ = capsys.readouterr()
33+
34+
# Check that the output contains expected strings
35+
assert "Token" in stdout
36+
assert "Confidence:" in stdout
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
17+
import token_detected_languages_sample
18+
19+
20+
def test_token_detected_languages_sample(capsys):
21+
# Use a test document from the resources directory
22+
test_file_path = os.path.join(
23+
os.path.dirname(__file__), "resources", "form_with_tables.json"
24+
)
25+
26+
# Run the sample
27+
token_detected_languages_sample.token_detected_languages_sample(
28+
document_path=test_file_path
29+
)
30+
31+
# Capture output
32+
stdout, _ = capsys.readouterr()
33+
34+
# Check that the output contains expected strings
35+
assert "Token" in stdout
36+
assert "Detected Languages:" in stdout
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
# [START documentai_toolbox_token_confidence]
17+
from typing import Optional
18+
19+
from google.cloud.documentai_toolbox import document
20+
21+
# TODO(developer): Uncomment these variables before running the sample.
22+
# gcs_uri = "gs://bucket/path/to/folder/document.json"
23+
24+
25+
def token_confidence_sample(
26+
gcs_uri: Optional[str] = None,
27+
document_path: Optional[str] = None,
28+
) -> None:
29+
"""Demonstrates how to access token-level confidence scores.
30+
31+
Args:
32+
gcs_uri (Optional[str]):
33+
URI to a Document JSON file in GCS.
34+
document_path (Optional[str]):
35+
Path to a local Document JSON file.
36+
"""
37+
if gcs_uri:
38+
# Load a single Document from a Google Cloud Storage URI
39+
wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
40+
elif document_path:
41+
# Load from local `Document` JSON file
42+
wrapped_document = document.Document.from_document_path(document_path)
43+
else:
44+
raise ValueError("No document source provided.")
45+
46+
# Display token confidence for the first page
47+
if wrapped_document.pages:
48+
page = wrapped_document.pages[0]
49+
print(f"Page {page.page_number} Tokens:")
50+
51+
for i, token in enumerate(page.tokens[:10]): # Limiting to first 10 tokens for brevity
52+
print(f"Token {i}: '{token.text.strip()}'")
53+
print(f" Confidence: {token.confidence:.4f}")
54+
print()
55+
# [END documentai_toolbox_token_confidence]
56+
57+
58+
if __name__ == "__main__":
59+
import argparse
60+
61+
parser = argparse.ArgumentParser()
62+
group = parser.add_mutually_exclusive_group(required=True)
63+
group.add_argument("--gcs_uri", help="GCS URI to Document JSON.")
64+
group.add_argument("--document_path", help="Path to local Document JSON file.")
65+
args = parser.parse_args()
66+
67+
token_confidence_sample(
68+
gcs_uri=args.gcs_uri,
69+
document_path=args.document_path,
70+
)
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
# [START documentai_toolbox_token_detected_languages]
17+
from typing import Optional
18+
19+
from google.cloud.documentai_toolbox import document
20+
21+
# TODO(developer): Uncomment these variables before running the sample.
22+
# gcs_uri = "gs://bucket/path/to/folder/document.json"
23+
24+
25+
def token_detected_languages_sample(
26+
gcs_uri: Optional[str] = None,
27+
document_path: Optional[str] = None,
28+
) -> None:
29+
"""Demonstrates how to access token-level detected languages.
30+
31+
Args:
32+
gcs_uri (Optional[str]):
33+
URI to a Document JSON file in GCS.
34+
document_path (Optional[str]):
35+
Path to a local Document JSON file.
36+
"""
37+
if gcs_uri:
38+
# Load a single Document from a Google Cloud Storage URI
39+
wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
40+
elif document_path:
41+
# Load from local `Document` JSON file
42+
wrapped_document = document.Document.from_document_path(document_path)
43+
else:
44+
raise ValueError("No document source provided.")
45+
46+
# Display detected languages for tokens in the first page
47+
if wrapped_document.pages:
48+
page = wrapped_document.pages[0]
49+
print(f"Page {page.page_number} Tokens:")
50+
51+
for i, token in enumerate(page.tokens[:10]): # Limiting to first 10 tokens for brevity
52+
print(f"Token {i}: '{token.text.strip()}'")
53+
54+
if token.detected_languages:
55+
print(" Detected Languages:")
56+
for lang in token.detected_languages:
57+
confidence_str = f", confidence: {lang.confidence:.4f}" if hasattr(lang, "confidence") else ""
58+
print(f" - {lang.language_code}{confidence_str}")
59+
else:
60+
print(" No language detected")
61+
print()
62+
# [END documentai_toolbox_token_detected_languages]
63+
64+
65+
if __name__ == "__main__":
66+
import argparse
67+
68+
parser = argparse.ArgumentParser()
69+
group = parser.add_mutually_exclusive_group(required=True)
70+
group.add_argument("--gcs_uri", help="GCS URI to Document JSON.")
71+
group.add_argument("--document_path", help="Path to local Document JSON file.")
72+
args = parser.parse_args()
73+
74+
token_detected_languages_sample(
75+
gcs_uri=args.gcs_uri,
76+
document_path=args.document_path,
77+
)

tests/unit/test_page.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,17 @@ def test_Token(docproto):
297297
# checking cached value
298298
assert token.text == "Q.\n"
299299
assert token.hocr_bounding_box == "bbox 585 1781 620 1818"
300+
301+
# Check confidence value
302+
assert isinstance(token.confidence, float)
303+
assert 0.0 <= token.confidence <= 1.0
304+
305+
# Check detected languages
306+
assert isinstance(token.detected_languages, list)
307+
if token.detected_languages:
308+
for language in token.detected_languages:
309+
assert isinstance(language, documentai.Document.Page.DetectedLanguage)
310+
assert hasattr(language, "language_code")
300311

301312
assert token.symbols == []
302313

0 commit comments

Comments
 (0)