-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr_test.py
More file actions
95 lines (81 loc) · 3.28 KB
/
ocr_test.py
File metadata and controls
95 lines (81 loc) · 3.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import sys
import json
from PIL import Image, ImageDraw
import google.generativeai as genai
def test_ocr(image_path):
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("API_KEY")
if not api_key:
print("Set the GEMINI_API_KEY environment variable.")
sys.exit(1)
genai.configure(api_key=api_key)
# Prompt da KoogOcrService
prompt_instructions = (
"Analyze this comic panel. Locate ALL areas containing text (speech bubbles, captions, and text boxes). "
"Do NOT transcribe sound effects (SFX) or onomatopoeia that appear OUTSIDE of speech bubbles (e.g. drawn directly on the artwork). "
"For each text area provide:\n"
" 1. The bounding box of the TEXT ITSELF (not the balloon outline).\n"
" Express coordinates as FRACTIONS of the image dimensions, between 0.0 and 1.0:\n"
" xmin = left edge / image_width, ymin = top edge / image_height,\n"
" xmax = right edge / image_width, ymax = bottom edge / image_height.\n"
" 2. The exact text transcribed from that area."
)
# Usiamo gemini-1.5-flash come default, con response_mime_type in json
model = genai.GenerativeModel(
"gemini-1.5-pro",
generation_config={
"response_mime_type": "application/json",
"response_schema": {
"type": "object",
"properties": {
"balloons": {
"type": "array",
"items": {
"type": "object",
"properties": {
"xmin": {"type": "number"},
"ymin": {"type": "number"},
"xmax": {"type": "number"},
"ymax": {"type": "number"},
"text": {"type": "string"},
},
"required": ["xmin", "ymin", "xmax", "ymax", "text"],
},
}
},
"required": ["balloons"],
},
},
)
print(f"Loading image: {image_path}")
img = Image.open(image_path)
width, height = img.size
print("Calling Gemini...")
response = model.generate_content([prompt_instructions, img])
try:
data = json.loads(response.text)
except Exception as e:
print("Failed to parse JSON:", e)
print("Raw response:", response.text)
return
print("Drawing bounding boxes...")
draw = ImageDraw.Draw(img)
balloons = data.get("balloons", [])
for i, b in enumerate(balloons):
xmin = b["xmin"] * width
ymin = b["ymin"] * height
xmax = b["xmax"] * width
ymax = b["ymax"] * height
# Disegna il box
draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)
print(
f"Balloon {i + 1}: {b['text']} @ {b['xmin']:.2f}, {b['ymin']:.2f}, {b['xmax']:.2f}, {b['ymax']:.2f}"
)
out_path = os.path.splitext(image_path)[0] + "_ocr_test.png"
img.save(out_path)
print(f"Saved result to {out_path}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python ocr_test.py <image_path>")
sys.exit(1)
test_ocr(sys.argv[1])