Skip to content

Commit d3f16d9

Browse files
feat: Enhance result formatting and emergency tool integration
- Added light level and emotions formatting to vision results for LOD 2+ - Included currency detection in vision results - Expanded OCR result formatting for specialized document types: medicine labels, receipts, food packaging, and business cards - Updated prompt builder to include new information on currency, emotions, and lighting changes - Implemented emergency help tool to provide nearest services and country-specific emergency numbers - Introduced tests for new functionalities in result formatters and emergency tool - Enhanced spatial change detection with new rules for vehicle and obstacle detection
1 parent e755dfe commit d3f16d9

14 files changed

+1320
-17
lines changed

agents/ocr_agent.py

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@
3838
Rules:
3939
1. Extract every piece of readable text — signs, menus, labels, documents, \
4040
screens, handwriting.
41-
2. Classify the text type: "menu", "sign", "document", "label", or "unknown".
41+
2. Classify the text type: "menu", "sign", "document", "label", \
42+
"medicine_label", "receipt", "food_packaging", "business_card", or "unknown".
4243
3. For menus: parse into individual items with prices when visible. Format \
4344
each item as "Item Name - $Price" or just "Item Name" if no price. \
4445
Group items by category when clear (e.g. appetizers, mains, drinks, desserts).
@@ -47,8 +48,32 @@
4748
6. Report confidence based on text clarity (0.0 = unreadable, 1.0 = crystal clear).
4849
7. If no text is visible, return empty results with confidence 0.0.
4950
51+
## Specialized Document Types
52+
53+
For medicine labels / prescription bottles:
54+
- Extract: drug name, dosage/strength, frequency/directions, warnings/side effects, \
55+
expiry date, manufacturer. Populate the medicine_info field.
56+
- Safety-critical: always flag drug interaction warnings and "do not exceed" limits.
57+
58+
For receipts:
59+
- Extract: store/merchant name, individual items with prices, subtotal, tax, total, \
60+
payment method, change given, date/time. Populate the receipt_info field.
61+
- Read items in order from top to bottom.
62+
63+
For food packaging / nutrition labels:
64+
- Extract: product name, allergens (CRITICAL for safety — always extract), \
65+
calories per serving, serving size, key nutrients, ingredients list. \
66+
Populate the nutrition_info field.
67+
- Allergens are safety-critical — always extract and flag prominently.
68+
69+
For business cards:
70+
- Extract: person's name, job title, company, phone number(s), email, \
71+
website, physical address. Populate the contact_info field.
72+
- Preserve exact formatting of phone numbers and emails.
73+
5074
Text priority (extract all, but rank by importance):
51-
1. Safety-critical: warnings, caution signs, traffic signals, hazard labels.
75+
1. Safety-critical: warnings, caution signs, traffic signals, hazard labels, \
76+
allergens, drug warnings.
5277
2. Actionable: prices, opening hours, directions, instructions, dosage info.
5378
3. Informational: names, titles, descriptions, news headlines.
5479
4. Decorative: brand slogans, decorative quotes, background text.
@@ -90,7 +115,8 @@
90115
),
91116
"text_type": types.Schema(
92117
type=types.Type.STRING,
93-
enum=["menu", "sign", "document", "label", "unknown"],
118+
enum=["menu", "sign", "document", "label", "medicine_label",
119+
"receipt", "food_packaging", "business_card", "unknown"],
94120
description="Classification of the dominant text type.",
95121
),
96122
"items": types.Schema(
@@ -102,6 +128,55 @@
102128
type=types.Type.NUMBER,
103129
description="Confidence score from 0.0 to 1.0.",
104130
),
131+
"medicine_info": types.Schema(
132+
type=types.Type.OBJECT,
133+
nullable=True,
134+
properties={
135+
"drug_name": types.Schema(type=types.Type.STRING, description="Name of the medication."),
136+
"dosage": types.Schema(type=types.Type.STRING, description="Dosage/strength, e.g. '200mg'."),
137+
"frequency": types.Schema(type=types.Type.STRING, description="How often to take, e.g. 'every 6 hours'."),
138+
"warnings": types.Schema(type=types.Type.ARRAY, items=types.Schema(type=types.Type.STRING), description="Warnings and side effects."),
139+
"expiry_date": types.Schema(type=types.Type.STRING, nullable=True, description="Expiration date if visible."),
140+
},
141+
description="Structured medicine label information.",
142+
),
143+
"receipt_info": types.Schema(
144+
type=types.Type.OBJECT,
145+
nullable=True,
146+
properties={
147+
"store_name": types.Schema(type=types.Type.STRING, description="Merchant/store name."),
148+
"items": types.Schema(type=types.Type.ARRAY, items=types.Schema(type=types.Type.STRING), description="Line items with prices."),
149+
"total": types.Schema(type=types.Type.STRING, description="Total amount."),
150+
"payment_method": types.Schema(type=types.Type.STRING, nullable=True, description="Payment method if shown."),
151+
"change": types.Schema(type=types.Type.STRING, nullable=True, description="Change given if shown."),
152+
},
153+
description="Structured receipt information.",
154+
),
155+
"nutrition_info": types.Schema(
156+
type=types.Type.OBJECT,
157+
nullable=True,
158+
properties={
159+
"product_name": types.Schema(type=types.Type.STRING, description="Product name."),
160+
"allergens": types.Schema(type=types.Type.ARRAY, items=types.Schema(type=types.Type.STRING), description="Allergen warnings."),
161+
"calories": types.Schema(type=types.Type.STRING, description="Calories per serving."),
162+
"serving_size": types.Schema(type=types.Type.STRING, description="Serving size."),
163+
"ingredients": types.Schema(type=types.Type.STRING, nullable=True, description="Ingredients list."),
164+
},
165+
description="Structured nutrition/food packaging information.",
166+
),
167+
"contact_info": types.Schema(
168+
type=types.Type.OBJECT,
169+
nullable=True,
170+
properties={
171+
"name": types.Schema(type=types.Type.STRING, description="Person's name."),
172+
"title": types.Schema(type=types.Type.STRING, nullable=True, description="Job title."),
173+
"company": types.Schema(type=types.Type.STRING, nullable=True, description="Company name."),
174+
"phone": types.Schema(type=types.Type.STRING, nullable=True, description="Phone number(s)."),
175+
"email": types.Schema(type=types.Type.STRING, nullable=True, description="Email address."),
176+
"address": types.Schema(type=types.Type.STRING, nullable=True, description="Physical address."),
177+
},
178+
description="Structured business card / contact information.",
179+
),
105180
},
106181
required=["text", "text_type", "items", "confidence"],
107182
)

agents/orchestrator.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
convert_to_plus_code,
2222
extract_text_from_camera,
2323
get_accessibility_info,
24+
get_emergency_help,
2425
get_location_info,
2526
get_walking_directions,
2627
google_search,
@@ -183,13 +184,28 @@ def forget_recent_memory(minutes: int = 30) -> dict:
183184
184185
**"Is this area accessible?"** → ``get_accessibility_info`` (tactile paving, ramps, signals)
185186
187+
**"Help!" / "Emergency" / "I need help" / "Call 911"** → ``get_emergency_help`` — \
188+
INTERRUPT priority, always override. Provide emergency number, nearest services, \
189+
and location code immediately. Do not hesitate or ask clarifying questions first.
190+
186191
**General knowledge / fact check** → ``google_search``
187192
188193
**Navigation results** include slope warnings (>8% = ADA threshold) and accessibility info.
189194
190195
### Automatic Injections (No Tool Call Needed)
191196
- ``[VISION ANALYSIS]``: Scene understanding — integrate naturally into speech.
197+
- When currency is detected, read the denomination clearly and naturally: \
198+
"That looks like a twenty-dollar bill" (not "Currency detected: USD 20").
199+
- When emotions are detected, weave them in warmly: "The person across from \
200+
you seems to be smiling" (not "Expression: smiling detected").
201+
- When light conditions change, mention it during transitions: "You've stepped \
202+
into a brightly lit space" or "It's quite dim in here".
203+
- When motion_direction is "approaching" for vehicles, alert with urgency.
192204
- ``[OCR RESULT]``: Safety-critical text detected — read aloud when relevant.
205+
- For medication labels: read drug name, dosage, and warnings clearly.
206+
- For receipts: summarize store, item count, and total.
207+
- For food packaging: always read allergens first (safety-critical), then calories.
208+
- For business cards: read name, title, and contact info.
193209
- ``[FACE ID]``: Recognized faces — weave names naturally: "David is sitting across \
194210
from you" (never "Face recognized: David").
195211
Do NOT mention analysis systems by name.
@@ -234,6 +250,7 @@ def forget_recent_memory(minutes: int = 30) -> dict:
234250
"get_accessibility_info": get_accessibility_info,
235251
"maps_query": maps_query,
236252
"extract_text_from_camera": extract_text_from_camera,
253+
"get_emergency_help": get_emergency_help,
237254
"preload_memory": preload_memory,
238255
"remember_entity": remember_entity,
239256
"what_do_you_remember": what_do_you_remember,

agents/vision_agent.py

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,16 @@
5252
5353
ONLY report immediate physical hazards visible in this image:
5454
- Stairs, steps, drop-offs, curbs
55-
- Approaching vehicles or cyclists
55+
- Approaching vehicles or cyclists — flag with urgency, note direction of \
56+
approach (e.g., "vehicle approaching from 3 o'clock")
5657
- Obstacles in the walking path (poles, furniture, construction)
5758
- Wet/slippery surfaces, uneven ground
5859
- Low-hanging objects at head height (tree branches, awnings, scaffolding)
5960
- Overhead obstructions that a cane would not detect
61+
- Currency or payment-related items in the walking path (dropped money, wallet)
62+
63+
For moving objects (vehicles, cyclists, people), indicate motion direction \
64+
using the motion_direction field: "approaching", "receding", or "crossing".
6065
6166
Format: "[hazard] at [clock position], [distance estimate]"
6267
Examples: "Step down at 12 o'clock, 1 meter" / "Low branch at 11 o'clock, head height"
@@ -79,6 +84,12 @@
7984
3. Signage and wayfinding: readable signs, door numbers, directions.
8085
4. People: approximate count and proximity (not appearance descriptions).
8186
5. Key landmarks: counters, elevators, escalators, seating areas.
87+
6. Currency: identify any visible banknotes or coins. Report denomination \
88+
and count. If multiple bills/coins, estimate the total.
89+
7. Emotions: note obvious facial expressions (smiling, frowning, waving, \
90+
concerned) for people within 3 meters. Use the emotions field.
91+
92+
For moving objects, set motion_direction: "approaching", "receding", or "crossing".
8293
8394
Write the scene description as a natural spoken paragraph, not a bulleted list.
8495
Use clock positions for spatial references.
@@ -96,12 +107,22 @@
96107
Provide a comprehensive description as a natural, flowing narrative:
97108
1. SAFETY: Any hazards (always first priority).
98109
2. Spatial layout: full description of the space, dimensions, and organization.
99-
3. People: count, approximate positions, expressions, activities.
110+
3. People: count, approximate positions, detailed expressions and activities. \
111+
Use the emotions field for each person: note expression (smiling, laughing, \
112+
focused, concerned, puzzled) and what they appear to be doing.
100113
4. Text: all readable text (signs, menus, labels, screens).
101114
5. Objects: notable items, their positions and material qualities.
102-
6. Atmosphere: lighting quality (warm, fluorescent, natural), ambient energy \
103-
(quiet, bustling, peaceful), textures and surfaces, sounds you might infer \
104-
(traffic hum, conversation murmur, birdsong).
115+
6. Atmosphere and light: report the light_level field — describe lighting \
116+
quality in detail (warm incandescent, cool fluorescent, bright daylight, \
117+
dim indoor, dark, specific light sources like desk lamps or overhead fixtures). \
118+
Note ambient energy (quiet, bustling, peaceful), textures and surfaces, \
119+
sounds you might infer (traffic hum, conversation murmur, birdsong).
120+
7. Currency: identify any visible banknotes or coins with denomination, \
121+
country of origin if identifiable, and count. Use the currency_detected field.
122+
8. QR codes / barcodes: note presence and describe any readable content \
123+
or context clues about what the code links to.
124+
125+
For moving objects, set motion_direction: "approaching", "receding", or "crossing".
105126
106127
Use sensory language: "warm light filtering through large windows" rather than \
107128
"well-lit room". Describe textures, temperatures, and spatial feelings.
@@ -191,10 +212,44 @@
191212
type=types.Type.STRING,
192213
description="safety, navigation, interaction, or background.",
193214
),
215+
"motion_direction": types.Schema(
216+
type=types.Type.STRING,
217+
nullable=True,
218+
description="For moving objects: approaching, receding, or crossing.",
219+
),
194220
},
195221
),
196222
description="Structured spatial map of objects with clock positions and distances.",
197223
),
224+
"light_level": types.Schema(
225+
type=types.Type.STRING,
226+
nullable=True,
227+
description="Lighting quality: bright_daylight, overcast, dim_indoor, dark, fluorescent, warm_ambient, etc.",
228+
),
229+
"emotions": types.Schema(
230+
type=types.Type.ARRAY,
231+
nullable=True,
232+
items=types.Schema(
233+
type=types.Type.OBJECT,
234+
properties={
235+
"person_position": types.Schema(
236+
type=types.Type.STRING,
237+
description="Clock position and distance of the person, e.g. '2 o'clock, 2m'.",
238+
),
239+
"expression": types.Schema(
240+
type=types.Type.STRING,
241+
description="Observed facial expression: smiling, frowning, laughing, concerned, neutral, etc.",
242+
),
243+
},
244+
),
245+
description="Detected facial expressions for nearby people.",
246+
),
247+
"currency_detected": types.Schema(
248+
type=types.Type.ARRAY,
249+
nullable=True,
250+
items=types.Schema(type=types.Type.STRING),
251+
description="Identified currency: denomination and count, e.g. 'US $20 bill', '2 euro coins'.",
252+
),
198253
},
199254
required=[
200255
"safety_warnings",

context/spatial_change_detector.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
class SpatialChange:
1717
"""A detected change between consecutive vision frames."""
1818

19-
change_type: str # "new_person_approaching", "layout_change", "hazard_appeared", "person_left"
19+
change_type: str # "new_person_approaching", "layout_change", "hazard_appeared", "person_left", "vehicle_approaching", "sudden_obstacle", "person_very_close"
2020
severity: str # "safety", "significant", "minor"
2121
details: str
22+
urgency: str = "awareness" # "immediate" (within_reach), "approaching" (1-2m), "awareness" (3m+)
2223

2324

2425
class SpatialChangeDetector:
@@ -100,6 +101,81 @@ def detect(
100101
details=f"Scene composition changed ({len(prev_labels)}{len(curr_labels)} objects, {overlap:.0%} overlap)",
101102
))
102103

104+
# Rule 5: Approaching vehicle — distance decreased between frames
105+
prev_vehicles = _extract_objects_by_label(previous.get("spatial_objects", []), "vehicle")
106+
curr_vehicles = _extract_objects_by_label(current.get("spatial_objects", []), "vehicle")
107+
for v in curr_vehicles:
108+
dist = v.get("distance_estimate", "")
109+
motion = v.get("motion_direction", "")
110+
clock = v.get("clock_position", "")
111+
is_close = dist in ("within_reach", "1m", "2m")
112+
is_approaching = motion == "approaching"
113+
# Check if vehicle was previously farther away
114+
was_farther = not any(
115+
pv.get("distance_estimate", "") in ("within_reach", "1m", "2m")
116+
for pv in prev_vehicles
117+
) if prev_vehicles else False
118+
if is_close and (is_approaching or was_farther):
119+
urgency = "immediate" if dist == "within_reach" else "approaching"
120+
clock_str = f" from {clock} o'clock" if clock else ""
121+
changes.append(SpatialChange(
122+
change_type="vehicle_approaching",
123+
severity="safety",
124+
details=f"Vehicle approaching{clock_str}, {dist}",
125+
urgency=urgency,
126+
))
127+
128+
# Rule 6: Sudden obstacle in path — new object at 11-1 o'clock within 2m
129+
prev_obj_keys = {
130+
(o.get("label", ""), o.get("clock_position"))
131+
for o in previous.get("spatial_objects", [])
132+
if isinstance(o, dict)
133+
}
134+
for obj in current.get("spatial_objects", []):
135+
if not isinstance(obj, dict):
136+
continue
137+
label = obj.get("label", "")
138+
clock = obj.get("clock_position")
139+
dist = obj.get("distance_estimate", "")
140+
salience = obj.get("salience", "")
141+
obj_key = (label, clock)
142+
if (
143+
obj_key not in prev_obj_keys
144+
and clock in (11, 12, 1)
145+
and dist in ("within_reach", "1m", "2m")
146+
and salience in ("safety", "navigation")
147+
and label not in ("person",) # people handled by Rule 2/7
148+
):
149+
urgency = "immediate" if dist == "within_reach" else "approaching"
150+
changes.append(SpatialChange(
151+
change_type="sudden_obstacle",
152+
severity="safety",
153+
details=f"{label} appeared at {clock} o'clock, {dist}",
154+
urgency=urgency,
155+
))
156+
157+
# Rule 7: Person very close — person at within_reach distance
158+
for obj in current.get("spatial_objects", []):
159+
if not isinstance(obj, dict):
160+
continue
161+
if obj.get("label") == "person" and obj.get("distance_estimate") == "within_reach":
162+
clock = obj.get("clock_position", "")
163+
# Only flag if this person wasn't already within_reach in previous frame
164+
was_close = any(
165+
isinstance(po, dict)
166+
and po.get("label") == "person"
167+
and po.get("distance_estimate") == "within_reach"
168+
for po in previous.get("spatial_objects", [])
169+
)
170+
if not was_close:
171+
clock_str = f" at {clock} o'clock" if clock else ""
172+
changes.append(SpatialChange(
173+
change_type="person_very_close",
174+
severity="safety",
175+
details=f"Person very close{clock_str}",
176+
urgency="immediate",
177+
))
178+
103179
# Sort by severity: safety > significant > minor
104180
severity_order = {"safety": 0, "significant": 1, "minor": 2}
105181
changes.sort(key=lambda c: severity_order.get(c.severity, 3))
@@ -113,3 +189,11 @@ def _extract_labels(spatial_objects: list) -> set[str]:
113189
if isinstance(obj, dict) and obj.get("label"):
114190
labels.add(obj["label"])
115191
return labels
192+
193+
194+
def _extract_objects_by_label(spatial_objects: list, label: str) -> list[dict]:
195+
"""Extract all spatial objects matching a given label."""
196+
return [
197+
obj for obj in spatial_objects
198+
if isinstance(obj, dict) and obj.get("label") == label
199+
]

0 commit comments

Comments
 (0)