Skip to content

Commit 4d9a5e4

Browse files
committed
fix: capture screenshot on browser parse errors and improve click_text docs
- Catch parse_action errors in execute() and auto-capture screenshot instead of propagating error without visual context - Clarify click_text requires 'text' param (not 'selector') in tool description and parameter schema - Strengthen continuation nudge: bool → counter (max 3) with more aggressive message to prevent LLM describing instead of acting
1 parent 2f99f54 commit 4d9a5e4

File tree

2 files changed

+46
-14
lines changed

2 files changed

+46
-14
lines changed

crates/cratos-core/src/orchestrator/process.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ impl Orchestrator {
198198
let mut consecutive_all_fail = 0_usize;
199199
let mut total_failure_count = 0_usize;
200200
let mut fallback_sticky = false; // Once fallback is used, stick with it
201-
let mut continuation_nudged = false; // Nudge LLM to continue once if it stops mid-task
201+
let mut continuation_nudge_count = 0_usize; // Nudge LLM to continue if it stops mid-task (max 3)
202202

203203
// Messages accumulate tool call history across iterations
204204
let mut messages = messages;
@@ -447,24 +447,25 @@ impl Orchestrator {
447447
}
448448

449449
// Nudge: LLM returned text-only mid-task (tools were used but task may be incomplete).
450-
// Push it once to re-check if all steps are done before accepting the response.
451-
if !continuation_nudged
450+
// Push up to 3 times to re-check if all steps are done before accepting.
451+
if continuation_nudge_count < 3
452452
&& !tool_call_records.is_empty()
453453
&& iteration > 1
454454
&& iteration < self.config.max_iterations - 1
455455
&& !content_text.trim().is_empty()
456456
{
457-
continuation_nudged = true;
457+
continuation_nudge_count += 1;
458458
warn!(
459459
execution_id = %execution_id,
460460
iteration = iteration,
461+
nudge = continuation_nudge_count,
461462
"Model returned text-only mid-task, nudging to continue"
462463
);
463464
messages.push(Message::assistant(content_text));
464465
messages.push(Message::user(
465-
"The task may not be fully complete. Re-read the user's original request and check \
466-
if any steps remain. If so, continue using tools to finish. \
467-
Only report the final result once every step has been completed.",
466+
"The task is NOT complete yet. You said what you would do but did NOT actually do it. \
467+
Use the browser tool RIGHT NOW to perform the action. Do NOT just describe what to do — \
468+
call the tool. Continue until the user's original request is fully done.",
468469
));
469470
continue;
470471
}

crates/cratos-tools/src/browser/tool.rs

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,11 @@ impl BrowserTool {
3939
"Control the user's real web browser. Key actions: \
4040
'search' -- search a known site (REQUIRES site + query, e.g. site=\"naver_shopping\" query=\"keyword\"). Sites: naver, naver_shopping, coupang, google, youtube, amazon, google_maps. \
4141
'navigate' -- go to a specific URL (REQUIRES url, NOT site). \
42-
'click_text' -- click by visible text (no CSS selector needed). \
43-
Other: get_tabs, click, type, fill, screenshot, get_text, get_html, evaluate, scroll, go_back, reload. \
44-
IMPORTANT: To search a site use 'search' with site+query, NOT 'navigate'.",
42+
'click_text' -- click element by visible text (REQUIRES 'text' param, e.g. text=\"장바구니\"). Do NOT use 'selector' for click_text. \
43+
'click' -- click element by CSS selector (REQUIRES 'selector' param). \
44+
Other: get_tabs, type, fill, screenshot, get_text, get_html, evaluate, scroll, go_back, reload. \
45+
IMPORTANT: To search a site use 'search' with site+query, NOT 'navigate'. \
46+
IMPORTANT: click_text needs 'text' param, click needs 'selector' param — do NOT mix them up.",
4547
)
4648
.with_category(ToolCategory::External)
4749
.with_risk_level(RiskLevel::Medium)
@@ -85,11 +87,11 @@ impl BrowserTool {
8587
},
8688
"selector": {
8789
"type": "string",
88-
"description": "CSS selector for element actions (required for click, type, fill; optional for get_text — omit to read entire page)"
90+
"description": "CSS selector (for click, type, fill, hover, check). NOT for click_textuse 'text' param instead. Optional for get_text (omit to read entire page)."
8991
},
9092
"text": {
9193
"type": "string",
92-
"description": "Text to type (for type action), or visible text to find and click (for click_text action)"
94+
"description": "For click_text: the visible text to find and click (e.g. \"장바구니\", \"구매하기\"). For type: the text to type into the element."
9395
},
9496
"index": {
9597
"type": "integer",
@@ -419,8 +421,37 @@ impl Tool for BrowserTool {
419421
));
420422
}
421423

422-
// Parse the action
423-
let action = self.parse_action(&input)?;
424+
// Parse the action — on failure, capture a screenshot so the LLM can
425+
// visually inspect the current page state alongside the error message.
426+
let action = match self.parse_action(&input) {
427+
Ok(a) => a,
428+
Err(e) => {
429+
let mut screenshot: Option<String> = None;
430+
if let Ok(ss_result) = self
431+
.dispatch(BrowserAction::Screenshot {
432+
path: None,
433+
full_page: false,
434+
selector: None,
435+
})
436+
.await
437+
{
438+
if let Some(ss) = ss_result.screenshot {
439+
info!("Auto-captured screenshot for browser parse error");
440+
screenshot = Some(ss);
441+
}
442+
}
443+
let duration = start.elapsed().as_millis() as u64;
444+
return Ok(ToolResult {
445+
success: false,
446+
output: serde_json::json!({
447+
"error": e.to_string(),
448+
"screenshot": screenshot
449+
}),
450+
error: Some(e.to_string()),
451+
duration_ms: duration,
452+
});
453+
}
454+
};
424455
let is_interactive = action.is_interactive();
425456
debug!(action = ?action.name(), "Executing browser action");
426457

0 commit comments

Comments
 (0)