You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
fix: capture screenshot on browser parse errors and improve click_text docs
- Catch parse_action errors in execute() and auto-capture screenshot
instead of propagating error without visual context
- Clarify click_text requires 'text' param (not 'selector') in tool
description and parameter schema
- Strengthen continuation nudge: bool → counter (max 3) with more
aggressive message to prevent LLM describing instead of acting
Copy file name to clipboardExpand all lines: crates/cratos-tools/src/browser/tool.rs
+38-7Lines changed: 38 additions & 7 deletions
Original file line number
Diff line number
Diff line change
@@ -39,9 +39,11 @@ impl BrowserTool {
39
39
"Control the user's real web browser. Key actions: \
40
40
'search' -- search a known site (REQUIRES site + query, e.g. site=\"naver_shopping\" query=\"keyword\"). Sites: naver, naver_shopping, coupang, google, youtube, amazon, google_maps. \
41
41
'navigate' -- go to a specific URL (REQUIRES url, NOT site). \
42
-
'click_text' -- click by visible text (no CSS selector needed). \
IMPORTANT: To search a site use 'search' with site+query, NOT 'navigate'. \
46
+
IMPORTANT: click_text needs 'text' param, click needs 'selector' param — do NOT mix them up.",
45
47
)
46
48
.with_category(ToolCategory::External)
47
49
.with_risk_level(RiskLevel::Medium)
@@ -85,11 +87,11 @@ impl BrowserTool {
85
87
},
86
88
"selector":{
87
89
"type":"string",
88
-
"description":"CSS selector for element actions (required for click, type, fill; optional for get_text — omit to read entire page)"
90
+
"description":"CSS selector (for click, type, fill, hover, check). NOT for click_text — use 'text' param instead. Optional for get_text (omit to read entire page)."
89
91
},
90
92
"text":{
91
93
"type":"string",
92
-
"description":"Text to type (for type action), or visible text to find and click (for click_text action)"
94
+
"description":"For click_text: the visible text to find and click (e.g. \"장바구니\", \"구매하기\"). For type: the text to type into the element."
93
95
},
94
96
"index":{
95
97
"type":"integer",
@@ -419,8 +421,37 @@ impl Tool for BrowserTool {
419
421
));
420
422
}
421
423
422
-
// Parse the action
423
-
let action = self.parse_action(&input)?;
424
+
// Parse the action — on failure, capture a screenshot so the LLM can
425
+
// visually inspect the current page state alongside the error message.
426
+
let action = matchself.parse_action(&input){
427
+
Ok(a) => a,
428
+
Err(e) => {
429
+
letmut screenshot:Option<String> = None;
430
+
ifletOk(ss_result) = self
431
+
.dispatch(BrowserAction::Screenshot{
432
+
path:None,
433
+
full_page:false,
434
+
selector:None,
435
+
})
436
+
.await
437
+
{
438
+
ifletSome(ss) = ss_result.screenshot{
439
+
info!("Auto-captured screenshot for browser parse error");
0 commit comments