diff --git a/src/apps/desktop/Cargo.toml b/src/apps/desktop/Cargo.toml index 5978b1e4..06c93088 100644 --- a/src/apps/desktop/Cargo.toml +++ b/src/apps/desktop/Cargo.toml @@ -62,3 +62,13 @@ dispatch = "0.2" [target.'cfg(windows)'.dependencies] win32job = { workspace = true } +windows = { version = "0.61.3", features = [ + "Win32_Foundation", + "Win32_System_Com", + "Win32_UI_Accessibility", + "Win32_UI_WindowsAndMessaging", +] } +windows-core = "0.61.2" + +[target.'cfg(target_os = "linux")'.dependencies] +atspi = "0.29" diff --git a/src/apps/desktop/src/api/config_api.rs b/src/apps/desktop/src/api/config_api.rs index b28ab169..a756ce22 100644 --- a/src/apps/desktop/src/api/config_api.rs +++ b/src/apps/desktop/src/api/config_api.rs @@ -271,6 +271,29 @@ pub async fn get_mode_configs(state: State<'_, AppState>) -> Result (u32, u32) { if nw == 0 || nh == 0 { @@ -479,17 +482,24 @@ fn clamp_center_to_native(cx: u32, cy: u32, nw: u32, nh: u32) -> (u32, u32) { (cx, cy) } -/// Top-left and size of the native crop rectangle around `(cx, cy)`, clamped to the bitmap (≤500×500 when the display is large enough). -fn crop_rect_around_point_native(cx: u32, cy: u32, nw: u32, nh: u32) -> (u32, u32, u32, u32) { +/// Top-left and size of the native crop rectangle around `(cx, cy)`, clamped to the bitmap. +/// `half_px` is the distance from center to each edge (see [`clamp_point_crop_half_extent`]). +fn crop_rect_around_point_native( + cx: u32, + cy: u32, + nw: u32, + nh: u32, + half_px: u32, +) -> (u32, u32, u32, u32) { let (cx, cy) = clamp_center_to_native(cx, cy, nw, nh); if nw == 0 || nh == 0 { return (0, 0, 1, 1); } - let edge = POINT_CROP_HALF_PX.saturating_mul(2); + let edge = half_px.saturating_mul(2); let tw = edge.min(nw).max(1); let th = edge.min(nh).max(1); - let mut x0 = cx.saturating_sub(POINT_CROP_HALF_PX); - let mut y0 = cy.saturating_sub(POINT_CROP_HALF_PX); + let mut x0 = cx.saturating_sub(half_px); + let mut y0 = cy.saturating_sub(half_px); if x0.saturating_add(tw) > nw { x0 = nw.saturating_sub(tw); } @@ -798,6 +808,122 @@ impl DesktopComputerUseHost { } } + /// Best-effort foreground app + pointer; safe to call from `spawn_blocking`. + fn collect_session_snapshot_sync() -> ComputerUseSessionSnapshot { + #[cfg(target_os = "macos")] + { + return Self::session_snapshot_macos(); + } + #[cfg(target_os = "windows")] + { + return Self::session_snapshot_windows(); + } + #[cfg(target_os = "linux")] + { + return Self::session_snapshot_linux(); + } + #[cfg(not(any( + target_os = "macos", + target_os = "windows", + target_os = "linux" + )))] + { + ComputerUseSessionSnapshot::default() + } + } + + #[cfg(target_os = "macos")] + fn session_snapshot_macos() -> ComputerUseSessionSnapshot { + let pointer = macos::quartz_mouse_location().ok().map(|(x, y)| ComputerUsePointerGlobal { x, y }); + let foreground = Self::macos_foreground_application(); + ComputerUseSessionSnapshot { + foreground_application: foreground, + pointer_global: pointer, + } + } + + #[cfg(target_os = "macos")] + fn macos_foreground_application() -> Option { + let out = std::process::Command::new("/usr/bin/osascript") + .args(["-e", r#"tell application "System Events" + set p to first process whose frontmost is true + return (unix id of p as text) & "|" & (name of p) & "|" & (try (bundle identifier of p as text) on error "" end try) +end tell"#]) + .output() + .ok()?; + if !out.status.success() { + return None; + } + let s = String::from_utf8_lossy(&out.stdout); + let parts: Vec<&str> = s.trim().splitn(3, '|').collect(); + if parts.len() < 2 { + return None; + } + let pid = parts[0].trim().parse::().ok()?; + let name = parts[1].trim(); + let bundle = parts.get(2).map(|x| x.trim()).filter(|x| !x.is_empty()); + Some(ComputerUseForegroundApplication { + name: Some(name.to_string()), + bundle_id: bundle.map(|b| b.to_string()), + process_id: Some(pid), + }) + } + + #[cfg(target_os = "windows")] + fn session_snapshot_windows() -> ComputerUseSessionSnapshot { + use windows::Win32::Foundation::POINT; + use windows::Win32::UI::WindowsAndMessaging::{ + GetCursorPos, GetForegroundWindow, GetWindowTextW, GetWindowThreadProcessId, + }; + + unsafe { + let mut pt = POINT::default(); + let pointer = if GetCursorPos(&mut pt).is_ok() { + Some(ComputerUsePointerGlobal { + x: pt.x as f64, + y: pt.y as f64, + }) + } else { + None + }; + + let hwnd = GetForegroundWindow(); + let foreground = if hwnd.is_invalid() { + None + } else { + let mut pid: u32 = 0; + GetWindowThreadProcessId(hwnd, Some(&mut pid)); + let mut buf = [0u16; 512]; + let n = GetWindowTextW(hwnd, &mut buf) as usize; + let title = if n > 0 { + String::from_utf16_lossy(&buf[..n.min(512)]) + } else { + String::new() + }; + Some(ComputerUseForegroundApplication { + name: if title.is_empty() { + None + } else { + Some(title) + }, + bundle_id: None, + process_id: Some(pid as i32), + }) + }; + + ComputerUseSessionSnapshot { + foreground_application: foreground, + pointer_global: pointer, + } + } + } + + #[cfg(target_os = "linux")] + fn session_snapshot_linux() -> ComputerUseSessionSnapshot { + // Best-effort: no standard API across Wayland/X11 without extra deps. + ComputerUseSessionSnapshot::default() + } + fn refinement_from_shot(shot: &ComputerScreenshot) -> ComputerUseScreenshotRefinement { use ComputerUseScreenshotRefinement as R; if let Some(c) = shot.screenshot_crop_center { @@ -1072,8 +1198,10 @@ impl DesktopComputerUseHost { quadrant_navigation_click_ready, persist_nav_focus, ) = if let Some(center) = params.crop_center { + let half = clamp_point_crop_half_extent(params.point_crop_half_extent_native); let (ccx, ccy) = clamp_center_to_native(center.x, center.y, native_w, native_h); - let (x0, y0, tw, th) = crop_rect_around_point_native(center.x, center.y, native_w, native_h); + let (x0, y0, tw, th) = + crop_rect_around_point_native(center.x, center.y, native_w, native_h, half); let cropped = Self::crop_rgb(&full_frame, x0, y0, tw, th)?; let ox = origin_x + x0 as i32; let oy = origin_y + y0 as i32; @@ -1235,8 +1363,9 @@ impl DesktopComputerUseHost { #[cfg(target_os = "macos")] let macos_map_geo = if let Some(center) = params.crop_center { + let half = clamp_point_crop_half_extent(params.point_crop_half_extent_native); let (x0, y0, _, _) = - crop_rect_around_point_native(center.x, center.y, native_w, native_h); + crop_rect_around_point_native(center.x, center.y, native_w, native_h, half); full_geo.with_crop(x0, y0) } else { full_geo.with_crop(ruler_origin_native_x, ruler_origin_native_y) @@ -1289,6 +1418,10 @@ impl DesktopComputerUseHost { let jpeg_bytes = Self::encode_jpeg(&frame, JPEG_QUALITY)?; + let point_crop_half_extent_native = params.crop_center.map(|_| { + clamp_point_crop_half_extent(params.point_crop_half_extent_native) + }); + let shot = ComputerScreenshot { bytes: jpeg_bytes, mime_type: "image/jpeg".to_string(), @@ -1302,6 +1435,7 @@ impl DesktopComputerUseHost { pointer_image_x, pointer_image_y, screenshot_crop_center, + point_crop_half_extent_native, navigation_native_rect: shot_navigation_rect, quadrant_navigation_click_ready, image_content_rect: Some(image_content_rect), @@ -1586,6 +1720,44 @@ impl ComputerUseHost for DesktopComputerUseHost { .and_then(|g| *g) } + async fn locate_ui_element_screen_center( + &self, + query: UiElementLocateQuery, + ) -> BitFunResult { + Self::ensure_input_automation_allowed()?; + #[cfg(target_os = "macos")] + { + return tokio::task::spawn_blocking(move || { + crate::computer_use::macos_ax_ui::locate_ui_element_center(&query) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))?; + } + #[cfg(target_os = "windows")] + { + return tokio::task::spawn_blocking(move || { + crate::computer_use::windows_ax_ui::locate_ui_element_center(&query) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))?; + } + #[cfg(target_os = "linux")] + { + return crate::computer_use::linux_ax_ui::locate_ui_element_center(query).await; + } + #[cfg(not(any( + target_os = "macos", + target_os = "windows", + target_os = "linux" + )))] + { + Err(BitFunError::tool( + "Native UI element (accessibility) lookup is not available on this platform." + .to_string(), + )) + } + } + fn map_image_coords_to_pointer_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { let guard = self .last_pointer_map @@ -1669,7 +1841,7 @@ impl ComputerUseHost for DesktopComputerUseHost { .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; let Some(map) = *guard else { return Err(BitFunError::tool( - "Run action screenshot first: on macOS, pointer_move_relative / pointer_nudge convert pixel deltas using the last capture scale." + "Run action screenshot first: on macOS, pointer_move_relative / ComputerUseMouseStep convert pixel deltas using the last capture scale." .to_string(), )); }; @@ -1773,6 +1945,7 @@ impl ComputerUseHost for DesktopComputerUseHost { .iter() .map(|s| Self::map_key(s)) .collect::>()?; + #[cfg(target_os = "macos")] let chord_has_modifier = keys_for_job.iter().any(|s| { matches!( s.to_lowercase().as_str(), @@ -1830,6 +2003,12 @@ impl ComputerUseHost for DesktopComputerUseHost { Ok(()) } + async fn computer_use_session_snapshot(&self) -> ComputerUseSessionSnapshot { + tokio::task::spawn_blocking(Self::collect_session_snapshot_sync) + .await + .unwrap_or_else(|_| ComputerUseSessionSnapshot::default()) + } + fn computer_use_after_screenshot(&self) { if let Ok(mut g) = self.click_needs_fresh_screenshot.lock() { *g = false; @@ -1862,7 +2041,7 @@ impl ComputerUseHost for DesktopComputerUseHost { }) => {} _ => { return Err(BitFunError::tool( - "Click refused: use a **fine** screenshot basis — either a **~500×500 point crop** (`screenshot_crop_center_x` / `y` in full-display native pixels) **or** keep drilling with `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` is true in the tool result, then `mouse_move` + `click`. Full-screen alone is not enough.".to_string(), + "Click refused: use a **fine** screenshot basis — either a **~500×500 point crop** (`screenshot_crop_center_x` / `y` in full-display native pixels) **or** keep drilling with `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` is true in the tool result, then `ComputerUseMousePrecise` / `ComputerUseMouseStep` + `click`. Full-screen alone is not enough.".to_string(), )); } } diff --git a/src/apps/desktop/src/computer_use/linux_ax_ui.rs b/src/apps/desktop/src/computer_use/linux_ax_ui.rs new file mode 100644 index 00000000..400390f9 --- /dev/null +++ b/src/apps/desktop/src/computer_use/linux_ax_ui.rs @@ -0,0 +1,121 @@ +//! Linux AT-SPI2 (via `atspi`) BFS over accessible objects for stable screen coordinates. +//! +//! Requires session D-Bus, `at-spi2` registry, and apps exposing AT-SPI (typical on GNOME/KDE with a11y). + +use crate::computer_use::ui_locate_common; +use atspi::connection::P2P; +use atspi::AccessibilityConnection; +use atspi::CoordType; +use atspi::proxy::accessible::AccessibleProxy; +use atspi::proxy::proxy_ext::ProxyExt; +use bitfun_core::agentic::tools::computer_use_host::{UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use std::collections::VecDeque; + +async fn component_extents_screen(acc: &AccessibleProxy<'_>) -> Option<(i32, i32, i32, i32)> { + let proxies = acc.proxies().await.ok()?; + let comp = proxies.component().await.ok()?; + comp.get_extents(CoordType::Screen).await.ok() +} + +async fn role_match_string(acc: &AccessibleProxy<'_>) -> String { + match acc.get_role_name().await { + Ok(s) if !s.is_empty() => s, + _ => match acc.get_role().await { + Ok(r) => format!("{:?}", r), + Err(_) => String::new(), + }, + } +} + +/// Registry application roots → BFS until first match with non-empty screen extents. +pub async fn locate_ui_element_center(query: UiElementLocateQuery) -> BitFunResult { + ui_locate_common::validate_query(&query)?; + let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); + let max_nodes = 12_000usize; + + let conn = AccessibilityConnection::new() + .await + .map_err(|e| BitFunError::tool(format!("AT-SPI connection: {}.", e)))?; + + let registry_root = conn + .root_accessible_on_registry() + .await + .map_err(|e| BitFunError::tool(format!("AT-SPI registry root: {}.", e)))?; + + let children = registry_root + .get_children() + .await + .map_err(|e| BitFunError::tool(format!("AT-SPI get_children (registry): {}.", e)))?; + + let mut queue = VecDeque::new(); + for c in children { + queue.push_back((c, 0u32)); + } + + let mut visited = 0usize; + + while let Some((obj_ref, depth)) = queue.pop_front() { + if depth > max_depth { + continue; + } + visited += 1; + if visited > max_nodes { + return Err(BitFunError::tool( + "AT-SPI search limit reached; narrow title/role/identifier filters.".to_string(), + )); + } + + let acc = match conn.object_as_accessible(&obj_ref).await { + Ok(a) => a, + Err(_) => continue, + }; + + let name = acc.name().await.unwrap_or_default(); + let ident = acc.accessible_id().await.unwrap_or_default(); + let role = role_match_string(&acc).await; + + let matched = ui_locate_common::matches_filters( + &query, + Some(role.as_str()), + Some(name.as_str()), + Some(ident.as_str()), + ); + if matched { + if let Some((x, y, w, h)) = component_extents_screen(&acc).await { + if w > 0 && h > 0 { + let gx = x as f64 + w as f64 / 2.0; + let gy = y as f64 + h as f64 / 2.0; + let bl = x as f64; + let bt = y as f64; + let bw = w as f64; + let bh = h as f64; + return ui_locate_common::ok_result( + gx, + gy, + bl, + bt, + bw, + bh, + role, + if name.is_empty() { None } else { Some(name) }, + if ident.is_empty() { None } else { Some(ident) }, + ); + } + } + } + + let ch = match acc.get_children().await { + Ok(c) => c, + Err(_) => continue, + }; + for child in ch { + queue.push_back((child, depth + 1)); + } + } + + Err(BitFunError::tool( + "No AT-SPI accessible matched the query (try different substrings, enable desktop accessibility services, or use ComputerUse screenshot). Locate uses the same AT-SPI accessibility session as other automation." + .to_string(), + )) +} diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs new file mode 100644 index 00000000..d1031724 --- /dev/null +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -0,0 +1,254 @@ +//! macOS Accessibility (AX) tree search for stable UI centers (native “DOM”). +//! +//! Coordinates match CoreGraphics global space used by [`crate::computer_use::DesktopComputerUseHost`]. + +use crate::computer_use::ui_locate_common; +use bitfun_core::agentic::tools::computer_use_host::{UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use core_foundation::array::{CFArray, CFArrayRef}; +use core_foundation::base::{CFTypeRef, TCFType}; +use core_foundation::string::{CFString, CFStringRef}; +use core_graphics::geometry::{CGPoint, CGSize}; +use std::collections::VecDeque; +use std::ffi::c_void; + +type AXUIElementRef = *const c_void; +type AXValueRef = *const c_void; + +#[link(name = "ApplicationServices", kind = "framework")] +unsafe extern "C" { + fn AXUIElementCreateApplication(pid: i32) -> AXUIElementRef; + fn AXUIElementCopyAttributeValue( + element: AXUIElementRef, + attribute: CFStringRef, + value: *mut CFTypeRef, + ) -> i32; + fn AXValueGetType(value: AXValueRef) -> u32; + fn AXValueGetValue(value: AXValueRef, the_type: u32, ptr: *mut c_void) -> bool; +} + +#[link(name = "CoreFoundation", kind = "framework")] +unsafe extern "C" { + fn CFRetain(cf: CFTypeRef) -> CFTypeRef; +} + +const K_AX_VALUE_CGPOINT: u32 = 1; +const K_AX_VALUE_CGSIZE: u32 = 2; + +fn frontmost_pid() -> BitFunResult { + let out = std::process::Command::new("/usr/bin/osascript") + .args([ + "-e", + "tell application \"System Events\" to get unix id of first process whose frontmost is true", + ]) + .output() + .map_err(|e| BitFunError::tool(format!("osascript spawn: {}", e)))?; + if !out.status.success() { + return Err(BitFunError::tool(format!( + "osascript failed: {}", + String::from_utf8_lossy(&out.stderr) + ))); + } + let s = String::from_utf8_lossy(&out.stdout); + s.trim() + .parse::() + .map_err(|_| BitFunError::tool("Could not parse frontmost process id.".to_string())) +} + +unsafe fn ax_release(v: CFTypeRef) { + if !v.is_null() { + core_foundation::base::CFRelease(v); + } +} + +unsafe fn ax_copy_attr(elem: AXUIElementRef, key: &str) -> Option { + let mut val: CFTypeRef = std::ptr::null(); + let k = CFString::new(key); + let st = AXUIElementCopyAttributeValue(elem, k.as_concrete_TypeRef(), &mut val); + if st != 0 || val.is_null() { + if !val.is_null() { + ax_release(val); + } + return None; + } + Some(val) +} + +unsafe fn cfstring_to_string(cf: CFTypeRef) -> Option { + if cf.is_null() { + return None; + } + let s = CFString::wrap_under_get_rule(cf as CFStringRef); + Some(s.to_string()) +} + +unsafe fn ax_value_to_point(v: CFTypeRef) -> Option { + let v = v as AXValueRef; + let t = AXValueGetType(v); + if t != K_AX_VALUE_CGPOINT { + return None; + } + let mut pt = CGPoint { x: 0.0, y: 0.0 }; + if !AXValueGetValue(v, K_AX_VALUE_CGPOINT, &mut pt as *mut _ as *mut c_void) { + return None; + } + Some(pt) +} + +unsafe fn ax_value_to_size(v: CFTypeRef) -> Option { + let v = v as AXValueRef; + let t = AXValueGetType(v); + if t != K_AX_VALUE_CGSIZE { + return None; + } + let mut sz = CGSize { + width: 0.0, + height: 0.0, + }; + if !AXValueGetValue(v, K_AX_VALUE_CGSIZE, &mut sz as *mut _ as *mut c_void) { + return None; + } + Some(sz) +} + +unsafe fn read_role_title_id(elem: AXUIElementRef) -> (Option, Option, Option) { + let role = ax_copy_attr(elem, "AXRole").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + let title = ax_copy_attr(elem, "AXTitle").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + let ident = ax_copy_attr(elem, "AXIdentifier").and_then(|v| { + let s = cfstring_to_string(v); + ax_release(v); + s + }); + (role, title, ident) +} + +/// Global center and axis-aligned bounds from `AXPosition` + `AXSize`. +unsafe fn element_frame_global(elem: AXUIElementRef) -> Option<(f64, f64, f64, f64, f64, f64)> { + let pos = ax_copy_attr(elem, "AXPosition")?; + let size = ax_copy_attr(elem, "AXSize")?; + let pt = ax_value_to_point(pos)?; + let sz = ax_value_to_size(size)?; + ax_release(pos); + ax_release(size); + if sz.width <= 0.0 || sz.height <= 0.0 { + return None; + } + let left = pt.x; + let top = pt.y; + let w = sz.width; + let h = sz.height; + Some((left + w / 2.0, top + h / 2.0, left, top, w, h)) +} + +struct Queued { + ax: AXUIElementRef, + depth: u32, +} + +/// Search the **frontmost** app’s accessibility tree (BFS) for the first element matching filters. +pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult { + ui_locate_common::validate_query(query)?; + let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); + let pid = frontmost_pid()?; + let root = unsafe { AXUIElementCreateApplication(pid) }; + if root.is_null() { + return Err(BitFunError::tool("AXUIElementCreateApplication returned null.".to_string())); + } + let mut q = VecDeque::new(); + q.push_back(Queued { ax: root, depth: 0 }); + let mut visited = 0usize; + let max_nodes = 12_000usize; + + loop { + let Some(cur) = q.pop_front() else { + return Err(BitFunError::tool( + "No accessibility element matched in the **frontmost** app. Filters default to **AND** (`filter_combine` omitted = `all`): every non-empty field must match the **same** node — e.g. `title_contains` + `role_substring` together often fails when the control has a **role** but **empty or different AXTitle** (typical for search fields). Try: **`filter_combine`: `\"any\"`**, or **only** `role_substring` (e.g. `TextField`), or **only** `title_contains`; match UI language; ensure the chat app is focused. Or use **`action: screenshot`**. (If Accessibility were denied, you would see a different error.)" + .to_string(), + )); + }; + if cur.depth > max_depth { + unsafe { + ax_release(cur.ax as CFTypeRef); + } + continue; + } + visited += 1; + if visited > max_nodes { + unsafe { + ax_release(cur.ax as CFTypeRef); + } + while let Some(c) = q.pop_front() { + unsafe { + ax_release(c.ax as CFTypeRef); + } + } + return Err(BitFunError::tool( + "Accessibility search limit reached; narrow title/role/identifier filters." + .to_string(), + )); + } + + let (role_s, title_s, id_s) = unsafe { read_role_title_id(cur.ax) }; + let role_ref = role_s.as_deref(); + let title_ref = title_s.as_deref(); + let id_ref = id_s.as_deref(); + + let matched = ui_locate_common::matches_filters(query, role_ref, title_ref, id_ref); + if matched { + if let Some((gx, gy, bl, bt, bw, bh)) = unsafe { element_frame_global(cur.ax) } { + unsafe { + ax_release(cur.ax as CFTypeRef); + } + return ui_locate_common::ok_result( + gx, + gy, + bl, + bt, + bw, + bh, + role_s.unwrap_or_default(), + title_s, + id_s, + ); + } + } + + let children_ref = unsafe { ax_copy_attr(cur.ax, "AXChildren") }; + let next_depth = cur.depth + 1; + unsafe { + ax_release(cur.ax as CFTypeRef); + } + + let Some(ch) = children_ref else { + continue; + }; + unsafe { + let arr = CFArray::<*const c_void>::wrap_under_create_rule(ch as CFArrayRef); + let n = arr.len(); + for i in 0..n { + let Some(child_ref) = arr.get(i) else { + continue; + }; + let child = *child_ref; + if child.is_null() { + continue; + } + let retained = CFRetain(child as CFTypeRef) as AXUIElementRef; + if !retained.is_null() { + q.push_back(Queued { + ax: retained, + depth: next_depth, + }); + } + } + } + } +} diff --git a/src/apps/desktop/src/computer_use/mod.rs b/src/apps/desktop/src/computer_use/mod.rs index 4f7a3f9c..9b50d5a9 100644 --- a/src/apps/desktop/src/computer_use/mod.rs +++ b/src/apps/desktop/src/computer_use/mod.rs @@ -1,5 +1,12 @@ //! Desktop Computer use host (screenshots + enigo). mod desktop_host; +mod ui_locate_common; +#[cfg(target_os = "macos")] +mod macos_ax_ui; +#[cfg(target_os = "windows")] +mod windows_ax_ui; +#[cfg(target_os = "linux")] +mod linux_ax_ui; pub use desktop_host::DesktopComputerUseHost; diff --git a/src/apps/desktop/src/computer_use/ui_locate_common.rs b/src/apps/desktop/src/computer_use/ui_locate_common.rs new file mode 100644 index 00000000..2e809f5b --- /dev/null +++ b/src/apps/desktop/src/computer_use/ui_locate_common.rs @@ -0,0 +1,210 @@ +//! Shared validation, filter matching, and global→native pixel mapping for UI locate tools. + +use bitfun_core::agentic::tools::computer_use_host::{UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use screenshots::display_info::DisplayInfo; + +pub fn validate_query(q: &UiElementLocateQuery) -> BitFunResult<()> { + let t = q.title_contains.as_ref().map(|s| !s.trim().is_empty()).unwrap_or(false); + let r = q.role_substring.as_ref().map(|s| !s.trim().is_empty()).unwrap_or(false); + let i = q + .identifier_contains + .as_ref() + .map(|s| !s.trim().is_empty()) + .unwrap_or(false); + if !t && !r && !i { + return Err(BitFunError::tool( + "Provide at least one of: title_contains, role_substring, identifier_contains (non-empty)." + .to_string(), + )); + } + Ok(()) +} + +fn global_xy_to_native_with_display(d: &DisplayInfo, gx: f64, gy: f64) -> BitFunResult<(u32, u32)> { + let disp_ox = d.x as f64; + let disp_oy = d.y as f64; + let disp_w = d.width as f64; + let disp_h = d.height as f64; + if disp_w <= 0.0 || disp_h <= 0.0 || d.width == 0 || d.height == 0 { + return Err(BitFunError::tool( + "Invalid display geometry for UI locate mapping.".to_string(), + )); + } + let px_w = d.width as f64; + let px_h = d.height as f64; + let cx = ((gx - disp_ox) / disp_w) * px_w; + let cy = ((gy - disp_oy) / disp_h) * px_h; + let nx = cx.round().clamp(0.0, px_w - 1.0) as u32; + let ny = cy.round().clamp(0.0, px_h - 1.0) as u32; + Ok((nx, ny)) +} + +pub fn global_to_native_center(gx: f64, gy: f64) -> BitFunResult<(u32, u32)> { + let d = DisplayInfo::from_point(gx.round() as i32, gy.round() as i32) + .map_err(|e| BitFunError::tool(format!("DisplayInfo::from_point: {}", e)))?; + global_xy_to_native_with_display(&d, gx, gy) +} + +fn global_bounds_to_native_minmax( + center_gx: f64, + center_gy: f64, + left: f64, + top: f64, + width: f64, + height: f64, +) -> BitFunResult<(u32, u32, u32, u32)> { + let d = DisplayInfo::from_point(center_gx.round() as i32, center_gy.round() as i32) + .map_err(|e| BitFunError::tool(format!("DisplayInfo::from_point: {}", e)))?; + let corners = [ + (left, top), + (left + width, top), + (left, top + height), + (left + width, top + height), + ]; + let mut min_x = u32::MAX; + let mut min_y = u32::MAX; + let mut max_x = 0u32; + let mut max_y = 0u32; + for (gx, gy) in corners { + let (nx, ny) = global_xy_to_native_with_display(&d, gx, gy)?; + min_x = min_x.min(nx); + min_y = min_y.min(ny); + max_x = max_x.max(nx); + max_y = max_y.max(ny); + } + Ok((min_x, min_y, max_x, max_y)) +} + +fn contains_ci(hay: &str, needle: &str) -> bool { + if needle.is_empty() { + return true; + } + hay.to_lowercase().contains(&needle.to_lowercase()) +} + +fn combine_is_any(query: &UiElementLocateQuery) -> bool { + matches!( + query.filter_combine.as_deref(), + Some("any") | Some("or") + ) +} + +/// OR semantics: element matches if **at least one** non-empty filter matches. +pub fn matches_filters_any( + query: &UiElementLocateQuery, + role: Option<&str>, + title: Option<&str>, + ident: Option<&str>, +) -> bool { + let mut has_filter = false; + let mut matched = false; + if let Some(ref want) = query.role_substring { + if !want.trim().is_empty() { + has_filter = true; + if contains_ci(role.unwrap_or(""), want.trim()) { + matched = true; + } + } + } + if let Some(ref want) = query.title_contains { + if !want.trim().is_empty() { + has_filter = true; + if contains_ci(title.unwrap_or(""), want.trim()) { + matched = true; + } + } + } + if let Some(ref want) = query.identifier_contains { + if !want.trim().is_empty() { + has_filter = true; + if contains_ci(ident.unwrap_or(""), want.trim()) { + matched = true; + } + } + } + has_filter && matched +} + +/// AND semantics (default): **every** non-empty filter must match the same element. +pub fn matches_filters_all( + query: &UiElementLocateQuery, + role: Option<&str>, + title: Option<&str>, + ident: Option<&str>, +) -> bool { + if let Some(ref want) = query.role_substring { + if !want.trim().is_empty() { + let r = role.unwrap_or(""); + if !contains_ci(r, want.trim()) { + return false; + } + } + } + if let Some(ref want) = query.title_contains { + if !want.trim().is_empty() { + let t = title.unwrap_or(""); + if !contains_ci(t, want.trim()) { + return false; + } + } + } + if let Some(ref want) = query.identifier_contains { + if !want.trim().is_empty() { + let i = ident.unwrap_or(""); + if !contains_ci(i, want.trim()) { + return false; + } + } + } + true +} + +pub fn matches_filters( + query: &UiElementLocateQuery, + role: Option<&str>, + title: Option<&str>, + ident: Option<&str>, +) -> bool { + if combine_is_any(query) { + matches_filters_any(query, role, title, ident) + } else { + matches_filters_all(query, role, title, ident) + } +} + +pub fn ok_result( + gx: f64, + gy: f64, + bounds_left: f64, + bounds_top: f64, + bounds_width: f64, + bounds_height: f64, + matched_role: String, + matched_title: Option, + matched_identifier: Option, +) -> BitFunResult { + let (nx, ny) = global_to_native_center(gx, gy)?; + let (nminx, nminy, nmaxx, nmaxy) = if bounds_width > 0.0 && bounds_height > 0.0 { + global_bounds_to_native_minmax(gx, gy, bounds_left, bounds_top, bounds_width, bounds_height)? + } else { + (nx, ny, nx, ny) + }; + Ok(UiElementLocateResult { + global_center_x: gx, + global_center_y: gy, + native_center_x: nx, + native_center_y: ny, + global_bounds_left: bounds_left, + global_bounds_top: bounds_top, + global_bounds_width: bounds_width, + global_bounds_height: bounds_height, + native_bounds_min_x: nminx, + native_bounds_min_y: nminy, + native_bounds_max_x: nmaxx, + native_bounds_max_y: nmaxy, + matched_role, + matched_title, + matched_identifier, + }) +} diff --git a/src/apps/desktop/src/computer_use/windows_ax_ui.rs b/src/apps/desktop/src/computer_use/windows_ax_ui.rs new file mode 100644 index 00000000..1323bee1 --- /dev/null +++ b/src/apps/desktop/src/computer_use/windows_ax_ui.rs @@ -0,0 +1,160 @@ +//! Windows UI Automation (UIA) tree walk for stable screen coordinates. + +use crate::computer_use::ui_locate_common; +use bitfun_core::agentic::tools::computer_use_host::{UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use std::collections::VecDeque; +use windows::Win32::System::Com::{CoCreateInstance, CoInitializeEx, CLSCTX_INPROC_SERVER, COINIT_APARTMENTTHREADED}; +use windows::Win32::UI::Accessibility::{CUIAutomation, IUIAutomation, IUIAutomationElement, IUIAutomationTreeWalker}; +use windows::Win32::UI::WindowsAndMessaging::GetForegroundWindow; + +fn bstr_to_string(b: windows_core::BSTR) -> String { + b.to_string() +} + +fn walker_children( + walker: &IUIAutomationTreeWalker, + parent: &IUIAutomationElement, +) -> BitFunResult> { + let mut out = Vec::new(); + let first = unsafe { walker.GetFirstChildElement(parent) }; + let Ok(mut cur) = first else { + return Ok(out); + }; + loop { + out.push(cur.clone()); + let next = unsafe { walker.GetNextSiblingElement(&cur) }; + match next { + Ok(n) => cur = n, + Err(_) => break, + } + } + Ok(out) +} + +fn localized_control_type_string(elem: &IUIAutomationElement) -> String { + unsafe { + elem.CurrentLocalizedControlType() + .map(bstr_to_string) + .unwrap_or_default() + } +} + +/// Foreground window root, then UIA RawViewWalker BFS. +pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult { + ui_locate_common::validate_query(query)?; + let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); + let max_nodes = 12_000usize; + + unsafe { + let _ = CoInitializeEx(None, COINIT_APARTMENTTHREADED); + } + + let automation: IUIAutomation = unsafe { + CoCreateInstance(&CUIAutomation, None, CLSCTX_INPROC_SERVER).map_err(|e| { + BitFunError::tool(format!( + "UI Automation (CoCreateInstance CUIAutomation): {}.", + e + )) + })? + }; + + let hwnd = unsafe { GetForegroundWindow() }; + if hwnd.is_invalid() { + return Err(BitFunError::tool( + "No foreground window (GetForegroundWindow returned null).".to_string(), + )); + } + + let root = unsafe { + automation.ElementFromHandle(hwnd).map_err(|e| { + BitFunError::tool(format!( + "UI Automation ElementFromHandle failed: {}.", + e + )) + })? + }; + + let walker = unsafe { + automation + .RawViewWalker() + .map_err(|e| BitFunError::tool(format!("UI Automation RawViewWalker: {}.", e)))? + }; + + struct Queued { + el: IUIAutomationElement, + depth: u32, + } + + let mut q = VecDeque::new(); + q.push_back(Queued { el: root, depth: 0 }); + let mut visited = 0usize; + + loop { + let Some(cur) = q.pop_front() else { + return Err(BitFunError::tool( + "No UI element matched in the foreground window for this query. Refine filters or use ComputerUse screenshot. Locate uses the same UI Automation permission as mouse/keyboard automation." + .to_string(), + )); + }; + if cur.depth > max_depth { + continue; + } + visited += 1; + if visited > max_nodes { + return Err(BitFunError::tool( + "UI Automation search limit reached; narrow title/role/identifier filters.".to_string(), + )); + } + + let name = unsafe { cur.el.CurrentName().ok().map(bstr_to_string).unwrap_or_default() }; + let ident = unsafe { + cur.el + .CurrentAutomationId() + .ok() + .map(bstr_to_string) + .unwrap_or_default() + }; + let role = localized_control_type_string(&cur.el); + + let matched = ui_locate_common::matches_filters( + query, + Some(role.as_str()), + Some(name.as_str()), + Some(ident.as_str()), + ); + if matched { + let rect = unsafe { cur.el.CurrentBoundingRectangle() }; + if let Ok(r) = rect { + if r.right > r.left && r.bottom > r.top { + let gx = (r.left + r.right) as f64 / 2.0; + let gy = (r.top + r.bottom) as f64 / 2.0; + let bl = r.left as f64; + let bt = r.top as f64; + let bw = (r.right - r.left) as f64; + let bh = (r.bottom - r.top) as f64; + return ui_locate_common::ok_result( + gx, + gy, + bl, + bt, + bw, + bh, + role, + if name.is_empty() { None } else { Some(name) }, + if ident.is_empty() { None } else { Some(ident) }, + ); + } + } + } + + let children = walker_children(&walker, &cur.el)?; + let next_depth = cur.depth + 1; + for ch in children { + q.push_back(Queued { + el: ch, + depth: next_depth, + }); + } + } +} diff --git a/src/crates/core/src/agentic/agents/claw_mode.rs b/src/crates/core/src/agentic/agents/claw_mode.rs index e2f3e21a..24e453e5 100644 --- a/src/crates/core/src/agentic/agents/claw_mode.rs +++ b/src/crates/core/src/agentic/agents/claw_mode.rs @@ -28,6 +28,11 @@ impl ClawMode { "SessionHistory".to_string(), "Cron".to_string(), "ComputerUse".to_string(), + // Split computer-use tools must be allowlisted here; otherwise the pipeline rejects them + // ("Tool 'ComputerUseMousePrecise' is not in the allowed list") and the model falls back to ComputerUse-only + vision. + "ComputerUseMousePrecise".to_string(), + "ComputerUseMouseStep".to_string(), + "ComputerUseMouseClick".to_string(), ], } } diff --git a/src/crates/core/src/agentic/agents/prompts/claw_mode.md b/src/crates/core/src/agentic/agents/prompts/claw_mode.md index f50b961e..9efa4144 100644 --- a/src/crates/core/src/agentic/agents/prompts/claw_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/claw_mode.md @@ -11,6 +11,7 @@ Narrate only when it helps: multi-step work, complex/challenging problems, sensi Keep narration brief and value-dense; avoid repeating obvious steps. Use plain human language for narration unless in a technical context. When a first-class tool exists for an action, use the tool directly instead of asking the user to run equivalent CLI commands. +**Computer use (desktop automation):** If the user’s request needs **more than one** Computer use tool call (or spans **multiple apps/windows**), first state a **short numbered plan** in plain language: **(a)** whether **`Bash`** / **`TerminalControl`** applies (e.g. macOS `open -a "AppName"` to launch/focus an app), **(b)** **`ComputerUse`** **`action: locate`** for **named** UI targets before pointer moves, **(c)** target **application/window**, **(d)** how you will **verify** focus — **prefer** **`computer_use_context`** from tool results; use **`screenshot`** when you need **pixels** for the next step or when the **host** requires a fresh capture (see **Screenshot cadence** below). Then execute **step-by-step** — this overrides “silent tools” for that automation block only. # Session Coordination For complex coding tasks or office-style multi-step tasks, prefer multi-session coordination over doing everything in the current session. @@ -39,26 +40,41 @@ Prioritize safety and human oversight over completion; if instructions conflict, Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested. # Computer use (BitFun desktop, when enabled) -When the `ComputerUse` tool is available, you may capture the screen and use mouse/keyboard automation for tasks the user requested. -- **Automation priority (apply before reaching for the mouse):** (1) **`key_chord`** — standard **OS and in-app shortcuts**, including **system clipboard**: **Copy / Cut / Paste / Select all** using the host’s real chords (see Environment Information; e.g. macOS typically **command**+c/x/v/a, Windows/Linux typically **control**+c/x/v/a — match menus, not assumptions from another OS). Prefer **paste** over **`type_text`** when the user wants content inserted from the clipboard, when duplicating existing text, or when pasting long or structured content. (2) **`type_text`** — short literals, fields where paste is blocked, or after shortcuts clearly failed. (3) **Pointer + `screenshot` + `click`** — only when no shortcut or clipboard path fits, or after verifying a shortcut attempt failed. -- **Default path before any `click` (unless a shortcut replaces the click):** After the **first** full-frame `screenshot`, **you must narrow the view with quadrant drill** — each narrowing step is **`action: screenshot`** **plus** **`screenshot_navigate_quadrant`** (`top_left` / `top_right` / `bottom_left` / `bottom_right`). Repeat **one quadrant per call** until the tool JSON shows **`quadrant_navigation_click_ready`: true**, then `mouse_move` + `click`. **Do not skip straight to point crop** (`screenshot_crop_center_*`) from a full-screen shot unless: the click target already fills a large fraction of the frame, quadrant drill is clearly wrong for the UI (e.g. you must jump to a known margin coordinate), or the user explicitly asked for a crop at native x/y. +**What “computer use” means here:** **desktop automation for the user’s task**, not only tools whose names start with `ComputerUse`. When the step can be done from the **workspace terminal** (scripts, builds, tests, git, CLIs, macOS `open`/`osascript` where appropriate), use **`Bash`** / **`TerminalControl`** **before** driving the GUI. Do **not** skip the terminal and jump straight to screenshots if a shell command would accomplish the same step. + +**Tool list order (matches the API):** After **`Task`**, **`Bash`**, **`TerminalControl`**, file tools, then **`ComputerUse`** (screenshot + chords + **`locate`**), then **`ComputerUseMousePrecise` / `ComputerUseMouseStep` / `ComputerUseMouseClick`** — **within `ComputerUse`**, prefer **`action: locate`** before ruler-only **`action: screenshot`** when a **named** control can be matched; do **not** open with full-screen **`screenshot`** when **`locate`** can name the target. + +When **Computer use** is enabled, you have **`ComputerUse`** (`action` **`screenshot`** | **`locate`** | **`key_chord`** | **`type_text`** | **`pointer_move_rel`** | **`wait`**) and separate mouse tools **`ComputerUseMousePrecise`**, **`ComputerUseMouseStep`**, **`ComputerUseMouseClick`**. Do **not** treat “computer use” as “only `screenshot` + vision”. + +- **Screenshot cadence (align with BitFun host — avoid spam):** The **desktop host** enforces a **fresh `screenshot`** mainly for **two** cases: **`ComputerUseMouseClick` (`action`: click)** (needs a **fine** view: quadrant drill terminal or point crop — not full-screen-only), and **`key_chord` that includes Return / Enter** when the outcome matters. **Do not** treat `screenshot` as a **heartbeat** after every other action. **Do not** call `screenshot` simply because you just ran **`action: locate`** (JSON-only), **`key_chord` without Enter**, **`type_text`**, or **`wait`** — unless you **need** a JPEG for the next **vision** step (aiming, reading dense UI) or **`computer_use_context`** is missing/ambiguous. After **`locate`** success, **prefer** moving with **`coordinate_hints.mouse_precise_screen`** (global coords) or **`ComputerUseMousePrecise`** without an extra full-frame `screenshot` when the next step is not yet a host-guarded click/Enter. **Use** `screenshot` when you need to **see** pixels or before **click / Enter** per the host rules below. + +- **`action: locate` — how to aim filters (not OCR):** `locate` searches the **accessibility tree** (titles, roles, identifiers on the **foreground** window). It does **not** read pixels like OCR; labels drawn only in the bitmap, heavily custom UIs, or some list rows may **never** appear in AX. **Prefer** substrings that match what the app likely exposes — often a **shorter or distinctive fragment** and the **same language as the UI** (do not assume the user’s chat text matches **AXTitle** verbatim). **Filter combination:** by default, non-empty fields are combined with **AND** (same element must satisfy all). Many inputs (e.g. WeChat search) have **`AXTextField`** but **no** `AXTitle` containing “搜索” — then **`title_contains` + `role_substring` together will fail**. Use **`filter_combine`: `any`** so **role OR title** can match, or send **only one** of `title_contains` / `role_substring` / `identifier_contains` on the first try. If a call returns no match, **change the query** before retrying; avoid sending the **same** filters repeatedly. When AX probably will not contain the label (chat bubbles, owner-drawn text, dense feeds), **switch early** to **`screenshot`** and the vision / quadrant path — that is a normal fallback, not a failure to “use locate first” where AX has nothing to match. + +- **macOS — launch or foreground an app:** Prefer **`Bash`** with `open -a "AppName"` (e.g. `open -a WeChat`) instead of Spotlight (Command+Space + `type_text` + Return) when you only need to **start or bring forward** the app — **fewer steps** and fewer **Return/Enter** screenshot-guard failures. Reserve Spotlight for when `open -a` is wrong or the user asked for Spotlight. +- **Named rows / chats / list items (WeChat, Slack, Mail, etc.):** **Forbidden:** aiming at a **conversation row** or **named button** using **only** full-screen **`screenshot`** ruler coordinates + **`pointer_move_rel`** / **`ComputerUseMousePrecise`** **before** trying **`ComputerUse`** **`action: locate`**. **Required:** call **`ComputerUse`** with **`action: locate`** and **`title_contains`** / **`role_substring`** matching the **on-screen label** (same language as the app UI, e.g. «Bob») to get **`global_center_*`** / **`coordinate_hints`**, then move or point-crop. If locate fails after refining filters, **then** use vision (quadrant drill / crop). +- **Plan → execute → verify (multi-step):** Before the **first** tool call of a desktop task that needs several steps, output a **numbered plan** (target app/window, verification checkpoints, tool order). Execute **one logical step at a time**; after a step that may **change focus** (app switch, new window, dialog), **prefer** **`computer_use_context`** / **`wait`**; **add `screenshot`** only when you must **see** the new layout or before a **click / Enter** per host rules. Do **not** “stream” many unrelated actions while the foreground app might still be wrong. +- **Foreground safety (`computer_use_context`):** Tool results include **`computer_use_context`** when available: **`foreground_application`** (which app is frontmost), **`pointer_global`** (cursor), **`input_coordinates`** (this call). Treat **`foreground_application`** as ground truth. **`ComputerUse`** **`action: locate`** searches the **foreground** app only — if the wrong app is focused, locate and clicks hit the **wrong** window. If **`foreground_application`** (or the latest **`screenshot`**) does **not** match the **intended** target app, **stop** the current sequence: switch focus first (e.g. **`screenshot`** to see the dock/taskbar, **`ComputerUseMousePrecise`** + **`ComputerUseMouseClick`** on the correct icon, or **`key_chord`** for app switch / window cycle **on this host**), then **`wait`** / **`screenshot`** until the **correct** app is frontmost, then continue. Never assume BitFun, the terminal, or a previous app is still focused. +- **Re-plan on failure:** If a tool **errors**, **`locate`** finds **no** match, the last **`screenshot`** shows **unexpected** UI, or **`foreground_application`** is wrong: **do not** keep executing the old plan. **Re-read** **`computer_use_context`**; **add a `screenshot`** only when you need new pixels to revise the plan (fix focus, tighten locate filters, switch to vision/quadrant drill, or **ask the user**). Only retry after the plan is updated — do not stack pointer/text/chord actions on the wrong app. +- **Automation priority (strict order — try higher items before lower):** (1) **Terminal — `Bash` / `TerminalControl`** — anything achievable via **shell in the workspace** (build, test, git, scripts, CLIs). (2) **System shortcuts — `key_chord`** — **OS-wide** actions and **system clipboard** (see Environment Information for modifiers on **this** host). (3) **Application shortcuts — `key_chord`** — **in-app** shortcuts when the correct app/window is focused. (4) **`ComputerUse`** **`action: locate`** — **native accessibility tree** (macOS AX / Windows UIA / Linux AT-SPI) on the **foreground** window: **`title_contains`**, **`role_substring`**, **`identifier_contains`**, centers and **`global_bounds_*` / `native_bounds_*`**. **When locate matches:** you may **move** with **`coordinate_hints`** **without** an immediate full-frame **`screenshot`**; call **`screenshot`** with **`screenshot_crop_center_*`** / **`screenshot_crop_half_extent_native`** from **`coordinate_hints.screenshot_point_crop`** **only when** you need a **JPEG** for vision (e.g. quadrant drill toward a click) — **not** after every successful locate. If **`locate` finds nothing**, continue to (5). (5) **Vision — `ComputerUse`** **`action: screenshot`** + **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick`** — only when (1)–(4) cannot complete the step. **Between shortcuts and accessibility locate:** use **`type_text`** only for short or paste-blocked input **after** focus is correct; prefer **`key_chord`** paste when possible. +- **Quadrant drill vs locate:** The **default quadrant-drill + JPEG** workflow below is for **vision-based aiming**. It is **not** a substitute for **`ComputerUse`** **`action: locate`** when you can filter by **name/role** (e.g. “open chat with 尉怡青” → try **`locate`** with **`title_contains`** matching the contact name in **the app’s UI language** before guessing coordinates from a screenshot). +- **Default path before any `ComputerUseMouseClick` (`action`: click) when using the vision path (unless a shortcut replaces it):** After the **first** full-frame `screenshot`, **you must narrow the view with quadrant drill** — each narrowing step is **`action: screenshot`** **plus** **`screenshot_navigate_quadrant`** (`top_left` / `top_right` / `bottom_left` / `bottom_right`). Repeat **one quadrant per call** until the tool JSON shows **`quadrant_navigation_click_ready`: true**, then **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick` (`action`: click)**. **Do not skip straight to point crop** (`screenshot_crop_center_*`) from a full-screen shot unless: the click target already fills a large fraction of the frame, quadrant drill is clearly wrong for the UI (e.g. you must jump to a known margin coordinate), or the user explicitly asked for a crop at native x/y. - **Quadrant drill is never automatic:** The host **does not** split the screen unless **you** pass `screenshot_navigate_quadrant` on that `screenshot` call. A plain `screenshot` with **no** `screenshot_navigate_quadrant` only **refreshes** the full display (or the current drill region). **If you never set `screenshot_navigate_quadrant`, you will stay on a wide view and models often mis-click** — follow the default path above. -- **No automatic desktop images:** BitFun does **not** inject extra screenshot messages or attach follow-up JPEGs after other ComputerUse actions. Call **`screenshot`** whenever you need to see the screen: full frame, **`screenshot_navigate_quadrant`** (four-way drill — see tool schema), **`screenshot_reset_navigation`**, or point crop via `screenshot_crop_center_x` / `screenshot_crop_center_y` (**full-display native** pixels). If **`screenshot_navigate_quadrant`** is set, **`screenshot_crop_center_*` are ignored** in that same call (avoid sending both; send **only** fields that apply to the current `action`). +- **No automatic desktop images:** BitFun does **not** inject extra screenshot messages or attach follow-up JPEGs after other ComputerUse actions. Call **`screenshot`** when you **need** pixels for the next decision: full frame, **`screenshot_navigate_quadrant`** (four-way drill — see tool schema), **`screenshot_reset_navigation`**, or point crop via `screenshot_crop_center_x` / `screenshot_crop_center_y` (**full-display native** pixels). **Do not** refresh the full display **habitually** after `locate` / `key_chord` / `type_text` if the **Host-enforced screenshot** rules above do not yet apply. If **`screenshot_navigate_quadrant`** is set, **`screenshot_crop_center_*` are ignored** in that same call (avoid sending both; send **only** fields that apply to the current `action`). - **Host OS and shortcuts:** Before `key_chord`, read **Environment Information** below (Operating System line and the Computer use bullet there). Use modifier names that match **that** host only — do not mix OS conventions (e.g. do not use Windows-style shortcuts when the host is macOS). -- **Shortcut-first (required):** When a **standard OS or in-app shortcut** or **clipboard chord** does the same job as a planned pointer path, you **must choose `key_chord` first** — do **not** open Edit menus to click Copy/Paste when **`key_chord`** can do it; do **not** re-type long text with **`type_text`** when **Select all + Copy** or **Paste** achieves the goal. Same for New/Open/Save, Undo/Redo, Find, tab/window close or switch, Quit, Refresh, focus address bar, etc. Reserve **`mouse_move` + crop screenshots + `click`** for when **no** reliable shortcut exists, the control is pointer-only, or after a shortcut clearly failed (then **`screenshot`** and try another approach). Menus in the JPEG often display shortcuts — use them. -- **Never drive blind:** after `key_chord`, `type_text`, or `scroll` when the **next step depends on what is on screen** (app opened, focus changed, dialog appeared, field focused, list scrolled), you **must** run `screenshot` (optionally `wait` a short `ms` first if the UI animates) and **confirm** the state before more shortcuts or clicks. Do **not** chain many shortcuts in one turn without a screenshot in between when failure would mislead the user. -- **Strict rule — no blind Enter, no blind click:** Before **`click`**, you **must** have a **fine** screenshot after the pointer is aligned: **`quadrant_navigation_click_ready`: true** (preferred: **`screenshot` + `screenshot_navigate_quadrant`** each step until the tool JSON says so) **or** a **point-crop `screenshot`** (~500×500 via `screenshot_crop_center_*`) when the exceptions above apply. A **full-screen-only** frame alone does **not** authorize **`click`**. Before **`key_chord` that includes Return or Enter**, you **must** call **`screenshot` first** and **visually confirm** focus and target. The only exception is when the user explicitly asks for an unverified / blind step. +- **Shortcut-first (required, after terminal when applicable):** If the step is **not** better done via **`Bash`** / **`TerminalControl`**, then when a **standard OS or in-app shortcut** or **clipboard chord** does the same job as a planned pointer path, you **must choose `key_chord`** — do **not** open Edit menus to click Copy/Paste when **`key_chord`** can do it; do **not** re-type long text with **`type_text`** when **Select all + Copy** or **Paste** achieves the goal. Same for New/Open/Save, Undo/Redo, Find, tab/window close or switch, Quit, Refresh, focus address bar, etc. Reserve **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + crop screenshots + **`ComputerUseMouseClick` (`action`: click)** for when **no** reliable shortcut exists, the control is pointer-only, or after a shortcut clearly failed (then **`screenshot`** and try another approach). Menus in the JPEG often display shortcuts — use them. +- **When to verify with pixels:** If the **next** step is a **host-guarded** **`ComputerUseMouseClick` (click)** or **`key_chord` with Return/Enter**, follow the **Strict rule** below — that is when a **fresh `screenshot`** is **required**. For other steps (`key_chord` without Enter, `type_text`, `locate`, wheel), **prefer** `computer_use_context` and logical continuation; **add `screenshot`** only if you **cannot** reason about the next action (unknown dialog, wrong app suspected, or you need a JPEG to aim). **Do not** insert a full-frame `screenshot` between every pair of non-click actions. +- **Strict rule — no blind Enter, no blind click:** Before **`ComputerUseMouseClick` (`action`: click)**, you **must** have a **fine** screenshot after the pointer is aligned: **`quadrant_navigation_click_ready`: true** (preferred: **`screenshot` + `screenshot_navigate_quadrant`** each step until the tool JSON says so) **or** a **point-crop `screenshot`** (~500×500 via `screenshot_crop_center_*`) when the exceptions above apply. A **full-screen-only** frame alone does **not** authorize **`ComputerUseMouseClick` (click)**. Before **`key_chord` that includes Return or Enter**, you **must** call **`screenshot` first** and **visually confirm** focus and target. The only exception is when the user explicitly asks for an unverified / blind step. - For sending messages, payments, destructive actions, or anything sensitive, state the exact steps first and obtain clear user confirmation in chat before executing. - If Computer use is disabled or OS permissions are missing, tell the user what to enable in BitFun settings / system privacy instead of claiming success. - Screenshot results require the session primary model to use Anthropic or OpenAI-compatible API format so the image is attached to the tool result for vision. The JPEG matches **native display resolution** (no downscale): `coordinate_mode` `"image"` uses the same pixel grid as the bitmap. -- **Host-enforced screenshot (two cases):** The desktop host **rejects `click`** until the last `screenshot` after the last pointer move is a **valid fine basis**: **`quadrant_navigation_click_ready`: true** (quadrant drill until the region’s longest side is below the host threshold) **or** a **fresh point-crop** (`screenshot_crop_center_*`, ~500×500). **Full-screen-only** is **not** enough. It **rejects `key_chord` that includes Return or Enter** until a **fresh `screenshot`** since the last pointer move or click. **`mouse_move`** may use **`coordinate_mode` `\"image\"`** on any prior **`screenshot`**. Still **prefer `key_chord`** when it matches the step. +- **Host-enforced screenshot (two cases):** The desktop host **rejects `ComputerUseMouseClick` (click)** until the last `screenshot` after the last pointer move is a **valid fine basis**: **`quadrant_navigation_click_ready`: true** (quadrant drill until the region’s longest side is below the host threshold) **or** a **fresh point-crop** (`screenshot_crop_center_*`, ~500×500). **Full-screen-only** is **not** enough. It **rejects `key_chord` that includes Return or Enter** until a **fresh `screenshot`** since the last pointer move or click. **`ComputerUseMousePrecise`** may use **`coordinate_mode` `\"image\"`** on any prior **`screenshot`**. Still **prefer `key_chord`** when it matches the step. - **Rulers vs zoom:** Full-frame JPEGs have **margin rulers** and a **grid** — use them to orient. For small controls, **default to quadrant drill** (`screenshot_navigate_quadrant` on each `screenshot` step); use **point crop** only as a **secondary** option (see default path above). Each quadrant step **adds padding on every side** (clamped) so controls on split lines stay in the JPEG. **Do not** rely only on huge full-display images when a smaller view answers the question. -- **Click guard:** The host **rejects `click`** if there was **`mouse_move` / `pointer_nudge` / `pointer_move_rel` or a previous `click`** since the last `screenshot`, or if the last `screenshot` was **full-screen only** without **`quadrant_navigation_click_ready`**. **`screenshot`** before **Return/Enter** in **`key_chord`** when the outcome matters. -- **`pointer_nudge` / `pointer_move_rel` on macOS:** Deltas are in **screenshot/display pixels**; the host converts using the **last** **`screenshot`**’s scale — take **`screenshot`** first or moves may be wrong. **`mouse_move`** can do the same cardinal nudge in one call: set **`mouse_move_direction`** (`up` / `down` / `left` / `right`) and optional **`mouse_move_relative_pixels`** (default 32, use smaller values e.g. 8–24 for fine alignment); **`x`/`y` are ignored** when direction is set. **Small moves:** prefer **relative** direction + pixels (or `pointer_nudge`) over guessing tiny absolute **`x`/`y`** — vision models are usually more reliable that way. -- **Where is the pointer?** Only the latest `screenshot` tells you: **`pointer_image_x` / `pointer_image_y`** (tip in **this** JPEG for `coordinate_mode` `"image"`) and the **synthetic red cursor with gray border** in the image (**tip** = hotspot). Read **`pointer_marker`** in the tool JSON. If those coordinates are **null** and there is **no** overlay, the cursor is **not** on this capture — do not infer position from the image; use **`use_screen_coordinates`** with global coords or move the pointer onto this display. After any `mouse_move` / `pointer_*`, the old screenshot is **stale** until you `screenshot` again. -- After `screenshot`, when the pointer is on this display, the JPEG includes that **red cursor overlay** and the JSON fields above. **`mouse_move` only moves** the pointer (on macOS uses sub-point Quartz for accuracy). **`click` only clicks** at the current pointer (no coordinates). **Default:** **`screenshot` + `screenshot_navigate_quadrant`** (repeat) until **`quadrant_navigation_click_ready`**, then align the **red tip** with **`mouse_move`** on that JPEG and **`click`**. For **small** alignment fixes, **`mouse_move_direction`** + **`mouse_move_relative_pixels`** (or nudge) beats tiny absolute coords; reserve absolute **`x`/`y`** for **larger** jumps. **Fallback:** point-crop `screenshot` when the default path does not fit. Do not aim using only the OS cursor or guesswork. If tool JSON includes **`recommended_next_for_click_targeting`**, follow it. -- **Default pointer loop:** (1) `screenshot` (full or after **`screenshot_reset_navigation`**) then **required quadrant drill** until **`quadrant_navigation_click_ready`** (unless a justified point crop); (2) **`mouse_move` with relative direction** / `pointer_nudge` / `pointer_move_rel` for **small** nudges, absolute **`mouse_move`** when you need a **big** reposition; repeat until the **red cursor tip** is on the target; (3) **`screenshot` again** after any pointer move; (4) repeat if needed; (5) only then **`click`** when the last screenshot is **fine** (quadrant terminal or point crop). If the pointer is off the captured display (no red overlay), use `mouse_move` to bring it onto the screen, then continue. Re-screenshot after major UI changes. -- **Shortcut + verify:** Treat `key_chord` / `type_text` like risky steps: if something did not work (wrong window, IME, permission dialog), continuing without a screenshot causes bogus actions. When in doubt, screenshot. Follow **`hierarchical_navigation.shortcut_policy`** in each `screenshot` result together with this section. +- **Click guard:** The host **rejects `ComputerUseMouseClick` (click)** if there was **`ComputerUseMousePrecise` / `ComputerUseMouseStep` / `pointer_move_rel` or a previous click** since the last `screenshot`, or if the last `screenshot` was **full-screen only** without **`quadrant_navigation_click_ready`**. **`screenshot`** before **Return/Enter** in **`key_chord`** when the outcome matters. +- **`ComputerUseMouseStep` / `pointer_move_rel` on macOS:** Deltas are in **screenshot/display pixels**; the host converts using the **last** **`screenshot`**’s scale — take **`screenshot`** first or moves may be wrong. **`ComputerUseMouseStep`** uses **`direction`** (`up` / `down` / `left` / `right`) and optional **`pixels`** (default 32, use smaller values e.g. 8–24 for fine alignment). **Small moves:** prefer **`ComputerUseMouseStep`** over guessing tiny absolute **`ComputerUseMousePrecise` `x`/`y`** — vision models are usually more reliable that way. +- **Where is the pointer?** Only the latest `screenshot` tells you: **`pointer_image_x` / `pointer_image_y`** (tip in **this** JPEG for `coordinate_mode` `"image"`) and the **synthetic red cursor with gray border** in the image (**tip** = hotspot). Read **`pointer_marker`** in the tool JSON. If those coordinates are **null** and there is **no** overlay, the cursor is **not** on this capture — do not infer position from the image; use **`use_screen_coordinates`** with global coords or move the pointer onto this display. After any **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** / `pointer_move_rel`, the old screenshot is **stale** until you `screenshot` again. +- After `screenshot`, when the pointer is on this display, the JPEG includes that **red cursor overlay** and the JSON fields above. **`ComputerUseMousePrecise` only moves** to absolute coords (on macOS uses sub-point Quartz for accuracy). **`ComputerUseMouseClick` (`action`: click)** only clicks at the current pointer (no coordinates); **`action`: wheel** scrolls the wheel at the pointer. **Default:** **`screenshot` + `screenshot_navigate_quadrant`** (repeat) until **`quadrant_navigation_click_ready`**, then align the **red tip** with **`ComputerUseMouseStep`** / **`ComputerUseMousePrecise`** on that JPEG and **`ComputerUseMouseClick` (click)**. For **small** alignment fixes, **`ComputerUseMouseStep`** beats tiny absolute coords; reserve **`ComputerUseMousePrecise`** for **larger** jumps. **Fallback:** point-crop `screenshot` when the default path does not fit. Do not aim using only the OS cursor or guesswork. If tool JSON includes **`recommended_next_for_click_targeting`**, follow it. +- **Default pointer loop:** (1) `screenshot` (full or after **`screenshot_reset_navigation`**) then **required quadrant drill** until **`quadrant_navigation_click_ready`** (unless a justified point crop); (2) **`ComputerUseMouseStep`** / `pointer_move_rel` for **small** nudges, **`ComputerUseMousePrecise`** when you need a **big** reposition; repeat until the **red cursor tip** is on the target; (3) **`screenshot` again** after any pointer move; (4) repeat if needed; (5) only then **`ComputerUseMouseClick` (click)** when the last screenshot is **fine** (quadrant terminal or point crop). If the pointer is off the captured display (no red overlay), use **`ComputerUseMousePrecise`** to bring it onto the screen, then continue. Re-screenshot after major UI changes. +- **Shortcut + verify:** If a **`key_chord`** / **`type_text`** clearly **failed** (error tool result, or a **later** step shows wrong window), **then** use **`screenshot`** or **`computer_use_context`** to recover — not a blanket screenshot after every chord. Follow **`hierarchical_navigation.shortcut_policy`** in each `screenshot` result together with this section. - On macOS, development builds need Accessibility for the actual debug binary (path is in the error message if input is blocked). {CLAW_WORKSPACE} diff --git a/src/crates/core/src/agentic/execution/execution_engine.rs b/src/crates/core/src/agentic/execution/execution_engine.rs index 905889d1..141a8e61 100644 --- a/src/crates/core/src/agentic/execution/execution_engine.rs +++ b/src/crates/core/src/agentic/execution/execution_engine.rs @@ -756,7 +756,11 @@ impl ExecutionEngine { agent_type, allowed_tools.len() ); - self.get_available_tools_and_definitions(&allowed_tools, context.workspace.as_ref()) + self.get_available_tools_and_definitions( + &allowed_tools, + context.workspace.as_ref(), + &agent_type, + ) .await } else { (vec![], None) @@ -1193,17 +1197,17 @@ impl ExecutionEngine { &self, mode_allowed_tools: &[String], workspace: Option<&crate::agentic::WorkspaceBinding>, + agent_type: &str, ) -> (Vec, Option>) { // Use get_all_registered_tools to get all tools including MCP tools let all_tools = get_all_registered_tools().await; // Filter tools: 1) Check if enabled 2) Check if mode allows - let mut enabled_tool_names = Vec::new(); let mut tool_definitions = Vec::new(); let description_context = crate::agentic::tools::framework::ToolUseContext { tool_call_id: None, message_id: None, - agent_type: None, + agent_type: Some(agent_type.to_string()), session_id: None, dialog_turn_id: None, workspace: workspace.cloned(), @@ -1226,8 +1230,6 @@ impl ExecutionEngine { let tool_name = tool.name().to_string(); // MCP tools are automatically allowed (all tools starting with mcp_) if mode_allowed_tools.contains(&tool_name) || tool_name.starts_with("mcp_") { - enabled_tool_names.push(tool_name); - let description = tool .description_with_context(Some(&description_context)) .await @@ -1241,32 +1243,36 @@ impl ExecutionEngine { } } - let tool_ordering = { - let ordering = vec![ - "Task", - "Bash", - "Glob", - "Grep", - "Read", - "Edit", - "Write", - "Delete", - "WebFetch", - "WebSearch", - "TodoWrite", - "Skill", - "Log", - "MermaidInteractive", - ]; - let num_tools = ordering.len(); - ordering - .into_iter() - .map(|s| s.to_string()) - .zip(1..=num_tools) - .collect::>() - }; + // Order tools for the model API: terminal → file-ish tools → **`ComputerUse`** (locate / + // screenshot / keys) **before** split mouse tools so the list matches “sense then act”. + let tool_ordering: HashMap = [ + ("Task", 1), + ("Bash", 2), + ("TerminalControl", 3), + ("Glob", 4), + ("Grep", 5), + ("Read", 6), + ("Edit", 7), + ("Write", 8), + ("Delete", 9), + ("WebFetch", 10), + ("WebSearch", 11), + ("TodoWrite", 12), + ("Skill", 13), + ("Log", 14), + ("MermaidInteractive", 15), + ("ComputerUse", 16), + ("ComputerUseMousePrecise", 17), + ("ComputerUseMouseStep", 18), + ("ComputerUseMouseClick", 19), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect(); tool_definitions.sort_by_key(|tool| tool_ordering.get(&tool.name).unwrap_or(&100)); + let enabled_tool_names: Vec = tool_definitions.iter().map(|d| d.name.clone()).collect(); + (enabled_tool_names, Some(tool_definitions)) } diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index a6925795..648f8c9c 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -1,6 +1,6 @@ //! Host abstraction for desktop automation (implemented in `bitfun-desktop`). -use crate::util::errors::BitFunResult; +use crate::util::errors::{BitFunError, BitFunResult}; use async_trait::async_trait; use serde::{Deserialize, Serialize}; @@ -37,6 +37,8 @@ pub struct ComputerUseScreenshotParams { pub navigate_quadrant: Option, /// Clear stored navigation focus before applying this capture (next quadrant step starts from full display). pub reset_navigation: bool, + /// Half-size of the point crop in **native** pixels (total width/height ≈ `2 * half`). `None` → [`COMPUTER_USE_POINT_CROP_HALF_DEFAULT`]. + pub point_crop_half_extent_native: Option, } /// Longest side of the navigation region must be **strictly below** this to allow `click` without a separate point crop (desktop). @@ -45,6 +47,32 @@ pub const COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE: u32 = 500; /// Native pixels added on **each** side after a quadrant choice before compositing the JPEG (avoids controls sitting exactly on the split line). pub const COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX: u32 = 50; +/// Default **half** extent (native px) for point crop around `screenshot_crop_center_*` → total region up to **500×500**. +pub const COMPUTER_USE_POINT_CROP_HALF_DEFAULT: u32 = 250; + +/// Minimum **half** extent for point crop (native px) — total region **≥ 128×128** when the display is large enough. +pub const COMPUTER_USE_POINT_CROP_HALF_MIN: u32 = 64; + +/// Maximum **half** extent for point crop (native px) — total region **≤ 500×500**. +pub const COMPUTER_USE_POINT_CROP_HALF_MAX: u32 = 250; + +/// Clamp optional model/host request to a valid point-crop half extent. +#[inline] +pub fn clamp_point_crop_half_extent(requested: Option) -> u32 { + let v = requested.unwrap_or(COMPUTER_USE_POINT_CROP_HALF_DEFAULT); + v.clamp(COMPUTER_USE_POINT_CROP_HALF_MIN, COMPUTER_USE_POINT_CROP_HALF_MAX) +} + +/// Suggest a tighter half-extent from AX **native** bounds size (smaller controls → smaller JPEG). +#[inline] +pub fn suggested_point_crop_half_extent_from_native_bounds(native_w: u32, native_h: u32) -> u32 { + let max_edge = native_w.max(native_h).max(1); + let half = max_edge + .saturating_div(2) + .saturating_add(32); + clamp_point_crop_half_extent(Some(half)) +} + /// Snapshot of OS permissions relevant to computer use. #[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] pub struct ComputerUsePermissionSnapshot { @@ -54,6 +82,33 @@ pub struct ComputerUsePermissionSnapshot { pub platform_note: Option, } +/// Frontmost application (for Computer use tool JSON). +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)] +pub struct ComputerUseForegroundApplication { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub bundle_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub process_id: Option, +} + +/// Mouse cursor position in **global** screen space (host native units, e.g. macOS Quartz points). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct ComputerUsePointerGlobal { + pub x: f64, + pub y: f64, +} + +/// Foreground app + pointer position after a Computer use action (best-effort per platform). +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] +pub struct ComputerUseSessionSnapshot { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub foreground_application: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub pointer_global: Option, +} + /// Pixel rectangle of the **screen capture** inside the JPEG (excludes white margin and rulers). #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct ComputerUseImageContentRect { @@ -89,13 +144,16 @@ pub struct ComputerScreenshot { /// When set, this JPEG is a crop around this center in **full-display native** pixels (see tool docs). #[serde(default, skip_serializing_if = "Option::is_none")] pub screenshot_crop_center: Option, + /// Half extent used for this point crop (native px); omitted when not a point crop. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub point_crop_half_extent_native: Option, /// Native rectangle corresponding to this JPEG’s content (full display, quadrant drill region, or point-crop bounds). #[serde(default, skip_serializing_if = "Option::is_none")] pub navigation_native_rect: Option, - /// When true (desktop), `click` is allowed on this frame without an extra ~500×500 point crop — region is small enough for `mouse_move` + `click`. + /// When true (desktop), `click` is allowed on this frame without an extra ~500×500 point crop — region is small enough for pointer positioning + `click`. #[serde(default, skip_serializing_if = "is_false")] pub quadrant_navigation_click_ready: bool, - /// Screen pixels inside the JPEG (below/left of white margin); `mouse_move` maps this rect to the display. + /// Screen pixels inside the JPEG (below/left of white margin); `ComputerUseMousePrecise` maps this rect to the display. #[serde(default, skip_serializing_if = "Option::is_none")] pub image_content_rect: Option, } @@ -104,6 +162,52 @@ fn is_false(b: &bool) -> bool { !*b } +/// Filter for native accessibility (macOS AX) BFS search — role/title/identifier substrings. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct UiElementLocateQuery { + #[serde(default)] + pub title_contains: Option, + #[serde(default)] + pub role_substring: Option, + #[serde(default)] + pub identifier_contains: Option, + /// BFS depth from the application root (default 48, max 200). + #[serde(default)] + pub max_depth: Option, + /// `"all"` (default): every non-empty filter must match the **same** element (AND). + /// `"any"`: at least one non-empty filter matches (OR) — useful when title and role are not both present on one node (e.g. search field with empty AXTitle). + #[serde(default)] + pub filter_combine: Option, +} + +/// Matched element geometry from the accessibility tree: center plus **axis-aligned bounds** (four corners). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct UiElementLocateResult { + /// Same space as `ComputerUse` `use_screen_coordinates` / host pointer moves. + pub global_center_x: f64, + pub global_center_y: f64, + /// Use with `ComputerUse` `screenshot_crop_center_x` / `y` (full-capture native indices). + pub native_center_x: u32, + pub native_center_y: u32, + /// Element frame in **global** pointer space: top-left `(left, top)`, size `(width, height)`. + /// Four corners: `(left, top)`, `(left+width, top)`, `(left, top+height)`, `(left+width, top+height)`. + pub global_bounds_left: f64, + pub global_bounds_top: f64, + pub global_bounds_width: f64, + pub global_bounds_height: f64, + /// Tight **native** pixel bounds on the capture bitmap (full-display indices), derived from the global frame + /// (mapping uses the display that contains the center; large spans may be approximate). + pub native_bounds_min_x: u32, + pub native_bounds_min_y: u32, + pub native_bounds_max_x: u32, + pub native_bounds_max_y: u32, + pub matched_role: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub matched_title: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub matched_identifier: Option, +} + #[async_trait] pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { async fn permission_snapshot(&self) -> BitFunResult; @@ -132,7 +236,7 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { /// Fails if no screenshot was taken in this process since startup (or since last host reset). fn map_image_coords_to_pointer(&self, x: i32, y: i32) -> BitFunResult<(i32, i32)>; - /// Same as `map_image_coords_to_pointer` but **sub-point** precision (macOS: use for `mouse_move`). + /// Same as `map_image_coords_to_pointer` but **sub-point** precision (macOS: use for `ComputerUseMousePrecise`). fn map_image_coords_to_pointer_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { let (a, b) = self.map_image_coords_to_pointer(x, y)?; Ok((a as f64, b as f64)) @@ -154,10 +258,10 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { async fn mouse_move(&self, x: i32, y: i32) -> BitFunResult<()>; - /// Move the pointer by `(dx, dy)` in **global screen pixels** (same space as `mouse_move` absolute). + /// Move the pointer by `(dx, dy)` in **global screen pixels** (same space as `ComputerUseMousePrecise` absolute). async fn pointer_move_relative(&self, dx: i32, dy: i32) -> BitFunResult<()>; - /// Click at the **current** pointer position only (does not move). Use `mouse_move` / `pointer_*` first. + /// Click at the **current** pointer position only (does not move). Use `ComputerUseMousePrecise` / `ComputerUseMouseStep` / `pointer_move_rel` first. /// `button`: "left" | "right" | "middle" async fn mouse_click(&self, button: &str) -> BitFunResult<()>; @@ -171,10 +275,16 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { async fn wait_ms(&self, ms: u64) -> BitFunResult<()>; + /// Current frontmost app and global pointer position for tool-result JSON (`computer_use_context`). + /// Default: empty. Desktop overrides with platform queries (typically after each tool action). + async fn computer_use_session_snapshot(&self) -> ComputerUseSessionSnapshot { + ComputerUseSessionSnapshot::default() + } + /// After a successful `screenshot_display`, the model may `mouse_click` (until the pointer moves again). fn computer_use_after_screenshot(&self) {} - /// After `mouse_move` / relative pointer moves: the next `mouse_click` must be preceded by a new screenshot. + /// After `ComputerUseMousePrecise` / `ComputerUseMouseStep` / relative pointer moves: the next `mouse_click` must be preceded by a new screenshot. fn computer_use_after_pointer_mutation(&self) {} /// After `mouse_click`, require a fresh screenshot before the next click (unless pointer moved, which also invalidates). @@ -192,6 +302,17 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { fn last_screenshot_refinement(&self) -> Option { None } + + /// Search the frontmost app’s accessibility tree (macOS AX) for a matching control and return a stable center. + /// Default: unsupported outside the desktop host / non-macOS. + async fn locate_ui_element_screen_center( + &self, + _query: UiElementLocateQuery, + ) -> BitFunResult { + Err(BitFunError::tool( + "Native UI element (accessibility) lookup is not available on this host.".to_string(), + )) + } } /// Whether the latest screenshot JPEG was the full display, a point crop, or a quadrant-drill region. diff --git a/src/crates/core/src/agentic/tools/implementations/bash_tool.rs b/src/crates/core/src/agentic/tools/implementations/bash_tool.rs index 1481a9b7..06c2a560 100644 --- a/src/crates/core/src/agentic/tools/implementations/bash_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/bash_tool.rs @@ -254,6 +254,19 @@ Usage notes: )) } + async fn description_with_context( + &self, + context: Option<&ToolUseContext>, + ) -> BitFunResult { + let mut base = self.description().await?; + if context.and_then(|c| c.agent_type.as_deref()) == Some("Claw") { + base.push_str( + "\n\n**Claw (desktop automation):** Prefer this tool for anything achievable from the **workspace shell** (build, test, git, scripts, CLIs). On **macOS**, `open -a \"AppName\"` launches or foregrounds an app with fewer steps than GUI workflows. Use **`ComputerUse`** **`action: locate`** for **named** on-screen controls before guessing coordinates from **`action: screenshot`** alone.", + ); + } + Ok(base) + } + fn input_schema(&self) -> Value { json!({ "type": "object", diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs new file mode 100644 index 00000000..14c79415 --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs @@ -0,0 +1,172 @@ +//! Accessibility tree locate — invoked as `ComputerUse` **`action: "locate"`** (same tool as screenshot / keys). + +use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; +use crate::agentic::tools::computer_use_host::{ + suggested_point_crop_half_extent_from_native_bounds, UiElementLocateQuery, +}; +use crate::agentic::tools::implementations::computer_use_tool::computer_use_augment_result_json; +use crate::agentic::tools::framework::{ToolResult, ToolUseContext}; +use crate::service::config::global::GlobalConfigManager; +use crate::util::errors::{BitFunError, BitFunResult}; +use serde_json::{json, Value}; + +/// Runs native UI locate (AX / UIA / AT-SPI) for the foreground app — **`ComputerUse`** `action: "locate"`. +pub(crate) async fn execute_computer_use_locate( + input: &Value, + context: &ToolUseContext, +) -> BitFunResult> { + if context.agent_type.as_deref() != Some("Claw") { + return Err(BitFunError::tool( + "ComputerUse action locate is only available in Claw assistant mode.".to_string(), + )); + } + if context.is_remote() { + return Err(BitFunError::tool( + "ComputerUse action locate cannot run while the session workspace is remote (SSH)." + .to_string(), + )); + } + if !computer_use_desktop_available() { + return Err(BitFunError::tool( + "Computer use is not available on this host.".to_string(), + )); + } + let Ok(service) = GlobalConfigManager::get_service().await else { + return Err(BitFunError::tool( + "Computer use configuration is unavailable.".to_string(), + )); + }; + let ai: crate::service::config::types::AIConfig = + service.get_config(Some("ai")).await.unwrap_or_default(); + if !ai.computer_use_enabled { + return Err(BitFunError::tool( + "Computer use is disabled in BitFun settings.".to_string(), + )); + } + + let host = context.computer_use_host.as_ref().ok_or_else(|| { + BitFunError::tool("Computer use is only available in the BitFun desktop app.".to_string()) + })?; + + let query = UiElementLocateQuery { + title_contains: input + .get("title_contains") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + role_substring: input + .get("role_substring") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + identifier_contains: input + .get("identifier_contains") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + max_depth: input + .get("max_depth") + .and_then(|v| v.as_u64()) + .map(|v| v as u32), + filter_combine: input + .get("filter_combine") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + }; + + let input_coords = json!({ + "kind": "locate", + "title_contains": query.title_contains.clone(), + "role_substring": query.role_substring.clone(), + "identifier_contains": query.identifier_contains.clone(), + "max_depth": query.max_depth, + "filter_combine": query.filter_combine.clone(), + }); + + let res = host.locate_ui_element_screen_center(query).await?; + + let native_w = res + .native_bounds_max_x + .saturating_sub(res.native_bounds_min_x) + .saturating_add(1); + let native_h = res + .native_bounds_max_y + .saturating_sub(res.native_bounds_min_y) + .saturating_add(1); + + let gx = res.global_center_x.round() as i64; + let gy = res.global_center_y.round() as i64; + let ncx = res.native_center_x as i64; + let ncy = res.native_center_y as i64; + + let suggested_half = suggested_point_crop_half_extent_from_native_bounds(native_w, native_h); + + let coordinate_hints = json!({ + "mouse_precise_screen": { + "tool": "ComputerUseMousePrecise", + "use_screen_coordinates": true, + "x": gx, + "y": gy, + "note": "Global display coordinates (host native units, e.g. macOS points). No prior screenshot required." + }, + "mouse_precise_image_after_full_screenshot": { + "tool": "ComputerUseMousePrecise", + "use_screen_coordinates": false, + "coordinate_mode": "image", + "x": ncx, + "y": ncy, + "note": "Use only when the last ComputerUse screenshot was full-display; x/y match margin ruler indices on that JPEG. After a point-crop screenshot, image space is the crop — do not reuse these numbers." + }, + "screenshot_point_crop": { + "tool": "ComputerUse", + "action": "screenshot", + "screenshot_crop_center_x": res.native_center_x, + "screenshot_crop_center_y": res.native_center_y, + "screenshot_crop_half_extent_native": suggested_half, + "note": "Copy **`screenshot_crop_center_*`** and **`screenshot_crop_half_extent_native`** into **`ComputerUse`** `action: \"screenshot\"`. Half-extent is derived from `native_extent_*` (tighter on small controls; host clamps)." + }, + "native_extent_px": { + "width": native_w, + "height": native_h, + "note": "Approximate control size in full-display native pixels; prefer smaller ComputerUseMouseStep pixels when width/height are small." + } + }); + + let body = json!({ + "success": true, + "action": "locate", + "global_center_x": res.global_center_x, + "global_center_y": res.global_center_y, + "native_center_x": res.native_center_x, + "native_center_y": res.native_center_y, + "global_bounds_left": res.global_bounds_left, + "global_bounds_top": res.global_bounds_top, + "global_bounds_width": res.global_bounds_width, + "global_bounds_height": res.global_bounds_height, + "native_bounds_min_x": res.native_bounds_min_x, + "native_bounds_min_y": res.native_bounds_min_y, + "native_bounds_max_x": res.native_bounds_max_x, + "native_bounds_max_y": res.native_bounds_max_y, + "native_extent_width": native_w, + "native_extent_height": native_h, + "coordinate_hints": coordinate_hints, + "matched_role": res.matched_role, + "matched_title": res.matched_title, + "matched_identifier": res.matched_identifier, + "recommended_next": "Prefer **`ComputerUse`** `action: screenshot` with fields from `coordinate_hints.screenshot_point_crop` to narrow the JPEG before quadrant drill; then ComputerUseMousePrecise / ComputerUseMouseStep + ComputerUseMouseClick, or use mouse_precise_screen if no screenshot is needed yet." + }); + + let body = computer_use_augment_result_json(host.as_ref(), body, Some(input_coords)).await; + + let summary = format!( + "AX match: role={} native_center=({}, {}) native_bounds=[{}..{}, {}..{}] global_center=({:.1}, {:.1})", + res.matched_role, + res.native_center_x, + res.native_center_y, + res.native_bounds_min_x, + res.native_bounds_max_x, + res.native_bounds_min_y, + res.native_bounds_max_y, + res.global_center_x, + res.global_center_y + ); + + Ok(vec![ToolResult::ok(body, Some(summary))]) +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs new file mode 100644 index 00000000..69cb00db --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs @@ -0,0 +1,102 @@ +//! Mouse button click and wheel at the current pointer (Claw Computer use). + +use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; +use crate::agentic::tools::implementations::computer_use_tool::computer_use_execute_mouse_click_tool; +use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; +use crate::service::config::global::GlobalConfigManager; +use crate::util::errors::{BitFunError, BitFunResult}; +use async_trait::async_trait; +use serde_json::{json, Value}; + +pub struct ComputerUseMouseClickTool; + +impl ComputerUseMouseClickTool { + pub fn new() -> Self { + Self + } +} + +#[async_trait] +impl Tool for ComputerUseMouseClickTool { + fn name(&self) -> &str { + "ComputerUseMouseClick" + } + + async fn description(&self) -> BitFunResult { + Ok( + "Click or scroll the **mouse wheel** at the **current** pointer (does not move the pointer). **`action`: `click`** — optional **`button`** (`left` | `right` | `middle`, default left); host enforces a fresh **fine** screenshot basis before click (same as former `ComputerUse` `click`). **`action`: `wheel`** — **`delta_x`** / **`delta_y`** (non-zero) for horizontal/vertical wheel ticks at the cursor (same as former `ComputerUse` `scroll`). Position the pointer first with **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** / **`ComputerUse`** `pointer_move_rel`, then **`screenshot`** before click when the host requires it." + .to_string(), + ) + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["click", "wheel"], + "description": "`click` — press a mouse button at the current pointer. `wheel` — scroll wheel at the current pointer (use delta_x/delta_y; host-dependent units)." + }, + "button": { + "type": "string", + "enum": ["left", "right", "middle"], + "description": "For `action` **click** only (default left). Ignored for `wheel`." + }, + "delta_x": { + "type": "integer", + "description": "For `action` **wheel** only: horizontal wheel delta (non-zero with delta_y or alone). Ignored for `click`." + }, + "delta_y": { + "type": "integer", + "description": "For `action` **wheel** only: vertical wheel delta (non-zero with delta_x or alone). Ignored for `click`." + } + }, + "required": ["action"], + "additionalProperties": false + }) + } + + fn is_readonly(&self) -> bool { + false + } + + fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool { + false + } + + fn needs_permissions(&self, _input: Option<&Value>) -> bool { + true + } + + async fn is_enabled(&self) -> bool { + if !computer_use_desktop_available() { + return false; + } + let Ok(service) = GlobalConfigManager::get_service().await else { + return false; + }; + let ai: crate::service::config::types::AIConfig = + service.get_config(Some("ai")).await.unwrap_or_default(); + ai.computer_use_enabled + } + + async fn call_impl(&self, input: &Value, context: &ToolUseContext) -> BitFunResult> { + if context.agent_type.as_deref() != Some("Claw") { + return Err(BitFunError::tool( + "ComputerUseMouseClick is only available in Claw assistant mode.".to_string(), + )); + } + if context.is_remote() { + return Err(BitFunError::tool( + "ComputerUseMouseClick cannot run while the session workspace is remote (SSH)." + .to_string(), + )); + } + let host = context.computer_use_host.as_ref().ok_or_else(|| { + BitFunError::tool("Computer use is only available in the BitFun desktop app.".to_string()) + })?; + + computer_use_execute_mouse_click_tool(host.as_ref(), input).await + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs new file mode 100644 index 00000000..d084f547 --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs @@ -0,0 +1,97 @@ +//! Absolute pointer positioning for Claw Computer use. + +use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; +use crate::agentic::tools::implementations::computer_use_tool::computer_use_execute_mouse_precise; +use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; +use crate::service::config::global::GlobalConfigManager; +use crate::util::errors::{BitFunError, BitFunResult}; +use async_trait::async_trait; +use serde_json::{json, Value}; + +pub struct ComputerUseMousePreciseTool; + +impl ComputerUseMousePreciseTool { + pub fn new() -> Self { + Self + } +} + +#[async_trait] +impl Tool for ComputerUseMousePreciseTool { + fn name(&self) -> &str { + "ComputerUseMousePrecise" + } + + async fn description(&self) -> BitFunResult { + Ok( + "Move the mouse pointer to **absolute** coordinates. Use **`coordinate_mode`** (`image` = last screenshot JPEG — **preferred for precision**; `normalized` = 0..1000 — **coarse**, avoid for fine alignment) or **`use_screen_coordinates`** for global display units. Same semantics as the former `ComputerUse` `mouse_move` absolute path. For **small** cardinal nudges, prefer **`ComputerUseMouseStep`** instead of tiny absolute x/y.".to_string(), + ) + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "x": { + "type": "integer", + "description": "Target x: in **image** mode, pixel on the latest screenshot JPEG; in **normalized**, 0..=1000 on the captured display; with **use_screen_coordinates**, global display units (host native, e.g. macOS points)." + }, + "y": { "type": "integer", "description": "Target y; same coordinate space as x." }, + "coordinate_mode": { + "type": "string", + "enum": ["image", "normalized"], + "description": "When use_screen_coordinates is false. \"image\" = pixels on the latest screenshot JPEG (use for precise moves). \"normalized\" = 0..=1000 (coarse grid only)." + }, + "use_screen_coordinates": { + "type": "boolean", + "description": "If true, x/y are global display coordinates in the host's native units (on macOS: **points**)." + } + }, + "required": ["x", "y"], + "additionalProperties": false + }) + } + + fn is_readonly(&self) -> bool { + false + } + + fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool { + false + } + + fn needs_permissions(&self, _input: Option<&Value>) -> bool { + true + } + + async fn is_enabled(&self) -> bool { + if !computer_use_desktop_available() { + return false; + } + let Ok(service) = GlobalConfigManager::get_service().await else { + return false; + }; + let ai: crate::service::config::types::AIConfig = + service.get_config(Some("ai")).await.unwrap_or_default(); + ai.computer_use_enabled + } + + async fn call_impl(&self, input: &Value, context: &ToolUseContext) -> BitFunResult> { + if context.agent_type.as_deref() != Some("Claw") { + return Err(BitFunError::tool( + "ComputerUseMousePrecise is only available in Claw assistant mode.".to_string(), + )); + } + if context.is_remote() { + return Err(BitFunError::tool( + "ComputerUseMousePrecise cannot run while the session workspace is remote (SSH)." + .to_string(), + )); + } + let host = context.computer_use_host.as_ref().ok_or_else(|| { + BitFunError::tool("Computer use is only available in the BitFun desktop app.".to_string()) + })?; + + computer_use_execute_mouse_precise(host.as_ref(), input).await + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs new file mode 100644 index 00000000..e9112c4f --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs @@ -0,0 +1,92 @@ +//! Cardinal pointer step (up/down/left/right) for Claw Computer use. + +use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; +use crate::agentic::tools::implementations::computer_use_tool::computer_use_execute_mouse_step; +use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; +use crate::service::config::global::GlobalConfigManager; +use crate::util::errors::{BitFunError, BitFunResult}; +use async_trait::async_trait; +use serde_json::{json, Value}; + +pub struct ComputerUseMouseStepTool; + +impl ComputerUseMouseStepTool { + pub fn new() -> Self { + Self + } +} + +#[async_trait] +impl Tool for ComputerUseMouseStepTool { + fn name(&self) -> &str { + "ComputerUseMouseStep" + } + + async fn description(&self) -> BitFunResult { + Ok( + "Move the pointer **one cardinal step** (up / down / left / right) by **`pixels`** (default 32, clamped 1..400) in **screenshot/display pixel** space — same as the former `pointer_nudge` and relative `mouse_move_direction`. Take **`screenshot`** first so the host can convert scale (especially on macOS). For arbitrary deltas including diagonals, use **`ComputerUse`** **`pointer_move_rel`**.".to_string(), + ) + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "direction": { + "type": "string", + "enum": ["up", "down", "left", "right"], + "description": "Cardinal direction for the step." + }, + "pixels": { + "type": "integer", + "description": "Distance in screenshot/display pixels (default 32, clamped 1..400). Use smaller values (e.g. 8–24) for fine alignment." + } + }, + "required": ["direction"], + "additionalProperties": false + }) + } + + fn is_readonly(&self) -> bool { + false + } + + fn is_concurrency_safe(&self, _input: Option<&Value>) -> bool { + false + } + + fn needs_permissions(&self, _input: Option<&Value>) -> bool { + true + } + + async fn is_enabled(&self) -> bool { + if !computer_use_desktop_available() { + return false; + } + let Ok(service) = GlobalConfigManager::get_service().await else { + return false; + }; + let ai: crate::service::config::types::AIConfig = + service.get_config(Some("ai")).await.unwrap_or_default(); + ai.computer_use_enabled + } + + async fn call_impl(&self, input: &Value, context: &ToolUseContext) -> BitFunResult> { + if context.agent_type.as_deref() != Some("Claw") { + return Err(BitFunError::tool( + "ComputerUseMouseStep is only available in Claw assistant mode.".to_string(), + )); + } + if context.is_remote() { + return Err(BitFunError::tool( + "ComputerUseMouseStep cannot run while the session workspace is remote (SSH)." + .to_string(), + )); + } + let host = context.computer_use_host.as_ref().ok_or_else(|| { + BitFunError::tool("Computer use is only available in the BitFun desktop app.".to_string()) + })?; + + computer_use_execute_mouse_step(host.as_ref(), input).await + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index c8488632..193e5ad0 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -1,9 +1,11 @@ //! Desktop automation for Claw (Computer use). +use super::computer_use_locate::execute_computer_use_locate; use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; use crate::agentic::tools::computer_use_host::{ ComputerScreenshot, ComputerUseNavigateQuadrant, ComputerUseScreenshotParams, ComputerUseScreenshotRefinement, ScreenshotCropCenter, + COMPUTER_USE_POINT_CROP_HALF_MAX, COMPUTER_USE_POINT_CROP_HALF_MIN, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE, COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, }; use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; @@ -15,6 +17,26 @@ use base64::{engine::general_purpose::STANDARD as B64, Engine as _}; use log::{debug, warn}; use serde_json::{json, Value}; +/// Merges [`ComputerUseHost::computer_use_session_snapshot`] + optional `input_coordinates` into tool JSON. +pub(crate) async fn computer_use_augment_result_json( + host: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + mut body: Value, + input_coordinates: Option, +) -> Value { + let snap = host.computer_use_session_snapshot().await; + if let Value::Object(map) = &mut body { + map.insert( + "computer_use_context".to_string(), + json!({ + "foreground_application": snap.foreground_application, + "pointer_global": snap.pointer_global, + "input_coordinates": input_coordinates, + }), + ); + } + body +} + /// On-disk copy of each Computer use screenshot (pointer overlay included) for debugging. /// Filenames: `cu__full.jpg` (whole display) or `cu__crop__.jpg` when a point crop was requested. const COMPUTER_USE_DEBUG_SUBDIR: &str = ".bitfun/computer_use_debug"; @@ -164,7 +186,7 @@ impl ComputerUseTool { ) -> BitFunResult<(Value, ToolImageAttachment, String)> { let b64 = B64.encode(&shot.bytes); let pointer_marker_note = match (shot.pointer_image_x, shot.pointer_image_y) { - (Some(_), Some(_)) => "The JPEG includes a **synthetic red cursor with gray border** marking the **actual mouse position** on this bitmap (not the OS arrow). The **tip** is the true click hotspot (same pixel as pointer_image_x and pointer_image_y). Use this marker and those numbers for mouse_move — do not ignore them or guess from the OS cursor alone.", + (Some(_), Some(_)) => "The JPEG includes a **synthetic red cursor with gray border** marking the **actual mouse position** on this bitmap (not the OS arrow). The **tip** is the true click hotspot (same pixel as pointer_image_x and pointer_image_y). Use this marker and those numbers for **ComputerUseMousePrecise** — do not ignore them or guess from the OS cursor alone.", _ => "No pointer overlay in this JPEG (pointer_image_x/y null): the cursor is not on this bitmap (e.g. another display). Do not infer position from the image; use global screen coordinates + use_screen_coordinates, or move the pointer onto this display and screenshot again.", }; let mut data = json!({ @@ -183,27 +205,45 @@ impl ComputerUseTool { "pointer_image_y": shot.pointer_image_y, "pointer_marker": pointer_marker_note, "screenshot_crop_center": shot.screenshot_crop_center, + "point_crop_half_extent_native": shot.point_crop_half_extent_native, "navigation_native_rect": shot.navigation_native_rect, "quadrant_navigation_click_ready": shot.quadrant_navigation_click_ready, "debug_screenshot_path": debug_rel, }); let shortcut_policy = format!( - "**First:** `key_chord` for shortcuts **and** system clipboard (copy/cut/paste/select-all per host OS) — avoid Edit-menu clicks and avoid long `type_text` when paste fits. **Then** pointer when shortcuts do not fit (then screenshot). **Default for click prep:** after a full-frame shot, chain `screenshot` + `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` (long edge < {} px). **Do not** skip to `screenshot_crop_center_*` from full screen unless justified. **Quadrant narrowing is never automatic:** each drill step must set `screenshot_navigate_quadrant` on that `screenshot` call; a bare `screenshot` only refreshes. Point crop (~500×500) is a **fallback**. **Small pointer tweaks:** prefer `mouse_move` with **`mouse_move_direction`** (+ optional `mouse_move_relative_pixels`) over absolute `x`/`y` — easier for vision models than sub-pixel absolute coords. Fresh screenshot before key_chord that sends Return/Enter.", + "**First:** `key_chord` for shortcuts **and** system clipboard (copy/cut/paste/select-all per host OS) — avoid Edit-menu clicks and avoid long `type_text` when paste fits. **Then** pointer when shortcuts do not fit (then screenshot **only** when you need pixels or before host-guarded click/Enter). **Default for click prep:** after a full-frame shot, chain `screenshot` + `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` (long edge < {} px). **Do not** skip to `screenshot_crop_center_*` from full screen unless justified. **Quadrant narrowing is never automatic:** each drill step must set `screenshot_navigate_quadrant` on that `screenshot` call; a bare `screenshot` only refreshes. Point crop (~500×500) is a **fallback**. **Small pointer tweaks:** prefer **ComputerUseMouseStep** (`direction` + optional `pixels`) over tiny absolute **ComputerUseMousePrecise** `x`/`y` — easier for vision models than sub-pixel absolute coords. **Do not** screenshot after every `locate` or non-Enter `key_chord`; **fresh** screenshot **before** `key_chord` that sends Return/Enter (host) and before **click** (host).", COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE ); + let region_crop_size_note = shot + .point_crop_half_extent_native + .map(|h| { + let edge = h.saturating_mul(2); + format!( + "Crop frame (~{}×{} native, half-extent {} px; clamped {}..{}): ", + edge, + edge, + h, + COMPUTER_USE_POINT_CROP_HALF_MIN, + COMPUTER_USE_POINT_CROP_HALF_MAX + ) + }) + .unwrap_or_else(|| "Crop frame (~500×500 native, half-extent 250 px): ".to_string()); let hierarchical_navigation = if shot.screenshot_crop_center.is_some() { json!({ "phase": "region_crop", "image_is_crop_only": true, "shortcut_policy": shortcut_policy, - "instruction": "Crop frame (~500×500): **margin ruler numbers** are **full-capture native** indices (same whole-screen bitmap space as a full-screen shot — not 0..500 local). `coordinate_mode` \"image\" uses **this JPEG’s** pixel grid (content area under the rulers). For another view, call screenshot with new `screenshot_crop_center_*` in that same full-capture space. See shortcut_policy." + "instruction": format!( + "{}**margin ruler numbers** are **full-capture native** indices (same whole-screen bitmap space as a full-screen shot — not local 0..crop). `coordinate_mode` \"image\" uses **this JPEG’s** pixel grid (content area under the rulers). For another view, call screenshot with new `screenshot_crop_center_*` in that same full-capture space; optional `screenshot_crop_half_extent_native` adjusts crop size. See shortcut_policy.", + region_crop_size_note + ) }) } else if shot.quadrant_navigation_click_ready { json!({ "phase": "quadrant_terminal", "image_is_crop_only": true, "shortcut_policy": shortcut_policy, - "instruction": "Region is small enough for precise pointer: **`quadrant_navigation_click_ready`** is true. For **small** alignment fixes, prefer **`mouse_move`** with **`mouse_move_direction`** (`up`/`down`/`left`/`right`) and **`mouse_move_relative_pixels`**; use absolute `x`/`y` only for larger jumps. Then `click` (no extra point crop required). After pointer moves, screenshot again before the next click (host)." + "instruction": "Region is small enough for precise pointer: **`quadrant_navigation_click_ready`** is true. For **small** alignment fixes, prefer **`ComputerUseMouseStep`** (`direction`, optional `pixels`); use **`ComputerUseMousePrecise`** absolute `x`/`y` only for larger jumps. Then **`ComputerUseMouseClick`** (`action`: click) (no extra point crop required). After pointer moves, screenshot again before the next click (host)." }) } else if !Self::shot_covers_full_display(shot) { json!({ @@ -220,9 +260,9 @@ impl ComputerUseTool { "phase": "full_display", "image_is_crop_only": false, "host_auto_quadrant": false, - "next_step_for_mouse_click": "**Preferred (A):** next tool call = `screenshot` **with** `screenshot_navigate_quadrant` set (top_left|top_right|bottom_left|bottom_right). Repeat until `quadrant_navigation_click_ready`. **Fallback (B):** `screenshot` with `screenshot_crop_center_x/y` only when quadrant drill is a poor fit. The host never splits the screen unless you pass `screenshot_navigate_quadrant`.", + "next_step_for_mouse_click": "**Preferred (0):** If **`ComputerUse`** **`action: locate`** can match the control, use **`screenshot_crop_center_*`** (+ optional **`screenshot_crop_half_extent_native`**) to **narrow the JPEG** before the quadrant drill. **Preferred (A):** next tool call = `screenshot` **with** `screenshot_navigate_quadrant` set (top_left|top_right|bottom_left|bottom_right). Repeat until `quadrant_navigation_click_ready`. **Fallback (B):** `screenshot` with `screenshot_crop_center_x/y` when quadrant drill is a poor fit. The host never splits the screen unless you pass `screenshot_navigate_quadrant`.", "shortcut_policy": shortcut_policy, - "instruction": "Full frame: ruler indices are **full-display native** pixels. **You should start quadrant drill now:** next `screenshot` **must** include **`screenshot_navigate_quadrant`** (pick the quadrant that contains your click target). Repeat one quadrant per call until `quadrant_navigation_click_ready`, then `mouse_move` + `click`. Point crop is **secondary**. **`click` is rejected** on full-screen-only. See `next_step_for_mouse_click`, `recommended_next_for_click_targeting`, shortcut_policy." + "instruction": "Full frame: ruler indices are **full-display native** pixels. **If DOM/AX can locate the target:** use `screenshot_crop_center_*` (+ optional `screenshot_crop_half_extent_native`) first — **before** a long quadrant-only chain. **Otherwise** start quadrant drill: next `screenshot` **must** include **`screenshot_navigate_quadrant`**. Repeat one quadrant per call until `quadrant_navigation_click_ready`, then **ComputerUseMousePrecise** / **ComputerUseMouseStep** + **`ComputerUseMouseClick`** (`action`: click). **`ComputerUseMouseClick` (click) is rejected** on full-screen-only. See `next_step_for_mouse_click`, `recommended_next_for_click_targeting`, shortcut_policy." }) }; if let Some(obj) = data.as_object_mut() { @@ -248,7 +288,7 @@ impl ComputerUseTool { }; let pointer_line = match (shot.pointer_image_x, shot.pointer_image_y) { (Some(px), Some(py)) => format!( - " TRUE POINTER: **red cursor with gray border** (tip = hotspot) in the JPEG marks the mouse at this pixel — coordinate_mode \"image\" mouse_move target x={}, y={}. Align moves so the **tip** sits on your click target, then click. Prior screenshot is stale after mouse_move/pointer_* until you screenshot again.", + " TRUE POINTER: **red cursor with gray border** (tip = hotspot) in the JPEG marks the mouse at this pixel — coordinate_mode \"image\" **ComputerUseMousePrecise** target x={}, y={}. Align moves so the **tip** sits on your click target, then **ComputerUseMouseClick** (`action`: click). Prior screenshot is stale after **ComputerUseMousePrecise** / **ComputerUseMouseStep** / `pointer_move_rel` until you screenshot again.", px, py ), _ => " TRUE POINTER: not on this capture (pointer_image_x/y null). No red synthetic cursor — OS mouse may be on another display; use use_screen_coordinates with global coords or bring the pointer here and re-screenshot." @@ -275,7 +315,7 @@ impl ComputerUseTool { ) } else if shot.quadrant_navigation_click_ready { format!( - "Quadrant terminal {}x{} (native region {:?}). **`quadrant_navigation_click_ready`**: use `image` coords on this JPEG, then `mouse_move` + `click`.{}.{}", + "Quadrant terminal {}x{} (native region {:?}). **`quadrant_navigation_click_ready`**: use `image` coords on this JPEG, then **ComputerUseMousePrecise** / **ComputerUseMouseStep** + **`ComputerUseMouseClick`** (`action`: click).{}.{}", shot.image_width, shot.image_height, shot.navigation_native_rect, @@ -362,6 +402,25 @@ impl ComputerUseTool { } } + /// Optional half-extent for point crop (native px); host clamps to [COMPUTER_USE_POINT_CROP_HALF_MIN, MAX]. + fn parse_screenshot_crop_half_extent_native(input: &Value) -> BitFunResult> { + match input.get("screenshot_crop_half_extent_native") { + None => Ok(None), + Some(v) if v.is_null() => Ok(None), + Some(v) => { + let n = v.as_u64().ok_or_else(|| { + BitFunError::tool( + "screenshot_crop_half_extent_native must be a non-negative integer.".to_string(), + ) + })?; + let n = u32::try_from(n).map_err(|_| { + BitFunError::tool("screenshot_crop_half_extent_native is too large.".to_string()) + })?; + Ok(Some(n)) + } + } + } + /// True if the client sent non-null `screenshot_crop_center_x` and/or `y` (often `0` placeholders). fn input_has_screenshot_crop_fields(input: &Value) -> bool { let x = input.get("screenshot_crop_center_x"); @@ -406,54 +465,227 @@ impl ComputerUseTool { crop_center: None, navigate_quadrant: navigate, reset_navigation, + point_crop_half_extent_native: None, }, ignored_crop, )); } let crop = Self::parse_screenshot_crop_center(input)?; + let half = if crop.is_some() { + Self::parse_screenshot_crop_half_extent_native(input)? + } else { + None + }; Ok(( ComputerUseScreenshotParams { crop_center: crop, navigate_quadrant: None, reset_navigation, + point_crop_half_extent_native: half, }, false, )) } - /// Cardinal relative move for `action: mouse_move`. Same pixel space as `pointer_nudge` / `pointer_move_rel`. - fn parse_mouse_move_cardinal(input: &Value) -> BitFunResult> { - let dir_val = match input.get("mouse_move_direction") { - None => return Ok(None), - Some(v) if v.is_null() => return Ok(None), - Some(v) => v, - }; - let dir = dir_val.as_str().ok_or_else(|| { +} + +/// JSON for `snapshot_coordinate_basis` in mouse tool results (last screenshot refinement). +fn computer_use_snapshot_coordinate_basis( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, +) -> serde_json::Value { + let last_ref = host_ref.last_screenshot_refinement(); + match last_ref { + None => serde_json::Value::Null, + Some(ComputerUseScreenshotRefinement::FullDisplay) => json!("full_display"), + Some(ComputerUseScreenshotRefinement::RegionAroundPoint { + center_x, + center_y, + }) => { + json!({ + "region_crop_center_full_display_native": { "x": center_x, "y": center_y } + }) + } + Some(ComputerUseScreenshotRefinement::QuadrantNavigation { + x0, + y0, + width, + height, + click_ready, + }) => { + json!({ + "quadrant_native_rect": { "x0": x0, "y0": y0, "w": width, "h": height }, + "quadrant_navigation_click_ready": click_ready, + }) + } + } +} + +/// Absolute pointer move (`ComputerUseMousePrecise` tool). +pub(crate) async fn computer_use_execute_mouse_precise( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + input: &Value, +) -> BitFunResult> { + let snapshot_basis = computer_use_snapshot_coordinate_basis(host_ref); + let x = req_i32(input, "x")?; + let y = req_i32(input, "y")?; + let mode = ComputerUseTool::coordinate_mode(input); + let use_screen = ComputerUseTool::use_screen_coordinates(input); + let (sx64, sy64) = ComputerUseTool::resolve_xy_f64(host_ref, input, x, y)?; + host_ref.mouse_move_global_f64(sx64, sy64).await?; + let sx = sx64.round() as i32; + let sy = sy64.round() as i32; + let input_coords = json!({ + "kind": "mouse_precise", + "raw": { "x": x, "y": y, "coordinate_mode": mode, "use_screen_coordinates": use_screen }, + "resolved_global": { "x": sx64, "y": sy64 } + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "tool": "ComputerUseMousePrecise", + "positioning": "absolute", + "x": x, + "y": y, + "pointer_x": sx, + "pointer_y": sy, + "coordinate_mode": mode, + "use_screen_coordinates": use_screen, + "snapshot_coordinate_basis": snapshot_basis, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "Moved pointer to global screen (~{}, ~{}, sub-point on macOS) (input {:?} {}, {}).", + sx, sy, mode, x, y + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) +} + +/// Cardinal step move (`ComputerUseMouseStep` tool). Same pixel space as `pointer_move_rel`. +pub(crate) async fn computer_use_execute_mouse_step( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + input: &Value, +) -> BitFunResult> { + let dir = input + .get("direction") + .and_then(|v| v.as_str()) + .ok_or_else(|| { BitFunError::tool( - "mouse_move_direction must be a string: up, down, left, or right.".to_string(), + "direction is required for ComputerUseMouseStep (up|down|left|right)".to_string(), ) })?; - let px = input - .get("mouse_move_relative_pixels") - .and_then(|v| v.as_i64()) - .map(|v| v as i32) - .unwrap_or(32) - .clamp(1, 400); - let norm = dir.trim().to_ascii_lowercase().replace('-', "_"); - let (dx, dy, label) = match norm.as_str() { - "up" => (0, -px, "up"), - "down" => (0, px, "down"), - "left" => (-px, 0, "left"), - "right" => (px, 0, "right"), - _ => { + let px = input + .get("pixels") + .and_then(|v| v.as_i64()) + .map(|v| v as i32) + .unwrap_or(32) + .clamp(1, 400); + let (dx, dy) = match dir.to_lowercase().as_str() { + "up" => (0, -px), + "down" => (0, px), + "left" => (-px, 0), + "right" => (px, 0), + _ => { + return Err(BitFunError::tool( + "direction must be up, down, left, or right".to_string(), + )); + } + }; + host_ref.pointer_move_relative(dx, dy).await?; + let input_coords = json!({ + "kind": "mouse_step", + "direction": dir, + "pixels": px, + "delta_x": dx, + "delta_y": dy + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "tool": "ComputerUseMouseStep", + "direction": dir, + "pixels": px, + "delta_x": dx, + "delta_y": dy, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "Stepped pointer by ({}, {}) px (direction {}, {} px).", + dx, dy, dir, px + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) +} + +/// Click and mouse-wheel at the **current** pointer (`ComputerUseMouseClick` tool). +pub(crate) async fn computer_use_execute_mouse_click_tool( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + input: &Value, +) -> BitFunResult> { + let act = input + .get("action") + .and_then(|v| v.as_str()) + .ok_or_else(|| BitFunError::tool("action is required (click or wheel)".to_string()))?; + match act { + "click" => { + let button = input + .get("button") + .and_then(|v| v.as_str()) + .unwrap_or("left"); + host_ref.mouse_click(button).await?; + let input_coords = json!({ "kind": "mouse_click", "action": "click", "button": button }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "tool": "ComputerUseMouseClick", + "action": "click", + "button": button, + }), + Some(input_coords), + ) + .await; + let summary = format!("{} click at current pointer (does not move).", button); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + "wheel" => { + let dx = input.get("delta_x").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + let dy = input.get("delta_y").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + if dx == 0 && dy == 0 { return Err(BitFunError::tool( - "mouse_move_direction must be one of: up, down, left, right.".to_string(), + "wheel requires non-zero delta_x and/or delta_y".to_string(), )); } - }; - Ok(Some((dx, dy, label.to_string(), px))) + host_ref.scroll(dx, dy).await?; + let input_coords = json!({ + "kind": "mouse_click", + "action": "wheel", + "delta_x": dx, + "delta_y": dy + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "tool": "ComputerUseMouseClick", + "action": "wheel", + "delta_x": dx, + "delta_y": dy, + }), + Some(input_coords), + ) + .await; + let summary = format!("Mouse wheel at pointer: delta ({}, {}).", dx, dy); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + _ => Err(BitFunError::tool( + "ComputerUseMouseClick action must be \"click\" or \"wheel\"".to_string(), + )), } - } #[async_trait] @@ -465,27 +697,45 @@ impl Tool for ComputerUseTool { async fn description(&self) -> BitFunResult { let os = Self::host_os_label(); let keys = Self::key_chord_os_hint(); + let hmin = COMPUTER_USE_POINT_CROP_HALF_MIN; + let hmax = COMPUTER_USE_POINT_CROP_HALF_MAX; Ok(format!( "Desktop Computer use (host OS: {}). {} \ -**Automation priority (read order):** (1) **`key_chord`** — OS/app shortcuts **and** **system clipboard** (select all / copy / cut / paste via the host’s real modifier keys — see hint below). Prefer **paste** over **`type_text`** for long or duplicated content and whenever the user expects clipboard behavior; do **not** drive the mouse to Edit → Copy/Paste when chords exist. (2) **`type_text`** — short input, paste-blocked fields, or after chords failed. (3) **Mouse** (`screenshot` drill + `mouse_move` + `click`) — only when shortcuts and clipboard do not apply or after they failed. \ +**Automation priority (read order — same as Claw `claw_mode` “Computer use”):** (1) **Terminal** — **`Bash`** / **`TerminalControl`** — workspace shell; on **macOS** use **`open -a \"AppName\"`** to launch/focus apps (e.g. WeChat) **instead of** Spotlight+Return when possible (do **not** assume “computer use” = only `ComputerUse*` tools). (2) **System shortcuts** — **`key_chord`** for OS-wide actions and **system clipboard** (see hint below). (3) **Application shortcuts** — **`key_chord`** when the right app is focused. (4) **This tool — `action: locate`** — **named** controls in the **foreground** app (`AX` / UIA / AT-SPI); when it matches, you may **move** with **`coordinate_hints`** **without** an immediate full-frame **`screenshot`**; use **`action: screenshot`** with **`screenshot_crop_center_*`** / **`screenshot_crop_half_extent_native`** **when** you need a JPEG for vision (host clamps {}..{} per half). (5) **`type_text`** — short input, paste-blocked fields, or after the above failed. (6) **Vision / mouse** — only when (1)–(4) do not suffice. Prefer **paste** over **`type_text`** for long or duplicated content; do **not** drive the mouse to Edit → Copy/Paste when chords exist. **Do not** spam **`screenshot`** between unrelated actions — host mainly requires fresh capture before **click** and **Return/Enter**. \ **`screenshot` image layout (read this):** Every **`screenshot`** returns a JPEG with **white margins on all four sides** showing **numeric coordinate tick labels** (full-capture native pixel indices — the same scale on full-screen and point-crop shots), and a **line grid** drawn on the captured desktop **inside** those margins. Read x/y from the **top/bottom/left/right** margin numbers to aim moves and for **point crop** (`screenshot_crop_center_*`) when that path is justified. The inner bitmap (below the rulers) is the live capture. \ -**Default before `click` (mouse path):** After the **first** full **`screenshot`**, **your next screenshot call should set `screenshot_navigate_quadrant`** (one of `top_left`, `top_right`, `bottom_left`, `bottom_right`) — **do not** jump straight to **`screenshot_crop_center_*`** unless the target is already huge on screen or you have a stable native center from rulers. Chain **`screenshot` + `screenshot_navigate_quadrant`** until **`quadrant_navigation_click_ready`: true** in the tool JSON, then **`mouse_move`** + **`click`**. Tool results may include **`recommended_next_for_click_targeting`** — obey it. \ +**Default before `ComputerUseMouseClick` (`action`: click) (mouse path):** After the **first** full **`screenshot`**, **if `action: locate` gave a native center:** use **`screenshot`** with **`screenshot_crop_center_*`** (+ optional **`screenshot_crop_half_extent_native`**) to narrow the view **first**. **Else** set **`screenshot_navigate_quadrant`** (one of `top_left`, `top_right`, `bottom_left`, `bottom_right`) on the next **`screenshot`** — **do not** refresh full screen repeatedly without `screenshot_navigate_quadrant` or a point crop. Chain **`screenshot` + `screenshot_navigate_quadrant`** until **`quadrant_navigation_click_ready`: true** in the tool JSON, then **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick`**. Tool results may include **`recommended_next_for_click_targeting`** — obey it. \ **Shortcut-first (default):** When a **standard OS or in-app shortcut** or **clipboard chord** achieves the same step (e.g. New/Open/Save, Copy/Cut/Paste, Undo/Redo, Find, Close tab/window, Quit, Refresh, tab/window switch, focus address bar, select all), you **must prefer `key_chord`** over moving the pointer and clicking — **do not** default to mouse for actions that have a well-known chord on this host. Use pointer + screenshots when **no** suitable shortcut exists, the target is only reachable by mouse, menus show no shortcut, or a shortcut attempt clearly failed (then **screenshot** and reassess). \ -After `key_chord`, `type_text`, or `scroll`, when the **next step depends on what is on screen**, run **`screenshot`** (optionally `wait` ms first) and verify — do not chain many shortcuts without a screenshot when failure would mislead. \ -**No blind submit or click (unchanged):** before **`click`** (any button) and before **`key_chord` that sends Return/Enter** (or any key that submits/confirms), you **must** run **`screenshot` first** and visually confirm focus and target — **never** click or press Enter without a fresh screenshot when the outcome matters. Same discipline after moving the pointer. \ -**Quadrant drill (default zoom for precision; not automatic):** The app **never** splits the screen by itself. After an initial full **`screenshot`**, **each** narrowing step must be **`screenshot` + `screenshot_navigate_quadrant`** ∈ {{`top_left`,`top_right`,`bottom_left`,`bottom_right`}} — omitting that field only **refreshes** full screen (or the current drill region). The host returns the chosen quarter **plus {} px on each side** (clamped); rulers stay **full-display native**. Repeat until **`quadrant_navigation_click_ready`: true** (longest native side < {} px), then **`mouse_move`** and **`click`**. **`screenshot_reset_navigation`**: true restarts from full display. **If `screenshot_navigate_quadrant` is set, `screenshot_crop_center_*` are ignored**. **Point crop** (`screenshot_crop_center_*` only, no `screenshot_navigate_quadrant`) is a **fallback** when quadrant drill is a poor fit. \ -**Screenshot zoom:** When you must **confirm** small text, dense UI, or the **red cursor** tip, **proactively** zoom — **prefer quadrant drill**; use point crop only when justified — **do not** rely only on huge full-display images when a smaller view answers the question. \ -`mouse_move`: **absolute** `x`/`y` (`coordinate_mode` / `use_screen_coordinates`) **or** **relative cardinal** `mouse_move_direction` (`up`|`down`|`left`|`right`) with optional `mouse_move_relative_pixels` (default 32, same screenshot-pixel space as `pointer_nudge`). If `mouse_move_direction` is set, `x`/`y` are ignored. **Recommendation:** for **small** pointer adjustments (nudging the red tip onto a control), **prefer relative** `mouse_move_direction` over guessing absolute `x`/`y` — usually more reliable for vision models. `click` only at current pointer (optional button), never moves. \ -**Host (desktop):** Call **`screenshot`** when you need current pixels; there is **no** automatic follow-up capture after other actions. Before **`click`**, after pointer moves, the host requires a fresh **fine** basis: **`quadrant_navigation_click_ready`** (preferred path) **or** a **point crop** — **full-screen-only** is **not** enough. Before **`key_chord`** with **Return/Enter**, a fresh **`screenshot`** (any mode) is required. Numeric fields in each tool result JSON are authoritative for that frame. \ -`pointer_nudge` / `pointer_move_rel` for relative screen-pixel moves (same idea; `mouse_move` can cover cardinal nudges in one call). \ -Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), **grid on the capture**, and a **synthetic mouse marker** when the pointer is on that display (**red** with **gray border**; **tip** = hotspot, same as **`pointer_image_x` / `pointer_image_y`**). On macOS, `mouse_move` uses sub-point Quartz when applicable. Also **wait**. **Per `action`:** send **only** the parameters that apply (e.g. for `screenshot` do not send `keys`, `button`, `x`/`y` for `mouse_move`, etc.) — extra keys may confuse you or the UI. macOS: Accessibility for the running binary.", +**Between non-click steps:** **`computer_use_context`** often suffices; add **`screenshot`** when you need pixels or before **click / Enter** per host rules — **not** after every `key_chord` / `type_text` / `locate`. \ +**No blind submit or click (unchanged):** before **`ComputerUseMouseClick` (`action`: click)** (any button) and before **`key_chord` that sends Return/Enter** (or any key that submits/confirms), you **must** run **`screenshot` first** and visually confirm focus and target — **never** click or press Enter without a fresh screenshot when the outcome matters. Same discipline after moving the pointer. \ +**Quadrant drill (vision zoom; not automatic):** The app **never** splits the screen by itself. After an initial full **`screenshot`**, **when DOM is unavailable**, **each** narrowing step is **`screenshot` + `screenshot_navigate_quadrant`** ∈ {{`top_left`,`top_right`,`bottom_left`,`bottom_right`}} — omitting that field only **refreshes** full screen (or the current drill region). The host returns the chosen quarter **plus {} px on each side** (clamped); rulers stay **full-display native**. Repeat until **`quadrant_navigation_click_ready`: true** (longest native side < {} px), then **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** and **`ComputerUseMouseClick` (`action`: click)**. **`screenshot_reset_navigation`**: true restarts from full display. **If `screenshot_navigate_quadrant` is set, `screenshot_crop_center_*` are ignored**. **Point crop** (`screenshot_crop_center_*` ± optional half-extent) is **preferred when DOM supplies `native_center_*`**; otherwise use quadrant drill. \ +**Screenshot zoom:** When you must **confirm** small text, dense UI, or the **red cursor** tip, **proactively** zoom — **DOM + point crop** when possible; else quadrant drill — **do not** rely only on huge full-display images when a smaller view answers the question. \ +**Pointer positioning (separate tools):** **`ComputerUseMousePrecise`** — absolute `x`/`y` with `coordinate_mode` / `use_screen_coordinates`. **`ComputerUseMouseStep`** — cardinal `direction` (`up`|`down`|`left`|`right`) and optional `pixels` (default 32, clamped 1..400; same screenshot-pixel space as `pointer_move_rel`). For **small** nudges onto a control, prefer **`ComputerUseMouseStep`** over tiny absolute coords. **`pointer_move_rel`** — arbitrary `delta_x`/`delta_y` when diagonal or non-cardinal deltas are needed. **`ComputerUseMouseClick`** — `action` **`click`** (button at pointer) or **`wheel`** (scroll wheel `delta_x`/`delta_y` at pointer); does not move the pointer. \ +**Host (desktop):** Call **`screenshot`** when you need current pixels; there is **no** automatic follow-up capture after other actions. Before **`ComputerUseMouseClick` (`action`: click)**, after pointer moves, the host requires a fresh **fine** basis: **`quadrant_navigation_click_ready`** (preferred path) **or** a **point crop** — **full-screen-only** is **not** enough. Before **`key_chord`** with **Return/Enter**, a fresh **`screenshot`** (any mode) is required. Numeric fields in each tool result JSON are authoritative for that frame. \ +Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), **grid on the capture**, and a **synthetic mouse marker** when the pointer is on that display (**red** with **gray border**; **tip** = hotspot, same as **`pointer_image_x` / `pointer_image_y`**). On macOS, **`ComputerUseMousePrecise`** uses sub-point Quartz when applicable. Also **wait**. **Per `action`:** send **only** the parameters that apply (e.g. for `screenshot` do not send `keys` or fields meant for **`ComputerUseMousePrecise`**) — extra keys may confuse you or the UI. macOS: Accessibility for the running binary.", os, keys, + hmin, + hmax, COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE )) } + async fn description_with_context( + &self, + context: Option<&ToolUseContext>, + ) -> BitFunResult { + let base = self.description().await?; + if context.and_then(|c| c.agent_type.as_deref()) == Some("Claw") { + Ok(format!( + "**Claw:** **`action: locate`** (accessibility) is the same tool as **`screenshot`** / **`key_chord`**. Use **`locate`** for **named** UI when AX exposes it; **do not** call **`screenshot`** after every **`locate`** / **`key_chord`** / **`type_text`** — only when you need pixels, or before **click** / **Return·Enter** (host). See `claw_mode` **Screenshot cadence**.\n\n{}", + base + )) + } else { + Ok(base) + } + } + fn input_schema(&self) -> Value { let qpad = COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX; json!({ @@ -493,54 +743,56 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** "properties": { "action": { "type": "string", - "enum": ["screenshot", "mouse_move", "click", "pointer_nudge", "pointer_move_rel", "scroll", "key_chord", "type_text", "wait"], - "description": format!("**Before any mouse drill:** try `key_chord` (shortcuts + clipboard copy/cut/paste/select-all per OS). **action `screenshot`:** JPEG with **margin coordinate scales** + **grid** (**full-capture native** indices). **After a full-screen capture, the usual next step before click is (2):** **`screenshot_navigate_quadrant`** — 4-way drill; chosen quadrant **plus {} px per side** (clamped). Repeat until tool JSON `quadrant_navigation_click_ready`. **Modes:** (1) Plain / refresh — same region or full display (no narrowing). (2) **`screenshot_navigate_quadrant`** — **default zoom path** for mouse clicks. (3) **`screenshot_reset_navigation`**: true — full display base. (4) **`screenshot_crop_center_*`** — ~500×500 point crop (**fallback**, not the default from full screen). **Precedence:** if `screenshot_navigate_quadrant` is set, **`screenshot_crop_center_*` are ignored**. **Prefer** sending **only** fields relevant to `screenshot` for this call. When **`quadrant_navigation_click_ready`** is true, you may **`mouse_move` + `click`**. **Other actions:** `key_chord` + clipboard before `type_text`; red synthetic cursor when the mouse is on this display.", qpad) + "enum": ["screenshot", "locate", "pointer_move_rel", "key_chord", "type_text", "wait"], + "description": format!("**Same tool, different `action`:** **`locate`** — accessibility tree match on the **foreground** window (JSON only, no JPEG); use **`title_contains`** / **`role_substring`** / **`identifier_contains`** and optional **`filter_combine`**: **`all`** (default, AND) or **`any`** (OR) when one node has role but not title. **Before** ruler-only **`screenshot`** for named rows/buttons. **`screenshot`** — JPEG with **margin coordinate scales** + **grid**. **After `locate` matched:** prefer **`screenshot_crop_center_*`** + optional **`screenshot_crop_half_extent_native`** from the locate result **before** a long quadrant-only chain. **`key_chord`** — shortcuts + clipboard. **Pointer moves:** **`ComputerUseMousePrecise`**, **`ComputerUseMouseStep`**. **Click / wheel:** **`ComputerUseMouseClick`**. **When locate did not match:** **`screenshot_navigate_quadrant`** — 4-way drill; chosen quadrant **plus {} px per side** (clamped). Repeat until tool JSON `quadrant_navigation_click_ready`. **Modes:** (1) Plain / refresh — same region or full display (no narrowing). (2) **`screenshot_navigate_quadrant`**. (3) **`screenshot_reset_navigation`**: true — full display base. (4) **`screenshot_crop_center_*`** ± **`screenshot_crop_half_extent_native`** — point crop. **Precedence:** if `screenshot_navigate_quadrant` is set, **`screenshot_crop_center_*` are ignored**. **Prefer** sending **only** fields relevant to `screenshot` for this call. When **`quadrant_navigation_click_ready`** is true, you may **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick`**. **Other actions:** `key_chord` + clipboard before `type_text`; red synthetic cursor when the mouse is on this display.", qpad) }, - "x": { "type": "integer", "description": "For mouse_move **absolute** mode only (omit when `mouse_move_direction` is set): pixel in **image** mode uses the last screenshot JPEG grid; **normalized** 0..=1000 on the captured display; or **use_screen_coordinates** for global px. **Small moves:** prefer `mouse_move_direction` + `mouse_move_relative_pixels` instead of tiny absolute deltas." }, - "y": { "type": "integer", "description": "For mouse_move **absolute** mode only: same as x (including **small moves** → prefer relative). Ignored when `mouse_move_direction` is set." }, - "mouse_move_direction": { + "delta_x": { "type": "integer", "description": "For pointer_move_rel only: horizontal delta in screenshot/display pixels (negative=left). On macOS converted via last screenshot scale; screenshot first." }, + "delta_y": { "type": "integer", "description": "For pointer_move_rel only: vertical delta in screenshot/display pixels (negative=up). On macOS converted via last screenshot scale; screenshot first." }, + "keys": { "type": "array", "items": { "type": "string" }, "description": "For key_chord: **prefer this action** for standard shortcuts **and** **system clipboard** (e.g. select all + copy/cut/paste per host — see tool description OS hint). Do not use mouse menus for Copy/Paste when these chords work. OS-specific key names per Environment Information. If the chord includes **return** / **enter** (submit/confirm), **`screenshot` first** and verify — **no blind Enter.** Otherwise screenshot when the next action depends on UI." }, + "text": { "type": "string", "description": "For type_text: short or paste-blocked input only — **prefer `key_chord` paste** (and focus/select chords) when inserting longer or duplicated content from the system clipboard. Then screenshot if you need to confirm focus or field content before further steps." }, + "ms": { "type": "integer", "description": "Wait duration in milliseconds" }, + "title_contains": { "type": "string", - "enum": ["up", "down", "left", "right"], - "description": "For mouse_move **relative** mode (**recommended for small nudges**): move the pointer in this cardinal direction by `mouse_move_relative_pixels` (screenshot/display pixels; macOS converts via last capture — screenshot first). **Takes precedence** over `x`/`y` when set." - }, - "mouse_move_relative_pixels": { - "type": "integer", - "description": "For mouse_move with `mouse_move_direction`: distance in **screenshot/display pixels** (default 32, clamped 1..400). Use smaller values (e.g. 8–24) for fine alignment. Same semantics as `pointer_nudge` `pixels`." + "description": "For **`action: locate`** only: case-insensitive substring on accessible title (AXTitle / etc.). Prefer the **same language as the app UI**. Optional if other filters match." }, - "coordinate_mode": { + "role_substring": { "type": "string", - "enum": ["image", "normalized"], - "description": "For mouse_move when use_screen_coordinates is false. \"image\" = pixels on the latest screenshot JPEG; \"normalized\" = 0..=1000 on the latest capture." + "description": "For **`action: locate`** only: case-insensitive substring on AXRole (e.g. \"Button\", \"AXButton\")." }, - "use_screen_coordinates": { - "type": "boolean", - "description": "For mouse_move only: if true, x/y are global display coordinates in the host’s native units (on macOS: **points**, same space as `pointer_x`/`pointer_y` after a move — not necessarily physical Retina pixels)." - }, - "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "For click only (default left). **Desktop:** after the last pointer move, `click` requires a fresh **fine** screenshot: **`quadrant_navigation_click_ready`** true **or** point crop (`screenshot_crop_center_*`). Full-screen-only basis is rejected (host)." }, - "direction": { + "identifier_contains": { "type": "string", - "enum": ["up", "down", "left", "right"], - "description": "For pointer_nudge: cardinal move in screen pixels" + "description": "For **`action: locate`** only: case-insensitive substring on AXIdentifier when present." }, - "pixels": { + "max_depth": { "type": "integer", - "description": "For pointer_nudge: distance in **screenshot/display pixels** (same scale as the last JPEG width/height on that display). On macOS this is converted to CG point deltas using the last capture; take screenshot first." + "minimum": 1, + "maximum": 200, + "description": "For **`action: locate`** only: max BFS depth from the frontmost application root (default 48)." + }, + "filter_combine": { + "type": "string", + "enum": ["all", "any"], + "description": "For **`action: locate`** only: **`all`** (default) — every non-empty filter must match the **same** element (AND). **`any`** — match if **any** non-empty filter matches (OR). Use **`any`** when a field has a **role** (e.g. `AXTextField`) but **empty or different AXTitle** than your `title_contains` (common for search boxes). Prefer **one** filter (`role_substring` alone or `title_contains` alone) when unsure." }, - "delta_x": { "type": "integer", "description": "For pointer_move_rel: horizontal delta in screenshot/display pixels (negative=left). On macOS converted via last screenshot scale; screenshot first. For scroll: horizontal scroll amount (host-dependent)." }, - "delta_y": { "type": "integer", "description": "For pointer_move_rel: vertical delta in screenshot/display pixels (negative=up). On macOS converted via last screenshot scale; screenshot first. For scroll: vertical scroll amount (host-dependent)." }, - "keys": { "type": "array", "items": { "type": "string" }, "description": "For key_chord: **prefer this action** for standard shortcuts **and** **system clipboard** (e.g. select all + copy/cut/paste per host — see tool description OS hint). Do not use mouse menus for Copy/Paste when these chords work. OS-specific key names per Environment Information. If the chord includes **return** / **enter** (submit/confirm), **`screenshot` first** and verify — **no blind Enter.** Otherwise screenshot when the next action depends on UI." }, - "text": { "type": "string", "description": "For type_text: short or paste-blocked input only — **prefer `key_chord` paste** (and focus/select chords) when inserting longer or duplicated content from the system clipboard. Then screenshot if you need to confirm focus or field content before further steps." }, - "ms": { "type": "integer", "description": "Wait duration in milliseconds" }, "screenshot_crop_center_x": { "type": "integer", "minimum": 0, - "description": "For action `screenshot` only (point crop): X center in **full-capture native** pixels — same as margin tick labels on a prior full-screen shot. Pair with `screenshot_crop_center_y`. Omit **both** when using `screenshot_navigate_quadrant` or plain refresh. **Ignored** if `screenshot_navigate_quadrant` is set. ~500×500 region when both set." + "description": "For action `screenshot` only (point crop): X center in **full-capture native** pixels — same as margin tick labels on a prior full-screen shot. Pair with `screenshot_crop_center_y`. Optional **`screenshot_crop_half_extent_native`** adjusts crop size (default half=250 → ~500×500). Omit **both** centers when using `screenshot_navigate_quadrant` or plain refresh. **Ignored** if `screenshot_navigate_quadrant` is set." }, "screenshot_crop_center_y": { "type": "integer", "minimum": 0, "description": "For action `screenshot` only (point crop): Y center in **full-capture native** pixels; pair with `screenshot_crop_center_x`. Omit **both** for quadrant drill or plain refresh. **Ignored** if `screenshot_navigate_quadrant` is set." }, + "screenshot_crop_half_extent_native": { + "type": "integer", + "minimum": 0, + "description": format!( + "For action `screenshot` only, with **`screenshot_crop_center_*`**: half-size of the crop in **native** pixels (total region ≈ `2 × half`). Host clamps to {}..{}. Omit for default **250** (~500×500). After **`action: locate`**, copy **`coordinate_hints.screenshot_point_crop.screenshot_crop_half_extent_native`** when available for tighter crops around small controls.", + COMPUTER_USE_POINT_CROP_HALF_MIN, + COMPUTER_USE_POINT_CROP_HALF_MAX + ) + }, "screenshot_navigate_quadrant": { "type": "string", "enum": ["top_left", "top_right", "bottom_left", "bottom_right"], @@ -603,6 +855,8 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** .ok_or_else(|| BitFunError::tool("action is required".to_string()))?; match action { + "locate" => execute_computer_use_locate(input, context).await, + "screenshot" => { Self::require_multimodal_tool_output_for_screenshot(context)?; let (params, ignored_crop_for_quadrant) = Self::parse_screenshot_params(input)?; @@ -621,6 +875,19 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** nav_debug, ) .await; + let input_coords = json!({ + "kind": "screenshot", + "screenshot_reset_navigation": params.reset_navigation, + "screenshot_crop_ignored_for_quadrant": ignored_crop_for_quadrant, + "screenshot_crop_center": params.crop_center.map(|c| json!({ "x": c.x, "y": c.y })), + "screenshot_crop_half_extent_native": params.point_crop_half_extent_native, + "screenshot_navigate_quadrant": params.navigate_quadrant.map(|q| match q { + ComputerUseNavigateQuadrant::TopLeft => "top_left", + ComputerUseNavigateQuadrant::TopRight => "top_right", + ComputerUseNavigateQuadrant::BottomLeft => "bottom_left", + ComputerUseNavigateQuadrant::BottomRight => "bottom_right", + }), + }); let (mut data, attach, mut hint) = Self::pack_screenshot_tool_output(&shot, debug_rel).await?; if let Some(obj) = data.as_object_mut() { @@ -643,46 +910,10 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** ); } } + let data = computer_use_augment_result_json(host_ref, data, Some(input_coords)).await; Ok(vec![ToolResult::ok_with_images(data, Some(hint), vec![attach])]) } - "pointer_nudge" => { - let dir = input - .get("direction") - .and_then(|v| v.as_str()) - .ok_or_else(|| BitFunError::tool("direction is required for pointer_nudge (up|down|left|right)".to_string()))?; - let px = input - .get("pixels") - .and_then(|v| v.as_i64()) - .map(|v| v as i32) - .unwrap_or(32) - .clamp(1, 400); - let (dx, dy) = match dir.to_lowercase().as_str() { - "up" => (0, -px), - "down" => (0, px), - "left" => (-px, 0), - "right" => (px, 0), - _ => { - return Err(BitFunError::tool( - "direction must be up, down, left, or right".to_string(), - )); - } - }; - host_ref.pointer_move_relative(dx, dy).await?; - let body = json!({ - "success": true, - "action": "pointer_nudge", - "direction": dir, - "pixels": px, - "delta_x": dx, - "delta_y": dy, - }); - let summary = format!( - "Nudged pointer by ({}, {}) px (direction {}, {} px).", - dx, dy, dir, px - ); - Ok(vec![ToolResult::ok(body, Some(summary))]) - } "pointer_move_rel" => { let dx = input.get("delta_x").and_then(|v| v.as_i64()).unwrap_or(0) as i32; let dy = input.get("delta_y").and_then(|v| v.as_i64()).unwrap_or(0) as i32; @@ -693,117 +924,28 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** )); } host_ref.pointer_move_relative(dx, dy).await?; - let body = json!({ - "success": true, - "action": "pointer_move_rel", + let input_coords = json!({ + "kind": "pointer_move_rel", "delta_x": dx, "delta_y": dy, }); - let summary = format!( - "Moved pointer relatively by ({}, {}) screen pixels.", - dx, dy - ); - Ok(vec![ToolResult::ok(body, Some(summary))]) - } - "mouse_move" => { - let last_ref = host_ref.last_screenshot_refinement(); - let snapshot_basis = match last_ref { - None => serde_json::Value::Null, - Some(ComputerUseScreenshotRefinement::FullDisplay) => json!("full_display"), - Some(ComputerUseScreenshotRefinement::RegionAroundPoint { - center_x, - center_y, - }) => { - json!({ - "region_crop_center_full_display_native": { "x": center_x, "y": center_y } - }) - } - Some(ComputerUseScreenshotRefinement::QuadrantNavigation { - x0, - y0, - width, - height, - click_ready, - }) => { - json!({ - "quadrant_native_rect": { "x0": x0, "y0": y0, "w": width, "h": height }, - "quadrant_navigation_click_ready": click_ready, - }) - } - }; - - if let Some((dx, dy, dir_label, px_used)) = Self::parse_mouse_move_cardinal(input)? { - host_ref.pointer_move_relative(dx, dy).await?; - let body = json!({ + let body = computer_use_augment_result_json( + host_ref, + json!({ "success": true, - "action": "mouse_move", - "positioning": "relative_cardinal", - "mouse_move_direction": dir_label, - "mouse_move_relative_pixels": px_used, + "action": "pointer_move_rel", "delta_x": dx, "delta_y": dy, - "snapshot_coordinate_basis": snapshot_basis, - }); - let summary = format!( - "mouse_move relative: {} by {} px (delta {}, {}).", - dir_label, px_used, dx, dy - ); - return Ok(vec![ToolResult::ok(body, Some(summary))]); - } - - let x = req_i32(input, "x")?; - let y = req_i32(input, "y")?; - let mode = Self::coordinate_mode(input); - let use_screen = Self::use_screen_coordinates(input); - let (sx64, sy64) = Self::resolve_xy_f64(host_ref, input, x, y)?; - host_ref.mouse_move_global_f64(sx64, sy64).await?; - let sx = sx64.round() as i32; - let sy = sy64.round() as i32; - let body = json!({ - "success": true, - "action": "mouse_move", - "positioning": "absolute", - "x": x, - "y": y, - "pointer_x": sx, - "pointer_y": sy, - "coordinate_mode": mode, - "use_screen_coordinates": use_screen, - "snapshot_coordinate_basis": snapshot_basis, - }); + }), + Some(input_coords), + ) + .await; let summary = format!( - "Moved pointer to global screen (~{}, ~{}, sub-point on macOS) (input {:?} {}, {}).", - sx, sy, mode, x, y + "Moved pointer relatively by ({}, {}) screen pixels.", + dx, dy ); Ok(vec![ToolResult::ok(body, Some(summary))]) } - "click" => { - let button = input - .get("button") - .and_then(|v| v.as_str()) - .unwrap_or("left"); - host_ref.mouse_click(button).await?; - let body = json!({ - "success": true, - "action": "click", - "button": button, - }); - let summary = format!("{} click at current pointer (does not move).", button); - Ok(vec![ToolResult::ok(body, Some(summary))]) - } - "scroll" => { - let dx = input.get("delta_x").and_then(|v| v.as_i64()).unwrap_or(0) as i32; - let dy = input.get("delta_y").and_then(|v| v.as_i64()).unwrap_or(0) as i32; - if dx == 0 && dy == 0 { - return Err(BitFunError::tool( - "scroll requires non-zero delta_x and/or delta_y".to_string(), - )); - } - host_ref.scroll(dx, dy).await?; - let body = json!({ "success": true, "action": "scroll", "delta_x": dx, "delta_y": dy }); - let summary = format!("Scrolled by ({}, {}).", dx, dy); - Ok(vec![ToolResult::ok(body, Some(summary))]) - } "key_chord" => { let keys: Vec = input .get("keys") @@ -816,7 +958,13 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** return Err(BitFunError::tool("keys must not be empty".to_string())); } host_ref.key_chord(keys.clone()).await?; - let body = json!({ "success": true, "action": "key_chord", "keys": keys }); + let input_coords = json!({ "kind": "key_chord", "keys": keys }); + let body = computer_use_augment_result_json( + host_ref, + json!({ "success": true, "action": "key_chord", "keys": keys }), + Some(input_coords), + ) + .await; let summary = "Key chord sent.".to_string(); Ok(vec![ToolResult::ok(body, Some(summary))]) } @@ -826,7 +974,13 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** .and_then(|v| v.as_str()) .ok_or_else(|| BitFunError::tool("text is required".to_string()))?; host_ref.type_text(text).await?; - let body = json!({ "success": true, "action": "type_text", "chars": text.chars().count() }); + let input_coords = json!({ "kind": "type_text", "char_count": text.chars().count() }); + let body = computer_use_augment_result_json( + host_ref, + json!({ "success": true, "action": "type_text", "chars": text.chars().count() }), + Some(input_coords), + ) + .await; let summary = format!("Typed {} character(s) into the focused target.", text.chars().count()); Ok(vec![ToolResult::ok(body, Some(summary))]) } @@ -836,8 +990,14 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** .and_then(|v| v.as_u64()) .ok_or_else(|| BitFunError::tool("ms is required".to_string()))?; host_ref.wait_ms(ms).await?; - Ok(vec![ToolResult::ok( + let body = computer_use_augment_result_json( + host_ref, json!({ "success": true, "action": "wait", "ms": ms }), + None, + ) + .await; + Ok(vec![ToolResult::ok( + body, Some(format!("Waited {} ms.", ms)), )]) } diff --git a/src/crates/core/src/agentic/tools/implementations/mod.rs b/src/crates/core/src/agentic/tools/implementations/mod.rs index cdb6c7df..9b9e4199 100644 --- a/src/crates/core/src/agentic/tools/implementations/mod.rs +++ b/src/crates/core/src/agentic/tools/implementations/mod.rs @@ -4,6 +4,10 @@ pub mod ask_user_question_tool; pub mod bash_tool; pub mod code_review_tool; pub mod computer_use_tool; +pub mod computer_use_mouse_precise_tool; +pub mod computer_use_mouse_step_tool; +pub mod computer_use_mouse_click_tool; +pub mod computer_use_locate; pub mod cron_tool; pub mod create_plan_tool; pub mod delete_file_tool; @@ -33,6 +37,9 @@ pub use ask_user_question_tool::AskUserQuestionTool; pub use bash_tool::BashTool; pub use code_review_tool::CodeReviewTool; pub use computer_use_tool::ComputerUseTool; +pub use computer_use_mouse_precise_tool::ComputerUseMousePreciseTool; +pub use computer_use_mouse_step_tool::ComputerUseMouseStepTool; +pub use computer_use_mouse_click_tool::ComputerUseMouseClickTool; pub use cron_tool::CronTool; pub use create_plan_tool::CreatePlanTool; pub use delete_file_tool::DeleteFileTool; diff --git a/src/crates/core/src/agentic/tools/registry.rs b/src/crates/core/src/agentic/tools/registry.rs index 33488e9c..85262c62 100644 --- a/src/crates/core/src/agentic/tools/registry.rs +++ b/src/crates/core/src/agentic/tools/registry.rs @@ -135,6 +135,9 @@ impl ToolRegistry { self.register_tool(Arc::new(InitMiniAppTool::new())); self.register_tool(Arc::new(ComputerUseTool::new())); + self.register_tool(Arc::new(ComputerUseMousePreciseTool::new())); + self.register_tool(Arc::new(ComputerUseMouseStepTool::new())); + self.register_tool(Arc::new(ComputerUseMouseClickTool::new())); } /// Register a single tool diff --git a/src/crates/webdriver/src/platform/capture.rs b/src/crates/webdriver/src/platform/capture.rs index 0433651b..7b57d7da 100644 --- a/src/crates/webdriver/src/platform/capture.rs +++ b/src/crates/webdriver/src/platform/capture.rs @@ -1,5 +1,3 @@ -use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; -use base64::Engine as _; use tauri::{Runtime, Webview}; use super::types::PrintOptions; @@ -26,6 +24,8 @@ mod imp { use std::time::Duration; use super::*; + use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; + use base64::Engine as _; use block2::RcBlock; use objc2::runtime::AnyObject; use objc2::MainThreadMarker; @@ -198,6 +198,8 @@ mod imp { use std::time::Duration; use super::*; + use base64::engine::general_purpose::STANDARD as BASE64_STANDARD; + use base64::Engine as _; use tokio::sync::oneshot; use webview2_com::Microsoft::Web::WebView2::Win32::{ ICoreWebView2CapturePreviewCompletedHandler, diff --git a/src/web-ui/src/app/components/SceneBar/SceneTab.tsx b/src/web-ui/src/app/components/SceneBar/SceneTab.tsx index 0988dd8f..72d82584 100644 --- a/src/web-ui/src/app/components/SceneBar/SceneTab.tsx +++ b/src/web-ui/src/app/components/SceneBar/SceneTab.tsx @@ -59,6 +59,25 @@ const SceneTab: React.FC = ({ } }, [onActivate, tab.id]); + /** Middle-click closes (browser-tab style); skip on pinned tabs and the inline + action. */ + const handleMouseDown = useCallback((e: React.MouseEvent) => { + if (e.button !== 1) return; + if (pinned) return; + const target = e.target as HTMLElement; + if (target.closest('.bitfun-scene-tab__action')) return; + e.preventDefault(); + }, [pinned]); + + const handleAuxClick = useCallback((e: React.MouseEvent) => { + if (e.button !== 1) return; + if (pinned) return; + const target = e.target as HTMLElement; + if (target.closest('.bitfun-scene-tab__action')) return; + e.preventDefault(); + e.stopPropagation(); + onClose(tab.id); + }, [pinned, onClose, tab.id]); + return (
= ({ pinned && 'bitfun-scene-tab--pinned', ].filter(Boolean).join(' ')} onClick={handleClick} + onMouseDown={handleMouseDown} + onAuxClick={handleAuxClick} onKeyDown={handleKeyDown} > {/* Centered content group */} diff --git a/src/web-ui/src/flow_chat/components/modern/ModernFlowChatContainer.tsx b/src/web-ui/src/flow_chat/components/modern/ModernFlowChatContainer.tsx index 3031c27a..68de5084 100644 --- a/src/web-ui/src/flow_chat/components/modern/ModernFlowChatContainer.tsx +++ b/src/web-ui/src/flow_chat/components/modern/ModernFlowChatContainer.tsx @@ -47,7 +47,7 @@ export const ModernFlowChatContainer: React.FC = ( const autoPinnedSessionIdRef = useRef(null); const virtualListRef = useRef(null); const { workspacePath } = useWorkspaceContext(); - const { isBtwSession, btwOrigin, btwParentTitle } = useFlowChatSessionRelationship(activeSession); + const { btwOrigin, btwParentTitle } = useFlowChatSessionRelationship(activeSession); const { exploreGroupStates, onExploreGroupToggle: handleExploreGroupToggle, diff --git a/src/web-ui/src/infrastructure/config/components/SessionConfig.tsx b/src/web-ui/src/infrastructure/config/components/SessionConfig.tsx index 0fe9db22..1e1a6cd1 100644 --- a/src/web-ui/src/infrastructure/config/components/SessionConfig.tsx +++ b/src/web-ui/src/infrastructure/config/components/SessionConfig.tsx @@ -520,7 +520,12 @@ const SessionConfig: React.FC = () => { {computerUseNote} ) : null} - +
{
- +