diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5ca9eb8c..e2cc541b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,7 +70,11 @@ jobs: "$WEBKIT_PKG" \ "$APPINDICATOR_PKG" \ librsvg2-dev \ - patchelf + patchelf \ + libleptonica-dev \ + libtesseract-dev \ + tesseract-ocr \ + tesseract-ocr-eng - uses: dtolnay/rust-toolchain@stable diff --git a/.github/workflows/desktop-package.yml b/.github/workflows/desktop-package.yml index c6a159b5..ec071b8b 100644 --- a/.github/workflows/desktop-package.yml +++ b/.github/workflows/desktop-package.yml @@ -147,7 +147,11 @@ jobs: librsvg2-dev \ patchelf \ fakeroot \ - rpm + rpm \ + libleptonica-dev \ + libtesseract-dev \ + tesseract-ocr \ + tesseract-ocr-eng - name: Setup pnpm uses: pnpm/action-setup@v4 diff --git a/src/apps/desktop/Cargo.toml b/src/apps/desktop/Cargo.toml index 06c93088..5d17c9ef 100644 --- a/src/apps/desktop/Cargo.toml +++ b/src/apps/desktop/Cargo.toml @@ -59,10 +59,19 @@ fontdue = "0.9" core-foundation = "0.9" core-graphics = "0.23" dispatch = "0.2" +objc2 = "0.6" +objc2-foundation = "0.3" +objc2-app-kit = "0.3" +objc2-vision = { version = "0.3.2", features = ["VNRecognizeTextRequest", "VNRequest", "VNObservation", "VNRequestHandler", "VNUtils", "VNTypes", "objc2-core-foundation"] } [target.'cfg(windows)'.dependencies] win32job = { workspace = true } windows = { version = "0.61.3", features = [ + "Foundation", + "Globalization", + "Graphics_Imaging", + "Media_Ocr", + "Storage_Streams", "Win32_Foundation", "Win32_System_Com", "Win32_UI_Accessibility", @@ -72,3 +81,4 @@ windows-core = "0.61.2" [target.'cfg(target_os = "linux")'.dependencies] atspi = "0.29" +leptess = "0.14.0" diff --git a/src/apps/desktop/src/api/config_api.rs b/src/apps/desktop/src/api/config_api.rs index a756ce22..1ab92667 100644 --- a/src/apps/desktop/src/api/config_api.rs +++ b/src/apps/desktop/src/api/config_api.rs @@ -272,27 +272,15 @@ pub async fn get_mode_configs(state: State<'_, AppState>) -> Result, } @@ -131,12 +150,7 @@ fn draw_pointer_fallback_cross(img: &mut RgbImage, cx: i32, cy: i32) { } } -// ── Computer-use coordinate grid (100 px step): lines + anti-aliased axis labels (Inter OFL) ── - -const COORD_GRID_DEFAULT_STEP: u32 = 100; -const COORD_GRID_MAJOR_STEP: u32 = 500; -/// Logical scale knob; mapped to TTF pixel size for `fontdue` (`scale * 3.5`). -const COORD_LABEL_SCALE: i32 = 11; +// ── SoM / overlay text: Inter (OFL) via fontdue ── /// Inter (OFL); variable font from google/fonts OFL tree. const COORD_AXIS_FONT_TTF: &[u8] = include_bytes!("../../assets/fonts/Inter-Regular.ttf"); @@ -146,15 +160,10 @@ static COORD_AXIS_FONT: OnceLock = OnceLock::new(); fn coord_axis_font() -> &'static Font { COORD_AXIS_FONT.get_or_init(|| { Font::from_bytes(COORD_AXIS_FONT_TTF, FontSettings::default()) - .expect("Inter TTF embedded for computer-use axis labels") + .expect("Inter TTF embedded for computer-use SoM/overlay labels") }) } -#[inline] -fn coord_label_px() -> f32 { - COORD_LABEL_SCALE as f32 * 3.5 -} - /// Alpha-blend grayscale coverage onto `img` (baseline-anchored glyph). fn coord_blit_glyph( img: &mut RgbImage, @@ -228,243 +237,175 @@ fn coord_draw_text_h(img: &mut RgbImage, mut baseline_x: i32, baseline_y: i32, t } } -/// Vertically center a horizontal digit string on tick `py`. -fn coord_draw_u32_h_centered(img: &mut RgbImage, lx: i32, py: i32, n: u32, fg: Rgb, px: f32) { - let s = n.to_string(); +// ── Set-of-Mark (SoM) label rendering ── + +/// Badge font size for SoM labels (smaller than axis labels). +const SOM_LABEL_PX: f32 = 28.0; +/// Badge background color (bright magenta -- high contrast on most UIs). +const SOM_BG: Rgb = Rgb([230, 40, 120]); +/// Badge text color. +const SOM_FG: Rgb = Rgb([255, 255, 255]); +/// Padding around the label text inside the badge. +const SOM_PAD_X: i32 = 4; +const SOM_PAD_Y: i32 = 2; + +/// Draw SoM numbered labels on the frame at each element's mapped image position. +/// `elements`: SoM elements with global coordinates. +/// `margin_l`, `margin_t`: content area offset in the frame. +/// `map_fn`: maps global (f64,f64) -> Option<(i32,i32)> in content-area pixel space. +fn draw_som_labels( + frame: &mut RgbImage, + elements: &[SomElement], + margin_l: u32, + margin_t: u32, + map_fn: F, +) where + F: Fn(f64, f64) -> Option<(i32, i32)>, +{ let font = coord_axis_font(); - let (m_rep, _) = font.rasterize('8', px); - let text_h = m_rep.height as i32; - let baseline_y = py - (m_rep.ymin + text_h / 2); - coord_draw_text_h(img, lx, baseline_y, &s, fg, px); -} - -#[inline] -fn coord_plot(img: &mut RgbImage, x: i32, y: i32, c: Rgb) { - let w = img.width() as i32; - let h = img.height() as i32; - if x >= 0 && x < w && y >= 0 && y < h { - img.put_pixel(x as u32, y as u32, c); - } -} - -fn coord_digit_block_width(digit_count: usize, px: f32) -> i32 { - if digit_count == 0 { - return 0; - } - let s: String = std::iter::repeat('8').take(digit_count).collect(); - coord_measure_str_width(&s, px) -} + let (fw, fh) = frame.dimensions(); -/// Height of a vertical digit stack (top-to-bottom) for `nd` decimal digits. -fn coord_vertical_digit_stack_height(nd: usize, px: f32) -> i32 { - if nd == 0 { - return 0; - } - let font = coord_axis_font(); - let gap = (px * 0.22).ceil().max(1.0) as i32; - let mut tot = 0i32; - for _ in 0..nd { - let (m, _) = font.rasterize('8', px); - tot += m.height as i32 + gap; - } - tot - gap -} + for elem in elements { + let Some((cx, cy)) = map_fn(elem.global_center_x, elem.global_center_y) else { + continue; + }; -/// Draw decimal `n` with digits stacked **top-to-bottom** (high-order digit at top). -/// Column is centered on `center_x` (tick position); narrow horizontal footprint for dense x-axis ticks. -fn coord_draw_u32_vertical_stack( - img: &mut RgbImage, - center_x: i32, - top_y: i32, - n: u32, - fg: Rgb, - px: f32, -) { - let s = n.to_string(); - let font = coord_axis_font(); - let gap = (px * 0.22).ceil().max(1.0) as i32; - let mut ty = top_y; - for c in s.chars() { - let (m, bmp) = font.rasterize(c, px); - let top_left_x = center_x - m.width as i32 / 2; - let top_left_y = ty; - let baseline_x = top_left_x - m.xmin as i32; - let baseline_y = top_left_y - m.ymin as i32; - coord_blit_glyph_bold(img, baseline_x, baseline_y, &m, &bmp, fg); - ty += m.height as i32 + gap; - } -} + // Map from content-area space to frame space + let img_x = cx + margin_l as i32; + let img_y = cy + margin_t as i32; + + // Measure label text width + let label_text = elem.label.to_string(); + let text_w = coord_measure_str_width(&label_text, SOM_LABEL_PX); + let (m_rep, _) = font.rasterize('8', SOM_LABEL_PX); + let text_h = m_rep.height as i32; + + let badge_w = text_w + SOM_PAD_X * 2 + 2; // +2 for bold offset + let badge_h = text_h + SOM_PAD_Y * 2; + + // Position badge at top-left of element's bounds (mapped to image), + // but fall back to center if bounds mapping fails + let (badge_x, badge_y) = { + let bx = elem.bounds_left; + let by = elem.bounds_top; + if let Some((bix, biy)) = map_fn(bx, by) { + (bix + margin_l as i32, biy + margin_t as i32) + } else { + // Fall back to center + (img_x - badge_w / 2, img_y - badge_h / 2) + } + }; -fn content_grid_step(min_side: u32) -> u32 { - if min_side < 240 { - 25u32 - } else if min_side < 480 { - 50u32 - } else { - COORD_GRID_DEFAULT_STEP - } -} + // Clamp to frame bounds + let bx0 = badge_x.max(0).min(fw as i32 - badge_w); + let by0 = badge_y.max(0).min(fh as i32 - badge_h); + + // Draw badge background rectangle + for dy in 0..badge_h { + for dx in 0..badge_w { + let px = bx0 + dx; + let py = by0 + dy; + if px >= 0 && px < fw as i32 && py >= 0 && py < fh as i32 { + frame.put_pixel(px as u32, py as u32, SOM_BG); + } + } + } -/// Symmetric white margins (left = right, top = bottom) for ruler labels outside the capture. -/// `ruler_origin_*` is the **full-capture native** pixel index of the content’s top-left (0,0 for full screen; crop `x0,y0` for point crops) so label digit width fits large coordinates. -fn computer_use_margins( - cw: u32, - ch: u32, - ruler_origin_x: u32, - ruler_origin_y: u32, -) -> (u32, u32) { - if cw < 2 || ch < 2 { - return (0, 0); + // Draw label text centered in badge + let text_x = bx0 + SOM_PAD_X; + let baseline_y = by0 + SOM_PAD_Y + text_h - (m_rep.ymin.max(0) as i32); + coord_draw_text_h(frame, text_x, baseline_y, &label_text, SOM_FG, SOM_LABEL_PX); } - let px = coord_label_px(); - let tick_len = 14i32; - let pad = 12i32; - let max_val_x = ruler_origin_x.saturating_add(cw.saturating_sub(1)); - let max_val_y = ruler_origin_y.saturating_add(ch.saturating_sub(1)); - let nd_x = (max_val_x.max(1).ilog10() as usize + 1).max(4); - let nd_y = (max_val_y.max(1).ilog10() as usize + 1).max(4); - let nd = nd_x.max(nd_y); - let ml = (coord_digit_block_width(nd, px) + tick_len + pad).max(0) as u32; - // Top/bottom: x-axis labels are vertical stacks — need height for `nd_x` digits. - let x_stack_h = coord_vertical_digit_stack_height(nd_x, px); - let mt = (x_stack_h + tick_len + pad).max(0) as u32; - (ml, mt) } -/// White border, grid lines on the capture only, numeric labels in the margin. -/// `ruler_origin_x/y`: **full-capture native** index of content pixel (0,0) — for a point crop, pass the crop’s `x0,y0` so tick labels match the same **whole-screen bitmap** space as a full-screen shot (not 0..crop_width only). +/// Returns the capture bitmap unchanged (no grid, rulers, or margins). Pointer and SoM overlays are applied later. fn compose_computer_use_frame( content: RgbImage, - ruler_origin_x: u32, - ruler_origin_y: u32, + _ruler_origin_x: u32, + _ruler_origin_y: u32, ) -> (RgbImage, u32, u32) { - let cw = content.width(); - let ch = content.height(); - if cw < 2 || ch < 2 { - return (content, 0, 0); - } - let grid_step = content_grid_step(cw.min(ch)); - let (ml, mt) = computer_use_margins(cw, ch, ruler_origin_x, ruler_origin_y); - let mr = ml; - let mb = mt; - let tw = ml + cw + mr; - let th = mt + ch + mb; - let label_px = coord_label_px(); - let tick_len = 14i32; - let pad = 12i32; - - let mut out = RgbImage::new(tw, th); - for p in out.pixels_mut() { - *p = Rgb([255u8, 255, 255]); - } - for yy in 0..ch { - for xx in 0..cw { - out.put_pixel(ml + xx, mt + yy, *content.get_pixel(xx, yy)); - } - } - - let grid = Rgb([52, 52, 68]); - let grid_major = Rgb([95, 95, 118]); - let tick = Rgb([180, 130, 40]); - // Coordinate numerals in white margins — saturated red for visibility. - let label = Rgb([200, 32, 40]); - - let cl = ml as i32; - let ct = mt as i32; - let cr = (ml + cw - 1) as i32; - let cb = (mt + ch - 1) as i32; - let wi = tw as i32; - let hi = th as i32; - - let mut gx = grid_step as i32; - while gx < cw as i32 { - let major = (gx as u32) % COORD_GRID_MAJOR_STEP == 0; - let thick = if major { 2 } else { 1 }; - let c = if major { grid_major } else { grid }; - for t in 0..thick { - let px = cl + gx + t; - if px >= cl && px <= cr { - for py in ct..=cb { - coord_plot(&mut out, px, py, c); - } - } - } - gx += grid_step as i32; - } + (content, 0, 0) +} - let mut gy = grid_step as i32; - while gy < ch as i32 { - let major = (gy as u32) % COORD_GRID_MAJOR_STEP == 0; - let thick = if major { 2 } else { 1 }; - let c = if major { grid_major } else { grid }; - for t in 0..thick { - let py = ct + gy + t; - if py >= ct && py <= cb { - for px in cl..=cr { - coord_plot(&mut out, px, py, c); - } - } - } - gy += grid_step as i32; +fn implicit_confirmation_should_apply(click_needs: bool, params: &ComputerUseScreenshotParams) -> bool { + // Applies on **every** bare `screenshot` while confirmation is required — including the + // first capture in a session (`last_shot_refinement` may still be `None`), so click/Enter + // guards get a ~500×500 around the mouse (or `text_caret` when requested) instead of full screen. + // + // **Always** apply when `click_needs` (even during quadrant/point-crop drill): previously we + // skipped implicit crop while `navigation_focus` was Quadrant/PointCrop, which produced large + // confirmation JPEGs; confirmation shots must stay ~500×500 around the pointer/caret. + if !click_needs { + return false; + } + if params.crop_center.is_some() + || params.navigate_quadrant.is_some() + || params.reset_navigation + { + return false; } + true +} - let top_label_y = pad.max(2); - for gxc in (0..cw as i32).step_by(grid_step as usize) { - let tick_x = cl + gxc; - for k in 0..tick_len.min(ct.max(1)) { - coord_plot(&mut out, tick_x, ct - 1 - k, tick); +fn global_to_native_full_pixel_center( + gx: f64, + gy: f64, + native_w: u32, + native_h: u32, + d: &DisplayInfo, +) -> (u32, u32) { + #[cfg(target_os = "macos")] + { + let geo = MacPointerGeo::from_display(native_w, native_h, d); + let lx = gx - geo.disp_ox; + let ly = gy - geo.disp_oy; + if lx < 0.0 || lx >= geo.disp_w || ly < 0.0 || ly >= geo.disp_h { + return clamp_center_to_native(native_w / 2, native_h / 2, native_w, native_h); } - let val = ruler_origin_x.saturating_add(gxc.max(0) as u32); - let col_w = coord_measure_str_width("8", label_px).max(1); - let cx = tick_x.clamp(col_w / 2 + 2, wi - col_w / 2 - 2); - coord_draw_u32_vertical_stack(&mut out, cx, top_label_y, val, label, label_px); + let full_ix = ((lx / geo.disp_w) * geo.full_px_w as f64).floor() as u32; + let full_iy = ((ly / geo.disp_h) * geo.full_px_h as f64).floor() as u32; + clamp_center_to_native(full_ix, full_iy, native_w, native_h) } - - let bot_label_y = cb + tick_len + 4; - for gxc in (0..cw as i32).step_by(grid_step as usize) { - let tick_x = cl + gxc; - for k in 0..tick_len { - let y = cb + 1 + k; - if y < hi { - coord_plot(&mut out, tick_x, y, tick); - } + #[cfg(not(target_os = "macos"))] + { + let disp_w = d.width as f64; + let disp_h = d.height as f64; + if disp_w <= 0.0 || disp_h <= 0.0 || native_w == 0 || native_h == 0 { + return (0, 0); } - let val = ruler_origin_x.saturating_add(gxc.max(0) as u32); - let col_w = coord_measure_str_width("8", label_px).max(1); - let cx = tick_x.clamp(col_w / 2 + 2, wi - col_w / 2 - 2); - coord_draw_u32_vertical_stack(&mut out, cx, bot_label_y, val, label, label_px); - } - - let left_numbers_x = pad.max(2); - for gyc in (0..ch as i32).step_by(grid_step as usize) { - let py = ct + gyc; - for k in 0..tick_len.min(cl.max(1)) { - coord_plot(&mut out, cl - 1 - k, py, tick); + let lx = gx - d.x as f64; + let ly = gy - d.y as f64; + if lx < 0.0 || lx >= disp_w || ly < 0.0 || ly >= disp_h { + return clamp_center_to_native(native_w / 2, native_h / 2, native_w, native_h); } - let val = ruler_origin_y.saturating_add(gyc.max(0) as u32); - let s = val.to_string(); - let dw = coord_measure_str_width(&s, label_px); - let lx = left_numbers_x.min(cl - dw - 2).max(2); - coord_draw_u32_h_centered(&mut out, lx, py, val, label, label_px); + let full_ix = ((lx / disp_w) * native_w as f64).floor() as u32; + let full_iy = ((ly / disp_h) * native_h as f64).floor() as u32; + clamp_center_to_native(full_ix, full_iy, native_w, native_h) } +} - let right_text_x = cr + tick_len + 4; - for gyc in (0..ch as i32).step_by(grid_step as usize) { - let py = ct + gyc; - for k in 0..tick_len { - let x = cr + 1 + k; - if x < wi { - coord_plot(&mut out, x, py, tick); - } +#[cfg(target_os = "macos")] +fn implicit_global_center_for_confirmation( + center: ComputerUseImplicitScreenshotCenter, + mx: f64, + my: f64, +) -> (f64, f64) { + match center { + ComputerUseImplicitScreenshotCenter::Mouse => (mx, my), + ComputerUseImplicitScreenshotCenter::TextCaret => { + crate::computer_use::macos_ax_ui::global_point_for_text_caret_screenshot(mx, my) } - let val = ruler_origin_y.saturating_add(gyc.max(0) as u32); - let s = val.to_string(); - let dw = coord_measure_str_width(&s, label_px); - let lx = right_text_x.min(wi - dw - 2).max(2); - coord_draw_u32_h_centered(&mut out, lx, py, val, label, label_px); } +} - (out, ml, mt) +#[cfg(not(target_os = "macos"))] +fn implicit_global_center_for_confirmation( + center: ComputerUseImplicitScreenshotCenter, + mx: f64, + my: f64, +) -> (f64, f64) { + let _ = center; + (mx, my) } /// JPEG quality for computer-use screenshots. Native display resolution is preserved (no downscale) @@ -686,13 +627,13 @@ impl MacPointerGeo { #[derive(Clone, Copy, Debug)] struct PointerMap { - /// Composed JPEG size (includes white margin). + /// Screenshot JPEG width/height (same as capture when there is no frame padding). image_w: u32, image_h: u32, - /// Top-left of capture inside the JPEG. + /// Top-left of capture inside the JPEG (0 when there is no padding). content_origin_x: u32, content_origin_y: u32, - /// Native capture pixel size (the screen bitmap, no margin). + /// Native capture pixel size (the cropped/visible bitmap). content_w: u32, content_h: u32, native_w: u32, @@ -742,7 +683,7 @@ impl PointerMap { Ok((center_full_x, center_full_y)) } - /// Normalized 0..=1000 maps to the **capture** (same as pre-margin bitmap; independent of ruler padding). + /// Normalized 0..=1000 maps to the **capture** bitmap. fn map_normalized_to_global_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { if self.native_w == 0 || self.native_h == 0 { return Err(BitFunError::tool( @@ -781,15 +722,67 @@ enum ComputerUseNavFocus { }, } -pub struct DesktopComputerUseHost { - last_pointer_map: Mutex>, +/// Unified mutable session state for computer use — one mutex instead of five. +/// State transitions are applied centrally after each action (screenshot, pointer move, click, etc.). +#[derive(Debug)] +struct ComputerUseSessionMutableState { + pointer_map: Option, /// When true, a fresh `screenshot_display` is required before `click` and before `key_chord` that sends Return/Enter /// (set after pointer moves / click; cleared after screenshot). - click_needs_fresh_screenshot: Mutex, + click_needs_fresh_screenshot: bool, /// Last `screenshot_display` scope (full screen vs point crop) for tool hints and click rules. - last_shot_refinement: Mutex>, + last_shot_refinement: Option, /// Drill / crop context for the next `screenshot` (see [`ComputerUseNavFocus`]). - navigation_focus: Mutex>, + navigation_focus: Option, + /// Cached full-screen screenshot for fast consecutive crops. + screenshot_cache: Option, + /// After `screenshot`, block `pointer_move_rel` / `ComputerUseMouseStep` until an absolute move + /// from AX/OCR/globals (`mouse_move`, `move_to_text`, `click_element`, `click_label`) clears this. + block_vision_pixel_nudge_after_screenshot: bool, + /// Action optimizer for loop detection, history, and visual verification. + optimizer: ComputerUseOptimizer, +} + +impl ComputerUseSessionMutableState { + fn new() -> Self { + Self { + pointer_map: None, + click_needs_fresh_screenshot: true, + last_shot_refinement: None, + navigation_focus: None, + screenshot_cache: None, + block_vision_pixel_nudge_after_screenshot: false, + optimizer: ComputerUseOptimizer::new(), + } + } + + /// Called after a successful screenshot capture. + fn transition_after_screenshot( + &mut self, + map: PointerMap, + refinement: ComputerUseScreenshotRefinement, + nav_focus: Option, + ) { + self.pointer_map = Some(map); + self.last_shot_refinement = Some(refinement); + self.navigation_focus = nav_focus; + self.click_needs_fresh_screenshot = false; + self.block_vision_pixel_nudge_after_screenshot = true; + } + + /// Called after pointer mutation (move, step, relative), click, scroll, key_chord, or type_text. + fn transition_after_pointer_mutation(&mut self) { + self.click_needs_fresh_screenshot = true; + } + + /// Called after click (same effect as pointer mutation for freshness). + fn transition_after_click(&mut self) { + self.click_needs_fresh_screenshot = true; + } +} + +pub struct DesktopComputerUseHost { + state: Mutex, } impl std::fmt::Debug for DesktopComputerUseHost { @@ -801,10 +794,13 @@ impl std::fmt::Debug for DesktopComputerUseHost { impl DesktopComputerUseHost { pub fn new() -> Self { Self { - last_pointer_map: Mutex::new(None), - click_needs_fresh_screenshot: Mutex::new(true), - last_shot_refinement: Mutex::new(None), - navigation_focus: Mutex::new(None), + state: Mutex::new(ComputerUseSessionMutableState::new()), + } + } + + fn clear_vision_pixel_nudge_block(&self) { + if let Ok(mut s) = self.state.lock() { + s.block_vision_pixel_nudge_after_screenshot = false; } } @@ -1050,8 +1046,21 @@ end tell"#]) "right" => Key::RightArrow, "home" => Key::Home, "end" => Key::End, - "pageup" => Key::PageUp, - "pagedown" => Key::PageDown, + "pageup" | "page_up" => Key::PageUp, + "pagedown" | "page_down" => Key::PageDown, + "capslock" | "caps_lock" => Key::CapsLock, + "f1" => Key::F1, + "f2" => Key::F2, + "f3" => Key::F3, + "f4" => Key::F4, + "f5" => Key::F5, + "f6" => Key::F6, + "f7" => Key::F7, + "f8" => Key::F8, + "f9" => Key::F9, + "f10" => Key::F10, + "f11" => Key::F11, + "f12" => Key::F12, s if s.len() == 1 => { let c = s.chars().next().unwrap(); Key::Unicode(c) @@ -1074,6 +1083,147 @@ end tell"#]) Ok(buf) } + /// JPEG for OCR only: **no** pointer/SoM overlay — raw capture pixels. + const OCR_RAW_JPEG_QUALITY: u8 = 75; + + /// Build [`ComputerScreenshot`] from a raw RGB crop; image pixels map 1:1 to `native_*` at `display_origin_*`. + fn raw_shot_from_rgb_crop( + rgb: RgbImage, + display_origin_x: i32, + display_origin_y: i32, + native_w: u32, + native_h: u32, + ) -> BitFunResult { + let jpeg_bytes = Self::encode_jpeg(&rgb, Self::OCR_RAW_JPEG_QUALITY)?; + let iw = rgb.width(); + let ih = rgb.height(); + Ok(ComputerScreenshot { + bytes: jpeg_bytes, + mime_type: "image/jpeg".to_string(), + image_width: iw, + image_height: ih, + native_width: native_w, + native_height: native_h, + display_origin_x, + display_origin_y, + vision_scale: 1.0_f64, + pointer_image_x: None, + pointer_image_y: None, + screenshot_crop_center: None, + point_crop_half_extent_native: None, + navigation_native_rect: None, + quadrant_navigation_click_ready: false, + image_content_rect: Some(ComputerUseImageContentRect { + left: 0, + top: 0, + width: iw, + height: ih, + }), + som_labels: vec![], + implicit_confirmation_crop_applied: false, + }) + } + + /// Full primary-display region in **global logical coordinates** (same as `CGDisplayBounds` / AX). + fn ocr_full_primary_display_region() -> BitFunResult { + let screen = Screen::from_point(0, 0) + .map_err(|e| BitFunError::tool(format!("Screen capture init (OCR raw): {}", e)))?; + let d = screen.display_info; + Ok(OcrRegionNative { + x0: d.x, + y0: d.y, + width: d.width, + height: d.height, + }) + } + + /// Region to OCR: explicit `ocr_region_native`, else (macOS) frontmost window from AX, else full primary display. + fn ocr_resolve_region_for_capture(region_native: Option) -> BitFunResult { + if let Some(r) = region_native { + return Ok(r); + } + #[cfg(target_os = "macos")] + { + match crate::computer_use::macos_ax_ui::frontmost_window_bounds_global() { + Ok((x0, y0, w, h)) => Ok(OcrRegionNative { x0, y0, width: w, height: h }), + Err(e) => { + warn!( + "computer_use OCR: frontmost window bounds failed ({}); falling back to full primary display.", + e + ); + Self::ocr_full_primary_display_region() + } + } + } + #[cfg(not(target_os = "macos"))] + { + Self::ocr_full_primary_display_region() + } + } + + /// Capture **raw** display pixels (no pointer/SoM overlay), cropped to `region` intersected with the chosen display. + /// + /// `region` and [`DisplayInfo::width`]/[`height`] are **global logical points** (CG / AX). The framebuffer + /// is **physical pixels** on Retina; intersect in point space, then map to pixels like [`MacPointerGeo`]. + fn screenshot_raw_native_region(region: OcrRegionNative) -> BitFunResult { + let cx = region.x0 + region.width as i32 / 2; + let cy = region.y0 + region.height as i32 / 2; + let screen = Screen::from_point(cx, cy) + .or_else(|_| Screen::from_point(0, 0)) + .map_err(|e| BitFunError::tool(format!("Screen capture init (OCR raw): {}", e)))?; + let rgba = screen.capture().map_err(|e| { + BitFunError::tool(format!("Screenshot failed (OCR raw): {}", e)) + })?; + let (full_px_w, full_px_h) = rgba.dimensions(); + let d = screen.display_info; + let disp_w = d.width as f64; + let disp_h = d.height as f64; + if disp_w <= 0.0 || disp_h <= 0.0 || full_px_w == 0 || full_px_h == 0 { + return Err(BitFunError::tool( + "Invalid display geometry for OCR raw crop.".to_string(), + )); + } + let ox = d.x as f64; + let oy = d.y as f64; + let full_rgb = DynamicImage::ImageRgba8(rgba).to_rgb8(); + // Region from AX / user: global logical coords (points). + let rx0 = region.x0 as f64; + let ry0 = region.y0 as f64; + let rw = region.width as f64; + let rh = region.height as f64; + let ix0 = rx0.max(ox); + let iy0 = ry0.max(oy); + let ix1 = (rx0 + rw).min(ox + disp_w); + let iy1 = (ry0 + rh).min(oy + disp_h); + if ix1 <= ix0 || iy1 <= iy0 { + return Err(BitFunError::tool( + "OCR region does not intersect the captured display. Focus the target app or set ocr_region_native." + .to_string(), + )); + } + let px0_f = ((ix0 - ox) / disp_w) * full_px_w as f64; + let py0_f = ((iy0 - oy) / disp_h) * full_px_h as f64; + let px1_f = ((ix1 - ox) / disp_w) * full_px_w as f64; + let py1_f = ((iy1 - oy) / disp_h) * full_px_h as f64; + let px0 = px0_f.floor().max(0.0) as u32; + let py0 = py0_f.floor().max(0.0) as u32; + let px1 = px1_f.ceil().min(full_px_w as f64) as u32; + let py1 = py1_f.ceil().min(full_px_h as f64) as u32; + if px1 <= px0 || py1 <= py0 { + return Err(BitFunError::tool( + "OCR crop rectangle is empty after point-to-pixel mapping.".to_string(), + )); + } + let crop_w = px1 - px0; + let crop_h = py1 - py0; + let cropped = Self::crop_rgb(&full_rgb, px0, py0, crop_w, crop_h)?; + let span_w = ((crop_w as f64 / full_px_w as f64) * disp_w).round().max(1.0) as u32; + let span_h = ((crop_h as f64 / full_px_h as f64) * disp_h).round().max(1.0) as u32; + let origin_gx = (ox + (px0 as f64 / full_px_w as f64) * disp_w).round() as i32; + let origin_gy = (oy + (py0 as f64 / full_px_h as f64) * disp_h).round() as i32; + Self::raw_shot_from_rgb_crop(cropped, origin_gx, origin_gy, span_w, span_h) + } + /// Rasterizes `assets/computer_use_pointer.svg` via **resvg** (vector → antialiased pixmap). /// **Tip** in SVG user space **(0,0)** is placed at `(cx, cy)` = click hotspot. fn draw_pointer_marker(img: &mut RgbImage, cx: i32, cy: i32) { @@ -1089,13 +1239,8 @@ end tell"#]) if x0.saturating_add(w) > sw || y0.saturating_add(h) > sh { return Err(BitFunError::tool("Tile crop out of bounds.".to_string())); } - let mut out = RgbImage::new(w, h); - for yy in 0..h { - for xx in 0..w { - out.put_pixel(xx, yy, *src.get_pixel(x0 + xx, y0 + yy)); - } - } - Ok(out) + let view = image::imageops::crop_imm(src, x0, y0, w, h); + Ok(view.to_image()) } /// Pointer position in **scaled image** pixels, if it lies inside the captured display. @@ -1129,9 +1274,13 @@ end tell"#]) Some((ix, iy)) } - fn screenshot_sync_tool( + fn screenshot_sync_tool_with_capture( params: ComputerUseScreenshotParams, nav_in: Option, + rgba: image::RgbaImage, + screen: Screen, + som_elements: Vec, + implicit_confirmation_crop_applied: bool, ) -> BitFunResult<( ComputerScreenshot, PointerMap, @@ -1144,14 +1293,6 @@ end tell"#]) )); } - let screen = Screen::from_point(0, 0) - .map_err(|e| BitFunError::tool(format!("Screen capture init: {}", e)))?; - let rgba = screen.capture().map_err(|e| { - BitFunError::tool(format!( - "Screenshot failed (on macOS grant Screen Recording for BitFun): {}", - e - )) - })?; let (native_w, native_h) = rgba.dimensions(); let origin_x = screen.display_info.x; let origin_y = screen.display_info.y; @@ -1416,6 +1557,44 @@ end tell"#]) } }; + // Draw SoM (Set-of-Mark) numbered labels on the frame + if !som_elements.is_empty() { + #[cfg(target_os = "macos")] + { + let geo = macos_map_geo; + draw_som_labels( + &mut frame, + &som_elements, + margin_l, + margin_t, + |gx, gy| geo.global_to_view_pixel(gx, gy, content_w, content_h), + ); + } + #[cfg(not(target_os = "macos"))] + { + // On non-macOS: map global -> content pixel using the linear mapping + draw_som_labels( + &mut frame, + &som_elements, + margin_l, + margin_t, + |gx, gy| { + let ox = map_origin_x as f64; + let oy = map_origin_y as f64; + let nw = map_native_w as f64; + let nh = map_native_h as f64; + if nw <= 0.0 || nh <= 0.0 { return None; } + let rx = (gx - ox) / nw * content_w as f64; + let ry = (gy - oy) / nh * content_h as f64; + if rx < 0.0 || ry < 0.0 || rx >= content_w as f64 || ry >= content_h as f64 { + return None; + } + Some((rx.round() as i32, ry.round() as i32)) + }, + ); + } + } + let jpeg_bytes = Self::encode_jpeg(&frame, JPEG_QUALITY)?; let point_crop_half_extent_native = params.crop_center.map(|_| { @@ -1439,6 +1618,8 @@ end tell"#]) navigation_native_rect: shot_navigation_rect, quadrant_navigation_click_ready, image_content_rect: Some(image_content_rect), + som_labels: som_elements, + implicit_confirmation_crop_applied, }; #[cfg(target_os = "macos")] @@ -1528,18 +1709,88 @@ end tell"#]) } fn computer_use_guard_verified_ui(&self) -> BitFunResult<()> { - let guard = self - .click_needs_fresh_screenshot + let s = self + .state .lock() .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; - if *guard { - return Err(BitFunError::tool( - "Computer use refused: run action screenshot first. After the last pointer move or click you must capture a new screenshot before click or before key_chord that sends Return/Enter.".to_string(), - )); + if s.click_needs_fresh_screenshot { + return Err(BitFunError::tool(STALE_CAPTURE_TOOL_MESSAGE.to_string())); } Ok(()) } + /// Best-effort current mouse position in global screen coordinates. + fn current_mouse_position() -> (f64, f64) { + #[cfg(target_os = "macos")] + { + macos::quartz_mouse_location().unwrap_or((0.0, 0.0)) + } + #[cfg(target_os = "windows")] + { + use windows::Win32::Foundation::POINT; + use windows::Win32::UI::WindowsAndMessaging::GetCursorPos; + unsafe { + let mut pt = POINT::default(); + if GetCursorPos(&mut pt).is_ok() { + (pt.x as f64, pt.y as f64) + } else { + (0.0, 0.0) + } + } + } + #[cfg(target_os = "linux")] + { + match Self::run_enigo_job(|e| { + e.location() + .map_err(|err| BitFunError::tool(format!("pointer location: {}", err))) + }) { + Ok((x, y)) => (x as f64, y as f64), + Err(_) => (0.0, 0.0), + } + } + #[cfg(not(any(target_os = "macos", target_os = "windows", target_os = "linux")))] + { + (0.0, 0.0) + } + } + + /// Resolve a screen capture from cache (if still valid and same screen) or capture fresh. + fn resolve_screenshot_capture( + cached: Option, + mouse_x: f64, + mouse_y: f64, + ) -> BitFunResult<(image::RgbaImage, Screen)> { + let mx = mouse_x.round() as i32; + let my = mouse_y.round() as i32; + + if let Some(cache) = cached { + let screen_id_match = cache.screen.display_info.id + == Screen::from_point(mx, my) + .map(|s| s.display_info.id) + .unwrap_or_default(); + if cache.capture_time.elapsed() < Duration::from_millis(SCREENSHOT_CACHE_TTL_MS) + && screen_id_match + { + debug!( + "Using cached screenshot (age: {}ms)", + cache.capture_time.elapsed().as_millis() + ); + return Ok((cache.rgba, cache.screen)); + } + } + + let screen = Screen::from_point(mx, my) + .or_else(|_| Screen::from_point(0, 0)) + .map_err(|e| BitFunError::tool(format!("Screen capture init: {}", e)))?; + let rgba = screen.capture().map_err(|e| { + BitFunError::tool(format!( + "Screenshot failed (on macOS grant Screen Recording for BitFun): {}", + e + )) + })?; + Ok((rgba, screen)) + } + fn chord_includes_return_or_enter(keys: &[String]) -> bool { keys.iter().any(|s| { matches!( @@ -1638,6 +1889,25 @@ mod macos { } } +impl DesktopComputerUseHost { + /// Perform a physical click at the current pointer without running [`ComputerUseHost::computer_use_guard_click_allowed`]. + /// Used after `mouse_move_global_f64` when coordinates came from AX, OCR, or SoM (not from vision model image coords). + async fn mouse_click_at_current_pointer(&self, button: &str) -> BitFunResult<()> { + let button = button.to_string(); + tokio::task::spawn_blocking(move || { + Self::run_enigo_job(|e| { + let b = Self::map_button(&button)?; + e.button(b, Direction::Click) + .map_err(|err| BitFunError::tool(format!("click: {}", err))) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + ComputerUseHost::computer_use_after_click(self); + Ok(()) + } +} + #[async_trait] impl ComputerUseHost for DesktopComputerUseHost { async fn permission_snapshot(&self) -> BitFunResult { @@ -1646,6 +1916,50 @@ impl ComputerUseHost for DesktopComputerUseHost { .map_err(|e| BitFunError::tool(e.to_string()))?) } + fn computer_use_interaction_state(&self) -> ComputerUseInteractionState { + let s = self.state.lock().unwrap(); + let last_ref = s.last_shot_refinement; + let click_needs_fresh = s.click_needs_fresh_screenshot; + + let (click_ready, screenshot_kind, recommended_next_action) = match last_ref { + Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => ( + !click_needs_fresh, + Some(ComputerUseInteractionScreenshotKind::RegionCrop), + None, + ), + Some(ComputerUseScreenshotRefinement::QuadrantNavigation { click_ready, .. }) if click_ready => ( + !click_needs_fresh, + Some(ComputerUseInteractionScreenshotKind::QuadrantTerminal), + None, + ), + Some(ComputerUseScreenshotRefinement::QuadrantNavigation { .. }) => ( + false, + Some(ComputerUseInteractionScreenshotKind::QuadrantDrill), + Some("screenshot_navigate_quadrant_until_click_ready".to_string()), + ), + Some(ComputerUseScreenshotRefinement::FullDisplay) => ( + !click_needs_fresh, + Some(ComputerUseInteractionScreenshotKind::FullDisplay), + if click_needs_fresh { + Some("screenshot".to_string()) + } else { + None + }, + ), + None => (false, None, Some("screenshot".to_string())), + }; + + ComputerUseInteractionState { + click_ready, + enter_ready: !click_needs_fresh, + requires_fresh_screenshot_before_click: click_needs_fresh, + requires_fresh_screenshot_before_enter: click_needs_fresh, + last_screenshot_kind: screenshot_kind, + last_mutation: None, + recommended_next_action, + } + } + async fn request_accessibility_permission(&self) -> BitFunResult<()> { #[cfg(target_os = "macos")] { @@ -1672,52 +1986,146 @@ impl ComputerUseHost for DesktopComputerUseHost { &self, params: ComputerUseScreenshotParams, ) -> BitFunResult { - let nav_snapshot = *self - .navigation_focus - .lock() - .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + let (nav_snapshot, cached, click_needs) = { + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + ( + s.navigation_focus, + s.screenshot_cache.clone(), + s.click_needs_fresh_screenshot, + ) + }; + + // Get current mouse position to select the right screen + let (mouse_x, mouse_y) = Self::current_mouse_position(); + + // Resolve capture from cache or fresh + let (rgba, screen) = Self::resolve_screenshot_capture(cached, mouse_x, mouse_y)?; + let (native_w, native_h) = rgba.dimensions(); + + let mut params = params; + let mut implicit_applied = false; + if implicit_confirmation_should_apply(click_needs, ¶ms) { + let center = params + .implicit_confirmation_center + .unwrap_or(ComputerUseImplicitScreenshotCenter::Mouse); + let (gx, gy) = implicit_global_center_for_confirmation(center, mouse_x, mouse_y); + let (cx, cy) = global_to_native_full_pixel_center( + gx, + gy, + native_w, + native_h, + &screen.display_info, + ); + params = ComputerUseScreenshotParams { + crop_center: Some(ScreenshotCropCenter { x: cx, y: cy }), + navigate_quadrant: None, + reset_navigation: false, + point_crop_half_extent_native: Some(COMPUTER_USE_POINT_CROP_HALF_DEFAULT), + implicit_confirmation_center: None, + }; + implicit_applied = true; + } + + // Update cache in state + { + let mut s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.screenshot_cache = Some(ScreenshotCacheEntry { + rgba: rgba.clone(), + screen: screen.clone(), + capture_time: Instant::now(), + }); + } + + // Enumerate SoM elements (AX tree walk) for label overlay + let som_elements = self.enumerate_som_elements().await; let (shot, map, nav_out) = tokio::task::spawn_blocking(move || { - Self::screenshot_sync_tool(params, nav_snapshot) + Self::screenshot_sync_tool_with_capture( + params, + nav_snapshot, + rgba, + screen, + som_elements, + implicit_applied, + ) }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; - *self - .last_pointer_map - .lock() - .map_err(|e| BitFunError::tool(format!("lock: {}", e)))? = Some(map); - - *self - .navigation_focus - .lock() - .map_err(|e| BitFunError::tool(format!("lock: {}", e)))? = nav_out; - let refinement = Self::refinement_from_shot(&shot); - *self - .last_shot_refinement - .lock() - .map_err(|e| BitFunError::tool(format!("lock: {}", e)))? = Some(refinement); - - ComputerUseHost::computer_use_after_screenshot(self); + { + let mut s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + s.transition_after_screenshot(map, refinement, nav_out); + } Ok(shot) } async fn screenshot_peek_full_display(&self) -> BitFunResult { let (shot, _map, _) = tokio::task::spawn_blocking(|| { - Self::screenshot_sync_tool(ComputerUseScreenshotParams::default(), None) + let screen = Screen::from_point(0, 0) + .map_err(|e| BitFunError::tool(format!("Screen capture init (peek): {}", e)))?; + let rgba = screen.capture().map_err(|e| { + BitFunError::tool(format!("Screenshot failed (peek): {}", e)) + })?; + Self::screenshot_sync_tool_with_capture( + ComputerUseScreenshotParams::default(), + None, + rgba, + screen, + vec![], // No SoM labels for peek screenshots + false, + ) }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; Ok(shot) } + async fn ocr_find_text_matches( + &self, + text_query: &str, + region_native: Option, + ) -> BitFunResult> { + let region_opt = region_native.clone(); + let shot = tokio::task::spawn_blocking(move || { + let region = Self::ocr_resolve_region_for_capture(region_opt)?; + Self::screenshot_raw_native_region(region) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + let query = text_query.to_string(); + let desktop_matches = tokio::task::spawn_blocking(move || { + super::screen_ocr::find_text_matches(&shot, &query) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + Ok(desktop_matches + .into_iter() + .map(|m| bitfun_core::agentic::tools::computer_use_host::OcrTextMatch { + text: m.text, + confidence: m.confidence, + center_x: m.center_x, + center_y: m.center_y, + bounds_left: m.bounds_left, + bounds_top: m.bounds_top, + bounds_width: m.bounds_width, + bounds_height: m.bounds_height, + }) + .collect()) + } + fn last_screenshot_refinement(&self) -> Option { - self.last_shot_refinement - .lock() - .ok() - .and_then(|g| *g) + self.state.lock().ok().and_then(|s| s.last_shot_refinement) } async fn locate_ui_element_screen_center( @@ -1758,12 +2166,28 @@ impl ComputerUseHost for DesktopComputerUseHost { } } + async fn enumerate_som_elements(&self) -> Vec { + #[cfg(target_os = "macos")] + { + const SOM_MAX_ELEMENTS: usize = 50; + tokio::task::spawn_blocking(move || { + crate::computer_use::macos_ax_ui::enumerate_interactive_elements(SOM_MAX_ELEMENTS) + }) + .await + .unwrap_or_default() + } + #[cfg(not(target_os = "macos"))] + { + vec![] + } + } + fn map_image_coords_to_pointer_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { - let guard = self - .last_pointer_map + let s = self + .state .lock() .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; - let Some(map) = *guard else { + let Some(map) = s.pointer_map else { return Err(BitFunError::tool( "No screenshot yet in this session: run action screenshot first, then use x,y in the screenshot image pixel grid (image_width x image_height), or set use_screen_coordinates true with global screen pixels.".to_string(), )); @@ -1777,11 +2201,11 @@ impl ComputerUseHost for DesktopComputerUseHost { } fn map_normalized_coords_to_pointer_f64(&self, x: i32, y: i32) -> BitFunResult<(f64, f64)> { - let guard = self - .last_pointer_map + let s = self + .state .lock() .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; - let Some(map) = *guard else { + let Some(map) = s.pointer_map else { return Err(BitFunError::tool( "No screenshot yet: run screenshot first. For coordinate_mode \"normalized\", use x and y each in 0..=1000.".to_string(), )); @@ -1802,6 +2226,7 @@ impl ComputerUseHost for DesktopComputerUseHost { }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; + self.clear_vision_pixel_nudge_block(); ComputerUseHost::computer_use_after_pointer_mutation(self); return Ok(()); } @@ -1812,6 +2237,7 @@ impl ComputerUseHost for DesktopComputerUseHost { } async fn mouse_move(&self, x: i32, y: i32) -> BitFunResult<()> { + debug!("computer_use: mouse_move absolute ({}, {})", x, y); tokio::task::spawn_blocking(move || { Self::run_enigo_job(|e| { e.move_mouse(x, y, Coordinate::Abs) @@ -1820,6 +2246,7 @@ impl ComputerUseHost for DesktopComputerUseHost { }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; + self.clear_vision_pixel_nudge_block(); ComputerUseHost::computer_use_after_pointer_mutation(self); Ok(()) } @@ -1829,17 +2256,29 @@ impl ComputerUseHost for DesktopComputerUseHost { return Ok(()); } + { + let s = self + .state + .lock() + .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; + if s.block_vision_pixel_nudge_after_screenshot { + return Err(BitFunError::tool( + VISION_PIXEL_NUDGE_AFTER_SCREENSHOT_MSG.to_string(), + )); + } + } + #[cfg(target_os = "macos")] { // enigo `Coordinate::Rel` uses `location()` on macOS, which mixes NSEvent + main-display // pixel height — not the same space as `CGEvent` / our screenshot mapping. Use Quartz // position + scale from the last capture (display points per screenshot pixel). let geo = { - let guard = self - .last_pointer_map + let s = self + .state .lock() .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; - let Some(map) = *guard else { + let Some(map) = s.pointer_map else { return Err(BitFunError::tool( "Run action screenshot first: on macOS, pointer_move_relative / ComputerUseMouseStep convert pixel deltas using the last capture scale." .to_string(), @@ -1892,18 +2331,45 @@ impl ComputerUseHost for DesktopComputerUseHost { } async fn mouse_click(&self, button: &str) -> BitFunResult<()> { + debug!("computer_use: mouse_click button={}", button); ComputerUseHost::computer_use_guard_click_allowed(self)?; + self.mouse_click_at_current_pointer(button).await + } + + async fn mouse_click_authoritative(&self, button: &str) -> BitFunResult<()> { + debug!("computer_use: mouse_click_authoritative button={}", button); + self.mouse_click_at_current_pointer(button).await + } + + async fn mouse_down(&self, button: &str) -> BitFunResult<()> { + debug!("computer_use: mouse_down button={}", button); let button = button.to_string(); tokio::task::spawn_blocking(move || { Self::run_enigo_job(|e| { let b = Self::map_button(&button)?; - e.button(b, Direction::Click) - .map_err(|err| BitFunError::tool(format!("click: {}", err))) + e.button(b, Direction::Press) + .map_err(|err| BitFunError::tool(format!("mouse_down: {}", err))) }) }) .await .map_err(|e| BitFunError::tool(e.to_string()))??; - ComputerUseHost::computer_use_after_click(self); + ComputerUseHost::computer_use_after_pointer_mutation(self); + Ok(()) + } + + async fn mouse_up(&self, button: &str) -> BitFunResult<()> { + debug!("computer_use: mouse_up button={}", button); + let button = button.to_string(); + tokio::task::spawn_blocking(move || { + Self::run_enigo_job(|e| { + let b = Self::map_button(&button)?; + e.button(b, Direction::Release) + .map_err(|err| BitFunError::tool(format!("mouse_up: {}", err))) + }) + }) + .await + .map_err(|e| BitFunError::tool(e.to_string()))??; + ComputerUseHost::computer_use_after_pointer_mutation(self); Ok(()) } @@ -1935,6 +2401,7 @@ impl ComputerUseHost for DesktopComputerUseHost { if keys.is_empty() { return Ok(()); } + debug!("computer_use: key_chord keys={:?}", keys); if Self::chord_includes_return_or_enter(&keys) { Self::computer_use_guard_verified_ui(self)?; } @@ -2010,41 +2477,89 @@ impl ComputerUseHost for DesktopComputerUseHost { } fn computer_use_after_screenshot(&self) { - if let Ok(mut g) = self.click_needs_fresh_screenshot.lock() { - *g = false; - } + // Transition is handled centrally in screenshot_display via transition_after_screenshot. } fn computer_use_after_pointer_mutation(&self) { - if let Ok(mut g) = self.click_needs_fresh_screenshot.lock() { - *g = true; + if let Ok(mut s) = self.state.lock() { + s.transition_after_pointer_mutation(); } } fn computer_use_after_click(&self) { - if let Ok(mut g) = self.click_needs_fresh_screenshot.lock() { - *g = true; + if let Ok(mut s) = self.state.lock() { + s.transition_after_click(); } } fn computer_use_guard_click_allowed(&self) -> BitFunResult<()> { - self.computer_use_guard_verified_ui()?; - let refine = self - .last_shot_refinement + let s = self + .state .lock() .map_err(|e| BitFunError::tool(format!("lock: {}", e)))?; - match *refine { + if s.click_needs_fresh_screenshot { + return Err(BitFunError::tool(STALE_CAPTURE_TOOL_MESSAGE.to_string())); + } + match s.last_shot_refinement { Some(ComputerUseScreenshotRefinement::RegionAroundPoint { .. }) => {} Some(ComputerUseScreenshotRefinement::QuadrantNavigation { click_ready: true, .. }) => {} + // Fresh full-screen JPEG matches the display — valid for image-space `mouse_move` then + // guarded `click` as long as `click_needs_fresh_screenshot` is false above. + Some(ComputerUseScreenshotRefinement::FullDisplay) => {} _ => { return Err(BitFunError::tool( - "Click refused: use a **fine** screenshot basis — either a **~500×500 point crop** (`screenshot_crop_center_x` / `y` in full-display native pixels) **or** keep drilling with `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` is true in the tool result, then `ComputerUseMousePrecise` / `ComputerUseMouseStep` + `click`. Full-screen alone is not enough.".to_string(), + "Click refused: use a **fine** screenshot basis — either a **~500×500 point crop** (`screenshot_crop_center_x` / `y` in full-display native pixels) **or** keep drilling with `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` is true in the tool result, then `ComputerUseMousePrecise` / `ComputerUseMouseStep` / **`ComputerUse`** **`mouse_move`** (**`use_screen_coordinates`: true** only) to position, then **`click`**. Or take a **full-screen** `screenshot` (no pointer move since capture), then **`mouse_move`** with globals from tool results, then **`click`**.".to_string(), )); } } Ok(()) } + + fn computer_use_guard_click_allowed_relaxed(&self) -> BitFunResult<()> { + // For AX-based click_element: we only require that no pointer mutation + // happened since the last known state (i.e. we moved the pointer ourselves + // inside click_element, so the flag is not set). No fine-screenshot needed. + // This is intentionally permissive — AX coordinates are authoritative. + Ok(()) + } + + fn record_action(&self, action_type: &str, action_params: &str, success: bool) { + if let Ok(mut s) = self.state.lock() { + s.optimizer.record_action( + action_type.to_string(), + action_params.to_string(), + success, + ); + } + } + + fn update_screenshot_hash(&self, hash: u64) { + if let Ok(mut s) = self.state.lock() { + s.optimizer.update_screenshot_hash(hash); + } + } + + fn detect_action_loop(&self) -> LoopDetectionResult { + if let Ok(s) = self.state.lock() { + s.optimizer.detect_loop() + } else { + LoopDetectionResult { + is_loop: false, + pattern_length: 0, + repetitions: 0, + suggestion: String::new(), + } + } + } + + fn get_action_history(&self) -> Vec { + if let Ok(s) = self.state.lock() { + s.optimizer.get_history() + } else { + vec![] + } + } } diff --git a/src/apps/desktop/src/computer_use/macos_ax_ui.rs b/src/apps/desktop/src/computer_use/macos_ax_ui.rs index d1031724..cd969cd7 100644 --- a/src/apps/desktop/src/computer_use/macos_ax_ui.rs +++ b/src/apps/desktop/src/computer_use/macos_ax_ui.rs @@ -3,7 +3,7 @@ //! Coordinates match CoreGraphics global space used by [`crate::computer_use::DesktopComputerUseHost`]. use crate::computer_use::ui_locate_common; -use bitfun_core::agentic::tools::computer_use_host::{UiElementLocateQuery, UiElementLocateResult}; +use bitfun_core::agentic::tools::computer_use_host::{SomElement, UiElementLocateQuery, UiElementLocateResult}; use bitfun_core::util::errors::{BitFunError, BitFunResult}; use core_foundation::array::{CFArray, CFArrayRef}; use core_foundation::base::{CFTypeRef, TCFType}; @@ -17,6 +17,7 @@ type AXValueRef = *const c_void; #[link(name = "ApplicationServices", kind = "framework")] unsafe extern "C" { + fn AXUIElementCreateSystemWide() -> AXUIElementRef; fn AXUIElementCreateApplication(pid: i32) -> AXUIElementRef; fn AXUIElementCopyAttributeValue( element: AXUIElementRef, @@ -151,9 +152,108 @@ unsafe fn element_frame_global(elem: AXUIElementRef) -> Option<(f64, f64, f64, f struct Queued { ax: AXUIElementRef, depth: u32, + /// Parent's role + title for context (e.g. "AXWindow: Settings"). + parent_desc: Option, } -/// Search the **frontmost** app’s accessibility tree (BFS) for the first element matching filters. +/// A candidate match found during BFS, before ranking. +struct CandidateMatch { + gx: f64, + gy: f64, + bounds_left: f64, + bounds_top: f64, + bounds_width: f64, + bounds_height: f64, + role: String, + title: Option, + identifier: Option, + parent_desc: Option, + depth: u32, + /// Whether AXHidden is explicitly false / absent (visible). + is_visible: bool, +} + +impl CandidateMatch { + /// Higher = better. Prefer visible, reasonably-sized, shallower, on-screen elements. + fn rank_score(&self) -> i64 { + let mut score: i64 = 0; + + // Visibility is critical + if !self.is_visible { + score -= 10000; + } + + // Off-screen penalty + if !ui_locate_common::is_element_on_screen( + self.gx, self.gy, self.bounds_width, self.bounds_height, + ) { + score -= 5000; + } + + // Prefer reasonably-sized elements (buttons, text fields) over huge containers + let area = self.bounds_width * self.bounds_height; + if area > 0.0 && area < 50000.0 { + score += 100; // Small interactive element + } else if area >= 50000.0 && area < 200000.0 { + score += 50; // Medium element + } + // Very large elements (>200000 area) get no bonus -- likely containers + + // Prefer shallower elements (closer to the top of the tree = more likely + // to be the "primary" instance vs a deeply nested duplicate) + score -= self.depth as i64; + + // Bonus for elements in focused/active contexts + if let Some(ref pd) = self.parent_desc { + let pd_lower = pd.to_lowercase(); + if pd_lower.contains("sheet") || pd_lower.contains("dialog") || pd_lower.contains("popover") { + score += 200; // Prefer elements in modal dialogs / sheets + } + } + + // Prefer elements with a non-empty title (more likely to be interactive) + if self.title.as_ref().map_or(false, |t| !t.is_empty()) { + score += 20; + } + + score + } + + fn short_description(&self) -> String { + let title_str = self.title.as_deref().unwrap_or(""); + let parent_str = self.parent_desc.as_deref().unwrap_or("?"); + format!( + "role={} title={:?} at ({:.0},{:.0}) size={:.0}x{:.0} parent=[{}]", + self.role, title_str, self.gx, self.gy, + self.bounds_width, self.bounds_height, parent_str + ) + } +} + +/// Check if an AX element has `AXHidden` set to true. +unsafe fn is_ax_hidden(elem: AXUIElementRef) -> bool { + let Some(val) = ax_copy_attr(elem, "AXHidden") else { + return false; // No AXHidden attribute = not hidden + }; + // AXHidden is a CFBoolean + let hidden = val as *const c_void == core_foundation::boolean::kCFBooleanTrue as *const c_void; + ax_release(val); + hidden +} + +/// Build a short description string for an element (for use as parent context). +fn element_short_desc(role: Option<&str>, title: Option<&str>) -> String { + let r = role.unwrap_or("?"); + match title { + Some(t) if !t.is_empty() => format!("{}: {}", r, t), + _ => r.to_string(), + } +} + +const MAX_CANDIDATES: usize = 10; + +/// Search the **frontmost** app's accessibility tree (BFS) for elements matching filters. +/// Collects all matches, filters invisible/off-screen ones, ranks by relevance, returns the best. pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult { ui_locate_common::validate_query(query)?; let max_depth = query.max_depth.unwrap_or(48).clamp(1, 200); @@ -162,38 +262,25 @@ pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult = Vec::new(); - loop { - let Some(cur) = q.pop_front() else { - return Err(BitFunError::tool( - "No accessibility element matched in the **frontmost** app. Filters default to **AND** (`filter_combine` omitted = `all`): every non-empty field must match the **same** node — e.g. `title_contains` + `role_substring` together often fails when the control has a **role** but **empty or different AXTitle** (typical for search fields). Try: **`filter_combine`: `\"any\"`**, or **only** `role_substring` (e.g. `TextField`), or **only** `title_contains`; match UI language; ensure the chat app is focused. Or use **`action: screenshot`**. (If Accessibility were denied, you would see a different error.)" - .to_string(), - )); - }; + while let Some(cur) = bfs_queue.pop_front() { if cur.depth > max_depth { - unsafe { - ax_release(cur.ax as CFTypeRef); - } + unsafe { ax_release(cur.ax as CFTypeRef); } continue; } visited += 1; if visited > max_nodes { - unsafe { - ax_release(cur.ax as CFTypeRef); - } - while let Some(c) = q.pop_front() { - unsafe { - ax_release(c.ax as CFTypeRef); - } + unsafe { ax_release(cur.ax as CFTypeRef); } + // Drain remaining queue + while let Some(c) = bfs_queue.pop_front() { + unsafe { ax_release(c.ax as CFTypeRef); } } - return Err(BitFunError::tool( - "Accessibility search limit reached; narrow title/role/identifier filters." - .to_string(), - )); + break; } let (role_s, title_s, id_s) = unsafe { read_role_title_id(cur.ax) }; @@ -204,28 +291,34 @@ pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult= MAX_CANDIDATES { + unsafe { ax_release(cur.ax as CFTypeRef); } + while let Some(c) = bfs_queue.pop_front() { + unsafe { ax_release(c.ax as CFTypeRef); } + } + break; } - return ui_locate_common::ok_result( - gx, - gy, - bl, - bt, - bw, - bh, - role_s.unwrap_or_default(), - title_s, - id_s, - ); } } + // Build description for this node to pass as parent context to children + let this_desc = element_short_desc(role_ref, title_ref); + let children_ref = unsafe { ax_copy_attr(cur.ax, "AXChildren") }; let next_depth = cur.depth + 1; - unsafe { - ax_release(cur.ax as CFTypeRef); - } + unsafe { ax_release(cur.ax as CFTypeRef); } let Some(ch) = children_ref else { continue; @@ -234,21 +327,289 @@ pub fn locate_ui_element_center(query: &UiElementLocateQuery) -> BitFunResult::wrap_under_create_rule(ch as CFArrayRef); let n = arr.len(); for i in 0..n { - let Some(child_ref) = arr.get(i) else { - continue; - }; + let Some(child_ref) = arr.get(i) else { continue; }; let child = *child_ref; - if child.is_null() { - continue; - } + if child.is_null() { continue; } let retained = CFRetain(child as CFTypeRef) as AXUIElementRef; if !retained.is_null() { - q.push_back(Queued { + bfs_queue.push_back(Queued { ax: retained, depth: next_depth, + parent_desc: Some(this_desc.clone()), }); } } } } + + if candidates.is_empty() { + return Err(BitFunError::tool( + "No accessibility element matched in the frontmost app. Tips: use `filter_combine: \"any\"` for OR matching; use only `role_substring` or only `title_contains`; match UI language; ensure the target app is focused. Or fall back to `screenshot` + vision path." + .to_string(), + )); + } + + // Sort by rank score (descending) + candidates.sort_by(|a, b| b.rank_score().cmp(&a.rank_score())); + + let total = candidates.len() as u32; + let best = &candidates[0]; + + // Build "other matches" summaries for the model to see alternatives + let other_matches: Vec = candidates.iter() + .skip(1) + .take(4) + .map(|c| c.short_description()) + .collect(); + + ui_locate_common::ok_result_with_context( + best.gx, best.gy, + best.bounds_left, best.bounds_top, best.bounds_width, best.bounds_height, + best.role.clone(), + best.title.clone(), + best.identifier.clone(), + best.parent_desc.clone(), + total, + other_matches, + ) +} + + +/// Roles considered "interactive" for Set-of-Mark labeling. +const SOM_INTERACTIVE_ROLES: &[&str] = &[ + "AXButton", "AXTextField", "AXTextArea", "AXCheckBox", + "AXRadioButton", "AXPopUpButton", "AXComboBox", "AXSlider", + "AXLink", "AXMenuItem", "AXMenuBarItem", "AXTab", + "AXDisclosureTriangle", "AXIncrementor", "AXColorWell", + "AXToolbarButton", "AXToggle", "AXSwitch", "AXSegmentedControl", + "AXCell", "AXImage", "AXStaticText", +]; + +fn is_interactive_role(role: &str) -> bool { + SOM_INTERACTIVE_ROLES.iter().any(|r| role.contains(r) || r.contains(role)) +} + +/// Enumerate all visible interactive elements in the frontmost app's AX tree. +/// Returns up to `max_elements` SomElement entries with 1-based label numbers. +pub fn enumerate_interactive_elements(max_elements: usize) -> Vec { + let pid = match frontmost_pid() { + Ok(p) => p, + Err(_) => return vec![], + }; + let root = unsafe { AXUIElementCreateApplication(pid) }; + if root.is_null() { + return vec![]; + } + + struct BfsItem { + ax: AXUIElementRef, + depth: u32, + } + + let mut queue = VecDeque::new(); + queue.push_back(BfsItem { ax: root, depth: 0 }); + let max_depth: u32 = 30; + let max_nodes: usize = 8_000; + let mut visited: usize = 0; + let mut results: Vec = Vec::new(); + + while let Some(cur) = queue.pop_front() { + if cur.depth > max_depth || results.len() >= max_elements { + unsafe { ax_release(cur.ax as CFTypeRef); } + continue; + } + visited += 1; + if visited > max_nodes { + unsafe { ax_release(cur.ax as CFTypeRef); } + while let Some(c) = queue.pop_front() { + unsafe { ax_release(c.ax as CFTypeRef); } + } + break; + } + + let (role_s, title_s, id_s) = unsafe { read_role_title_id(cur.ax) }; + let role = role_s.as_deref().unwrap_or(""); + + // Check if this element is interactive and visible + if is_interactive_role(role) { + let hidden = unsafe { is_ax_hidden(cur.ax) }; + if !hidden { + if let Some((gx, gy, bl, bt, bw, bh)) = unsafe { element_frame_global(cur.ax) } { + // Filter: reasonable size (not a giant container, not tiny) + if bw >= 4.0 && bh >= 4.0 && bw <= 2000.0 && bh <= 1000.0 { + // Filter: on-screen (center must be non-negative) + if gx >= 0.0 && gy >= 0.0 { + let label = results.len() as u32 + 1; + results.push(SomElement { + label, + role: role.to_string(), + title: title_s.clone().filter(|s| !s.is_empty()), + identifier: id_s.clone().filter(|s| !s.is_empty()), + global_center_x: gx, + global_center_y: gy, + bounds_left: bl, + bounds_top: bt, + bounds_width: bw, + bounds_height: bh, + }); + if results.len() >= max_elements { + unsafe { ax_release(cur.ax as CFTypeRef); } + while let Some(c) = queue.pop_front() { + unsafe { ax_release(c.ax as CFTypeRef); } + } + break; + } + } + } + } + } + } + + // Enqueue children + let children_ref = unsafe { ax_copy_attr(cur.ax, "AXChildren") }; + let next_depth = cur.depth + 1; + unsafe { ax_release(cur.ax as CFTypeRef); } + + let Some(ch) = children_ref else { continue; }; + unsafe { + let arr = CFArray::<*const c_void>::wrap_under_create_rule(ch as CFArrayRef); + let n = arr.len(); + for i in 0..n { + let Some(child_ref) = arr.get(i) else { continue; }; + let child = *child_ref; + if child.is_null() { continue; } + let retained = CFRetain(child as CFTypeRef) as AXUIElementRef; + if !retained.is_null() { + queue.push_back(BfsItem { ax: retained, depth: next_depth }); + } + } + } + } + + results +} + +// ── Raw OCR: frontmost window bounds (separate from agent screenshot pipeline) ───────────────── + +/// Bounds of the foreground app's focused or main window in global screen coordinates (same space as pointer / screen capture). +/// Used to crop **raw** pixels for Vision OCR without pointer/SoM overlays from the agent screenshot path. +pub fn frontmost_window_bounds_global() -> BitFunResult<(i32, i32, u32, u32)> { + let pid = frontmost_pid()?; + let app = unsafe { AXUIElementCreateApplication(pid) }; + if app.is_null() { + return Err(BitFunError::tool( + "AXUIElementCreateApplication returned null for OCR window bounds.".to_string(), + )); + } + unsafe { + let win = try_frontmost_window_element(app); + ax_release(app as CFTypeRef); + let Some(win) = win else { + return Err(BitFunError::tool( + "No AX window for foreground app (try AXFocusedWindow / AXMainWindow / AXWindows).".to_string(), + )); + }; + let frame = element_frame_global(win).ok_or_else(|| { + ax_release(win as CFTypeRef); + BitFunError::tool("Could not read AXPosition/AXSize for foreground window.".to_string()) + })?; + ax_release(win as CFTypeRef); + let (_, _, bl, bt, bw, bh) = frame; + if bw < 1.0 || bh < 1.0 { + return Err(BitFunError::tool( + "Foreground window has invalid size for OCR.".to_string(), + )); + } + let x0 = bl.floor() as i32; + let y0 = bt.floor() as i32; + let w = bw.ceil().max(1.0) as u32; + let h = bh.ceil().max(1.0) as u32; + Ok((x0, y0, w, h)) + } +} + +unsafe fn try_frontmost_window_element(app: AXUIElementRef) -> Option { + for key in ["AXFocusedWindow", "AXMainWindow"] { + if let Some(w) = ax_copy_attr(app, key) { + let elem = w as AXUIElementRef; + if !elem.is_null() && element_frame_global(elem).is_some() { + return Some(elem); + } + ax_release(w); + } + } + first_ax_window_from_ax_windows(app) +} + +fn is_text_editing_ax_role(role: &str) -> bool { + matches!( + role, + "AXTextField" | "AXTextArea" | "AXComboBox" | "AXSearchField" | "AXSecureTextField" + ) +} + +unsafe fn ax_focused_element_from_system_wide() -> Option { + let sys = AXUIElementCreateSystemWide(); + if sys.is_null() { + return None; + } + let mut focused: CFTypeRef = std::ptr::null(); + let k = CFString::new("AXFocusedUIElement"); + let st = AXUIElementCopyAttributeValue(sys, k.as_concrete_TypeRef(), &mut focused); + if st != 0 || focused.is_null() { + if !focused.is_null() { + ax_release(focused); + } + return None; + } + Some(focused as AXUIElementRef) +} + +/// Best-effort global (x, y) for a 500×500 screenshot centered near the focused text field (AX element center). +/// Returns `None` if no suitable focused text UI; caller should fall back to the mouse position. +pub fn global_point_for_text_caret_screenshot(mx: f64, my: f64) -> (f64, f64) { + unsafe { + let Some(el) = ax_focused_element_from_system_wide() else { + return (mx, my); + }; + let (role, _, _) = read_role_title_id(el); + let Some(role) = role.as_deref() else { + ax_release(el as CFTypeRef); + return (mx, my); + }; + if !is_text_editing_ax_role(role) { + ax_release(el as CFTypeRef); + return (mx, my); + } + let Some((gx, gy, _, _, _, _)) = element_frame_global(el) else { + ax_release(el as CFTypeRef); + return (mx, my); + }; + ax_release(el as CFTypeRef); + (gx, gy) + } +} + +unsafe fn first_ax_window_from_ax_windows(app: AXUIElementRef) -> Option { + let arr_ref = ax_copy_attr(app, "AXWindows")?; + let arr = CFArray::<*const c_void>::wrap_under_create_rule(arr_ref as CFArrayRef); + for i in 0..arr.len() { + let Some(w) = arr.get(i) else { + continue; + }; + let child = *w as AXUIElementRef; + if child.is_null() { + continue; + } + let retained = CFRetain(child as CFTypeRef) as AXUIElementRef; + if retained.is_null() { + continue; + } + let (role, _, _) = read_role_title_id(retained); + if role.as_deref() == Some("AXWindow") && element_frame_global(retained).is_some() { + return Some(retained); + } + ax_release(retained as CFTypeRef); + } + None } diff --git a/src/apps/desktop/src/computer_use/mod.rs b/src/apps/desktop/src/computer_use/mod.rs index 9b50d5a9..efe34a36 100644 --- a/src/apps/desktop/src/computer_use/mod.rs +++ b/src/apps/desktop/src/computer_use/mod.rs @@ -2,6 +2,7 @@ mod desktop_host; mod ui_locate_common; +mod screen_ocr; #[cfg(target_os = "macos")] mod macos_ax_ui; #[cfg(target_os = "windows")] diff --git a/src/apps/desktop/src/computer_use/screen_ocr.rs b/src/apps/desktop/src/computer_use/screen_ocr.rs new file mode 100644 index 00000000..df7e3733 --- /dev/null +++ b/src/apps/desktop/src/computer_use/screen_ocr.rs @@ -0,0 +1,869 @@ +use bitfun_core::agentic::tools::computer_use_host::{ + ComputerScreenshot, ComputerUseImageContentRect, OcrRegionNative, +}; +use bitfun_core::infrastructure::try_get_path_manager_arc; +use bitfun_core::util::errors::{BitFunError, BitFunResult}; +use image::codecs::jpeg::JpegEncoder; +use log::{info, warn}; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +use chrono::Utc; + +#[derive(Debug, Clone)] +pub struct OcrTextMatch { + pub text: String, + pub confidence: f32, + pub center_x: f64, + pub center_y: f64, + pub bounds_left: f64, + pub bounds_top: f64, + pub bounds_width: f64, + pub bounds_height: f64, +} + +pub fn find_text_matches( + shot: &ComputerScreenshot, + text_query: &str, +) -> BitFunResult> { + let query = normalize_query(text_query)?; + save_ocr_debug_jpeg(shot, &query); + + #[cfg(target_os = "macos")] + { + return macos::find_text_matches(shot, &query); + } + + #[cfg(target_os = "windows")] + { + return windows_backend::find_text_matches(shot, &query); + } + + #[cfg(target_os = "linux")] + { + return linux_backend::find_text_matches(shot, &query); + } + + #[allow(unreachable_code)] + Err(BitFunError::tool( + "move_to_text OCR is not supported on this platform.".to_string(), + )) +} + +/// If unset or non-zero: write the exact JPEG passed to OCR into `computer_use_debug` under the app data dir (see implementation). Set `BITFUN_COMPUTER_USE_OCR_DEBUG=0` to disable. +fn ocr_debug_save_enabled() -> bool { + match std::env::var("BITFUN_COMPUTER_USE_OCR_DEBUG") { + Ok(v) if v == "0" || v.eq_ignore_ascii_case("false") => false, + _ => true, + } +} + +/// Same directory as agent `screenshot` debug (`workspace/.bitfun/computer_use_debug`), when PathManager is available. +fn computer_use_ocr_debug_dir() -> PathBuf { + if let Ok(pm) = try_get_path_manager_arc() { + return pm + .default_assistant_workspace_dir(None) + .join(".bitfun") + .join("computer_use_debug"); + } + dirs::home_dir() + .map(|h| { + h.join(".bitfun") + .join("personal_assistant") + .join("workspace") + .join(".bitfun") + .join("computer_use_debug") + }) + .unwrap_or_else(|| std::env::temp_dir().join("computer_use_debug")) +} + +/// Persists `shot.bytes` (same buffer as Vision / WinRT / Tesseract) before OCR runs. +fn save_ocr_debug_jpeg(shot: &ComputerScreenshot, text_query: &str) { + if !ocr_debug_save_enabled() { + return; + } + let dir = computer_use_ocr_debug_dir(); + if let Err(e) = fs::create_dir_all(&dir) { + warn!("computer_use ocr_debug: create_dir_all {:?}: {}", dir, e); + return; + } + let safe: String = text_query + .chars() + .map(|c| match c { + '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|' => '_', + c if c.is_control() => '_', + c => c, + }) + .take(96) + .collect(); + let safe = if safe.trim().is_empty() { + "query".to_string() + } else { + safe + }; + let ts = Utc::now().format("%Y%m%d_%H%M%S"); + let ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0); + let name = format!( + "ocr_{}_{}_{}x{}_{}ms_{}.jpg", + ts, + ms, + shot.image_width, + shot.image_height, + shot.bytes.len(), + safe + ); + let path = dir.join(name); + match fs::File::create(&path).and_then(|mut f| f.write_all(&shot.bytes)) { + Ok(()) => { + info!( + "computer_use ocr_debug: wrote {} bytes to {}", + shot.bytes.len(), + path.display() + ); + } + Err(e) => warn!("computer_use ocr_debug: write {:?}: {}", path, e), + } +} + +fn normalize_query(text_query: &str) -> BitFunResult { + let q = text_query.trim(); + if q.is_empty() { + return Err(BitFunError::tool( + "move_to_text requires a non-empty text_query.".to_string(), + )); + } + Ok(q.to_string()) +} + +fn normalize_for_match(s: &str) -> String { + s.split_whitespace() + .collect::>() + .join(" ") + .to_lowercase() +} + +fn rank_matches(mut matches: Vec, query: &str) -> Vec { + let normalized_query = normalize_for_match(query); + matches.sort_by(|a, b| compare_match(a, b, &normalized_query)); + matches +} + +fn compare_match( + a: &OcrTextMatch, + b: &OcrTextMatch, + normalized_query: &str, +) -> std::cmp::Ordering { + let sa = match_score(a, normalized_query); + let sb = match_score(b, normalized_query); + sb.cmp(&sa) + .then_with(|| { + b.confidence + .partial_cmp(&a.confidence) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .then_with(|| { + let da = normalized_len_delta(&a.text, normalized_query); + let db = normalized_len_delta(&b.text, normalized_query); + da.cmp(&db) + }) +} + +fn match_score(m: &OcrTextMatch, normalized_query: &str) -> i32 { + let text = normalize_for_match(&m.text); + if text == normalized_query { + 4 + } else if text.starts_with(normalized_query) { + 3 + } else if text.contains(normalized_query) { + 2 + } else { + 1 + } +} + +fn normalized_len_delta(text: &str, normalized_query: &str) -> usize { + let l = normalize_for_match(text).chars().count(); + let q = normalized_query.chars().count(); + l.abs_diff(q) +} + +fn filter_and_rank(query: &str, raw_matches: Vec) -> Vec { + let normalized_query = normalize_for_match(query); + let filtered = raw_matches + .into_iter() + .filter(|m| normalize_for_match(&m.text).contains(&normalized_query)) + .collect::>(); + rank_matches(filtered, query) +} + +fn image_content_rect_or_full(shot: &ComputerScreenshot) -> (u32, u32, u32, u32) { + if let Some(rect) = &shot.image_content_rect { + (rect.left, rect.top, rect.width, rect.height) + } else { + (0, 0, shot.image_width, shot.image_height) + } +} + +/// Map a rectangle in **full JPEG pixel space** (top-left origin) to global pointer coordinates. +/// Uses `image_content_rect`: only the inner content area maps linearly to `native_width` × `native_height`. +pub fn image_box_to_global_match( + shot: &ComputerScreenshot, + text: String, + confidence: f32, + local_left: f64, + local_top: f64, + width: f64, + height: f64, +) -> OcrTextMatch { + let (cl, ct, cw, ch) = image_content_rect_or_full(shot); + let cw = cw as f64; + let ch = ch as f64; + let cl = cl as f64; + let ct = ct as f64; + let rel_x = local_left - cl; + let rel_y = local_top - ct; + let nw = shot.native_width as f64; + let nh = shot.native_height as f64; + let global_left = shot.display_origin_x as f64 + (rel_x / cw.max(1e-9)) * nw; + let global_top = shot.display_origin_y as f64 + (rel_y / ch.max(1e-9)) * nh; + let global_width = (width / cw.max(1e-9)) * nw; + let global_height = (height / ch.max(1e-9)) * nh; + let center_x = global_left + global_width / 2.0; + let center_y = global_top + global_height / 2.0; + OcrTextMatch { + text, + confidence, + center_x, + center_y, + bounds_left: global_left, + bounds_top: global_top, + bounds_width: global_width, + bounds_height: global_height, + } +} + +/// Crop the peek JPEG to a **global native** rectangle intersected with the capture, then rebuild +/// [`ComputerScreenshot`] so OCR and coordinate mapping stay consistent (full frame = content). +/// Unused while OCR uses raw capture only; kept for experiments against cropped JPEG workflows. +#[allow(dead_code)] +pub fn crop_shot_to_ocr_region( + shot: ComputerScreenshot, + region: &OcrRegionNative, +) -> BitFunResult { + if region.width == 0 || region.height == 0 { + return Err(BitFunError::tool( + "ocr_region_native width and height must be non-zero.".to_string(), + )); + } + let (cl, ct, cw, ch) = image_content_rect_or_full(&shot); + if cw == 0 || ch == 0 { + return Err(BitFunError::tool( + "Screenshot content rect is empty; cannot crop for OCR.".to_string(), + )); + } + + let ox = shot.display_origin_x as i64; + let oy = shot.display_origin_y as i64; + let nw = shot.native_width as i64; + let nh = shot.native_height as i64; + + let rx0 = region.x0 as i64; + let ry0 = region.y0 as i64; + let rw = region.width as i64; + let rh = region.height as i64; + + let ix0 = rx0.max(ox); + let iy0 = ry0.max(oy); + let ix1 = (rx0 + rw).min(ox + nw); + let iy1 = (ry0 + rh).min(oy + nh); + if ix1 <= ix0 || iy1 <= iy0 { + return Err(BitFunError::tool( + "ocr_region_native does not intersect the captured display. Check coordinates (global native pixels).".to_string(), + )); + } + + let jx0 = cl as f64 + ((ix0 - ox) as f64 / nw as f64) * cw as f64; + let jy0 = ct as f64 + ((iy0 - oy) as f64 / nh as f64) * ch as f64; + let jx1 = cl as f64 + ((ix1 - ox) as f64 / nw as f64) * cw as f64; + let jy1 = ct as f64 + ((iy1 - oy) as f64 / nh as f64) * ch as f64; + + let px0 = jx0.floor().max(0.0) as u32; + let py0 = jy0.floor().max(0.0) as u32; + let px1 = jx1.ceil().min(shot.image_width as f64) as u32; + let py1 = jy1.ceil().min(shot.image_height as f64) as u32; + if px1 <= px0 || py1 <= py0 { + return Err(BitFunError::tool( + "OCR crop region is empty after mapping to image pixels.".to_string(), + )); + } + + let dyn_img = image::load_from_memory(&shot.bytes) + .map_err(|e| BitFunError::tool(format!("OCR crop: decode JPEG: {}", e)))?; + let rgb = dyn_img.to_rgb8(); + let img_w = rgb.width(); + let img_h = rgb.height(); + // Clamp to decoded dimensions (must match Vision / mapping; may differ from metadata by 1px). + let px1 = px1.min(img_w); + let py1 = py1.min(img_h); + if px1 <= px0 || py1 <= py0 { + return Err(BitFunError::tool( + "OCR crop region is empty after clamping to decoded JPEG size.".to_string(), + )); + } + let crop_w = px1 - px0; + let crop_h = py1 - py0; + if px0.saturating_add(crop_w) > img_w || py0.saturating_add(crop_h) > img_h { + return Err(BitFunError::tool("OCR crop rectangle is out of image bounds.".to_string())); + } + let cropped_view = image::imageops::crop_imm(&rgb, px0, py0, crop_w, crop_h); + let cropped = cropped_view.to_image(); + + const OCR_CROP_JPEG_QUALITY: u8 = 75; + let mut buf = Vec::new(); + let mut enc = JpegEncoder::new_with_quality(&mut buf, OCR_CROP_JPEG_QUALITY); + enc + .encode( + cropped.as_raw(), + cropped.width(), + cropped.height(), + image::ColorType::Rgb8, + ) + .map_err(|e| BitFunError::tool(format!("OCR crop: encode JPEG: {}", e)))?; + + // Affine mapping must match `image_box_to_global_match`: global = origin + (jpeg_px - content_left) / content_w * native_capture. + // Do **not** use ix0/ix1 (intersection clip) as display_origin/native size — they disagree with floor/ceil JPEG bounds px0..px1. + let cl_f = cl as f64; + let ct_f = ct as f64; + let cw_f = cw as f64; + let ch_f = ch as f64; + let ox_f = shot.display_origin_x as f64; + let oy_f = shot.display_origin_y as f64; + let nw_f = shot.native_width as f64; + let nh_f = shot.native_height as f64; + let native_left = ox_f + (px0 as f64 - cl_f) / cw_f.max(1e-9) * nw_f; + let native_top = oy_f + (py0 as f64 - ct_f) / ch_f.max(1e-9) * nh_f; + let native_right = ox_f + (px1 as f64 - cl_f) / cw_f.max(1e-9) * nw_f; + let native_bottom = oy_f + (py1 as f64 - ct_f) / ch_f.max(1e-9) * nh_f; + let native_w = (native_right - native_left).round().max(1.0) as u32; + let native_h = (native_bottom - native_top).round().max(1.0) as u32; + + Ok(ComputerScreenshot { + bytes: buf, + mime_type: "image/jpeg".to_string(), + image_width: cropped.width(), + image_height: cropped.height(), + native_width: native_w, + native_height: native_h, + display_origin_x: native_left.round() as i32, + display_origin_y: native_top.round() as i32, + vision_scale: shot.vision_scale, + pointer_image_x: None, + pointer_image_y: None, + screenshot_crop_center: None, + point_crop_half_extent_native: None, + navigation_native_rect: None, + quadrant_navigation_click_ready: false, + image_content_rect: Some(ComputerUseImageContentRect { + left: 0, + top: 0, + width: cropped.width(), + height: cropped.height(), + }), + som_labels: vec![], + implicit_confirmation_crop_applied: false, + }) +} + +// --------------------------------------------------------------------------- +// macOS: Vision framework OCR via objc2-vision +// --------------------------------------------------------------------------- +#[cfg(target_os = "macos")] +mod macos { + use super::{ + filter_and_rank, image_box_to_global_match, image_content_rect_or_full, normalize_for_match, + OcrTextMatch, + }; + use bitfun_core::agentic::tools::computer_use_host::ComputerScreenshot; + use bitfun_core::util::errors::{BitFunError, BitFunResult}; + use objc2::msg_send; + use objc2::rc::Retained; + use objc2::AnyThread; + use objc2_foundation::{NSArray, NSData, NSDictionary, NSError, NSString}; + use objc2_vision::{ + VNImageOption, VNImageRectForNormalizedRect, VNImageRequestHandler, + VNRecognizeTextRequest, VNRecognizeTextRequestRevision3, VNRecognizedTextObservation, + VNRequest, VNRequestTextRecognitionLevel, + }; + + /// Top-N candidates per observation; Chinese matches often appear below rank 1. + const TOP_CANDIDATES_MAX: usize = 10; + + pub fn find_text_matches( + shot: &ComputerScreenshot, + text_query: &str, + ) -> BitFunResult> { + let (_content_left, _content_top, content_width, content_height) = + image_content_rect_or_full(shot); + if content_width == 0 || content_height == 0 { + return Err(BitFunError::tool( + "Screenshot content rect is empty; cannot run macOS Vision OCR.".to_string(), + )); + } + + let observations = recognize_text_observations(&shot.bytes)?; + let mut raw_matches = Vec::new(); + for obs in &observations { + if let Some(m) = observation_to_match(shot, text_query, obs) { + raw_matches.push(m); + } + } + + let ranked = filter_and_rank(text_query, raw_matches); + if ranked.is_empty() { + return Err(BitFunError::tool(format!( + "No OCR text matched {:?} on screen (macOS Vision found {} text regions total). \ + If the UI is Chinese, try a shorter substring (e.g. one or two characters) or ensure the text is visible in the capture; Vision may mis-read stylized UI.", + text_query, + observations.len() + ))); + } + Ok(ranked) + } + + fn recognize_text_observations( + jpeg_bytes: &[u8], + ) -> BitFunResult>> { + // Create NSData from the raw JPEG bytes. + let ns_data = NSData::with_bytes(jpeg_bytes); + + // Create the text recognition request. + let request = VNRecognizeTextRequest::new(); + // Revision 3: language auto-detection + improved scripts (CJK). + unsafe { + let _: () = msg_send![&*request, setRevision: VNRecognizeTextRequestRevision3]; + } + request.setRecognitionLevel(VNRequestTextRecognitionLevel::Accurate); + request.setUsesLanguageCorrection(true); + request.setAutomaticallyDetectsLanguage(true); + + // Prefer Simplified Chinese, Traditional Chinese, then English (WeChat / mixed UIs). + let zh_hans = NSString::from_str("zh-Hans"); + let zh_hant = NSString::from_str("zh-Hant"); + let en_us = NSString::from_str("en-US"); + let langs = NSArray::from_retained_slice(&[zh_hans, zh_hant, en_us]); + request.setRecognitionLanguages(&langs); + + request.setMinimumTextHeight(0.005); + + // Upcast VNRecognizeTextRequest -> VNImageBasedRequest -> VNRequest + // via Retained::into_super() twice. + let request_as_vn: Retained = + Retained::into_super(Retained::into_super(request.clone())); + + let requests = NSArray::from_retained_slice(&[request_as_vn]); + + // Build VNImageRequestHandler from NSData (JPEG). + let options: Retained> = + NSDictionary::new(); + let handler = VNImageRequestHandler::initWithData_options( + VNImageRequestHandler::alloc(), + &ns_data, + &options, + ); + + // Perform the request synchronously. + handler + .performRequests_error(&requests) + .map_err(ns_error_to_bitfun)?; + + // Collect results. + let results = match request.results() { + Some(arr) => arr, + None => return Ok(Vec::new()), + }; + Ok(results.to_vec()) + } + + fn ns_error_to_bitfun(err: Retained) -> BitFunError { + let desc = err.localizedDescription().to_string(); + BitFunError::tool(format!("macOS Vision OCR failed: {}", desc)) + } + + fn observation_to_match( + shot: &ComputerScreenshot, + text_query: &str, + obs: &VNRecognizedTextObservation, + ) -> Option { + let candidates = obs.topCandidates(TOP_CANDIDATES_MAX as usize); + let n = candidates.len(); + let q_norm = normalize_for_match(text_query); + + let mut chosen_text: Option = None; + let mut chosen_confidence: f32 = 0.0; + + for i in 0..n { + let candidate = unsafe { candidates.objectAtIndex_unchecked(i) }; + let text = candidate.string().to_string(); + if !normalize_for_match(&text).contains(&q_norm) { + continue; + } + let conf = candidate.confidence(); + if chosen_text.is_none() || conf > chosen_confidence { + chosen_text = Some(text); + chosen_confidence = conf; + } + } + + let text = chosen_text?; + + // Vision bounding box is normalized to the **full** image (JPEG), not the content rect. + let bounding = unsafe { obs.boundingBox() }; + let image_rect = unsafe { + VNImageRectForNormalizedRect( + bounding, + shot.image_width as usize, + shot.image_height as usize, + ) + }; + + // image_rect origin is bottom-left in image pixel space; convert to top-left. + let local_left = image_rect.origin.x; + let local_top = shot.image_height as f64 - image_rect.origin.y - image_rect.size.height; + let width = image_rect.size.width; + let height = image_rect.size.height; + + Some(image_box_to_global_match( + shot, + text, + chosen_confidence, + local_left, + local_top, + width, + height, + )) + } +} + +// --------------------------------------------------------------------------- +// Windows: Windows.Media.Ocr UWP API +// --------------------------------------------------------------------------- +#[cfg(target_os = "windows")] +mod windows_backend { + use super::{ + filter_and_rank, image_box_to_global_match, image_content_rect_or_full, normalize_for_match, + OcrTextMatch, + }; + use bitfun_core::agentic::tools::computer_use_host::ComputerScreenshot; + use bitfun_core::util::errors::{BitFunError, BitFunResult}; + use windows::core::HSTRING; + use windows::Graphics::Imaging::BitmapDecoder; + use windows::Media::Ocr::{OcrEngine, OcrWord}; + use windows::Storage::Streams::{DataWriter, InMemoryRandomAccessStream}; + use windows::Win32::System::Com::{ + CoIncrementMTAUsage, CoInitializeEx, CoUninitialize, COINIT_APARTMENTTHREADED, + COINIT_DISABLE_OLE1DDE, + }; + + fn w(r: windows::core::Result) -> BitFunResult { + r.map_err(|e| BitFunError::tool(format!("Windows OCR: {}", e))) + } + + pub fn find_text_matches( + shot: &ComputerScreenshot, + text_query: &str, + ) -> BitFunResult> { + let (content_left, content_top, content_width, content_height) = + image_content_rect_or_full(shot); + if content_width == 0 || content_height == 0 { + return Err(BitFunError::tool( + "Screenshot content rect is empty; cannot run Windows OCR.".to_string(), + )); + } + + // Initialize COM apartment for WinRT APIs + // This must run on a thread initialized with COINIT_APARTMENTTHREADED + // Windows.Media.Ocr requires STA thread + let mut co_init = None; + if unsafe { CoIncrementMTAUsage() }.is_err() { + let hr = unsafe { + CoInitializeEx( + None, + COINIT_APARTMENTTHREADED | COINIT_DISABLE_OLE1DDE, + ) + }; + if hr.is_err() { + return Err(BitFunError::tool(format!( + "Windows OCR COM initialization failed: {:?}", + hr + ))); + } + co_init = Some(()); + } + + let result = (|| -> BitFunResult> { + // 1. Write JPEG bytes to in-memory stream + let stream = w(InMemoryRandomAccessStream::new())?; + let writer = w(DataWriter::CreateDataWriter(&stream))?; + w(writer.WriteBytes(&shot.bytes))?; + w(w(writer.StoreAsync())?.get())?; + w(w(writer.FlushAsync())?.get())?; + w(writer.DetachStream())?; + + // 2. Decode JPEG to SoftwareBitmap + let decoder = w(w(BitmapDecoder::CreateAsync(&stream))?.get())?; + let software_bitmap = w(w(decoder.GetSoftwareBitmapAsync())?.get())?; + + // 3. Create OCR engine (use user profile languages) + let engine = match OcrEngine::TryCreateFromUserProfileLanguages() { + Ok(e) => e, + Err(_) => { + // Fallback to English if user profile languages fail + let lang = w(windows::Globalization::Language::CreateLanguage(&HSTRING::from( + "en-US", + )))?; + if !w(OcrEngine::IsLanguageSupported(&lang))? { + return Err(BitFunError::tool( + "Windows OCR: No supported language packs installed.".to_string(), + )); + } + w(OcrEngine::TryCreateFromLanguage(&lang))? + } + }; + + // 4. Run OCR recognition + let ocr_result = w(w(engine.RecognizeAsync(&software_bitmap))?.get())?; + let lines = w(ocr_result.Lines())?; + let line_count = w(lines.Size())?; + + let mut raw_matches = Vec::new(); + for line in &lines { + let words = w(line.Words())?; + for word in &words { + if let Some(m) = ocr_word_to_match( + shot, + text_query, + &word, + content_left, + content_top, + content_width, + content_height, + ) { + raw_matches.push(m); + } + } + } + + let ranked = filter_and_rank(text_query, raw_matches); + if ranked.is_empty() { + return Err(BitFunError::tool(format!( + "No OCR text matched {:?} on screen (Windows OCR found {} text regions total).", + text_query, + line_count + ))); + } + Ok(ranked) + })(); + + // Uninitialize COM if we initialized it + if co_init.is_some() { + unsafe { CoUninitialize() }; + } + + result + } + + fn ocr_word_to_match( + shot: &ComputerScreenshot, + text_query: &str, + word: &OcrWord, + content_left: u32, + content_top: u32, + _content_width: u32, + _content_height: u32, + ) -> Option { + let text = word.Text().ok()?.to_string(); + + // Pre-filter + if !normalize_for_match(&text).contains(&normalize_for_match(text_query)) { + return None; + } + + // Windows OCR returns bounding rect in pixels, top-left origin, within the image + let rect = word.BoundingRect().ok()?; + let local_left = content_left as f64 + f64::from(rect.X); + let local_top = content_top as f64 + f64::from(rect.Y); + let width = f64::from(rect.Width); + let height = f64::from(rect.Height); + + Some(image_box_to_global_match( + shot, + text, + 0.8, + local_left, + local_top, + width, + height, + )) + } +} + +// --------------------------------------------------------------------------- +// Linux: Tesseract OCR via leptess bindings +// --------------------------------------------------------------------------- +#[cfg(target_os = "linux")] +mod linux_backend { + use super::{ + filter_and_rank, image_box_to_global_match, image_content_rect_or_full, normalize_for_match, + OcrTextMatch, + }; + use bitfun_core::agentic::tools::computer_use_host::ComputerScreenshot; + use bitfun_core::util::errors::{BitFunError, BitFunResult}; + use leptess::capi::TessPageIteratorLevel_RIL_WORD; + use leptess::{leptonica, tesseract::TessApi}; + + pub fn find_text_matches( + shot: &ComputerScreenshot, + text_query: &str, + ) -> BitFunResult> { + let (content_left, content_top, content_width, content_height) = + image_content_rect_or_full(shot); + if content_width == 0 || content_height == 0 { + return Err(BitFunError::tool( + "Screenshot content rect is empty; cannot run Linux Tesseract OCR.".to_string(), + )); + } + + // Initialize Tesseract API + // Try system default tessdata path first, then common locations + let mut api = match TessApi::new(None, "eng") { + Ok(api) => api, + Err(_) => { + let paths = [ + "/usr/share/tesseract-ocr/5/tessdata/", + "/usr/share/tesseract-ocr/tessdata/", + "/usr/share/tessdata/", + ]; + let mut api = None; + for path in &paths { + if std::path::Path::new(path).exists() { + if let Ok(a) = TessApi::new(Some(path), "eng") { + api = Some(a); + break; + } + } + } + api.ok_or_else(|| BitFunError::tool( + "Linux OCR: Tesseract initialization failed. Please install tesseract-ocr and tesseract-ocr-eng packages, or ensure TESSDATA_PREFIX is set correctly.".to_string() + ))? + } + }; + + let pix = leptonica::pix_read_mem(&shot.bytes).map_err(|e| { + BitFunError::tool(format!( + "Linux OCR: Failed to decode screenshot image with Leptonica: {}", + e + )) + })?; + + api.set_image(&pix); + if api.recognize() != 0 { + return Err(BitFunError::tool( + "Linux OCR: Tesseract recognition failed.".to_string(), + )); + } + + let boxa = api + .get_component_images(TessPageIteratorLevel_RIL_WORD, true) + .ok_or_else(|| { + BitFunError::tool( + "Linux OCR: Tesseract did not return word regions.".to_string(), + ) + })?; + + let word_region_count = boxa.get_n(); + let mut raw_matches = Vec::new(); + + for b in &boxa { + let g = b.get_geometry(); + if g.w <= 0 || g.h <= 0 { + continue; + } + let x1 = g.x; + let y1 = g.y; + let x2 = g.x + g.w; + let y2 = g.y + g.h; + api.set_rectangle(g.x, g.y, g.w, g.h); + let text = match api.get_utf8_text() { + Ok(t) => t, + Err(_) => continue, + }; + let confidence = api.mean_text_conf() as f32 / 100.0; + if let Some(m) = tesseract_word_to_match( + shot, + text_query, + &text, + confidence, + x1, + y1, + x2, + y2, + content_left, + content_top, + content_width, + content_height, + ) { + raw_matches.push(m); + } + } + + let ranked = filter_and_rank(text_query, raw_matches); + if ranked.is_empty() { + return Err(BitFunError::tool(format!( + "No OCR text matched {:?} on screen (Tesseract found {} word regions total).", + text_query, word_region_count + ))); + } + Ok(ranked) + } + + fn tesseract_word_to_match( + shot: &ComputerScreenshot, + text_query: &str, + text: &str, + confidence: f32, + x1: i32, y1: i32, x2: i32, y2: i32, + content_left: u32, + content_top: u32, + _content_width: u32, + _content_height: u32, + ) -> Option { + // Pre-filter + if !normalize_for_match(text).contains(&normalize_for_match(text_query)) { + return None; + } + + // Tesseract returns bounding box in pixels, top-left origin, within the image + let local_left = content_left as f64 + x1 as f64; + let local_top = content_top as f64 + y1 as f64; + let width = (x2 - x1) as f64; + let height = (y2 - y1) as f64; + + if width <= 0.0 || height <= 0.0 { + return None; + } + + Some(image_box_to_global_match( + shot, + text.to_string(), + confidence, + local_left, + local_top, + width, + height, + )) + } +} diff --git a/src/apps/desktop/src/computer_use/ui_locate_common.rs b/src/apps/desktop/src/computer_use/ui_locate_common.rs index 2e809f5b..e1ccb225 100644 --- a/src/apps/desktop/src/computer_use/ui_locate_common.rs +++ b/src/apps/desktop/src/computer_use/ui_locate_common.rs @@ -173,6 +173,7 @@ pub fn matches_filters( } } +#[allow(dead_code)] // Used by windows_ax_ui / linux_ax_ui (not compiled on macOS) pub fn ok_result( gx: f64, gy: f64, @@ -183,6 +184,27 @@ pub fn ok_result( matched_role: String, matched_title: Option, matched_identifier: Option, +) -> BitFunResult { + ok_result_with_context( + gx, gy, bounds_left, bounds_top, bounds_width, bounds_height, + matched_role, matched_title, matched_identifier, + None, 1, vec![], + ) +} + +pub fn ok_result_with_context( + gx: f64, + gy: f64, + bounds_left: f64, + bounds_top: f64, + bounds_width: f64, + bounds_height: f64, + matched_role: String, + matched_title: Option, + matched_identifier: Option, + parent_context: Option, + total_matches: u32, + other_matches: Vec, ) -> BitFunResult { let (nx, ny) = global_to_native_center(gx, gy)?; let (nminx, nminy, nmaxx, nmaxy) = if bounds_width > 0.0 && bounds_height > 0.0 { @@ -206,5 +228,18 @@ pub fn ok_result( matched_role, matched_title, matched_identifier, + parent_context, + total_matches, + other_matches, }) } + +/// Whether an element's global bounds fall within any visible display. +pub fn is_element_on_screen(gx: f64, gy: f64, width: f64, height: f64) -> bool { + // Element must have reasonable size (not a giant container) + if width > 3000.0 || height > 2000.0 { + return false; + } + // Center must be resolvable to a display + DisplayInfo::from_point(gx.round() as i32, gy.round() as i32).is_ok() +} diff --git a/src/crates/core/src/agentic/agents/claw_mode.rs b/src/crates/core/src/agentic/agents/claw_mode.rs index 24e453e5..c50dd177 100644 --- a/src/crates/core/src/agentic/agents/claw_mode.rs +++ b/src/crates/core/src/agentic/agents/claw_mode.rs @@ -27,12 +27,9 @@ impl ClawMode { "SessionMessage".to_string(), "SessionHistory".to_string(), "Cron".to_string(), + // All desktop automation consolidated into ComputerUse + // (click_element, click, mouse_move, scroll, drag, screenshot, locate, etc.) "ComputerUse".to_string(), - // Split computer-use tools must be allowlisted here; otherwise the pipeline rejects them - // ("Tool 'ComputerUseMousePrecise' is not in the allowed list") and the model falls back to ComputerUse-only + vision. - "ComputerUseMousePrecise".to_string(), - "ComputerUseMouseStep".to_string(), - "ComputerUseMouseClick".to_string(), ], } } diff --git a/src/crates/core/src/agentic/agents/prompts/claw_mode.md b/src/crates/core/src/agentic/agents/prompts/claw_mode.md index 9efa4144..4dbc7040 100644 --- a/src/crates/core/src/agentic/agents/prompts/claw_mode.md +++ b/src/crates/core/src/agentic/agents/prompts/claw_mode.md @@ -11,7 +11,7 @@ Narrate only when it helps: multi-step work, complex/challenging problems, sensi Keep narration brief and value-dense; avoid repeating obvious steps. Use plain human language for narration unless in a technical context. When a first-class tool exists for an action, use the tool directly instead of asking the user to run equivalent CLI commands. -**Computer use (desktop automation):** If the user’s request needs **more than one** Computer use tool call (or spans **multiple apps/windows**), first state a **short numbered plan** in plain language: **(a)** whether **`Bash`** / **`TerminalControl`** applies (e.g. macOS `open -a "AppName"` to launch/focus an app), **(b)** **`ComputerUse`** **`action: locate`** for **named** UI targets before pointer moves, **(c)** target **application/window**, **(d)** how you will **verify** focus — **prefer** **`computer_use_context`** from tool results; use **`screenshot`** when you need **pixels** for the next step or when the **host** requires a fresh capture (see **Screenshot cadence** below). Then execute **step-by-step** — this overrides “silent tools” for that automation block only. +**Computer use (desktop automation):** If the user's request needs **more than one** ComputerUse call (or spans **multiple apps/windows**), first state a **short numbered plan**: (a) whether `Bash` applies (e.g. `open -a "AppName"`), (b) which `click_element` / `move_to_text` / `locate` calls to try, (c) target app/window, (d) how you will verify focus. Then execute step-by-step. # Session Coordination For complex coding tasks or office-style multi-step tasks, prefer multi-session coordination over doing everything in the current session. @@ -40,42 +40,90 @@ Prioritize safety and human oversight over completion; if instructions conflict, Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested. # Computer use (BitFun desktop, when enabled) -**What “computer use” means here:** **desktop automation for the user’s task**, not only tools whose names start with `ComputerUse`. When the step can be done from the **workspace terminal** (scripts, builds, tests, git, CLIs, macOS `open`/`osascript` where appropriate), use **`Bash`** / **`TerminalControl`** **before** driving the GUI. Do **not** skip the terminal and jump straight to screenshots if a shell command would accomplish the same step. - -**Tool list order (matches the API):** After **`Task`**, **`Bash`**, **`TerminalControl`**, file tools, then **`ComputerUse`** (screenshot + chords + **`locate`**), then **`ComputerUseMousePrecise` / `ComputerUseMouseStep` / `ComputerUseMouseClick`** — **within `ComputerUse`**, prefer **`action: locate`** before ruler-only **`action: screenshot`** when a **named** control can be matched; do **not** open with full-screen **`screenshot`** when **`locate`** can name the target. - -When **Computer use** is enabled, you have **`ComputerUse`** (`action` **`screenshot`** | **`locate`** | **`key_chord`** | **`type_text`** | **`pointer_move_rel`** | **`wait`**) and separate mouse tools **`ComputerUseMousePrecise`**, **`ComputerUseMouseStep`**, **`ComputerUseMouseClick`**. Do **not** treat “computer use” as “only `screenshot` + vision”. - -- **Screenshot cadence (align with BitFun host — avoid spam):** The **desktop host** enforces a **fresh `screenshot`** mainly for **two** cases: **`ComputerUseMouseClick` (`action`: click)** (needs a **fine** view: quadrant drill terminal or point crop — not full-screen-only), and **`key_chord` that includes Return / Enter** when the outcome matters. **Do not** treat `screenshot` as a **heartbeat** after every other action. **Do not** call `screenshot` simply because you just ran **`action: locate`** (JSON-only), **`key_chord` without Enter**, **`type_text`**, or **`wait`** — unless you **need** a JPEG for the next **vision** step (aiming, reading dense UI) or **`computer_use_context`** is missing/ambiguous. After **`locate`** success, **prefer** moving with **`coordinate_hints.mouse_precise_screen`** (global coords) or **`ComputerUseMousePrecise`** without an extra full-frame `screenshot` when the next step is not yet a host-guarded click/Enter. **Use** `screenshot` when you need to **see** pixels or before **click / Enter** per the host rules below. - -- **`action: locate` — how to aim filters (not OCR):** `locate` searches the **accessibility tree** (titles, roles, identifiers on the **foreground** window). It does **not** read pixels like OCR; labels drawn only in the bitmap, heavily custom UIs, or some list rows may **never** appear in AX. **Prefer** substrings that match what the app likely exposes — often a **shorter or distinctive fragment** and the **same language as the UI** (do not assume the user’s chat text matches **AXTitle** verbatim). **Filter combination:** by default, non-empty fields are combined with **AND** (same element must satisfy all). Many inputs (e.g. WeChat search) have **`AXTextField`** but **no** `AXTitle` containing “搜索” — then **`title_contains` + `role_substring` together will fail**. Use **`filter_combine`: `any`** so **role OR title** can match, or send **only one** of `title_contains` / `role_substring` / `identifier_contains` on the first try. If a call returns no match, **change the query** before retrying; avoid sending the **same** filters repeatedly. When AX probably will not contain the label (chat bubbles, owner-drawn text, dense feeds), **switch early** to **`screenshot`** and the vision / quadrant path — that is a normal fallback, not a failure to “use locate first” where AX has nothing to match. - -- **macOS — launch or foreground an app:** Prefer **`Bash`** with `open -a "AppName"` (e.g. `open -a WeChat`) instead of Spotlight (Command+Space + `type_text` + Return) when you only need to **start or bring forward** the app — **fewer steps** and fewer **Return/Enter** screenshot-guard failures. Reserve Spotlight for when `open -a` is wrong or the user asked for Spotlight. -- **Named rows / chats / list items (WeChat, Slack, Mail, etc.):** **Forbidden:** aiming at a **conversation row** or **named button** using **only** full-screen **`screenshot`** ruler coordinates + **`pointer_move_rel`** / **`ComputerUseMousePrecise`** **before** trying **`ComputerUse`** **`action: locate`**. **Required:** call **`ComputerUse`** with **`action: locate`** and **`title_contains`** / **`role_substring`** matching the **on-screen label** (same language as the app UI, e.g. «Bob») to get **`global_center_*`** / **`coordinate_hints`**, then move or point-crop. If locate fails after refining filters, **then** use vision (quadrant drill / crop). -- **Plan → execute → verify (multi-step):** Before the **first** tool call of a desktop task that needs several steps, output a **numbered plan** (target app/window, verification checkpoints, tool order). Execute **one logical step at a time**; after a step that may **change focus** (app switch, new window, dialog), **prefer** **`computer_use_context`** / **`wait`**; **add `screenshot`** only when you must **see** the new layout or before a **click / Enter** per host rules. Do **not** “stream” many unrelated actions while the foreground app might still be wrong. -- **Foreground safety (`computer_use_context`):** Tool results include **`computer_use_context`** when available: **`foreground_application`** (which app is frontmost), **`pointer_global`** (cursor), **`input_coordinates`** (this call). Treat **`foreground_application`** as ground truth. **`ComputerUse`** **`action: locate`** searches the **foreground** app only — if the wrong app is focused, locate and clicks hit the **wrong** window. If **`foreground_application`** (or the latest **`screenshot`**) does **not** match the **intended** target app, **stop** the current sequence: switch focus first (e.g. **`screenshot`** to see the dock/taskbar, **`ComputerUseMousePrecise`** + **`ComputerUseMouseClick`** on the correct icon, or **`key_chord`** for app switch / window cycle **on this host**), then **`wait`** / **`screenshot`** until the **correct** app is frontmost, then continue. Never assume BitFun, the terminal, or a previous app is still focused. -- **Re-plan on failure:** If a tool **errors**, **`locate`** finds **no** match, the last **`screenshot`** shows **unexpected** UI, or **`foreground_application`** is wrong: **do not** keep executing the old plan. **Re-read** **`computer_use_context`**; **add a `screenshot`** only when you need new pixels to revise the plan (fix focus, tighten locate filters, switch to vision/quadrant drill, or **ask the user**). Only retry after the plan is updated — do not stack pointer/text/chord actions on the wrong app. -- **Automation priority (strict order — try higher items before lower):** (1) **Terminal — `Bash` / `TerminalControl`** — anything achievable via **shell in the workspace** (build, test, git, scripts, CLIs). (2) **System shortcuts — `key_chord`** — **OS-wide** actions and **system clipboard** (see Environment Information for modifiers on **this** host). (3) **Application shortcuts — `key_chord`** — **in-app** shortcuts when the correct app/window is focused. (4) **`ComputerUse`** **`action: locate`** — **native accessibility tree** (macOS AX / Windows UIA / Linux AT-SPI) on the **foreground** window: **`title_contains`**, **`role_substring`**, **`identifier_contains`**, centers and **`global_bounds_*` / `native_bounds_*`**. **When locate matches:** you may **move** with **`coordinate_hints`** **without** an immediate full-frame **`screenshot`**; call **`screenshot`** with **`screenshot_crop_center_*`** / **`screenshot_crop_half_extent_native`** from **`coordinate_hints.screenshot_point_crop`** **only when** you need a **JPEG** for vision (e.g. quadrant drill toward a click) — **not** after every successful locate. If **`locate` finds nothing**, continue to (5). (5) **Vision — `ComputerUse`** **`action: screenshot`** + **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick`** — only when (1)–(4) cannot complete the step. **Between shortcuts and accessibility locate:** use **`type_text`** only for short or paste-blocked input **after** focus is correct; prefer **`key_chord`** paste when possible. -- **Quadrant drill vs locate:** The **default quadrant-drill + JPEG** workflow below is for **vision-based aiming**. It is **not** a substitute for **`ComputerUse`** **`action: locate`** when you can filter by **name/role** (e.g. “open chat with 尉怡青” → try **`locate`** with **`title_contains`** matching the contact name in **the app’s UI language** before guessing coordinates from a screenshot). -- **Default path before any `ComputerUseMouseClick` (`action`: click) when using the vision path (unless a shortcut replaces it):** After the **first** full-frame `screenshot`, **you must narrow the view with quadrant drill** — each narrowing step is **`action: screenshot`** **plus** **`screenshot_navigate_quadrant`** (`top_left` / `top_right` / `bottom_left` / `bottom_right`). Repeat **one quadrant per call** until the tool JSON shows **`quadrant_navigation_click_ready`: true**, then **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick` (`action`: click)**. **Do not skip straight to point crop** (`screenshot_crop_center_*`) from a full-screen shot unless: the click target already fills a large fraction of the frame, quadrant drill is clearly wrong for the UI (e.g. you must jump to a known margin coordinate), or the user explicitly asked for a crop at native x/y. -- **Quadrant drill is never automatic:** The host **does not** split the screen unless **you** pass `screenshot_navigate_quadrant` on that `screenshot` call. A plain `screenshot` with **no** `screenshot_navigate_quadrant` only **refreshes** the full display (or the current drill region). **If you never set `screenshot_navigate_quadrant`, you will stay on a wide view and models often mis-click** — follow the default path above. -- **No automatic desktop images:** BitFun does **not** inject extra screenshot messages or attach follow-up JPEGs after other ComputerUse actions. Call **`screenshot`** when you **need** pixels for the next decision: full frame, **`screenshot_navigate_quadrant`** (four-way drill — see tool schema), **`screenshot_reset_navigation`**, or point crop via `screenshot_crop_center_x` / `screenshot_crop_center_y` (**full-display native** pixels). **Do not** refresh the full display **habitually** after `locate` / `key_chord` / `type_text` if the **Host-enforced screenshot** rules above do not yet apply. If **`screenshot_navigate_quadrant`** is set, **`screenshot_crop_center_*` are ignored** in that same call (avoid sending both; send **only** fields that apply to the current `action`). -- **Host OS and shortcuts:** Before `key_chord`, read **Environment Information** below (Operating System line and the Computer use bullet there). Use modifier names that match **that** host only — do not mix OS conventions (e.g. do not use Windows-style shortcuts when the host is macOS). -- **Shortcut-first (required, after terminal when applicable):** If the step is **not** better done via **`Bash`** / **`TerminalControl`**, then when a **standard OS or in-app shortcut** or **clipboard chord** does the same job as a planned pointer path, you **must choose `key_chord`** — do **not** open Edit menus to click Copy/Paste when **`key_chord`** can do it; do **not** re-type long text with **`type_text`** when **Select all + Copy** or **Paste** achieves the goal. Same for New/Open/Save, Undo/Redo, Find, tab/window close or switch, Quit, Refresh, focus address bar, etc. Reserve **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + crop screenshots + **`ComputerUseMouseClick` (`action`: click)** for when **no** reliable shortcut exists, the control is pointer-only, or after a shortcut clearly failed (then **`screenshot`** and try another approach). Menus in the JPEG often display shortcuts — use them. -- **When to verify with pixels:** If the **next** step is a **host-guarded** **`ComputerUseMouseClick` (click)** or **`key_chord` with Return/Enter**, follow the **Strict rule** below — that is when a **fresh `screenshot`** is **required**. For other steps (`key_chord` without Enter, `type_text`, `locate`, wheel), **prefer** `computer_use_context` and logical continuation; **add `screenshot`** only if you **cannot** reason about the next action (unknown dialog, wrong app suspected, or you need a JPEG to aim). **Do not** insert a full-frame `screenshot` between every pair of non-click actions. -- **Strict rule — no blind Enter, no blind click:** Before **`ComputerUseMouseClick` (`action`: click)**, you **must** have a **fine** screenshot after the pointer is aligned: **`quadrant_navigation_click_ready`: true** (preferred: **`screenshot` + `screenshot_navigate_quadrant`** each step until the tool JSON says so) **or** a **point-crop `screenshot`** (~500×500 via `screenshot_crop_center_*`) when the exceptions above apply. A **full-screen-only** frame alone does **not** authorize **`ComputerUseMouseClick` (click)**. Before **`key_chord` that includes Return or Enter**, you **must** call **`screenshot` first** and **visually confirm** focus and target. The only exception is when the user explicitly asks for an unverified / blind step. -- For sending messages, payments, destructive actions, or anything sensitive, state the exact steps first and obtain clear user confirmation in chat before executing. -- If Computer use is disabled or OS permissions are missing, tell the user what to enable in BitFun settings / system privacy instead of claiming success. -- Screenshot results require the session primary model to use Anthropic or OpenAI-compatible API format so the image is attached to the tool result for vision. The JPEG matches **native display resolution** (no downscale): `coordinate_mode` `"image"` uses the same pixel grid as the bitmap. -- **Host-enforced screenshot (two cases):** The desktop host **rejects `ComputerUseMouseClick` (click)** until the last `screenshot` after the last pointer move is a **valid fine basis**: **`quadrant_navigation_click_ready`: true** (quadrant drill until the region’s longest side is below the host threshold) **or** a **fresh point-crop** (`screenshot_crop_center_*`, ~500×500). **Full-screen-only** is **not** enough. It **rejects `key_chord` that includes Return or Enter** until a **fresh `screenshot`** since the last pointer move or click. **`ComputerUseMousePrecise`** may use **`coordinate_mode` `\"image\"`** on any prior **`screenshot`**. Still **prefer `key_chord`** when it matches the step. -- **Rulers vs zoom:** Full-frame JPEGs have **margin rulers** and a **grid** — use them to orient. For small controls, **default to quadrant drill** (`screenshot_navigate_quadrant` on each `screenshot` step); use **point crop** only as a **secondary** option (see default path above). Each quadrant step **adds padding on every side** (clamped) so controls on split lines stay in the JPEG. **Do not** rely only on huge full-display images when a smaller view answers the question. -- **Click guard:** The host **rejects `ComputerUseMouseClick` (click)** if there was **`ComputerUseMousePrecise` / `ComputerUseMouseStep` / `pointer_move_rel` or a previous click** since the last `screenshot`, or if the last `screenshot` was **full-screen only** without **`quadrant_navigation_click_ready`**. **`screenshot`** before **Return/Enter** in **`key_chord`** when the outcome matters. -- **`ComputerUseMouseStep` / `pointer_move_rel` on macOS:** Deltas are in **screenshot/display pixels**; the host converts using the **last** **`screenshot`**’s scale — take **`screenshot`** first or moves may be wrong. **`ComputerUseMouseStep`** uses **`direction`** (`up` / `down` / `left` / `right`) and optional **`pixels`** (default 32, use smaller values e.g. 8–24 for fine alignment). **Small moves:** prefer **`ComputerUseMouseStep`** over guessing tiny absolute **`ComputerUseMousePrecise` `x`/`y`** — vision models are usually more reliable that way. -- **Where is the pointer?** Only the latest `screenshot` tells you: **`pointer_image_x` / `pointer_image_y`** (tip in **this** JPEG for `coordinate_mode` `"image"`) and the **synthetic red cursor with gray border** in the image (**tip** = hotspot). Read **`pointer_marker`** in the tool JSON. If those coordinates are **null** and there is **no** overlay, the cursor is **not** on this capture — do not infer position from the image; use **`use_screen_coordinates`** with global coords or move the pointer onto this display. After any **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** / `pointer_move_rel`, the old screenshot is **stale** until you `screenshot` again. -- After `screenshot`, when the pointer is on this display, the JPEG includes that **red cursor overlay** and the JSON fields above. **`ComputerUseMousePrecise` only moves** to absolute coords (on macOS uses sub-point Quartz for accuracy). **`ComputerUseMouseClick` (`action`: click)** only clicks at the current pointer (no coordinates); **`action`: wheel** scrolls the wheel at the pointer. **Default:** **`screenshot` + `screenshot_navigate_quadrant`** (repeat) until **`quadrant_navigation_click_ready`**, then align the **red tip** with **`ComputerUseMouseStep`** / **`ComputerUseMousePrecise`** on that JPEG and **`ComputerUseMouseClick` (click)**. For **small** alignment fixes, **`ComputerUseMouseStep`** beats tiny absolute coords; reserve **`ComputerUseMousePrecise`** for **larger** jumps. **Fallback:** point-crop `screenshot` when the default path does not fit. Do not aim using only the OS cursor or guesswork. If tool JSON includes **`recommended_next_for_click_targeting`**, follow it. -- **Default pointer loop:** (1) `screenshot` (full or after **`screenshot_reset_navigation`**) then **required quadrant drill** until **`quadrant_navigation_click_ready`** (unless a justified point crop); (2) **`ComputerUseMouseStep`** / `pointer_move_rel` for **small** nudges, **`ComputerUseMousePrecise`** when you need a **big** reposition; repeat until the **red cursor tip** is on the target; (3) **`screenshot` again** after any pointer move; (4) repeat if needed; (5) only then **`ComputerUseMouseClick` (click)** when the last screenshot is **fine** (quadrant terminal or point crop). If the pointer is off the captured display (no red overlay), use **`ComputerUseMousePrecise`** to bring it onto the screen, then continue. Re-screenshot after major UI changes. -- **Shortcut + verify:** If a **`key_chord`** / **`type_text`** clearly **failed** (error tool result, or a **later** step shows wrong window), **then** use **`screenshot`** or **`computer_use_context`** to recover — not a blanket screenshot after every chord. Follow **`hierarchical_navigation.shortcut_policy`** in each `screenshot` result together with this section. -- On macOS, development builds need Accessibility for the actual debug binary (path is in the error message if input is blocked). +Everything is in one tool: **`ComputerUse`** with these actions: `click_element`, `click_label`, `move_to_text`, `click`, `mouse_move`, `scroll`, `drag`, `screenshot`, `locate`, `key_chord`, `type_text`, `pointer_move_rel`, `wait`. + +## Automation priority (try higher first) +**Targeting rule:** Prefer **non-screenshot** targeting before any workflow that depends on **new** screenshots for pointing. **`screenshot` + quadrant / crop + `mouse_move` + `click` is the lowest-priority targeting path** — use only when AX, OCR, and (if already available) SoM labels are insufficient. + +1. **`Bash` / `TerminalControl`** -- shell commands, scripts, `open -a "App"` on macOS to launch/focus apps. +2. **`key_chord`** -- OS and app keyboard shortcuts, clipboard (copy/cut/paste). Prefer over mouse when a shortcut exists. **No** mandatory screenshot before non-Enter chords (see Screenshot policy). +3. **`click_element`** -- accessibility (AX/UIA/AT-SPI): locate + move + click in one call. **Bypasses screenshot guard.** Use when filters can match the control. +4. **`move_to_text`** (OCR) -- match **visible on-screen text** and **move the pointer** to it (no click, no keys). **Does not require a prior model-driven `screenshot` for targeting** (host captures internally). Use **`click`** in a separate step if you need a mouse press. Use **before** `screenshot` drill or **`mouse_move` + `click`** whenever distinctive text is visible in the **same language as the UI**. Prefer this over SoM/vision when you have not yet taken a screenshot or when labels are missing. +5. **`click_label`** -- if a **previous** `screenshot` already returned numbered Set-of-Mark labels, click by number. **Requires** that screenshot step first; still **prefer `move_to_text` over starting a long screenshot-only drill** when readable text is enough. +6. **`locate`** -- find an element without clicking (JSON + coordinates). No screenshot required for the lookup itself. +7. **`screenshot`** (confirm UI / SoM / drill only) + **`mouse_move`** (**`use_screen_coordinates`: true**, globals from **`locate`** / **`move_to_text`** / tool JSON) + **`click`** -- **last resort** when AX/OCR/SoM are insufficient. **Never** derive `mouse_move` targets from JPEG pixels. **`click`** still needs a valid host basis (host). +8. **`mouse_move`**, **`scroll`**, **`drag`**, **`type_text`**, **`pointer_move_rel`**, **`ComputerUseMouseStep`**, **`wait`** -- manipulate without mandatory pre-screenshot (see Screenshot policy; host may still require refresh before a later **`click`** or Enter **`key_chord`**). **`mouse_move` / `drag`:** globals only (`use_screen_coordinates`: true). **`pointer_move_rel` / `ComputerUseMouseStep`:** the **desktop host refuses** these as the **next** action after **`screenshot`** — reposition with **`move_to_text`**, **`mouse_move`**, **`click_element`**, or **`click_label`** first (do not nudge from the JPEG). + +## `click_element` (preferred for most accessibility-backed clicks) +Use `click_element` when the target has a known accessible title or role. It locates the element via AX tree, moves the pointer to its center, and clicks -- all in one call. No screenshot or quadrant drill needed. Supports `button` (left/right/middle) and `num_clicks` (1/2/3 for single/double/triple click). + +**Filter tips:** Use `title_contains` and/or `role_substring` in the **same language as the app UI**. Use `filter_combine: "any"` when fields might not overlap (e.g. text fields with no title). If no match, refine the query or fall back to SoM / OCR / vision path. Prefer short, distinctive substrings. If a call returns no match, **change the query** before retrying. + +**When `click_element` won't work:** Chat apps (e.g. WeChat), Electron/web views, owner-drawn controls, and minimal AX trees often omit or misname roles/titles (your filter may not match even when the control is visible). **Do not** repeat the same `title_contains`/`role_substring` more than twice — switch to **`move_to_text`** on visible chrome (tabs, buttons, search hints) or screenshot + `click_label` / quadrant workflow. That is expected, not a bug. + +## Screenshot policy (host-enforced) +**Mandatory fresh screenshot / valid fine-capture basis applies only to:** +- **`click`** (at current pointer — **`click` never accepts x/y**) — the host may require a **fine** capture basis (point crop, quadrant terminal, or full-frame per host rules); use point crop or quadrant drill until `quadrant_navigation_click_ready` when needed, **or** use `click_element` / `click_label` / `move_to_text` instead of guessing pixels. +- **`key_chord` that includes `return` or `enter` / `kp_enter`** — requires a fresh screenshot since the last pointer-changing action (host). + +**Not** subject to “must screenshot first” by themselves: `mouse_move`, `scroll`, `drag`, `type_text`, `locate`, `wait`, `pointer_move_rel`, `key_chord` **without** Enter/Return, and **`move_to_text`** / **`click_element`** / **`click_label`** (they bypass the click guard or do not use it). + +**Cadence:** Take **`screenshot`** when you need **visual confirmation**, SoM labels, or the host requires a fresh capture before **`click`** / Enter. When confirmation is required, the host applies **~500×500** around the mouse or text caret (including during quadrant drill) unless you force full-frame with **`screenshot_reset_navigation`**. Do **not** add extra screenshots before ordinary moves, typing, or non-Enter shortcuts “just in case.” + +## Screenshot path (lowest targeting tier) +After **`click_element`** and **`move_to_text`** are exhausted or inappropriate, use **`screenshot`** for **confirmation** and SoM — not for inventing move coordinates. + +When you **do** take a `screenshot`, inspect JSON: +- If `som_labels` is present, **`click_label`** is preferred. +- **Do not** read pixel coordinates off the JPEG for **`mouse_move`** — use **`locate`**, **`move_to_text`**, or globals from tool results with **`use_screen_coordinates`: true**. + +## `move_to_text` (OCR — high priority, not a last resort) +Use **`move_to_text`** when visible text identifies the target and AX is weak or unknown. It **only moves the cursor**; add **`click`** afterward if you need a press. **Call it before** chaining multiple `screenshot` + quadrant steps when a short substring would suffice. + +Pass a substring in the **same language as the UI**. If multiple matches, refine `text_query`. + +**vs globals:** Prefer **`move_to_text`** (then **`click`** if needed) over **`mouse_move` + `click`** when text is visible. **`mouse_move`** must use **`use_screen_coordinates`: true** with numbers from **`locate`** / **`move_to_text`** / **`pointer_global`** — never JPEG guesses. + +## Vision / drill path (last resort) +When `click_element`, **`move_to_text`**, and (if applicable) `click_label` cannot complete the step: +1. `screenshot` (confirm state; host may return ~500×500 when a guarded action is pending) +2. optional `screenshot_navigate_quadrant` or `screenshot_crop_center_*` until `quadrant_navigation_click_ready` or a tight crop +3. **`mouse_move`** with **`use_screen_coordinates`: true** (globals from **`locate`** or prior tool JSON) / `pointer_move_rel` as needed +4. `screenshot` if the host requires an updated basis after large pointer moves (for the next **`click`**) +5. `click` + +**Quadrant drill is never automatic** unless you pass `screenshot_navigate_quadrant` on `screenshot`. + +## Think before you act (Chain-of-Thought) +Before **every** ComputerUse action, briefly state in your response: +1. **See:** What you observe on the current screen (or from the last screenshot/tool result). +2. **Plan:** What you intend to do and why. +3. **Expect:** What the expected result should be (e.g. "button changes color", "new dialog appears", "text field gains focus"). + +After the action, compare the actual result against your expectation. If they differ, pause and reassess before continuing. This prevents blind repetition and helps catch errors early. + +## Loop detection and recovery +The system automatically tracks your action history. If `loop_warning` appears in a tool result: +- **Stop the current approach immediately.** Do not repeat the same action sequence. +- **Read the suggestion** in the `loop_warning` field and follow it. +- **Try a different strategy:** switch from vision to accessibility (`click_element`) or OCR (`move_to_text`), from mouse to keyboard shortcuts, or vice versa. +- **If stuck after trying alternatives:** explain what you attempted and ask the user for guidance rather than continuing to loop. + +## Key rules +- **macOS apps:** Use `open -a "AppName"` via Bash to launch/focus, not Spotlight. +- **Foreground safety:** Check `computer_use_context.foreground_application` -- if wrong app is focused, fix focus first. `locate` and `click_element` search the **foreground** app only. +- **Targeting order:** `click_element` → **`move_to_text`** (when text is visible) → **`click_label`** if SoM is already on a screenshot → **screenshot** drill / crop + **`mouse_move`** + **`click`** last. +- **Screenshot cadence:** Only when you need pixels, SoM, or a **fine** basis before guarded **`click`**; and always immediately before **`key_chord`** with Enter/Return (host). **Do not** treat `screenshot` as the default next step after every non-click action. +- **No blind Enter:** Fresh `screenshot` required before `key_chord` with Return/Enter only (not before other chords). +- **Shortcut-first:** Use `key_chord` for Copy/Paste/Save/Undo etc. Do not click menus when shortcuts exist. Menus in screenshots often display shortcuts -- use them. +- **Re-plan on failure:** If `locate`/`click_element` misses or screenshot shows unexpected UI, stop and reassess. Do not retry the same approach more than twice. +- **Sensitive actions:** For messages, payments, or destructive actions, state steps and get user confirmation first. +- **Pointer info:** After `screenshot`, `pointer_image_x/y` and the red synthetic cursor show pointer position. Optional follow-up `screenshot` after large pointer moves if you need pixels before a guarded **`click`**. +- **Screenshot layout:** JPEGs are for **confirmation** (optional pointer + SoM). **Do not** use JPEG pixel indices for **`mouse_move`** — the host disables image/normalized moves; use **global** coordinates only. +- **Multi-step plans:** For tasks spanning multiple apps/steps, output a numbered plan before starting. +- **Host OS:** Use modifier names matching this host (see Environment Information). Do not mix OS conventions. +- On macOS, development builds need Accessibility permission for the debug binary. +- If Computer use is disabled or OS permissions are missing, tell the user what to enable. {CLAW_WORKSPACE} {ENV_INFO} @@ -83,4 +131,4 @@ When **Computer use** is enabled, you have **`ComputerUse`** (`action` **`screen {AGENT_MEMORY} {RULES} {MEMORIES} -{PROJECT_CONTEXT_FILES:exclude=review} \ No newline at end of file +{PROJECT_CONTEXT_FILES:exclude=review} diff --git a/src/crates/core/src/agentic/execution/execution_engine.rs b/src/crates/core/src/agentic/execution/execution_engine.rs index 14ef2831..46e20461 100644 --- a/src/crates/core/src/agentic/execution/execution_engine.rs +++ b/src/crates/core/src/agentic/execution/execution_engine.rs @@ -1264,9 +1264,6 @@ impl ExecutionEngine { ("Log", 14), ("MermaidInteractive", 15), ("ComputerUse", 16), - ("ComputerUseMousePrecise", 17), - ("ComputerUseMouseStep", 18), - ("ComputerUseMouseClick", 19), ] .into_iter() .map(|(k, v)| (k.to_string(), v)) diff --git a/src/crates/core/src/agentic/tools/computer_use_host.rs b/src/crates/core/src/agentic/tools/computer_use_host.rs index 648f8c9c..4c90516f 100644 --- a/src/crates/core/src/agentic/tools/computer_use_host.rs +++ b/src/crates/core/src/agentic/tools/computer_use_host.rs @@ -1,10 +1,12 @@ //! Host abstraction for desktop automation (implemented in `bitfun-desktop`). +// Re-export optimizer types so downstream crates can import from computer_use_host. +pub use crate::agentic::tools::computer_use_optimizer::{ActionRecord, LoopDetectionResult}; use crate::util::errors::{BitFunError, BitFunResult}; use async_trait::async_trait; use serde::{Deserialize, Serialize}; -/// Center of a **point crop** in **full-display native capture pixels** (same origin as ruler indices on a full-screen computer-use shot). +/// Center of a **point crop** in **full-display native capture pixels** (same origin as full-screen computer-use JPEG pixels). #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] pub struct ScreenshotCropCenter { pub x: u32, @@ -30,6 +32,16 @@ pub enum ComputerUseNavigateQuadrant { BottomRight, } +/// Center for host-applied **implicit** 500×500 confirmation crops (when a fresh screenshot is required). +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ComputerUseImplicitScreenshotCenter { + #[default] + Mouse, + /// Best-effort focused text field / insertion area (macOS AX); other platforms fall back to mouse. + TextCaret, +} + /// Parameters for [`ComputerUseHost::screenshot_display`]. #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] pub struct ComputerUseScreenshotParams { @@ -39,6 +51,8 @@ pub struct ComputerUseScreenshotParams { pub reset_navigation: bool, /// Half-size of the point crop in **native** pixels (total width/height ≈ `2 * half`). `None` → [`COMPUTER_USE_POINT_CROP_HALF_DEFAULT`]. pub point_crop_half_extent_native: Option, + /// For `action: screenshot`: when the host applies an implicit 500×500 crop, use mouse vs text-focus center (see desktop host). + pub implicit_confirmation_center: Option, } /// Longest side of the navigation region must be **strictly below** this to allow `click` without a separate point crop (desktop). @@ -109,7 +123,7 @@ pub struct ComputerUseSessionSnapshot { pub pointer_global: Option, } -/// Pixel rectangle of the **screen capture** inside the JPEG (excludes white margin and rulers). +/// Pixel rectangle of the **screen capture** in JPEG image coordinates (offset is zero when there is no frame padding). #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct ComputerUseImageContentRect { pub left: u32, @@ -153,15 +167,46 @@ pub struct ComputerScreenshot { /// When true (desktop), `click` is allowed on this frame without an extra ~500×500 point crop — region is small enough for pointer positioning + `click`. #[serde(default, skip_serializing_if = "is_false")] pub quadrant_navigation_click_ready: bool, - /// Screen pixels inside the JPEG (below/left of white margin); `ComputerUseMousePrecise` maps this rect to the display. + /// Screen capture rectangle in JPEG pixel coordinates (offset zero when there is no frame padding); `ComputerUseMousePrecise` maps this rect to the display. #[serde(default, skip_serializing_if = "Option::is_none")] pub image_content_rect: Option, + /// Set-of-Mark labels: numbered interactive elements overlaid on the screenshot. + /// When non-empty, the model can use `click_label` with a label number instead of coordinates. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub som_labels: Vec, + /// Desktop: this JPEG was produced by implicit 500×500 confirmation crop (mouse or text focus center). + #[serde(default, skip_serializing_if = "is_false")] + pub implicit_confirmation_crop_applied: bool, } fn is_false(b: &bool) -> bool { !*b } +/// Optional **global native** rectangle (same space as pointer / `display_origin` + capture) to limit +/// OCR to a screen region (e.g. one app window) and avoid matching text in other windows. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct OcrRegionNative { + pub x0: i32, + pub y0: i32, + pub width: u32, + pub height: u32, +} + +/// A single OCR text match with global display coordinates. +/// Returned by [`ComputerUseHost::ocr_find_text_matches`]. +#[derive(Debug, Clone)] +pub struct OcrTextMatch { + pub text: String, + pub confidence: f32, + pub center_x: f64, + pub center_y: f64, + pub bounds_left: f64, + pub bounds_top: f64, + pub bounds_width: f64, + pub bounds_height: f64, +} + /// Filter for native accessibility (macOS AX) BFS search — role/title/identifier substrings. #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct UiElementLocateQuery { @@ -206,6 +251,16 @@ pub struct UiElementLocateResult { pub matched_title: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub matched_identifier: Option, + /// Parent element role + title for disambiguation (e.g. "AXWindow: Settings"). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub parent_context: Option, + /// Total number of elements that matched the query (before ranking). + /// If > 1, the model should consider whether this is the right one. + #[serde(default)] + pub total_matches: u32, + /// Brief descriptions of other matches (up to 4) for disambiguation. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub other_matches: Vec, } #[async_trait] @@ -232,6 +287,20 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { .await } + /// OCR on **raw display pixels** (no pointer/SoM overlay). Desktop captures only the relevant region: + /// optional `region_native`, else on macOS the frontmost window from Accessibility, else the primary display. + /// Default returns a "not implemented" error. Desktop overrides with Vision (macOS), WinRT OCR (Windows), or Tesseract (Linux). + async fn ocr_find_text_matches( + &self, + text_query: &str, + region_native: Option, + ) -> BitFunResult> { + let _ = (text_query, region_native); + Err(BitFunError::tool( + "OCR text recognition is not available on this host.".to_string(), + )) + } + /// Map `(x, y)` from the **last** screenshot's image pixel grid to global pointer pixels. /// Fails if no screenshot was taken in this process since startup (or since last host reset). fn map_image_coords_to_pointer(&self, x: i32, y: i32) -> BitFunResult<(i32, i32)>; @@ -263,8 +332,28 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { /// Click at the **current** pointer position only (does not move). Use `ComputerUseMousePrecise` / `ComputerUseMouseStep` / `pointer_move_rel` first. /// `button`: "left" | "right" | "middle" + /// On desktop, enforces the vision fine-screenshot guard (unlike [`mouse_click_authoritative`](Self::mouse_click_authoritative)). async fn mouse_click(&self, button: &str) -> BitFunResult<()>; + /// Click at the current pointer after the host has moved it to a **trusted** target (`click_element`, `click_label`, `move_to_text`). + /// Skips the vision fine-screenshot / stale-pointer guard that [`mouse_click`](Self::mouse_click) applies after a pointer move. + /// Default: delegates to [`mouse_click`](Self::mouse_click). + async fn mouse_click_authoritative(&self, button: &str) -> BitFunResult<()> { + self.mouse_click(button).await + } + + /// Press a mouse button and hold it at the current pointer position. + /// `button`: "left" | "right" | "middle" + async fn mouse_down(&self, _button: &str) -> BitFunResult<()> { + Err(BitFunError::tool("mouse_down is not supported on this host.".to_string())) + } + + /// Release a mouse button at the current pointer position. + /// `button`: "left" | "right" | "middle" + async fn mouse_up(&self, _button: &str) -> BitFunResult<()> { + Err(BitFunError::tool("mouse_up is not supported on this host.".to_string())) + } + async fn scroll(&self, delta_x: i32, delta_y: i32) -> BitFunResult<()>; /// Press key combination; names like "command", "control", "shift", "alt", "return", "tab", "escape", "space", or single letters. @@ -297,12 +386,24 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { Ok(()) } + /// Relaxed click guard for AX-based `click_element`: skips the fine-screenshot requirement. + /// AX coordinates are authoritative, so no quadrant drill or point crop is needed. + fn computer_use_guard_click_allowed_relaxed(&self) -> BitFunResult<()> { + Ok(()) + } + /// What the **last** `screenshot_display` captured (e.g. coordinate hints for the model). /// Default: unknown (`None`). Desktop sets after each `screenshot_display`. fn last_screenshot_refinement(&self) -> Option { None } + /// Derive structured interaction readiness and guidance from the current session state. + /// Default: empty/default state. Desktop overrides with state-driven implementation. + fn computer_use_interaction_state(&self) -> ComputerUseInteractionState { + ComputerUseInteractionState::default() + } + /// Search the frontmost app’s accessibility tree (macOS AX) for a matching control and return a stable center. /// Default: unsupported outside the desktop host / non-macOS. async fn locate_ui_element_screen_center( @@ -313,8 +414,67 @@ pub trait ComputerUseHost: Send + Sync + std::fmt::Debug { "Native UI element (accessibility) lookup is not available on this host.".to_string(), )) } + + /// Enumerate all visible interactive UI elements for Set-of-Mark (SoM) overlay. + /// Returns elements suitable for numbered label annotation on screenshots. + /// Default: empty (no SoM support). + async fn enumerate_som_elements(&self) -> Vec { + vec![] + } + + /// Record a completed action for loop detection and history tracking. + /// Default: no-op. Desktop host overrides with optimizer integration. + fn record_action(&self, _action_type: &str, _action_params: &str, _success: bool) {} + + /// Update the screenshot hash for visual change detection. + /// Default: no-op. Desktop host overrides with optimizer integration. + fn update_screenshot_hash(&self, _hash: u64) {} + + /// Check if the agent is stuck in a repeating action loop. + /// Returns a detection result with suggestions if a loop is found. + /// Default: no loop detected. + fn detect_action_loop(&self) -> LoopDetectionResult { + LoopDetectionResult { + is_loop: false, + pattern_length: 0, + repetitions: 0, + suggestion: String::new(), + } + } + + /// Get action history for context and backtracking. + /// Default: empty history. + fn get_action_history(&self) -> Vec { + vec![] + } +} + +/// A visible interactive UI element discovered via the accessibility tree, +/// used for Set-of-Mark (SoM) numbered label overlay on screenshots. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SomElement { + /// 1-based label number rendered on the screenshot. + pub label: u32, + /// AX role (e.g. "AXButton", "AXTextField"). + pub role: String, + /// AX title (visible label text), if any. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub title: Option, + /// AX identifier, if any. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub identifier: Option, + /// Global screen center X (host pointer space). + pub global_center_x: f64, + /// Global screen center Y (host pointer space). + pub global_center_y: f64, + /// Element bounds in global screen space. + pub bounds_left: f64, + pub bounds_top: f64, + pub bounds_width: f64, + pub bounds_height: f64, } + /// Whether the latest screenshot JPEG was the full display, a point crop, or a quadrant-drill region. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ComputerUseScreenshotRefinement { @@ -330,4 +490,80 @@ pub enum ComputerUseScreenshotRefinement { }, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerUseInteractionScreenshotKind { + FullDisplay, + RegionCrop, + QuadrantDrill, + QuadrantTerminal, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputerUseLastMutationKind { + Screenshot, + PointerMove, + Click, + Scroll, + KeyChord, + TypeText, + Wait, + Locate, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct ComputerUseInteractionState { + pub click_ready: bool, + pub enter_ready: bool, + pub requires_fresh_screenshot_before_click: bool, + pub requires_fresh_screenshot_before_enter: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub last_screenshot_kind: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub last_mutation: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub recommended_next_action: Option, +} + pub type ComputerUseHostRef = std::sync::Arc; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn interaction_state_serializes_expected_shape() { + let state = ComputerUseInteractionState { + click_ready: false, + enter_ready: true, + requires_fresh_screenshot_before_click: true, + requires_fresh_screenshot_before_enter: false, + last_screenshot_kind: Some(ComputerUseInteractionScreenshotKind::FullDisplay), + last_mutation: Some(ComputerUseLastMutationKind::Screenshot), + recommended_next_action: Some("screenshot_navigate_quadrant".to_string()), + }; + + let value = serde_json::to_value(&state).expect("serialize interaction state"); + + assert_eq!(value["click_ready"], serde_json::json!(false)); + assert_eq!(value["enter_ready"], serde_json::json!(true)); + assert_eq!( + value["requires_fresh_screenshot_before_click"], + serde_json::json!(true) + ); + assert_eq!( + value["requires_fresh_screenshot_before_enter"], + serde_json::json!(false) + ); + assert_eq!( + value["last_screenshot_kind"], + serde_json::json!("full_display") + ); + assert_eq!(value["last_mutation"], serde_json::json!("screenshot")); + assert_eq!( + value["recommended_next_action"], + serde_json::json!("screenshot_navigate_quadrant") + ); + } +} diff --git a/src/crates/core/src/agentic/tools/computer_use_optimizer.rs b/src/crates/core/src/agentic/tools/computer_use_optimizer.rs new file mode 100644 index 00000000..e10426b5 --- /dev/null +++ b/src/crates/core/src/agentic/tools/computer_use_optimizer.rs @@ -0,0 +1,195 @@ +//! Computer Use optimization: action verification, loop detection, and retry logic. + +use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// Maximum actions to track in history +const MAX_HISTORY_SIZE: usize = 50; + +/// Loop detection window (check last N actions) +const LOOP_DETECTION_WINDOW: usize = 10; + +/// Maximum identical action sequences before triggering loop detection +const MAX_LOOP_REPETITIONS: usize = 3; + +/// Action record for history tracking +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ActionRecord { + pub timestamp_ms: u64, + pub action_type: String, + pub action_params: String, + pub success: bool, + pub screenshot_hash: Option, +} + +/// Loop detection result +#[derive(Debug, Clone)] +pub struct LoopDetectionResult { + pub is_loop: bool, + pub pattern_length: usize, + pub repetitions: usize, + pub suggestion: String, +} + +/// Computer Use session optimizer +#[derive(Debug)] +pub struct ComputerUseOptimizer { + action_history: VecDeque, + last_screenshot_hash: Option, +} + +impl ComputerUseOptimizer { + pub fn new() -> Self { + Self { + action_history: VecDeque::with_capacity(MAX_HISTORY_SIZE), + last_screenshot_hash: None, + } + } + + /// Record an action in history + pub fn record_action( + &mut self, + action_type: String, + action_params: String, + success: bool, + ) { + let timestamp_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + + let record = ActionRecord { + timestamp_ms, + action_type, + action_params, + success, + screenshot_hash: self.last_screenshot_hash, + }; + + self.action_history.push_back(record); + if self.action_history.len() > MAX_HISTORY_SIZE { + self.action_history.pop_front(); + } + } + + /// Update screenshot hash for visual change detection + pub fn update_screenshot_hash(&mut self, hash: u64) { + self.last_screenshot_hash = Some(hash); + } + + /// Detect if agent is stuck in a loop + pub fn detect_loop(&self) -> LoopDetectionResult { + if self.action_history.len() < LOOP_DETECTION_WINDOW { + return LoopDetectionResult { + is_loop: false, + pattern_length: 0, + repetitions: 0, + suggestion: String::new(), + }; + } + + // Check for repeating action patterns + for pattern_len in 2..=5 { + if let Some(result) = self.check_pattern_repetition(pattern_len) { + if result.repetitions >= MAX_LOOP_REPETITIONS { + return result; + } + } + } + + // Check for screenshot stagnation (same view, different actions) + if self.check_screenshot_stagnation() { + return LoopDetectionResult { + is_loop: true, + pattern_length: 0, + repetitions: 0, + suggestion: "Screen state unchanged after multiple actions. Try a different approach or use accessibility tree instead of vision.".to_string(), + }; + } + + LoopDetectionResult { + is_loop: false, + pattern_length: 0, + repetitions: 0, + suggestion: String::new(), + } + } + + fn check_pattern_repetition(&self, pattern_len: usize) -> Option { + let recent: Vec<_> = self.action_history.iter().rev().take(LOOP_DETECTION_WINDOW).collect(); + if recent.len() < pattern_len * MAX_LOOP_REPETITIONS { + return None; + } + + let pattern: Vec<_> = recent.iter().take(pattern_len).map(|r| &r.action_type).collect(); + let mut reps = 1; + + for chunk in recent.chunks(pattern_len).skip(1) { + if chunk.len() != pattern_len { + break; + } + let chunk_types: Vec<_> = chunk.iter().map(|r| &r.action_type).collect(); + if chunk_types == pattern { + reps += 1; + } else { + break; + } + } + + if reps >= MAX_LOOP_REPETITIONS { + Some(LoopDetectionResult { + is_loop: true, + pattern_length: pattern_len, + repetitions: reps, + suggestion: format!( + "Detected repeating pattern of {} actions (repeated {} times). Try: 1) Use accessibility tree (click_element/locate) instead of vision, 2) Use keyboard shortcuts instead of mouse, 3) Take a fresh screenshot to verify current state.", + pattern_len, reps + ), + }) + } else { + None + } + } + + fn check_screenshot_stagnation(&self) -> bool { + let recent: Vec<_> = self.action_history.iter().rev().take(6).collect(); + if recent.len() < 6 { + return false; + } + + // Check if last 6 actions had same screenshot hash (no visual change) + if let Some(first_hash) = recent[0].screenshot_hash { + recent.iter().skip(1).all(|r| r.screenshot_hash == Some(first_hash)) + } else { + false + } + } + + /// Get action history for backtracking + pub fn get_history(&self) -> Vec { + self.action_history.iter().cloned().collect() + } + + /// Clear history (for new task) + pub fn clear_history(&mut self) { + self.action_history.clear(); + self.last_screenshot_hash = None; + } +} + +impl Default for ComputerUseOptimizer { + fn default() -> Self { + Self::new() + } +} + +/// Simple hash function for screenshot comparison +pub fn hash_screenshot_bytes(bytes: &[u8]) -> u64 { + let mut hash: u64 = 0xcbf29ce484222325; + for &byte in bytes.iter().step_by(1000) { + hash ^= byte as u64; + hash = hash.wrapping_mul(0x100000001b3); + } + hash +} diff --git a/src/crates/core/src/agentic/tools/computer_use_verification.rs b/src/crates/core/src/agentic/tools/computer_use_verification.rs new file mode 100644 index 00000000..5e9d31d0 --- /dev/null +++ b/src/crates/core/src/agentic/tools/computer_use_verification.rs @@ -0,0 +1,102 @@ +//! Post-action verification and smart retry logic. + +use crate::util::errors::BitFunError; +use serde::{Deserialize, Serialize}; + +/// Verification result after an action +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VerificationResult { + pub verified: bool, + pub visual_change_detected: bool, + pub change_percentage: f32, + pub suggestion: Option, +} + +/// Retry strategy for failed actions +#[derive(Debug, Clone)] +pub struct RetryStrategy { + pub max_attempts: u32, + pub current_attempt: u32, + pub should_retry: bool, + pub retry_delay_ms: u64, +} + +impl RetryStrategy { + pub fn new(max_attempts: u32) -> Self { + Self { + max_attempts, + current_attempt: 0, + should_retry: true, + retry_delay_ms: 500, + } + } + + pub fn next_attempt(&mut self) -> bool { + self.current_attempt += 1; + self.should_retry = self.current_attempt < self.max_attempts; + self.should_retry + } + + pub fn is_exhausted(&self) -> bool { + self.current_attempt >= self.max_attempts + } +} + +/// Compare two screenshot hashes to detect visual changes +pub fn detect_visual_change(hash_before: u64, hash_after: u64) -> VerificationResult { + let changed = hash_before != hash_after; + + // Simple change detection based on hash difference + let change_pct = if changed { 100.0 } else { 0.0 }; + + VerificationResult { + verified: changed, + visual_change_detected: changed, + change_percentage: change_pct, + suggestion: if !changed { + Some("No visual change detected. Action may have failed or UI did not update. Consider: 1) Retry the action, 2) Verify element is clickable, 3) Try keyboard shortcut instead.".to_string()) + } else { + None + }, + } +} + +/// Determine if an action should be retried based on error type +pub fn should_retry_action(error: &BitFunError, action_type: &str) -> bool { + let error_msg = error.to_string().to_lowercase(); + + // Retry on transient errors + if error_msg.contains("timeout") + || error_msg.contains("not found") + || error_msg.contains("element moved") + || error_msg.contains("stale") { + return true; + } + + // Don't retry on permission or configuration errors + if error_msg.contains("permission") + || error_msg.contains("not enabled") + || error_msg.contains("not available") { + return false; + } + + // Retry click/locate actions by default + matches!(action_type, "click" | "click_element" | "click_label" | "locate") +} + +/// Generate retry suggestion based on failure context +pub fn generate_retry_suggestion(action_type: &str, attempt: u32) -> String { + match action_type { + "click" | "click_element" => { + if attempt == 1 { + "First retry: Taking fresh screenshot to verify element position.".to_string() + } else { + "Retry failed. Try: 1) Use accessibility tree (click_element), 2) Use keyboard shortcut, 3) Verify element is visible and clickable.".to_string() + } + } + "locate" => { + "Element not found. Try: 1) Broaden search criteria (use filter_combine: 'any'), 2) Use only role_substring or title_contains, 3) Verify app is focused.".to_string() + } + _ => format!("Retry attempt {} for action: {}", attempt, action_type), + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_input.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_input.rs new file mode 100644 index 00000000..2533058f --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_input.rs @@ -0,0 +1,223 @@ +use crate::agentic::tools::computer_use_host::{ + ComputerUseImplicitScreenshotCenter, ComputerUseNavigateQuadrant, ComputerUseScreenshotParams, + ScreenshotCropCenter, +}; +use crate::util::errors::{BitFunError, BitFunResult}; +use serde_json::Value; + +pub fn use_screen_coordinates(input: &Value) -> bool { + input + .get("use_screen_coordinates") + .and_then(|v| v.as_bool()) + .unwrap_or(false) +} + +/// Rejects JPEG/normalized coordinates for pointer moves — vision-derived positions are unreliable. +/// Use `use_screen_coordinates: true` with globals from OCR/AX tools, or non-coordinate actions. +pub fn ensure_pointer_move_uses_screen_coordinates_only(input: &Value) -> BitFunResult<()> { + if use_screen_coordinates(input) { + return Ok(()); + } + Err(BitFunError::tool( + "Positioning from screenshot pixels (coordinate_mode image/normalized) is disabled: do not guess coordinates from vision. Set use_screen_coordinates: true with global display coordinates from move_to_text (global_center_x/y), locate, click_element, or pointer_image_x/y from the last screenshot JSON; or use move_to_text, click_element, click_label, pointer_move_rel, ComputerUseMouseStep. Screenshots are for confirmation only.".to_string(), + )) +} + +pub fn coordinate_mode(input: &Value) -> &str { + input + .get("coordinate_mode") + .and_then(|v| v.as_str()) + .unwrap_or("image") +} + +pub fn parse_screenshot_crop_center(input: &Value) -> BitFunResult> { + let xv = input.get("screenshot_crop_center_x"); + let yv = input.get("screenshot_crop_center_y"); + let x_none = xv.is_none() || xv.is_some_and(|v| v.is_null()); + let y_none = yv.is_none() || yv.is_some_and(|v| v.is_null()); + + match (x_none, y_none) { + (true, true) => Ok(None), + (false, false) => { + let x = xv + .and_then(|v| v.as_u64()) + .ok_or_else(|| BitFunError::tool("screenshot_crop_center_x must be a non-negative integer (full-display native pixels).".to_string()))?; + let y = yv + .and_then(|v| v.as_u64()) + .ok_or_else(|| BitFunError::tool("screenshot_crop_center_y must be a non-negative integer (full-display native pixels).".to_string()))?; + Ok(Some(ScreenshotCropCenter { + x: u32::try_from(x) + .map_err(|_| BitFunError::tool("screenshot_crop_center_x is too large.".to_string()))?, + y: u32::try_from(y) + .map_err(|_| BitFunError::tool("screenshot_crop_center_y is too large.".to_string()))?, + })) + } + _ => Err(BitFunError::tool( + "screenshot_crop_center_x and screenshot_crop_center_y must both be set or both omitted for action screenshot.".to_string(), + )), + } +} + +pub fn parse_screenshot_crop_half_extent_native(input: &Value) -> BitFunResult> { + match input.get("screenshot_crop_half_extent_native") { + None => Ok(None), + Some(v) if v.is_null() => Ok(None), + Some(v) => { + let n = v + .as_u64() + .ok_or_else(|| BitFunError::tool("screenshot_crop_half_extent_native must be a non-negative integer.".to_string()))?; + Ok(Some( + u32::try_from(n) + .map_err(|_| BitFunError::tool("screenshot_crop_half_extent_native is too large.".to_string()))?, + )) + } + } +} + +pub fn input_has_screenshot_crop_fields(input: &Value) -> bool { + let x = input.get("screenshot_crop_center_x"); + let y = input.get("screenshot_crop_center_y"); + x.is_some_and(|v| !v.is_null()) || y.is_some_and(|v| !v.is_null()) +} + +pub fn parse_screenshot_implicit_center( + input: &Value, +) -> BitFunResult> { + match input + .get("screenshot_implicit_center") + .and_then(|v| v.as_str()) + .map(str::trim) + { + None | Some("") => Ok(None), + Some("mouse") => Ok(Some(ComputerUseImplicitScreenshotCenter::Mouse)), + Some("text_caret") => Ok(Some(ComputerUseImplicitScreenshotCenter::TextCaret)), + Some(other) => Err(BitFunError::tool(format!( + "screenshot_implicit_center must be \"mouse\" or \"text_caret\", got {:?}", + other + ))), + } +} + +pub fn parse_screenshot_navigate_quadrant( + input: &Value, +) -> BitFunResult> { + let value = input + .get("screenshot_navigate_quadrant") + .filter(|x| !x.is_null()) + .and_then(|x| x.as_str()); + let Some(s) = value else { + return Ok(None); + }; + + let n = s.trim().to_ascii_lowercase().replace('-', "_"); + Ok(Some(match n.as_str() { + "top_left" | "topleft" | "upper_left" => ComputerUseNavigateQuadrant::TopLeft, + "top_right" | "topright" | "upper_right" => ComputerUseNavigateQuadrant::TopRight, + "bottom_left" | "bottomleft" | "lower_left" => ComputerUseNavigateQuadrant::BottomLeft, + "bottom_right" | "bottomright" | "lower_right" => ComputerUseNavigateQuadrant::BottomRight, + _ => { + return Err(BitFunError::tool( + "screenshot_navigate_quadrant must be one of: top_left, top_right, bottom_left, bottom_right.".to_string(), + )); + } + })) +} + +pub fn parse_screenshot_params(input: &Value) -> BitFunResult<(ComputerUseScreenshotParams, bool)> { + let navigate = parse_screenshot_navigate_quadrant(input)?; + let reset_navigation = input + .get("screenshot_reset_navigation") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + let implicit_center = parse_screenshot_implicit_center(input)?; + + if navigate.is_some() { + let ignored_crop = input_has_screenshot_crop_fields(input); + return Ok(( + ComputerUseScreenshotParams { + crop_center: None, + navigate_quadrant: navigate, + reset_navigation, + point_crop_half_extent_native: None, + implicit_confirmation_center: implicit_center, + }, + ignored_crop, + )); + } + + let crop = parse_screenshot_crop_center(input)?; + let half = if crop.is_some() { + parse_screenshot_crop_half_extent_native(input)? + } else { + None + }; + + Ok(( + ComputerUseScreenshotParams { + crop_center: crop, + navigate_quadrant: None, + reset_navigation, + point_crop_half_extent_native: half, + implicit_confirmation_center: implicit_center, + }, + false, + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn screenshot_params_prefer_quadrant_over_crop_fields() { + let input = json!({ + "screenshot_navigate_quadrant": "top_left", + "screenshot_crop_center_x": 120, + "screenshot_crop_center_y": 340, + "screenshot_reset_navigation": true, + }); + + let (params, ignored_crop) = + parse_screenshot_params(&input).expect("parse screenshot params"); + + assert_eq!( + params.navigate_quadrant, + Some(ComputerUseNavigateQuadrant::TopLeft) + ); + assert_eq!(params.crop_center, None); + assert!(params.reset_navigation); + assert!(ignored_crop); + } + + #[test] + fn screenshot_params_parse_crop_half_extent_only_with_crop() { + let input = json!({ + "screenshot_crop_center_x": 33, + "screenshot_crop_center_y": 44, + "screenshot_crop_half_extent_native": 180 + }); + + let (params, ignored_crop) = + parse_screenshot_params(&input).expect("parse screenshot params"); + + let crop = params.crop_center.expect("crop center"); + assert_eq!(crop.x, 33); + assert_eq!(crop.y, 44); + assert_eq!(params.point_crop_half_extent_native, Some(180)); + assert!(!ignored_crop); + } + + #[test] + fn screenshot_params_parse_implicit_center() { + let input = json!({ + "screenshot_implicit_center": "text_caret" + }); + let (params, _) = parse_screenshot_params(&input).expect("parse"); + assert_eq!( + params.implicit_confirmation_center, + Some(ComputerUseImplicitScreenshotCenter::TextCaret) + ); + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs index 14c79415..9f54a6b8 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_locate.rs @@ -1,4 +1,4 @@ -//! Accessibility tree locate — invoked as `ComputerUse` **`action: "locate"`** (same tool as screenshot / keys). +//! Accessibility tree locate -- invoked as `ComputerUse` **`action: "locate"`** (same tool as screenshot / keys). use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; use crate::agentic::tools::computer_use_host::{ @@ -10,7 +10,7 @@ use crate::service::config::global::GlobalConfigManager; use crate::util::errors::{BitFunError, BitFunResult}; use serde_json::{json, Value}; -/// Runs native UI locate (AX / UIA / AT-SPI) for the foreground app — **`ComputerUse`** `action: "locate"`. +/// Runs native UI locate (AX / UIA / AT-SPI) for the foreground app -- `ComputerUse` `action: "locate"`. pub(crate) async fn execute_computer_use_locate( input: &Value, context: &ToolUseContext, @@ -93,43 +93,35 @@ pub(crate) async fn execute_computer_use_locate( let gx = res.global_center_x.round() as i64; let gy = res.global_center_y.round() as i64; - let ncx = res.native_center_x as i64; - let ncy = res.native_center_y as i64; let suggested_half = suggested_point_crop_half_extent_from_native_bounds(native_w, native_h); let coordinate_hints = json!({ - "mouse_precise_screen": { - "tool": "ComputerUseMousePrecise", + "click_element": { + "action": "click_element", + "note": "Fastest path: use click_element with the same locate filters. No screenshot needed." + }, + "mouse_move_screen": { + "action": "mouse_move", "use_screen_coordinates": true, "x": gx, "y": gy, - "note": "Global display coordinates (host native units, e.g. macOS points). No prior screenshot required." - }, - "mouse_precise_image_after_full_screenshot": { - "tool": "ComputerUseMousePrecise", - "use_screen_coordinates": false, - "coordinate_mode": "image", - "x": ncx, - "y": ncy, - "note": "Use only when the last ComputerUse screenshot was full-display; x/y match margin ruler indices on that JPEG. After a point-crop screenshot, image space is the crop — do not reuse these numbers." + "note": "Global display coordinates (host native units). No prior screenshot required." }, "screenshot_point_crop": { - "tool": "ComputerUse", "action": "screenshot", "screenshot_crop_center_x": res.native_center_x, "screenshot_crop_center_y": res.native_center_y, "screenshot_crop_half_extent_native": suggested_half, - "note": "Copy **`screenshot_crop_center_*`** and **`screenshot_crop_half_extent_native`** into **`ComputerUse`** `action: \"screenshot\"`. Half-extent is derived from `native_extent_*` (tighter on small controls; host clamps)." + "note": "Point-crop screenshot centered on the element for visual verification." }, "native_extent_px": { "width": native_w, "height": native_h, - "note": "Approximate control size in full-display native pixels; prefer smaller ComputerUseMouseStep pixels when width/height are small." } }); - let body = json!({ + let mut body = json!({ "success": true, "action": "locate", "global_center_x": res.global_center_x, @@ -150,13 +142,32 @@ pub(crate) async fn execute_computer_use_locate( "matched_role": res.matched_role, "matched_title": res.matched_title, "matched_identifier": res.matched_identifier, - "recommended_next": "Prefer **`ComputerUse`** `action: screenshot` with fields from `coordinate_hints.screenshot_point_crop` to narrow the JPEG before quadrant drill; then ComputerUseMousePrecise / ComputerUseMouseStep + ComputerUseMouseClick, or use mouse_precise_screen if no screenshot is needed yet." }); + // Include disambiguation info when multiple matches were found + if res.total_matches > 1 { + body["total_matches"] = json!(res.total_matches); + body["warning"] = json!(format!( + "{} elements matched; returning the best-ranked one. See `other_matches` for alternatives.", + res.total_matches + )); + } + if let Some(ref pc) = res.parent_context { + body["parent_context"] = json!(pc); + } + if !res.other_matches.is_empty() { + body["other_matches"] = json!(res.other_matches); + } + let body = computer_use_augment_result_json(host.as_ref(), body, Some(input_coords)).await; + let match_info = if res.total_matches > 1 { + format!(" ({} matches, best ranked)", res.total_matches) + } else { + String::new() + }; let summary = format!( - "AX match: role={} native_center=({}, {}) native_bounds=[{}..{}, {}..{}] global_center=({:.1}, {:.1})", + "AX match: role={} native_center=({}, {}) native_bounds=[{}..{}, {}..{}] global_center=({:.1}, {:.1}){}", res.matched_role, res.native_center_x, res.native_center_y, @@ -165,7 +176,8 @@ pub(crate) async fn execute_computer_use_locate( res.native_bounds_min_y, res.native_bounds_max_y, res.global_center_x, - res.global_center_y + res.global_center_y, + match_info, ); Ok(vec![ToolResult::ok(body, Some(summary))]) diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs index 69cb00db..ccf3f3ff 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_click_tool.rs @@ -24,7 +24,7 @@ impl Tool for ComputerUseMouseClickTool { async fn description(&self) -> BitFunResult { Ok( - "Click or scroll the **mouse wheel** at the **current** pointer (does not move the pointer). **`action`: `click`** — optional **`button`** (`left` | `right` | `middle`, default left); host enforces a fresh **fine** screenshot basis before click (same as former `ComputerUse` `click`). **`action`: `wheel`** — **`delta_x`** / **`delta_y`** (non-zero) for horizontal/vertical wheel ticks at the cursor (same as former `ComputerUse` `scroll`). Position the pointer first with **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** / **`ComputerUse`** `pointer_move_rel`, then **`screenshot`** before click when the host requires it." + "Click or scroll the **mouse wheel** at the **current** pointer (does not move the pointer). **`action`: `click`** — optional **`button`** (`left` | `right` | `middle`, default left), optional **`num_clicks`** (1 = single click default, 2 = double click, 3 = triple click); host enforces a fresh **fine** screenshot basis before click (same as former `ComputerUse` `click`). **`action`: `wheel`** — **`delta_x`** / **`delta_y`** (non-zero) for horizontal/vertical wheel ticks at the cursor (same as former `ComputerUse` `scroll`). Position the pointer first with **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** / **`ComputerUse`** `pointer_move_rel`, then **`screenshot`** before click when the host requires it." .to_string(), ) } @@ -43,6 +43,12 @@ impl Tool for ComputerUseMouseClickTool { "enum": ["left", "right", "middle"], "description": "For `action` **click** only (default left). Ignored for `wheel`." }, + "num_clicks": { + "type": "integer", + "minimum": 1, + "maximum": 3, + "description": "For `action` **click** only: number of clicks (1 = single click, 2 = double click for opening files / selecting words, 3 = triple click for selecting lines). Default 1." + }, "delta_x": { "type": "integer", "description": "For `action` **wheel** only: horizontal wheel delta (non-zero with delta_y or alone). Ignored for `click`." diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs index d084f547..6b2f7e67 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_precise_tool.rs @@ -24,7 +24,7 @@ impl Tool for ComputerUseMousePreciseTool { async fn description(&self) -> BitFunResult { Ok( - "Move the mouse pointer to **absolute** coordinates. Use **`coordinate_mode`** (`image` = last screenshot JPEG — **preferred for precision**; `normalized` = 0..1000 — **coarse**, avoid for fine alignment) or **`use_screen_coordinates`** for global display units. Same semantics as the former `ComputerUse` `mouse_move` absolute path. For **small** cardinal nudges, prefer **`ComputerUseMouseStep`** instead of tiny absolute x/y.".to_string(), + "Move the mouse pointer to **absolute global** coordinates only: set **`use_screen_coordinates`: true** (macOS: **points**). **Do not** use `coordinate_mode` image/normalized — that path is disabled (vision-derived positions are unreliable). Use numbers from **`move_to_text`**, **`locate`**, AX tools, or **`pointer_global`** in tool JSON. Same as `ComputerUse` **`mouse_move`**. For **small** cardinal nudges, prefer **ComputerUseMouseStep**.".to_string(), ) } @@ -34,20 +34,20 @@ impl Tool for ComputerUseMousePreciseTool { "properties": { "x": { "type": "integer", - "description": "Target x: in **image** mode, pixel on the latest screenshot JPEG; in **normalized**, 0..=1000 on the captured display; with **use_screen_coordinates**, global display units (host native, e.g. macOS points)." + "description": "Target x in **global display** units — requires **use_screen_coordinates**: true (e.g. from move_to_text global_center_x, locate, pointer_global.x)." }, - "y": { "type": "integer", "description": "Target y; same coordinate space as x." }, + "y": { "type": "integer", "description": "Target y; same as x (global display units)." }, "coordinate_mode": { "type": "string", "enum": ["image", "normalized"], - "description": "When use_screen_coordinates is false. \"image\" = pixels on the latest screenshot JPEG (use for precise moves). \"normalized\" = 0..=1000 (coarse grid only)." + "description": "Ignored — image/normalized positioning is disabled; always use **use_screen_coordinates**: true." }, "use_screen_coordinates": { "type": "boolean", - "description": "If true, x/y are global display coordinates in the host's native units (on macOS: **points**)." + "description": "**Must be true.** x/y are global display coordinates (macOS: **points**)." } }, - "required": ["x", "y"], + "required": ["x", "y", "use_screen_coordinates"], "additionalProperties": false }) } diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs index e9112c4f..086a9e3c 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_mouse_step_tool.rs @@ -24,7 +24,7 @@ impl Tool for ComputerUseMouseStepTool { async fn description(&self) -> BitFunResult { Ok( - "Move the pointer **one cardinal step** (up / down / left / right) by **`pixels`** (default 32, clamped 1..400) in **screenshot/display pixel** space — same as the former `pointer_nudge` and relative `mouse_move_direction`. Take **`screenshot`** first so the host can convert scale (especially on macOS). For arbitrary deltas including diagonals, use **`ComputerUse`** **`pointer_move_rel`**.".to_string(), + "Move the pointer **one cardinal step** (up / down / left / right) by **`pixels`** (default 32, clamped 1..400) — same as **`ComputerUse`** **`pointer_move_rel`** on macOS scale. **Host blocks this immediately after a `screenshot`** until you reposition with **`move_to_text`**, **`mouse_move`** (`use_screen_coordinates`: true), **`click_element`**, or **`click_label`** (do not nudge from the JPEG). For diagonals, use **`ComputerUse`** **`pointer_move_rel`**.".to_string(), ) } diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs new file mode 100644 index 00000000..541c7e8b --- /dev/null +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_result.rs @@ -0,0 +1,121 @@ +use crate::agentic::tools::computer_use_host::{ + ComputerScreenshot, ComputerUseInteractionState, +}; +use serde_json::{json, Value}; + +pub fn append_interaction_state(body: &mut Value, interaction: &ComputerUseInteractionState) { + if let Value::Object(map) = body { + map.insert( + "interaction_state".to_string(), + json!(interaction), + ); + } +} + +pub fn build_screenshot_body( + shot: &ComputerScreenshot, + debug_rel: Option, + interaction: &ComputerUseInteractionState, +) -> Value { + let mut data = json!({ + "success": true, + "mime_type": shot.mime_type, + "image_width": shot.image_width, + "image_height": shot.image_height, + "display_width_px": shot.image_width, + "display_height_px": shot.image_height, + "native_width": shot.native_width, + "native_height": shot.native_height, + "display_origin_x": shot.display_origin_x, + "display_origin_y": shot.display_origin_y, + "vision_scale": shot.vision_scale, + "pointer_image_x": shot.pointer_image_x, + "pointer_image_y": shot.pointer_image_y, + "screenshot_crop_center": shot.screenshot_crop_center, + "point_crop_half_extent_native": shot.point_crop_half_extent_native, + "navigation_native_rect": shot.navigation_native_rect, + "quadrant_navigation_click_ready": shot.quadrant_navigation_click_ready, + "implicit_confirmation_crop_applied": shot.implicit_confirmation_crop_applied, + "debug_screenshot_path": debug_rel, + }); + append_interaction_state(&mut data, interaction); + data +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agentic::tools::computer_use_host::{ + ComputerUseInteractionScreenshotKind, ComputerUseImageContentRect, + }; + + #[test] + fn append_interaction_state_includes_structured_block() { + let mut body = json!({ "success": true }); + let interaction = ComputerUseInteractionState { + click_ready: false, + enter_ready: true, + requires_fresh_screenshot_before_click: true, + requires_fresh_screenshot_before_enter: false, + last_screenshot_kind: Some(ComputerUseInteractionScreenshotKind::FullDisplay), + last_mutation: None, + recommended_next_action: Some("screenshot_navigate_quadrant".to_string()), + }; + + append_interaction_state(&mut body, &interaction); + + assert_eq!(body["interaction_state"]["click_ready"], json!(false)); + assert_eq!(body["interaction_state"]["enter_ready"], json!(true)); + assert_eq!( + body["interaction_state"]["recommended_next_action"], + json!("screenshot_navigate_quadrant") + ); + } + + #[test] + fn screenshot_body_keeps_existing_fields_and_adds_interaction_state() { + let shot = ComputerScreenshot { + bytes: vec![1, 2, 3], + mime_type: "image/jpeg".to_string(), + image_width: 100, + image_height: 80, + native_width: 100, + native_height: 80, + display_origin_x: 0, + display_origin_y: 0, + vision_scale: 1.0, + pointer_image_x: Some(10), + pointer_image_y: Some(11), + screenshot_crop_center: None, + point_crop_half_extent_native: None, + navigation_native_rect: None, + quadrant_navigation_click_ready: false, + image_content_rect: Some(ComputerUseImageContentRect { + left: 1, + top: 2, + width: 98, + height: 76, + }), + som_labels: vec![], + implicit_confirmation_crop_applied: false, + }; + let interaction = ComputerUseInteractionState { + click_ready: false, + enter_ready: true, + requires_fresh_screenshot_before_click: true, + requires_fresh_screenshot_before_enter: false, + last_screenshot_kind: Some(ComputerUseInteractionScreenshotKind::FullDisplay), + last_mutation: None, + recommended_next_action: Some("screenshot_navigate_quadrant".to_string()), + }; + + let body = build_screenshot_body(&shot, None, &interaction); + + assert_eq!(body["success"], json!(true)); + assert_eq!(body["mime_type"], json!("image/jpeg")); + assert_eq!( + body["interaction_state"]["last_screenshot_kind"], + json!("full_display") + ); + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs index 193e5ad0..36d5bd36 100644 --- a/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/computer_use_tool.rs @@ -1,13 +1,18 @@ //! Desktop automation for Claw (Computer use). +use super::computer_use_input::{ + coordinate_mode, ensure_pointer_move_uses_screen_coordinates_only, parse_screenshot_params, + use_screen_coordinates, +}; use super::computer_use_locate::execute_computer_use_locate; use crate::agentic::tools::computer_use_capability::computer_use_desktop_available; use crate::agentic::tools::computer_use_host::{ - ComputerScreenshot, ComputerUseNavigateQuadrant, ComputerUseScreenshotParams, - ComputerUseScreenshotRefinement, ScreenshotCropCenter, + ComputerScreenshot, ComputerUseNavigateQuadrant, ComputerUseScreenshotRefinement, + ScreenshotCropCenter, UiElementLocateQuery, COMPUTER_USE_POINT_CROP_HALF_MAX, COMPUTER_USE_POINT_CROP_HALF_MIN, COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE, COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, }; +use crate::agentic::tools::computer_use_optimizer::hash_screenshot_bytes; use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; use crate::service::config::global::GlobalConfigManager; use crate::util::errors::{BitFunError, BitFunResult}; @@ -18,12 +23,35 @@ use log::{debug, warn}; use serde_json::{json, Value}; /// Merges [`ComputerUseHost::computer_use_session_snapshot`] + optional `input_coordinates` into tool JSON. +/// Also records the action for loop detection and adds loop warnings if detected. pub(crate) async fn computer_use_augment_result_json( host: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, mut body: Value, input_coordinates: Option, ) -> Value { let snap = host.computer_use_session_snapshot().await; + let interaction = host.computer_use_interaction_state(); + + // Record action for loop detection + let action_type = body + .get("action") + .or_else(|| body.get("tool")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown") + .to_string(); + let action_params = input_coordinates + .as_ref() + .map(|v| v.to_string()) + .unwrap_or_default(); + let success = body + .get("success") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + host.record_action(&action_type, &action_params, success); + + // Check for action loops + let loop_result = host.detect_action_loop(); + if let Value::Object(map) = &mut body { map.insert( "computer_use_context".to_string(), @@ -33,6 +61,23 @@ pub(crate) async fn computer_use_augment_result_json( "input_coordinates": input_coordinates, }), ); + map.insert( + "interaction_state".to_string(), + json!(interaction), + ); + + // Add loop detection warning if a loop is detected + if loop_result.is_loop { + map.insert( + "loop_warning".to_string(), + json!({ + "detected": true, + "pattern_length": loop_result.pattern_length, + "repetitions": loop_result.repetitions, + "suggestion": loop_result.suggestion, + }), + ); + } } body } @@ -73,38 +118,42 @@ impl ComputerUseTool { )) } - fn use_screen_coordinates(input: &Value) -> bool { - input - .get("use_screen_coordinates") - .and_then(|v| v.as_bool()) - .unwrap_or(false) - } - - /// `image` (default): x,y are pixel indices in the attached screenshot (`image_width` x `image_height`). - /// `normalized`: x,y each in 0..=1000 across the captured display (coarser but easier for models). - fn coordinate_mode(input: &Value) -> &str { - input - .get("coordinate_mode") - .and_then(|v| v.as_str()) - .unwrap_or("image") - } - fn resolve_xy_f64( host: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, input: &Value, x: i32, y: i32, ) -> BitFunResult<(f64, f64)> { - if Self::use_screen_coordinates(input) { + if use_screen_coordinates(input) { return Ok((x as f64, y as f64)); } - if Self::coordinate_mode(input) == "normalized" { + if coordinate_mode(input) == "normalized" { host.map_normalized_coords_to_pointer_f64(x, y) } else { host.map_image_coords_to_pointer_f64(x, y) } } + /// `click` must not carry coordinate fields — use `mouse_move` (or `move_to_text`, etc.) separately. + fn ensure_click_has_no_coordinate_fields(input: &Value) -> BitFunResult<()> { + if input.get("x").is_some() || input.get("y").is_some() { + return Err(BitFunError::tool( + "click does not accept x or y. Position with move_to_text, click_element, or `mouse_move` with use_screen_coordinates: true (globals from tool results), then `click` with only button and num_clicks.".to_string(), + )); + } + if input.get("coordinate_mode").is_some() { + return Err(BitFunError::tool( + "click does not accept coordinate_mode. Use `mouse_move` with use_screen_coordinates: true, then `click`.".to_string(), + )); + } + if input.get("use_screen_coordinates").is_some() { + return Err(BitFunError::tool( + "click does not accept use_screen_coordinates. Use `mouse_move` with use_screen_coordinates, then `click`.".to_string(), + )); + } + Ok(()) + } + /// Runtime host OS label for tool description (desktop session matches this process). fn host_os_label() -> &'static str { match std::env::consts::OS { @@ -124,6 +173,29 @@ impl ComputerUseTool { } } + async fn find_text_on_screen( + host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, + text_query: &str, + region_native: Option, + ) -> BitFunResult> { + let matches = host_ref + .ocr_find_text_matches(text_query, region_native) + .await?; + Ok(matches + .into_iter() + .map(|m| ScreenOcrTextMatch { + text: m.text, + confidence: m.confidence, + center_x: m.center_x, + center_y: m.center_y, + bounds_left: m.bounds_left, + bounds_top: m.bounds_top, + bounds_width: m.bounds_width, + bounds_height: m.bounds_height, + }) + .collect()) + } + /// Writes the exact JPEG sent to the model (including pointer overlay) under the workspace for debugging. async fn try_save_screenshot_for_debug( bytes: &[u8], @@ -186,8 +258,16 @@ impl ComputerUseTool { ) -> BitFunResult<(Value, ToolImageAttachment, String)> { let b64 = B64.encode(&shot.bytes); let pointer_marker_note = match (shot.pointer_image_x, shot.pointer_image_y) { - (Some(_), Some(_)) => "The JPEG includes a **synthetic red cursor with gray border** marking the **actual mouse position** on this bitmap (not the OS arrow). The **tip** is the true click hotspot (same pixel as pointer_image_x and pointer_image_y). Use this marker and those numbers for **ComputerUseMousePrecise** — do not ignore them or guess from the OS cursor alone.", - _ => "No pointer overlay in this JPEG (pointer_image_x/y null): the cursor is not on this bitmap (e.g. another display). Do not infer position from the image; use global screen coordinates + use_screen_coordinates, or move the pointer onto this display and screenshot again.", + (Some(_), Some(_)) => "The JPEG includes a **synthetic red cursor with gray border** marking the **actual mouse position** on this bitmap (not the OS arrow). The **tip** is the true hotspot for **visual confirmation** only — **do not** use JPEG pixel indices for `mouse_move`; use `use_screen_coordinates: true` with globals from tool results (`pointer_global`, `move_to_text` global_center_*, `locate`, AX) or `move_to_text` / `click_element`.", + _ => "No pointer overlay in this JPEG (pointer_image_x/y null): the cursor is not on this bitmap (e.g. another display). Do not infer position from the image; use global coordinates with `use_screen_coordinates: true`, or move the pointer onto this display and screenshot again.", + }; + let som_note = if shot.som_labels.is_empty() { + "No Set-of-Mark labels on this screenshot.".to_string() + } else { + format!( + "Set-of-Mark labels are overlaid on the screenshot: use `click_label` with a label number from 1..={}. Prefer this over raw coordinate clicks when the target has a visible label.", + shot.som_labels.len() + ) }; let mut data = json!({ "success": true, @@ -208,10 +288,12 @@ impl ComputerUseTool { "point_crop_half_extent_native": shot.point_crop_half_extent_native, "navigation_native_rect": shot.navigation_native_rect, "quadrant_navigation_click_ready": shot.quadrant_navigation_click_ready, + "implicit_confirmation_crop_applied": shot.implicit_confirmation_crop_applied, "debug_screenshot_path": debug_rel, + "som_label_note": som_note, }); let shortcut_policy = format!( - "**First:** `key_chord` for shortcuts **and** system clipboard (copy/cut/paste/select-all per host OS) — avoid Edit-menu clicks and avoid long `type_text` when paste fits. **Then** pointer when shortcuts do not fit (then screenshot **only** when you need pixels or before host-guarded click/Enter). **Default for click prep:** after a full-frame shot, chain `screenshot` + `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` (long edge < {} px). **Do not** skip to `screenshot_crop_center_*` from full screen unless justified. **Quadrant narrowing is never automatic:** each drill step must set `screenshot_navigate_quadrant` on that `screenshot` call; a bare `screenshot` only refreshes. Point crop (~500×500) is a **fallback**. **Small pointer tweaks:** prefer **ComputerUseMouseStep** (`direction` + optional `pixels`) over tiny absolute **ComputerUseMousePrecise** `x`/`y` — easier for vision models than sub-pixel absolute coords. **Do not** screenshot after every `locate` or non-Enter `key_chord`; **fresh** screenshot **before** `key_chord` that sends Return/Enter (host) and before **click** (host).", + "**Targeting priority:** `click_element` → **`move_to_text`** (OCR + move; no prior `screenshot` for targeting) → **`click_label`** if SoM exists on a shot → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation and navigation — do not guess move targets from JPEG pixels.** **`click`** never moves the pointer. **Host-only mandatory screenshot:** before **`click`** or Enter **`key_chord`** when the pointer changed since the last capture — **not** before `mouse_move`, `scroll`, `type_text`, `locate`, `wait`, or non-Enter `key_chord`. **Valid basis for a guarded `click`:** `FullDisplay`, `quadrant_navigation_click_ready`, or point crop; or bare **`screenshot`** after a pointer-changing action (**~500×500** implicit confirmation around mouse/caret). **`mouse_move`** must use **global** coordinates (from `move_to_text` global_center_*, `locate`, AX, or `pointer_global`). **Bare confirmation `screenshot`:** whenever the host still requires a capture before **`click`** or Enter **`key_chord`** (`requires_fresh_screenshot_*`), a bare `screenshot` (no crop / no reset) is **~500×500** centered on **mouse** (`screenshot_implicit_center` default `mouse`) — **including during quadrant drill** and the **first** such capture in a session. Before Enter in a text field, set **`screenshot_implicit_center`: `text_caret`**. Use **`screenshot_reset_navigation`**: true for a **full-screen** capture instead. **If AX failed:** try **`move_to_text`** before a long screenshot drill. **Optional refinement** for tiny targets: `screenshot_navigate_quadrant` until `quadrant_navigation_click_ready` (long edge < {} px) or point crop. Small moves: **ComputerUseMouseStep** over tiny **ComputerUseMousePrecise** (screen globals only).", COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE ); let region_crop_size_note = shot @@ -234,7 +316,7 @@ impl ComputerUseTool { "image_is_crop_only": true, "shortcut_policy": shortcut_policy, "instruction": format!( - "{}**margin ruler numbers** are **full-capture native** indices (same whole-screen bitmap space as a full-screen shot — not local 0..crop). `coordinate_mode` \"image\" uses **this JPEG’s** pixel grid (content area under the rulers). For another view, call screenshot with new `screenshot_crop_center_*` in that same full-capture space; optional `screenshot_crop_half_extent_native` adjusts crop size. See shortcut_policy.", + "{}**Image pixel (0,0)** is the **top-left of this crop** in **full-capture native** space (same whole-screen bitmap as a full-screen shot — not local 0..crop only). This view is for **confirmation / drill** — do **not** use JPEG pixels for `mouse_move`. For another view, call screenshot with new `screenshot_crop_center_*` in that same full-capture space; optional `screenshot_crop_half_extent_native` adjusts crop size. See shortcut_policy.", region_crop_size_note ) }) @@ -243,7 +325,7 @@ impl ComputerUseTool { "phase": "quadrant_terminal", "image_is_crop_only": true, "shortcut_policy": shortcut_policy, - "instruction": "Region is small enough for precise pointer: **`quadrant_navigation_click_ready`** is true. For **small** alignment fixes, prefer **`ComputerUseMouseStep`** (`direction`, optional `pixels`); use **`ComputerUseMousePrecise`** absolute `x`/`y` only for larger jumps. Then **`ComputerUseMouseClick`** (`action`: click) (no extra point crop required). After pointer moves, screenshot again before the next click (host)." + "instruction": "Region is small enough for precise pointer: **`quadrant_navigation_click_ready`** is true. **Do not** use **`ComputerUseMouseStep`** / **`pointer_move_rel`** immediately after a **`screenshot`** (host blocks — vision nudges are wrong). First **`move_to_text`**, **`mouse_move`** (`use_screen_coordinates`: true), or **`click_element`**, then optional **`ComputerUseMouseStep`** / **`ComputerUseMousePrecise`**. Then **`ComputerUseMouseClick`** (`action`: click). Host requires a **fresh** screenshot before the next **`click`** or Enter **`key_chord`** if pointer state changed since last capture (see shortcut_policy)." }) } else if !Self::shot_covers_full_display(shot) { json!({ @@ -251,7 +333,7 @@ impl ComputerUseTool { "image_is_crop_only": true, "shortcut_policy": shortcut_policy, "instruction": format!( - "**Keep drilling (default):** call **`screenshot`** again with **`screenshot_navigate_quadrant`**: `top_left` | `top_right` | `bottom_left` | `bottom_right` — pick the tile that contains your target. The host expands the chosen quadrant by **{} px** on each side (clamped) so split-edge controls stay in-frame. Repeat until `quadrant_navigation_click_ready`. To restart from the full display, set **`screenshot_reset_navigation`**: true on the next screenshot. Ruler numbers stay **full-display native**. See shortcut_policy.", + "**Keep drilling (default):** call **`screenshot`** again with **`screenshot_navigate_quadrant`**: `top_left` | `top_right` | `bottom_left` | `bottom_right` — pick the tile that contains your target. The host expands the chosen quadrant by **{} px** on each side (clamped) so split-edge controls stay in-frame. Repeat until `quadrant_navigation_click_ready`. To restart from the full display, set **`screenshot_reset_navigation`**: true on the next screenshot. Coordinates remain **full-display native**. See shortcut_policy.", COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX ) }) @@ -260,9 +342,9 @@ impl ComputerUseTool { "phase": "full_display", "image_is_crop_only": false, "host_auto_quadrant": false, - "next_step_for_mouse_click": "**Preferred (0):** If **`ComputerUse`** **`action: locate`** can match the control, use **`screenshot_crop_center_*`** (+ optional **`screenshot_crop_half_extent_native`**) to **narrow the JPEG** before the quadrant drill. **Preferred (A):** next tool call = `screenshot` **with** `screenshot_navigate_quadrant` set (top_left|top_right|bottom_left|bottom_right). Repeat until `quadrant_navigation_click_ready`. **Fallback (B):** `screenshot` with `screenshot_crop_center_x/y` when quadrant drill is a poor fit. The host never splits the screen unless you pass `screenshot_navigate_quadrant`.", + "next_step_for_mouse_click": "**First:** **`move_to_text`** if visible text can name the target (OCR + move pointer; then **`click`** if you need a press). **If you must move by globals:** **`mouse_move`** with **`use_screen_coordinates`: true** and coordinates from **`locate`**, **`move_to_text`**, or **`pointer_global`** — **not** from guessing JPEG pixels. Then **`click`** when the host allows (`interaction_state.click_ready`). **Optional refinement:** `screenshot_crop_center_*`, quadrant drill, or **`screenshot_navigate_quadrant`** for smaller targets. Host never splits the screen unless you pass `screenshot_navigate_quadrant`.", "shortcut_policy": shortcut_policy, - "instruction": "Full frame: ruler indices are **full-display native** pixels. **If DOM/AX can locate the target:** use `screenshot_crop_center_*` (+ optional `screenshot_crop_half_extent_native`) first — **before** a long quadrant-only chain. **Otherwise** start quadrant drill: next `screenshot` **must** include **`screenshot_navigate_quadrant`**. Repeat one quadrant per call until `quadrant_navigation_click_ready`, then **ComputerUseMousePrecise** / **ComputerUseMouseStep** + **`ComputerUseMouseClick`** (`action`: click). **`ComputerUseMouseClick` (click) is rejected** on full-screen-only. See `next_step_for_mouse_click`, `recommended_next_for_click_targeting`, shortcut_policy." + "instruction": "Full frame: JPEG aligns with **full-display native** space for **visual confirmation** only. **Prefer `move_to_text`** when readable text exists (then **`click`**). **Do not** derive `mouse_move` targets from this bitmap — use **`use_screen_coordinates`: true** with globals from tools, or AX/OCR actions. Then **`click`** when host allows (`click_ready`). For tiny targets, optionally narrow with `screenshot_crop_center_*` or quadrant drill. **`screenshot`**-heavy paths are **last** for targeting. See `next_step_for_mouse_click`, `recommended_next_for_click_targeting`, shortcut_policy." }) }; if let Some(obj) = data.as_object_mut() { @@ -270,16 +352,41 @@ impl ComputerUseTool { "hierarchical_navigation".to_string(), hierarchical_navigation, ); - if shot.screenshot_crop_center.is_none() && !shot.quadrant_navigation_click_ready { - let rec = if Self::shot_covers_full_display(shot) { - "screenshot_navigate_quadrant" - } else { - "screenshot_navigate_quadrant_until_click_ready" - }; + if !shot.som_labels.is_empty() { + let som_labels = shot + .som_labels + .iter() + .map(|e| json!({ + "label": e.label, + "role": e.role, + "title": e.title, + "identifier": e.identifier, + })) + .collect::>(); + obj.insert("som_labels".to_string(), Value::Array(som_labels)); obj.insert( "recommended_next_for_click_targeting".to_string(), - Value::String(rec.to_string()), + Value::String("click_label".to_string()), ); + } else if shot.screenshot_crop_center.is_none() && !shot.quadrant_navigation_click_ready { + if Self::shot_covers_full_display(shot) { + obj.insert( + "recommended_next_for_click_targeting".to_string(), + Value::String( + "move_to_text_then_click_or_mouse_move_screen_globals_then_click" + .to_string(), + ), + ); + } else { + let rec = format!( + "move_to_text_first_then_{}", + "screenshot_navigate_quadrant_until_click_ready" + ); + obj.insert( + "recommended_next_for_click_targeting".to_string(), + Value::String(rec), + ); + } } } let attach = ToolImageAttachment { @@ -288,7 +395,7 @@ impl ComputerUseTool { }; let pointer_line = match (shot.pointer_image_x, shot.pointer_image_y) { (Some(px), Some(py)) => format!( - " TRUE POINTER: **red cursor with gray border** (tip = hotspot) in the JPEG marks the mouse at this pixel — coordinate_mode \"image\" **ComputerUseMousePrecise** target x={}, y={}. Align moves so the **tip** sits on your click target, then **ComputerUseMouseClick** (`action`: click). Prior screenshot is stale after **ComputerUseMousePrecise** / **ComputerUseMouseStep** / `pointer_move_rel` until you screenshot again.", + " TRUE POINTER: **red cursor with gray border** (tip = hotspot) in the JPEG at image x={}, y={} — **confirmation only**; use **`mouse_move`** with **`use_screen_coordinates`: true** using globals from tool JSON (`pointer_global`, `move_to_text`, `locate`), then **`click`**. **Do not** use **`pointer_move_rel`** / **ComputerUseMouseStep** as the next action after this **`screenshot`** (host blocks). Prior screenshot is stale after **ComputerUseMousePrecise** / **ComputerUseMouseStep** / `pointer_move_rel` until you screenshot again.", px, py ), _ => " TRUE POINTER: not on this capture (pointer_image_x/y null). No red synthetic cursor — OS mouse may be on another display; use use_screen_coordinates with global coords or bring the pointer here and re-screenshot." @@ -305,7 +412,7 @@ impl ComputerUseTool { .unwrap_or_default(); let hint = if let Some(c) = shot.screenshot_crop_center { format!( - "Region crop screenshot {}x{} around full-display native center ({}, {}). Use `image` coords in **this** bitmap only.{}.{} After pointer moves, screenshot again before click (host).", + "Region crop screenshot {}x{} around full-display native center ({}, {}). **Confirm** UI state here — do **not** use JPEG pixels for `mouse_move`.{}.{} After pointer moves, screenshot again before click (host).", shot.image_width, shot.image_height, c.x, @@ -315,7 +422,7 @@ impl ComputerUseTool { ) } else if shot.quadrant_navigation_click_ready { format!( - "Quadrant terminal {}x{} (native region {:?}). **`quadrant_navigation_click_ready`**: use `image` coords on this JPEG, then **ComputerUseMousePrecise** / **ComputerUseMouseStep** + **`ComputerUseMouseClick`** (`action`: click).{}.{}", + "Quadrant terminal {}x{} (native region {:?}). **`quadrant_navigation_click_ready`**: align with **ComputerUseMouseStep** / **`mouse_move`** (**`use_screen_coordinates`: true** only) / **ComputerUseMousePrecise**, then **`ComputerUseMouseClick`** (`action`: click) — **`click`** has no coordinates.{}.{}", shot.image_width, shot.image_height, shot.navigation_native_rect, @@ -335,7 +442,7 @@ impl ComputerUseTool { let nx = shot.native_width.saturating_sub(1); let ny = shot.native_height.saturating_sub(1); format!( - "Full screenshot {}x{} (vision_scale={}). Rulers + grid: **native** 0..={} x 0..={}. **Quadrant drill is not automatic** — the next narrowing step must set **`screenshot_navigate_quadrant`** on `screenshot` (repeat until `quadrant_navigation_click_ready`), or use point crop (`screenshot_crop_center_*`).{}.{} After pointer moves, fresh fine screenshot before click; Return/Enter in key_chord needs fresh screenshot (host).", + "Full screenshot {}x{} (vision_scale={}). **Display native** range **0..={}** x **0..={}** (JPEG matches this rect for **confirmation**). **Targeting:** prefer **`move_to_text`** when text is visible; **`screenshot` + SoM/quad** is lowest priority. If SoM labels are visible, prefer `click_label`. **`mouse_move`** uses **`use_screen_coordinates`: true** with globals from tools — **not** JPEG guesses; then **`click`** when allowed (see `interaction_state`). **Only** guarded **`click`** / Enter **`key_chord`** need a fresh capture after pointer moves (see shortcut_policy).{}.{}", shot.image_width, shot.image_height, shot.vision_scale, @@ -363,130 +470,6 @@ impl ComputerUseTool { } } - fn parse_screenshot_crop_center(input: &Value) -> BitFunResult> { - let xv = input.get("screenshot_crop_center_x"); - let yv = input.get("screenshot_crop_center_y"); - let x_none = xv.map_or(true, |v| v.is_null()); - let y_none = yv.map_or(true, |v| v.is_null()); - match (x_none, y_none) { - (true, true) => Ok(None), - (false, false) => { - let x = xv - .and_then(|v| v.as_u64()) - .ok_or_else(|| { - BitFunError::tool( - "screenshot_crop_center_x must be a non-negative integer (full-display native pixels)." - .to_string(), - ) - })?; - let y = yv - .and_then(|v| v.as_u64()) - .ok_or_else(|| { - BitFunError::tool( - "screenshot_crop_center_y must be a non-negative integer (full-display native pixels)." - .to_string(), - ) - })?; - let x = u32::try_from(x).map_err(|_| { - BitFunError::tool("screenshot_crop_center_x is too large.".to_string()) - })?; - let y = u32::try_from(y).map_err(|_| { - BitFunError::tool("screenshot_crop_center_y is too large.".to_string()) - })?; - Ok(Some(ScreenshotCropCenter { x, y })) - } - _ => Err(BitFunError::tool( - "screenshot_crop_center_x and screenshot_crop_center_y must both be set or both omitted for action screenshot." - .to_string(), - )), - } - } - - /// Optional half-extent for point crop (native px); host clamps to [COMPUTER_USE_POINT_CROP_HALF_MIN, MAX]. - fn parse_screenshot_crop_half_extent_native(input: &Value) -> BitFunResult> { - match input.get("screenshot_crop_half_extent_native") { - None => Ok(None), - Some(v) if v.is_null() => Ok(None), - Some(v) => { - let n = v.as_u64().ok_or_else(|| { - BitFunError::tool( - "screenshot_crop_half_extent_native must be a non-negative integer.".to_string(), - ) - })?; - let n = u32::try_from(n).map_err(|_| { - BitFunError::tool("screenshot_crop_half_extent_native is too large.".to_string()) - })?; - Ok(Some(n)) - } - } - } - - /// True if the client sent non-null `screenshot_crop_center_x` and/or `y` (often `0` placeholders). - fn input_has_screenshot_crop_fields(input: &Value) -> bool { - let x = input.get("screenshot_crop_center_x"); - let y = input.get("screenshot_crop_center_y"); - x.map_or(false, |v| !v.is_null()) || y.map_or(false, |v| !v.is_null()) - } - - fn parse_screenshot_navigate_quadrant(input: &Value) -> BitFunResult> { - let v = input - .get("screenshot_navigate_quadrant") - .filter(|x| !x.is_null()) - .and_then(|x| x.as_str()); - let Some(s) = v else { - return Ok(None); - }; - let n = s.trim().to_ascii_lowercase().replace('-', "_"); - Ok(Some(match n.as_str() { - "top_left" | "topleft" | "upper_left" => ComputerUseNavigateQuadrant::TopLeft, - "top_right" | "topright" | "upper_right" => ComputerUseNavigateQuadrant::TopRight, - "bottom_left" | "bottomleft" | "lower_left" => ComputerUseNavigateQuadrant::BottomLeft, - "bottom_right" | "bottomright" | "lower_right" => ComputerUseNavigateQuadrant::BottomRight, - _ => { - return Err(BitFunError::tool( - "screenshot_navigate_quadrant must be one of: top_left, top_right, bottom_left, bottom_right." - .to_string(), - )); - } - })) - } - - /// Second return value: crop fields were present but ignored because quadrant navigation wins. - fn parse_screenshot_params(input: &Value) -> BitFunResult<(ComputerUseScreenshotParams, bool)> { - let navigate = Self::parse_screenshot_navigate_quadrant(input)?; - let reset_navigation = input - .get("screenshot_reset_navigation") - .and_then(|v| v.as_bool()) - .unwrap_or(false); - if navigate.is_some() { - let ignored_crop = Self::input_has_screenshot_crop_fields(input); - return Ok(( - ComputerUseScreenshotParams { - crop_center: None, - navigate_quadrant: navigate, - reset_navigation, - point_crop_half_extent_native: None, - }, - ignored_crop, - )); - } - let crop = Self::parse_screenshot_crop_center(input)?; - let half = if crop.is_some() { - Self::parse_screenshot_crop_half_extent_native(input)? - } else { - None - }; - Ok(( - ComputerUseScreenshotParams { - crop_center: crop, - navigate_quadrant: None, - reset_navigation, - point_crop_half_extent_native: half, - }, - false, - )) - } - } /// JSON for `snapshot_coordinate_basis` in mouse tool results (last screenshot refinement). @@ -525,11 +508,12 @@ pub(crate) async fn computer_use_execute_mouse_precise( host_ref: &dyn crate::agentic::tools::computer_use_host::ComputerUseHost, input: &Value, ) -> BitFunResult> { + ensure_pointer_move_uses_screen_coordinates_only(input)?; let snapshot_basis = computer_use_snapshot_coordinate_basis(host_ref); let x = req_i32(input, "x")?; let y = req_i32(input, "y")?; - let mode = ComputerUseTool::coordinate_mode(input); - let use_screen = ComputerUseTool::use_screen_coordinates(input); + let mode = coordinate_mode(input); + let use_screen = use_screen_coordinates(input); let (sx64, sy64) = ComputerUseTool::resolve_xy_f64(host_ref, input, x, y)?; host_ref.mouse_move_global_f64(sx64, sy64).await?; let sx = sx64.round() as i32; @@ -636,8 +620,20 @@ pub(crate) async fn computer_use_execute_mouse_click_tool( .get("button") .and_then(|v| v.as_str()) .unwrap_or("left"); - host_ref.mouse_click(button).await?; - let input_coords = json!({ "kind": "mouse_click", "action": "click", "button": button }); + let num_clicks = input + .get("num_clicks") + .and_then(|v| v.as_u64()) + .unwrap_or(1) + .clamp(1, 3) as u32; + for _ in 0..num_clicks { + host_ref.mouse_click(button).await?; + } + let click_label = match num_clicks { + 2 => "double", + 3 => "triple", + _ => "single", + }; + let input_coords = json!({ "kind": "mouse_click", "action": "click", "button": button, "num_clicks": num_clicks }); let body = computer_use_augment_result_json( host_ref, json!({ @@ -645,11 +641,12 @@ pub(crate) async fn computer_use_execute_mouse_click_tool( "tool": "ComputerUseMouseClick", "action": "click", "button": button, + "num_clicks": num_clicks, }), Some(input_coords), ) .await; - let summary = format!("{} click at current pointer (does not move).", button); + let summary = format!("{} {} click at current pointer (does not move).", button, click_label); Ok(vec![ToolResult::ok(body, Some(summary))]) } "wheel" => { @@ -688,6 +685,70 @@ pub(crate) async fn computer_use_execute_mouse_click_tool( } } +/// Helper: build `UiElementLocateQuery` from tool input JSON. +fn parse_locate_query(input: &Value) -> UiElementLocateQuery { + UiElementLocateQuery { + title_contains: input.get("title_contains").and_then(|v| v.as_str()).map(|s| s.to_string()), + role_substring: input.get("role_substring").and_then(|v| v.as_str()).map(|s| s.to_string()), + identifier_contains: input.get("identifier_contains").and_then(|v| v.as_str()).map(|s| s.to_string()), + max_depth: input.get("max_depth").and_then(|v| v.as_u64()).map(|v| v as u32), + filter_combine: input.get("filter_combine").and_then(|v| v.as_str()).map(|s| s.to_string()), + } +} + +fn parse_ocr_region_native( + input: &Value, +) -> BitFunResult> { + let v = input.get("ocr_region_native").or_else(|| input.get("ocr_region")); + let Some(val) = v else { + return Ok(None); + }; + if val.is_null() { + return Ok(None); + } + let o = val.as_object().ok_or_else(|| { + BitFunError::tool( + "ocr_region_native must be an object { x0, y0, width, height } in global native pixels." + .to_string(), + ) + })?; + let x0 = o + .get("x0") + .and_then(|x| x.as_i64()) + .ok_or_else(|| BitFunError::tool("ocr_region_native.x0 (integer) is required.".to_string()))? + as i32; + let y0 = o + .get("y0") + .and_then(|x| x.as_i64()) + .ok_or_else(|| BitFunError::tool("ocr_region_native.y0 (integer) is required.".to_string()))? + as i32; + let width = o + .get("width") + .and_then(|x| x.as_u64()) + .ok_or_else(|| { + BitFunError::tool("ocr_region_native.width (positive integer) is required.".to_string()) + })? as u32; + let height = o + .get("height") + .and_then(|x| x.as_u64()) + .ok_or_else(|| { + BitFunError::tool("ocr_region_native.height (positive integer) is required.".to_string()) + })? as u32; + if width == 0 || height == 0 { + return Err(BitFunError::tool( + "ocr_region_native width and height must be greater than zero.".to_string(), + )); + } + Ok(Some( + crate::agentic::tools::computer_use_host::OcrRegionNative { + x0, + y0, + width, + height, + }, + )) +} + #[async_trait] impl Tool for ComputerUseTool { fn name(&self) -> &str { @@ -697,111 +758,76 @@ impl Tool for ComputerUseTool { async fn description(&self) -> BitFunResult { let os = Self::host_os_label(); let keys = Self::key_chord_os_hint(); - let hmin = COMPUTER_USE_POINT_CROP_HALF_MIN; - let hmax = COMPUTER_USE_POINT_CROP_HALF_MAX; Ok(format!( - "Desktop Computer use (host OS: {}). {} \ -**Automation priority (read order — same as Claw `claw_mode` “Computer use”):** (1) **Terminal** — **`Bash`** / **`TerminalControl`** — workspace shell; on **macOS** use **`open -a \"AppName\"`** to launch/focus apps (e.g. WeChat) **instead of** Spotlight+Return when possible (do **not** assume “computer use” = only `ComputerUse*` tools). (2) **System shortcuts** — **`key_chord`** for OS-wide actions and **system clipboard** (see hint below). (3) **Application shortcuts** — **`key_chord`** when the right app is focused. (4) **This tool — `action: locate`** — **named** controls in the **foreground** app (`AX` / UIA / AT-SPI); when it matches, you may **move** with **`coordinate_hints`** **without** an immediate full-frame **`screenshot`**; use **`action: screenshot`** with **`screenshot_crop_center_*`** / **`screenshot_crop_half_extent_native`** **when** you need a JPEG for vision (host clamps {}..{} per half). (5) **`type_text`** — short input, paste-blocked fields, or after the above failed. (6) **Vision / mouse** — only when (1)–(4) do not suffice. Prefer **paste** over **`type_text`** for long or duplicated content; do **not** drive the mouse to Edit → Copy/Paste when chords exist. **Do not** spam **`screenshot`** between unrelated actions — host mainly requires fresh capture before **click** and **Return/Enter**. \ -**`screenshot` image layout (read this):** Every **`screenshot`** returns a JPEG with **white margins on all four sides** showing **numeric coordinate tick labels** (full-capture native pixel indices — the same scale on full-screen and point-crop shots), and a **line grid** drawn on the captured desktop **inside** those margins. Read x/y from the **top/bottom/left/right** margin numbers to aim moves and for **point crop** (`screenshot_crop_center_*`) when that path is justified. The inner bitmap (below the rulers) is the live capture. \ -**Default before `ComputerUseMouseClick` (`action`: click) (mouse path):** After the **first** full **`screenshot`**, **if `action: locate` gave a native center:** use **`screenshot`** with **`screenshot_crop_center_*`** (+ optional **`screenshot_crop_half_extent_native`**) to narrow the view **first**. **Else** set **`screenshot_navigate_quadrant`** (one of `top_left`, `top_right`, `bottom_left`, `bottom_right`) on the next **`screenshot`** — **do not** refresh full screen repeatedly without `screenshot_navigate_quadrant` or a point crop. Chain **`screenshot` + `screenshot_navigate_quadrant`** until **`quadrant_navigation_click_ready`: true** in the tool JSON, then **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick`**. Tool results may include **`recommended_next_for_click_targeting`** — obey it. \ -**Shortcut-first (default):** When a **standard OS or in-app shortcut** or **clipboard chord** achieves the same step (e.g. New/Open/Save, Copy/Cut/Paste, Undo/Redo, Find, Close tab/window, Quit, Refresh, tab/window switch, focus address bar, select all), you **must prefer `key_chord`** over moving the pointer and clicking — **do not** default to mouse for actions that have a well-known chord on this host. Use pointer + screenshots when **no** suitable shortcut exists, the target is only reachable by mouse, menus show no shortcut, or a shortcut attempt clearly failed (then **screenshot** and reassess). \ -**Between non-click steps:** **`computer_use_context`** often suffices; add **`screenshot`** when you need pixels or before **click / Enter** per host rules — **not** after every `key_chord` / `type_text` / `locate`. \ -**No blind submit or click (unchanged):** before **`ComputerUseMouseClick` (`action`: click)** (any button) and before **`key_chord` that sends Return/Enter** (or any key that submits/confirms), you **must** run **`screenshot` first** and visually confirm focus and target — **never** click or press Enter without a fresh screenshot when the outcome matters. Same discipline after moving the pointer. \ -**Quadrant drill (vision zoom; not automatic):** The app **never** splits the screen by itself. After an initial full **`screenshot`**, **when DOM is unavailable**, **each** narrowing step is **`screenshot` + `screenshot_navigate_quadrant`** ∈ {{`top_left`,`top_right`,`bottom_left`,`bottom_right`}} — omitting that field only **refreshes** full screen (or the current drill region). The host returns the chosen quarter **plus {} px on each side** (clamped); rulers stay **full-display native**. Repeat until **`quadrant_navigation_click_ready`: true** (longest native side < {} px), then **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** and **`ComputerUseMouseClick` (`action`: click)**. **`screenshot_reset_navigation`**: true restarts from full display. **If `screenshot_navigate_quadrant` is set, `screenshot_crop_center_*` are ignored**. **Point crop** (`screenshot_crop_center_*` ± optional half-extent) is **preferred when DOM supplies `native_center_*`**; otherwise use quadrant drill. \ -**Screenshot zoom:** When you must **confirm** small text, dense UI, or the **red cursor** tip, **proactively** zoom — **DOM + point crop** when possible; else quadrant drill — **do not** rely only on huge full-display images when a smaller view answers the question. \ -**Pointer positioning (separate tools):** **`ComputerUseMousePrecise`** — absolute `x`/`y` with `coordinate_mode` / `use_screen_coordinates`. **`ComputerUseMouseStep`** — cardinal `direction` (`up`|`down`|`left`|`right`) and optional `pixels` (default 32, clamped 1..400; same screenshot-pixel space as `pointer_move_rel`). For **small** nudges onto a control, prefer **`ComputerUseMouseStep`** over tiny absolute coords. **`pointer_move_rel`** — arbitrary `delta_x`/`delta_y` when diagonal or non-cardinal deltas are needed. **`ComputerUseMouseClick`** — `action` **`click`** (button at pointer) or **`wheel`** (scroll wheel `delta_x`/`delta_y` at pointer); does not move the pointer. \ -**Host (desktop):** Call **`screenshot`** when you need current pixels; there is **no** automatic follow-up capture after other actions. Before **`ComputerUseMouseClick` (`action`: click)**, after pointer moves, the host requires a fresh **fine** basis: **`quadrant_navigation_click_ready`** (preferred path) **or** a **point crop** — **full-screen-only** is **not** enough. Before **`key_chord`** with **Return/Enter**, a fresh **`screenshot`** (any mode) is required. Numeric fields in each tool result JSON are authoritative for that frame. \ -Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), **grid on the capture**, and a **synthetic mouse marker** when the pointer is on that display (**red** with **gray border**; **tip** = hotspot, same as **`pointer_image_x` / `pointer_image_y`**). On macOS, **`ComputerUseMousePrecise`** uses sub-point Quartz when applicable. Also **wait**. **Per `action`:** send **only** the parameters that apply (e.g. for `screenshot` do not send `keys` or fields meant for **`ComputerUseMousePrecise`**) — extra keys may confuse you or the UI. macOS: Accessibility for the running binary.", - os, - keys, - hmin, - hmax, - COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX, - COMPUTER_USE_QUADRANT_CLICK_READY_MAX_LONG_EDGE + "Desktop automation (host OS: {}). {} All actions in one tool. Send only parameters that apply to the chosen `action`. \ +**Targeting priority:** `click_element` → **`move_to_text`** (OCR + move pointer only) → `click_label` (when SoM exists) → **`screenshot`** (confirm / drill) + **`mouse_move`** (**`use_screen_coordinates`: true only**) + **`click`** last. **Screenshots are for confirmation — do not guess move targets from JPEG pixels.** \ +**`click_element`:** Accessibility tree (AX/UIA/AT-SPI) locate + click. Provide `title_contains` / `role_substring` / `identifier_contains`. Bypasses coordinate screenshot guard. \ +**`move_to_text`:** OCR-match visible text (`text_query`) and **move the pointer** to it (no click, no keys); **no prior `screenshot` required for targeting** (host captures **raw** pixels for Vision — no agent screenshot overlays; on macOS defaults to the **frontmost window** unless **`ocr_region_native`** overrides). Use **`click`** afterward if you need a mouse press. Prefer after `click_element` misses when text is visible. \ +**`click_label`:** After `screenshot` with `som_labels`, click by label number. Bypasses coordinate guard. \ +**`click`:** Press at **current pointer only** — **never** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates`. Position first with **`move_to_text`**, **`mouse_move`** (**globals only**), or **`click_element`**. After pointer moves, **`screenshot`** again before the next guarded **`click`** when the host requires it. \ +**`mouse_move` / `drag`:** **`use_screen_coordinates`: true** required — global coordinates from **`move_to_text`**, **`locate`**, AX, or **`pointer_global`**; never JPEG pixel guesses. \ +**`scroll` / `type_text` / `pointer_move_rel` / `wait` / `locate`:** No mandatory pre-screenshot by themselves. **`pointer_move_rel`** (and **ComputerUseMouseStep**) are **blocked immediately after `screenshot`** until **`move_to_text`**, **`mouse_move`** (globals), **`click_element`**, or **`click_label`** — do not nudge from the JPEG. \ +**`key_chord`:** Press key combination. **Mandatory fresh screenshot only** when chord includes Return/Enter. \ +**`screenshot`:** JPEG for **confirmation** (optional pointer + SoM). When the host requires a fresh capture before **`click`** or Enter **`key_chord`**, a bare `screenshot` is **~500×500** around the **mouse** or **caret** (also during quadrant drill). Use **`screenshot_reset_navigation`**: true to force **full-screen** for wide context. \ +**`type_text`:** Type text; prefer clipboard for long content.", + os, keys, )) } async fn description_with_context( &self, - context: Option<&ToolUseContext>, + _context: Option<&ToolUseContext>, ) -> BitFunResult { - let base = self.description().await?; - if context.and_then(|c| c.agent_type.as_deref()) == Some("Claw") { - Ok(format!( - "**Claw:** **`action: locate`** (accessibility) is the same tool as **`screenshot`** / **`key_chord`**. Use **`locate`** for **named** UI when AX exposes it; **do not** call **`screenshot`** after every **`locate`** / **`key_chord`** / **`type_text`** — only when you need pixels, or before **click** / **Return·Enter** (host). See `claw_mode` **Screenshot cadence**.\n\n{}", - base - )) - } else { - Ok(base) - } + self.description().await } fn input_schema(&self) -> Value { - let qpad = COMPUTER_USE_QUADRANT_EDGE_EXPAND_PX; json!({ "type": "object", "properties": { "action": { "type": "string", - "enum": ["screenshot", "locate", "pointer_move_rel", "key_chord", "type_text", "wait"], - "description": format!("**Same tool, different `action`:** **`locate`** — accessibility tree match on the **foreground** window (JSON only, no JPEG); use **`title_contains`** / **`role_substring`** / **`identifier_contains`** and optional **`filter_combine`**: **`all`** (default, AND) or **`any`** (OR) when one node has role but not title. **Before** ruler-only **`screenshot`** for named rows/buttons. **`screenshot`** — JPEG with **margin coordinate scales** + **grid**. **After `locate` matched:** prefer **`screenshot_crop_center_*`** + optional **`screenshot_crop_half_extent_native`** from the locate result **before** a long quadrant-only chain. **`key_chord`** — shortcuts + clipboard. **Pointer moves:** **`ComputerUseMousePrecise`**, **`ComputerUseMouseStep`**. **Click / wheel:** **`ComputerUseMouseClick`**. **When locate did not match:** **`screenshot_navigate_quadrant`** — 4-way drill; chosen quadrant **plus {} px per side** (clamped). Repeat until tool JSON `quadrant_navigation_click_ready`. **Modes:** (1) Plain / refresh — same region or full display (no narrowing). (2) **`screenshot_navigate_quadrant`**. (3) **`screenshot_reset_navigation`**: true — full display base. (4) **`screenshot_crop_center_*`** ± **`screenshot_crop_half_extent_native`** — point crop. **Precedence:** if `screenshot_navigate_quadrant` is set, **`screenshot_crop_center_*` are ignored**. **Prefer** sending **only** fields relevant to `screenshot` for this call. When **`quadrant_navigation_click_ready`** is true, you may **`ComputerUseMousePrecise`** / **`ComputerUseMouseStep`** + **`ComputerUseMouseClick`**. **Other actions:** `key_chord` + clipboard before `type_text`; red synthetic cursor when the mouse is on this display.", qpad) - }, - "delta_x": { "type": "integer", "description": "For pointer_move_rel only: horizontal delta in screenshot/display pixels (negative=left). On macOS converted via last screenshot scale; screenshot first." }, - "delta_y": { "type": "integer", "description": "For pointer_move_rel only: vertical delta in screenshot/display pixels (negative=up). On macOS converted via last screenshot scale; screenshot first." }, - "keys": { "type": "array", "items": { "type": "string" }, "description": "For key_chord: **prefer this action** for standard shortcuts **and** **system clipboard** (e.g. select all + copy/cut/paste per host — see tool description OS hint). Do not use mouse menus for Copy/Paste when these chords work. OS-specific key names per Environment Information. If the chord includes **return** / **enter** (submit/confirm), **`screenshot` first** and verify — **no blind Enter.** Otherwise screenshot when the next action depends on UI." }, - "text": { "type": "string", "description": "For type_text: short or paste-blocked input only — **prefer `key_chord` paste** (and focus/select chords) when inserting longer or duplicated content from the system clipboard. Then screenshot if you need to confirm focus or field content before further steps." }, - "ms": { "type": "integer", "description": "Wait duration in milliseconds" }, - "title_contains": { - "type": "string", - "description": "For **`action: locate`** only: case-insensitive substring on accessible title (AXTitle / etc.). Prefer the **same language as the app UI**. Optional if other filters match." - }, - "role_substring": { - "type": "string", - "description": "For **`action: locate`** only: case-insensitive substring on AXRole (e.g. \"Button\", \"AXButton\")." - }, - "identifier_contains": { - "type": "string", - "description": "For **`action: locate`** only: case-insensitive substring on AXIdentifier when present." - }, - "max_depth": { - "type": "integer", - "minimum": 1, - "maximum": 200, - "description": "For **`action: locate`** only: max BFS depth from the frontmost application root (default 48)." - }, - "filter_combine": { - "type": "string", - "enum": ["all", "any"], - "description": "For **`action: locate`** only: **`all`** (default) — every non-empty filter must match the **same** element (AND). **`any`** — match if **any** non-empty filter matches (OR). Use **`any`** when a field has a **role** (e.g. `AXTextField`) but **empty or different AXTitle** than your `title_contains` (common for search boxes). Prefer **one** filter (`role_substring` alone or `title_contains` alone) when unsure." - }, - "screenshot_crop_center_x": { - "type": "integer", - "minimum": 0, - "description": "For action `screenshot` only (point crop): X center in **full-capture native** pixels — same as margin tick labels on a prior full-screen shot. Pair with `screenshot_crop_center_y`. Optional **`screenshot_crop_half_extent_native`** adjusts crop size (default half=250 → ~500×500). Omit **both** centers when using `screenshot_navigate_quadrant` or plain refresh. **Ignored** if `screenshot_navigate_quadrant` is set." - }, - "screenshot_crop_center_y": { - "type": "integer", - "minimum": 0, - "description": "For action `screenshot` only (point crop): Y center in **full-capture native** pixels; pair with `screenshot_crop_center_x`. Omit **both** for quadrant drill or plain refresh. **Ignored** if `screenshot_navigate_quadrant` is set." - }, - "screenshot_crop_half_extent_native": { - "type": "integer", - "minimum": 0, - "description": format!( - "For action `screenshot` only, with **`screenshot_crop_center_*`**: half-size of the crop in **native** pixels (total region ≈ `2 × half`). Host clamps to {}..{}. Omit for default **250** (~500×500). After **`action: locate`**, copy **`coordinate_hints.screenshot_point_crop.screenshot_crop_half_extent_native`** when available for tighter crops around small controls.", - COMPUTER_USE_POINT_CROP_HALF_MIN, - COMPUTER_USE_POINT_CROP_HALF_MAX - ) + "enum": ["screenshot", "click_element", "click_label", "move_to_text", "click", "mouse_move", "scroll", "drag", "locate", "key_chord", "type_text", "pointer_move_rel", "wait"], + "description": "The action to perform. `click_element` = find UI element by accessibility + click (preferred for named controls). `click_label` = click a numbered Set-of-Mark label from the latest screenshot. `move_to_text` = OCR visible text and move pointer **only** (no click). **After `click_element` fails, prefer `move_to_text` (visible substring, UI language) over vision guesses.** `click` = press at **current pointer only** — **do not** pass `x`, `y`, `coordinate_mode`, or `use_screen_coordinates` (use `mouse_move` first). `mouse_move` = absolute move with **`use_screen_coordinates`: true** (globals from tools — **no** JPEG pixel mode). `scroll` = mouse wheel. `drag` = drag between two globals (**`use_screen_coordinates`: true**). `screenshot` = confirmation JPEG (host may apply ~500×500 when required). `locate` = find UI element (no click). `key_chord` = keyboard shortcut. `type_text` = type string. `pointer_move_rel` = relative move — **host blocks right after `screenshot`** until a trusted absolute move (`move_to_text`, `mouse_move`, `click_element`, `click_label`). `wait` = pause." }, - "screenshot_navigate_quadrant": { - "type": "string", - "enum": ["top_left", "top_right", "bottom_left", "bottom_right"], - "description": format!("For action `screenshot` only: **set this on the next screenshot after a full-frame shot** (default path before click). Pick one quadrant of the **current** region (or full display after reset); host returns that tile + **{} px** padding per side (clamped). Enum: `top_left`, `top_right`, `bottom_left`, `bottom_right`. **Takes precedence:** any `screenshot_crop_center_*` in the same call are **ignored**.", qpad) + "x": { "type": "integer", "description": "For `mouse_move` and `drag`: X in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, + "y": { "type": "integer", "description": "For `mouse_move` and `drag`: Y in **global display** units when **`use_screen_coordinates`: true** (required). **Not** for `click`." }, + "coordinate_mode": { "type": "string", "enum": ["image", "normalized"], "description": "Ignored for `mouse_move` / `drag` — host rejects image/normalized positioning; always set **`use_screen_coordinates`: true**." }, + "use_screen_coordinates": { "type": "boolean", "description": "For `mouse_move`, `drag`: **must be true** — global display coordinates (e.g. macOS points) from `move_to_text`, `locate`, AX, or `pointer_global`. **Not** for `click`." }, + "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "For `click`, `click_element`, `drag`: mouse button (default left)." }, + "num_clicks": { "type": "integer", "minimum": 1, "maximum": 3, "description": "For `click`, `click_element`: 1=single (default), 2=double, 3=triple click." }, + "delta_x": { "type": "integer", "description": "For `pointer_move_rel`: horizontal delta (negative=left). **Not** allowed as the first move after `screenshot` (host). For `scroll`: horizontal wheel delta." }, + "delta_y": { "type": "integer", "description": "For `pointer_move_rel`: vertical delta (negative=up). **Not** allowed as the first move after `screenshot` (host). For `scroll`: vertical wheel delta." }, + "start_x": { "type": "integer", "description": "For `drag`: start X coordinate." }, + "start_y": { "type": "integer", "description": "For `drag`: start Y coordinate." }, + "end_x": { "type": "integer", "description": "For `drag`: end X coordinate." }, + "end_y": { "type": "integer", "description": "For `drag`: end Y coordinate." }, + "keys": { "type": "array", "items": { "type": "string" }, "description": "For `key_chord`: key names to press together. Use OS-appropriate modifier names. Host requires a fresh screenshot only before chords that include Return/Enter (not before other chords)." }, + "text": { "type": "string", "description": "For `type_text`: text to type. Prefer clipboard paste (key_chord) for long content." }, + "ms": { "type": "integer", "description": "For `wait`: duration in milliseconds." }, + "label": { "type": "integer", "minimum": 1, "description": "For `click_label`: 1-based Set-of-Mark label number from the latest screenshot." }, + "text_query": { "type": "string", "description": "For `move_to_text`: visible text to OCR-match on screen (case-insensitive substring)." }, + "ocr_region_native": { + "type": "object", + "description": "For `move_to_text`: optional global native rectangle for OCR. If omitted, macOS uses the frontmost window bounds from Accessibility; other OSes use the primary display. Overrides the automatic region when set. Requires x0, y0, width, height.", + "properties": { + "x0": { "type": "integer", "description": "Top-left X in global screen coordinates (macOS: same logical space as CGDisplayBounds / pointer; not physical Retina pixels)." }, + "y0": { "type": "integer", "description": "Top-left Y in global screen coordinates (macOS: logical, Y-down)." }, + "width": { "type": "integer", "minimum": 1, "description": "Width in the same coordinate unit as x0/y0 (logical on macOS)." }, + "height": { "type": "integer", "minimum": 1, "description": "Height in the same coordinate unit as x0/y0 (logical on macOS)." } + } }, - "screenshot_reset_navigation": { - "type": "boolean", - "description": "For action `screenshot` only: if true, clear quadrant navigation before this capture so the base region is the **full** display (then apply `screenshot_navigate_quadrant` if set)." - } + "title_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring match on accessible title (AXTitle). Use same language as the app UI." }, + "role_substring": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXRole (e.g. \"Button\", \"TextField\")." }, + "identifier_contains": { "type": "string", "description": "For `locate`, `click_element`: case-insensitive substring on AXIdentifier." }, + "max_depth": { "type": "integer", "minimum": 1, "maximum": 200, "description": "For `locate`, `click_element`: max BFS depth (default 48)." }, + "filter_combine": { "type": "string", "enum": ["all", "any"], "description": "For `locate`, `click_element`: `all` (default, AND) or `any` (OR) for filter combination." }, + "screenshot_crop_center_x": { "type": "integer", "minimum": 0, "description": "For `screenshot`: point crop X center in full-capture native pixels." }, + "screenshot_crop_center_y": { "type": "integer", "minimum": 0, "description": "For `screenshot`: point crop Y center in full-capture native pixels." }, + "screenshot_crop_half_extent_native": { "type": "integer", "minimum": 0, "description": "For `screenshot`: half-size of point crop in native pixels (default 250)." }, + "screenshot_navigate_quadrant": { "type": "string", "enum": ["top_left", "top_right", "bottom_left", "bottom_right"], "description": "For `screenshot`: zoom into quadrant. Repeat until `quadrant_navigation_click_ready` is true." }, + "screenshot_reset_navigation": { "type": "boolean", "description": "For `screenshot`: reset to full display before this capture." }, + "screenshot_implicit_center": { "type": "string", "enum": ["mouse", "text_caret"], "description": "For `screenshot` when `requires_fresh_screenshot_before_click` / `requires_fresh_screenshot_before_enter` is true: center the implicit ~500×500 on the mouse (`mouse`, default) or on the focused text control (`text_caret`, macOS AX; falls back to mouse). Applies to the **first** confirmation capture too. Ignored when you set `screenshot_crop_center_*` / `screenshot_navigate_quadrant` / `screenshot_reset_navigation`." } }, "required": ["action"], "additionalProperties": false @@ -857,9 +883,366 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** match action { "locate" => execute_computer_use_locate(input, context).await, + // ---- NEW: click_element (locate + move + click in one call) ---- + "click_element" => { + let query = parse_locate_query(input); + if query.title_contains.is_none() && query.role_substring.is_none() && query.identifier_contains.is_none() { + return Err(BitFunError::tool( + "click_element requires at least one of title_contains, role_substring, or identifier_contains.".to_string(), + )); + } + let button = input.get("button").and_then(|v| v.as_str()).unwrap_or("left"); + let num_clicks = input.get("num_clicks").and_then(|v| v.as_u64()).unwrap_or(1).clamp(1, 3) as u32; + + let res = host_ref.locate_ui_element_screen_center(query.clone()).await?; + + // Move pointer to AX center using global screen coordinates (authoritative). + host_ref.mouse_move_global_f64(res.global_center_x, res.global_center_y).await?; + + // Relaxed guard: AX coordinates are authoritative, no fine-screenshot needed. + host_ref.computer_use_guard_click_allowed_relaxed()?; + + for _ in 0..num_clicks { + host_ref.mouse_click_authoritative(button).await?; + } + + let click_label = match num_clicks { 2 => "double", 3 => "triple", _ => "single" }; + let input_coords = json!({ + "kind": "click_element", + "query": { + "title_contains": query.title_contains, + "role_substring": query.role_substring, + "identifier_contains": query.identifier_contains, + "filter_combine": query.filter_combine, + }, + "button": button, + "num_clicks": num_clicks, + }); + let mut result_json = json!({ + "success": true, + "action": "click_element", + "matched_role": res.matched_role, + "matched_title": res.matched_title, + "matched_identifier": res.matched_identifier, + "global_center_x": res.global_center_x, + "global_center_y": res.global_center_y, + "button": button, + "num_clicks": num_clicks, + }); + if let Some(ref pc) = res.parent_context { + result_json["parent_context"] = json!(pc); + } + if res.total_matches > 1 { + result_json["total_matches"] = json!(res.total_matches); + result_json["warning"] = json!(format!( + "{} elements matched; clicked the best-ranked one. See other_matches if wrong.", + res.total_matches + )); + } + if !res.other_matches.is_empty() { + result_json["other_matches"] = json!(res.other_matches); + } + let body = computer_use_augment_result_json( + host_ref, + result_json, + Some(input_coords), + ) + .await; + let match_info = if res.total_matches > 1 { + format!(" ({} matches)", res.total_matches) + } else { + String::new() + }; + let summary = format!( + "AX click_element: {} {} click on role={} at ({:.0}, {:.0}).{}", + button, click_label, res.matched_role, res.global_center_x, res.global_center_y, + match_info, + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + "click_label" => { + let label = input + .get("label") + .and_then(|v| v.as_u64()) + .ok_or_else(|| BitFunError::tool("click_label requires integer field `label`.".to_string()))? + as u32; + if label == 0 { + return Err(BitFunError::tool("click_label label must be >= 1.".to_string())); + } + let button = input.get("button").and_then(|v| v.as_str()).unwrap_or("left"); + let num_clicks = input.get("num_clicks").and_then(|v| v.as_u64()).unwrap_or(1).clamp(1, 3) as u32; + + let latest_shot = host_ref.screenshot_peek_full_display().await?; + let matched = latest_shot + .som_labels + .iter() + .find(|e| e.label == label) + .cloned() + .ok_or_else(|| BitFunError::tool(format!( + "No SoM label {} found. Take a fresh screenshot first and use one of the returned som_labels.", + label + )))?; + + host_ref.mouse_move_global_f64(matched.global_center_x, matched.global_center_y).await?; + host_ref.computer_use_guard_click_allowed_relaxed()?; + for _ in 0..num_clicks { + host_ref.mouse_click_authoritative(button).await?; + } + + let input_coords = json!({ + "kind": "click_label", + "label": label, + "button": button, + "num_clicks": num_clicks, + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "action": "click_label", + "label": label, + "matched_role": matched.role, + "matched_title": matched.title, + "matched_identifier": matched.identifier, + "global_center_x": matched.global_center_x, + "global_center_y": matched.global_center_y, + "button": button, + "num_clicks": num_clicks, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "SoM click_label: label={} role={} at ({:.0}, {:.0}).", + label, matched.role, matched.global_center_x, matched.global_center_y + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + "move_to_text" => { + let text_query = input + .get("text_query") + .and_then(|v| v.as_str()) + .map(str::trim) + .filter(|s| !s.is_empty()) + .ok_or_else(|| { + BitFunError::tool( + "move_to_text requires non-empty string field `text_query`.".to_string(), + ) + })?; + let ocr_region_native = parse_ocr_region_native(input)?; + + { + let matches = Self::find_text_on_screen( + host_ref, + text_query, + ocr_region_native.clone(), + ) + .await?; + let matched = matches.first().cloned().ok_or_else(|| { + BitFunError::tool(format!( + "move_to_text found no visible OCR match for {:?}. Take a fresh screenshot and try a shorter or more distinctive substring, or use click_label / click_element.", + text_query + )) + })?; + + host_ref + .mouse_move_global_f64(matched.center_x, matched.center_y) + .await?; + + let other_matches = matches + .iter() + .skip(1) + .take(4) + .map(|m| { + json!({ + "text": m.text, + "confidence": m.confidence, + "center_x": m.center_x, + "center_y": m.center_y, + }) + }) + .collect::>(); + + let input_coords = json!({ + "kind": "move_to_text", + "text_query": text_query, + "ocr_region_native": &ocr_region_native, + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "action": "move_to_text", + "text_query": text_query, + "ocr_region_native": ocr_region_native, + "matched_text": matched.text, + "confidence": matched.confidence, + "global_center_x": matched.center_x, + "global_center_y": matched.center_y, + "bounds_left": matched.bounds_left, + "bounds_top": matched.bounds_top, + "bounds_width": matched.bounds_width, + "bounds_height": matched.bounds_height, + "total_matches": matches.len(), + "other_matches": other_matches, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "OCR move_to_text: matched {:?} at ({:.0}, {:.0}).", + matched.text, matched.center_x, matched.center_y + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + } + + // ---- click: current pointer only; use `mouse_move` / `move_to_text` separately ---- + "click" => { + Self::ensure_click_has_no_coordinate_fields(input)?; + + let button = input.get("button").and_then(|v| v.as_str()).unwrap_or("left"); + let num_clicks = input.get("num_clicks").and_then(|v| v.as_u64()).unwrap_or(1).clamp(1, 3) as u32; + + host_ref.computer_use_guard_click_allowed()?; + + for _ in 0..num_clicks { + host_ref.mouse_click_authoritative(button).await?; + } + + let click_label = match num_clicks { 2 => "double", 3 => "triple", _ => "single" }; + let input_coords = json!({ + "kind": "click", + "button": button, + "num_clicks": num_clicks, + "at_current_pointer_only": true, + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "action": "click", + "button": button, + "num_clicks": num_clicks, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "{} {} click at current pointer only (no move).", + button, click_label + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + // ---- NEW: mouse_move (absolute pointer move, consolidated from ComputerUseMousePrecise) ---- + "mouse_move" => { + ensure_pointer_move_uses_screen_coordinates_only(input)?; + let x = req_i32(input, "x")?; + let y = req_i32(input, "y")?; + let (sx64, sy64) = Self::resolve_xy_f64(host_ref, input, x, y)?; + host_ref.mouse_move_global_f64(sx64, sy64).await?; + let mode = coordinate_mode(input); + let use_screen = use_screen_coordinates(input); + let input_coords = json!({ + "kind": "mouse_move", + "raw": { "x": x, "y": y, "coordinate_mode": mode, "use_screen_coordinates": use_screen }, + "resolved_global": { "x": sx64, "y": sy64 }, + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "action": "mouse_move", + "x": x, "y": y, + "pointer_x": sx64.round() as i32, + "pointer_y": sy64.round() as i32, + "coordinate_mode": mode, + "use_screen_coordinates": use_screen, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "Moved pointer to (~{}, ~{}).", + sx64.round() as i32, sy64.round() as i32 + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + // ---- NEW: scroll (consolidated from ComputerUseMouseClick wheel action) ---- + "scroll" => { + let dx = input.get("delta_x").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + let dy = input.get("delta_y").and_then(|v| v.as_i64()).unwrap_or(0) as i32; + if dx == 0 && dy == 0 { + return Err(BitFunError::tool( + "scroll requires non-zero delta_x and/or delta_y".to_string(), + )); + } + host_ref.scroll(dx, dy).await?; + let input_coords = json!({ "kind": "scroll", "delta_x": dx, "delta_y": dy }); + let body = computer_use_augment_result_json( + host_ref, + json!({ "success": true, "action": "scroll", "delta_x": dx, "delta_y": dy }), + Some(input_coords), + ) + .await; + let summary = format!("Scrolled ({}, {}).", dx, dy); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + + // ---- NEW: drag (mouse_down at start + move to end + mouse_up) ---- + "drag" => { + ensure_pointer_move_uses_screen_coordinates_only(input)?; + let start_x = req_i32(input, "start_x")?; + let start_y = req_i32(input, "start_y")?; + let end_x = req_i32(input, "end_x")?; + let end_y = req_i32(input, "end_y")?; + let button = input.get("button").and_then(|v| v.as_str()).unwrap_or("left"); + + let (sx0, sy0) = Self::resolve_xy_f64(host_ref, input, start_x, start_y)?; + let (sx1, sy1) = Self::resolve_xy_f64(host_ref, input, end_x, end_y)?; + + // Move to start, press, move to end, release. + host_ref.mouse_move_global_f64(sx0, sy0).await?; + host_ref.mouse_down(button).await?; + // Small pause for apps that need time to register the press. + host_ref.wait_ms(50).await?; + host_ref.mouse_move_global_f64(sx1, sy1).await?; + host_ref.wait_ms(50).await?; + host_ref.mouse_up(button).await?; + + let input_coords = json!({ + "kind": "drag", + "start": { "x": start_x, "y": start_y }, + "end": { "x": end_x, "y": end_y }, + "button": button, + }); + let body = computer_use_augment_result_json( + host_ref, + json!({ + "success": true, + "action": "drag", + "start_global": { "x": sx0.round() as i32, "y": sy0.round() as i32 }, + "end_global": { "x": sx1.round() as i32, "y": sy1.round() as i32 }, + "button": button, + }), + Some(input_coords), + ) + .await; + let summary = format!( + "Dragged from (~{}, ~{}) to (~{}, ~{}).", + sx0.round() as i32, sy0.round() as i32, + sx1.round() as i32, sy1.round() as i32, + ); + Ok(vec![ToolResult::ok(body, Some(summary))]) + } + "screenshot" => { Self::require_multimodal_tool_output_for_screenshot(context)?; - let (params, ignored_crop_for_quadrant) = Self::parse_screenshot_params(input)?; + let (params, ignored_crop_for_quadrant) = parse_screenshot_params(input)?; let crop_for_debug = params.crop_center; let nav_debug = params.navigate_quadrant.map(|q| match q { ComputerUseNavigateQuadrant::TopLeft => "nav_tl", @@ -868,6 +1251,10 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** ComputerUseNavigateQuadrant::BottomRight => "nav_br", }); let shot = host_ref.screenshot_display(params).await?; + // Update screenshot hash for visual change detection + let shot_hash = hash_screenshot_bytes(&shot.bytes); + host_ref.update_screenshot_hash(shot_hash); + let crop_for_debug = shot.screenshot_crop_center.or(crop_for_debug); let debug_rel = Self::try_save_screenshot_for_debug( &shot.bytes, context, @@ -879,8 +1266,9 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** "kind": "screenshot", "screenshot_reset_navigation": params.reset_navigation, "screenshot_crop_ignored_for_quadrant": ignored_crop_for_quadrant, - "screenshot_crop_center": params.crop_center.map(|c| json!({ "x": c.x, "y": c.y })), - "screenshot_crop_half_extent_native": params.point_crop_half_extent_native, + "screenshot_crop_center": shot.screenshot_crop_center.map(|c| json!({ "x": c.x, "y": c.y })), + "screenshot_crop_half_extent_native": shot.point_crop_half_extent_native, + "screenshot_implicit_confirmation_crop_applied": shot.implicit_confirmation_crop_applied, "screenshot_navigate_quadrant": params.navigate_quadrant.map(|q| match q { ComputerUseNavigateQuadrant::TopLeft => "top_left", ComputerUseNavigateQuadrant::TopRight => "top_right", @@ -1006,6 +1394,18 @@ Each **`screenshot`** JPEG: **four-side margin coordinate scales** (numbers), ** } } +#[derive(Debug, Clone)] +struct ScreenOcrTextMatch { + text: String, + confidence: f32, + center_x: f64, + center_y: f64, + bounds_left: f64, + bounds_top: f64, + bounds_width: f64, + bounds_height: f64, +} + fn req_i32(input: &Value, key: &str) -> BitFunResult { input .get(key) diff --git a/src/crates/core/src/agentic/tools/implementations/mod.rs b/src/crates/core/src/agentic/tools/implementations/mod.rs index 9b9e4199..91b1ea28 100644 --- a/src/crates/core/src/agentic/tools/implementations/mod.rs +++ b/src/crates/core/src/agentic/tools/implementations/mod.rs @@ -4,6 +4,8 @@ pub mod ask_user_question_tool; pub mod bash_tool; pub mod code_review_tool; pub mod computer_use_tool; +pub mod computer_use_input; +pub mod computer_use_result; pub mod computer_use_mouse_precise_tool; pub mod computer_use_mouse_step_tool; pub mod computer_use_mouse_click_tool; diff --git a/src/crates/core/src/agentic/tools/mod.rs b/src/crates/core/src/agentic/tools/mod.rs index b452f5df..3bf92eeb 100644 --- a/src/crates/core/src/agentic/tools/mod.rs +++ b/src/crates/core/src/agentic/tools/mod.rs @@ -2,6 +2,8 @@ pub mod computer_use_capability; pub mod computer_use_host; +pub mod computer_use_optimizer; +pub mod computer_use_verification; pub mod framework; pub mod image_context; pub mod implementations; diff --git a/src/crates/core/src/agentic/tools/registry.rs b/src/crates/core/src/agentic/tools/registry.rs index 85262c62..a0915c7c 100644 --- a/src/crates/core/src/agentic/tools/registry.rs +++ b/src/crates/core/src/agentic/tools/registry.rs @@ -134,10 +134,10 @@ impl ToolRegistry { // MiniApp Agent tool (single InitMiniApp) self.register_tool(Arc::new(InitMiniAppTool::new())); + // All desktop automation consolidated into ComputerUse (click_element, click, mouse_move, + // scroll, drag, screenshot, locate, key_chord, type_text, pointer_move_rel, wait). + // The separate ComputerUseMousePrecise/Step/Click tools are no longer registered. self.register_tool(Arc::new(ComputerUseTool::new())); - self.register_tool(Arc::new(ComputerUseMousePreciseTool::new())); - self.register_tool(Arc::new(ComputerUseMouseStepTool::new())); - self.register_tool(Arc::new(ComputerUseMouseClickTool::new())); } /// Register a single tool diff --git a/src/crates/core/src/service/remote_ssh/manager.rs b/src/crates/core/src/service/remote_ssh/manager.rs index 6a59fa9f..b830d616 100644 --- a/src/crates/core/src/service/remote_ssh/manager.rs +++ b/src/crates/core/src/service/remote_ssh/manager.rs @@ -905,8 +905,31 @@ impl SSHConnectionManager { } log::info!("Authentication successful for user {}", config.username); - // Get server info - let server_info = Self::get_server_info_internal(&handle).await; + // Get server info (prefer full probe; fall back to $HOME only so SFTP `~` works when uname fails) + let mut server_info = Self::get_server_info_internal(&handle).await; + if server_info + .as_ref() + .map(|s| s.home_dir.trim().is_empty()) + .unwrap_or(true) + { + if let Ok((stdout, _, status)) = Self::execute_command_internal(&handle, "echo $HOME").await { + if status == 0 { + let home = stdout.trim().to_string(); + if !home.is_empty() { + match &mut server_info { + Some(si) => si.home_dir = home, + None => { + server_info = Some(ServerInfo { + os_type: "unknown".to_string(), + hostname: "unknown".to_string(), + home_dir: home, + }); + } + } + } + } + } + } let connection_id = config.id.clone(); @@ -1041,6 +1064,40 @@ impl SSHConnectionManager { // SFTP Operations // ============================================================================ + /// Expand leading `~` using the remote user's home from [`ServerInfo`] (SFTP paths are not shell-expanded). + pub async fn resolve_sftp_path(&self, connection_id: &str, path: &str) -> anyhow::Result { + let path = path.trim(); + if path.is_empty() { + return Err(anyhow!("Empty remote path")); + } + if path == "~" || path.starts_with("~/") { + let guard = self.connections.read().await; + let home = guard + .get(connection_id) + .and_then(|c| c.server_info.as_ref()) + .map(|s| s.home_dir.trim()) + .filter(|h| !h.is_empty()); + let home = match home { + Some(h) => h.to_string(), + None => { + return Err(anyhow!( + "Cannot use '~' in remote path: home directory is not available for this connection" + )); + } + }; + if path == "~" || path == "~/" { + return Ok(home); + } + let rest = path[2..].trim_start_matches('/'); + if rest.is_empty() { + return Ok(home); + } + Ok(format!("{}/{}", home.trim_end_matches('/'), rest)) + } else { + Ok(path.to_string()) + } + } + /// Get or create SFTP session for a connection pub async fn get_sftp(&self, connection_id: &str) -> anyhow::Result> { // First check if we have an existing SFTP session @@ -1088,8 +1145,9 @@ impl SSHConnectionManager { /// Read a file via SFTP pub async fn sftp_read(&self, connection_id: &str, path: &str) -> anyhow::Result> { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - let mut file = sftp.open(path).await + let mut file = sftp.open(&path).await .map_err(|e| anyhow!("Failed to open remote file '{}': {}", path, e))?; let mut buffer = Vec::new(); @@ -1102,8 +1160,9 @@ impl SSHConnectionManager { /// Write a file via SFTP pub async fn sftp_write(&self, connection_id: &str, path: &str, content: &[u8]) -> anyhow::Result<()> { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - let mut file = sftp.create(path).await + let mut file = sftp.create(&path).await .map_err(|e| anyhow!("Failed to create remote file '{}': {}", path, e))?; use tokio::io::AsyncWriteExt; @@ -1118,72 +1177,81 @@ impl SSHConnectionManager { /// Read directory via SFTP pub async fn sftp_read_dir(&self, connection_id: &str, path: &str) -> anyhow::Result { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - let entries = sftp.read_dir(path).await + let entries = sftp.read_dir(&path).await .map_err(|e| anyhow!("Failed to read directory '{}': {}", path, e))?; Ok(entries) } /// Create directory via SFTP pub async fn sftp_mkdir(&self, connection_id: &str, path: &str) -> anyhow::Result<()> { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - sftp.create_dir(path).await + sftp.create_dir(&path).await .map_err(|e| anyhow!("Failed to create directory '{}': {}", path, e))?; Ok(()) } /// Create directory and all parents via SFTP pub async fn sftp_mkdir_all(&self, connection_id: &str, path: &str) -> anyhow::Result<()> { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; // Check if path exists - match sftp.as_ref().try_exists(path).await { + match sftp.as_ref().try_exists(&path).await { Ok(true) => return Ok(()), // Already exists Ok(false) => {} Err(_) => {} } // Try to create - sftp.as_ref().create_dir(path).await + sftp.as_ref().create_dir(&path).await .map_err(|e| anyhow!("Failed to create directory '{}': {}", path, e))?; Ok(()) } /// Remove file via SFTP pub async fn sftp_remove(&self, connection_id: &str, path: &str) -> anyhow::Result<()> { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - sftp.remove_file(path).await + sftp.remove_file(&path).await .map_err(|e| anyhow!("Failed to remove file '{}': {}", path, e))?; Ok(()) } /// Remove directory via SFTP pub async fn sftp_rmdir(&self, connection_id: &str, path: &str) -> anyhow::Result<()> { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - sftp.remove_dir(path).await + sftp.remove_dir(&path).await .map_err(|e| anyhow!("Failed to remove directory '{}': {}", path, e))?; Ok(()) } /// Rename/move via SFTP pub async fn sftp_rename(&self, connection_id: &str, old_path: &str, new_path: &str) -> anyhow::Result<()> { + let old_path = self.resolve_sftp_path(connection_id, old_path).await?; + let new_path = self.resolve_sftp_path(connection_id, new_path).await?; let sftp = self.get_sftp(connection_id).await?; - sftp.rename(old_path, new_path).await + sftp.rename(&old_path, &new_path).await .map_err(|e| anyhow!("Failed to rename '{}' to '{}': {}", old_path, new_path, e))?; Ok(()) } /// Check if path exists via SFTP pub async fn sftp_exists(&self, connection_id: &str, path: &str) -> anyhow::Result { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - sftp.as_ref().try_exists(path).await + sftp.as_ref().try_exists(&path).await .map_err(|e| anyhow!("Failed to check if '{}' exists: {}", path, e)) } /// Get file metadata via SFTP pub async fn sftp_stat(&self, connection_id: &str, path: &str) -> anyhow::Result { + let path = self.resolve_sftp_path(connection_id, path).await?; let sftp = self.get_sftp(connection_id).await?; - sftp.as_ref().metadata(path).await + sftp.as_ref().metadata(&path).await .map_err(|e| anyhow!("Failed to stat '{}': {}", path, e)) } diff --git a/src/crates/core/src/service/remote_ssh/remote_fs.rs b/src/crates/core/src/service/remote_ssh/remote_fs.rs index 2b66d498..b59ad90c 100644 --- a/src/crates/core/src/service/remote_ssh/remote_fs.rs +++ b/src/crates/core/src/service/remote_ssh/remote_fs.rs @@ -64,6 +64,7 @@ impl RemoteFileService { /// Read directory contents via SFTP pub async fn read_dir(&self, connection_id: &str, path: &str) -> anyhow::Result> { let manager = self.get_manager(connection_id).await?; + let path_resolved = manager.resolve_sftp_path(connection_id, path).await?; let mut entries = manager.sftp_read_dir(connection_id, path).await?; let mut result = Vec::new(); @@ -76,10 +77,10 @@ impl RemoteFileService { continue; } - let full_path = if path.ends_with('/') { - format!("{}{}", path, name) + let full_path = if path_resolved.ends_with('/') { + format!("{}{}", path_resolved, name) } else { - format!("{}/{}", path, name) + format!("{}/{}", path_resolved, name) }; let metadata = entry.metadata(); diff --git a/src/crates/core/src/service/remote_ssh/remote_terminal.rs b/src/crates/core/src/service/remote_ssh/remote_terminal.rs index ea439760..5d49fa2c 100644 --- a/src/crates/core/src/service/remote_ssh/remote_terminal.rs +++ b/src/crates/core/src/service/remote_ssh/remote_terminal.rs @@ -13,6 +13,11 @@ use std::collections::HashMap; use std::sync::Arc; use tokio::io::AsyncWriteExt; use tokio::sync::{broadcast, mpsc, RwLock}; +use tokio::time::{timeout, Duration}; + +/// `pwd` can hang on some hosts (e.g. path resolution touching an unreachable `/`) while the shell still works; +/// treat timeout the same as error and fall back to `~` for the initial `cd`. +const REMOTE_PWD_PROBE_TIMEOUT: Duration = Duration::from_secs(5); fn shell_escape(s: &str) -> String { if s.chars().all(|c| c.is_alphanumeric() || c == '/' || c == '.' || c == '-' || c == '_') { @@ -105,9 +110,41 @@ impl RemoteTerminalManager { let cwd = if let Some(dir) = initial_cwd { dir.to_string() } else { - match manager.execute_command(connection_id, "pwd").await { - Ok((output, _, _)) => output.trim().to_string(), - Err(_) => "/".to_string(), + match timeout( + REMOTE_PWD_PROBE_TIMEOUT, + manager.execute_command(connection_id, "pwd"), + ) + .await + { + Ok(Ok((output, _, status))) => { + let out = output.trim(); + if status == 0 && !out.is_empty() { + out.to_string() + } else { + log::debug!( + "remote_terminal: pwd empty or non-zero exit (status={}); using ~, connection_id={}", + status, + connection_id + ); + "~".to_string() + } + } + Ok(Err(e)) => { + log::debug!( + "remote_terminal: pwd error: {}; using ~, connection_id={}", + e, + connection_id + ); + "~".to_string() + } + Err(_elapsed) => { + log::debug!( + "remote_terminal: pwd timed out after {:?}; using ~, connection_id={}", + REMOTE_PWD_PROBE_TIMEOUT, + connection_id + ); + "~".to_string() + } } }; @@ -150,9 +187,14 @@ impl RemoteTerminalManager { tokio::spawn(async move { log::info!("Remote PTY owner task started: session_id={}", task_session_id); - // cd to workspace directory silently - if initial_cd != "/" { - let cd_cmd = format!("cd {} && clear\n", shell_escape(&initial_cd)); + // cd to workspace directory silently (avoid `/` default — some hosts block listing `/`) + if initial_cd != "/" && !initial_cd.is_empty() { + let cd_arg = if initial_cd == "~" || initial_cd.starts_with("~/") { + initial_cd.clone() + } else { + shell_escape(&initial_cd) + }; + let cd_cmd = format!("cd {} && clear\n", cd_arg); if let Err(e) = writer.write_all(cd_cmd.as_bytes()).await { log::warn!("Failed to cd to initial directory: {}", e); } diff --git a/src/web-ui/src/app/components/NavPanel/MainNav.tsx b/src/web-ui/src/app/components/NavPanel/MainNav.tsx index 57d4fd0e..f6c85d01 100644 --- a/src/web-ui/src/app/components/NavPanel/MainNav.tsx +++ b/src/web-ui/src/app/components/NavPanel/MainNav.tsx @@ -625,6 +625,8 @@ const MainNav: React.FC = ({ {sshRemote.showFileBrowser && sshRemote.connectionId && ( { sshRemote.setShowFileBrowser(false); diff --git a/src/web-ui/src/app/components/RemoteConnectDialog/RemoteConnectDisclaimer.tsx b/src/web-ui/src/app/components/RemoteConnectDialog/RemoteConnectDisclaimer.tsx index cf7a8ab5..902f2c91 100644 --- a/src/web-ui/src/app/components/RemoteConnectDialog/RemoteConnectDisclaimer.tsx +++ b/src/web-ui/src/app/components/RemoteConnectDialog/RemoteConnectDisclaimer.tsx @@ -46,7 +46,7 @@ export const RemoteConnectDisclaimerContent: React.FC{t('remoteConnect.disclaimerIntro')}

    -
  1. {t('remoteConnect.disclaimerItemBeta')}
  2. +
  3. {t('remoteConnect.disclaimerItemGeneralRisk')}
  4. {t('remoteConnect.disclaimerItemSecurity')}
  5. {t('remoteConnect.disclaimerItemEncryption')}
  6. {t('remoteConnect.disclaimerItemOpenSource')}
  7. diff --git a/src/web-ui/src/component-library/components/Markdown/Markdown.scss b/src/web-ui/src/component-library/components/Markdown/Markdown.scss index c489078c..10822294 100644 --- a/src/web-ui/src/component-library/components/Markdown/Markdown.scss +++ b/src/web-ui/src/component-library/components/Markdown/Markdown.scss @@ -1,6 +1,8 @@ /* Markdown renderer styles */ .markdown-renderer { --markdown-font-mono: "Fira Code", "JetBrains Mono", Consolas, "Courier New", monospace; + --markdown-block-gap: 0.65rem; + --markdown-code-bg-elevated: color-mix(in srgb, var(--color-bg-primary) 92%, #ffffff 8%); color: var(--color-text-primary); line-height: var(--line-height-relaxed); @@ -33,7 +35,7 @@ .markdown-renderer > * + * { - margin-top: 0.25rem; + margin-top: var(--markdown-block-gap); } @@ -50,7 +52,7 @@ .markdown-renderer p + p { - margin-top: 0.25rem; + margin-top: 0.5rem; } @@ -71,9 +73,9 @@ .markdown-renderer p { margin-top: 0; - margin-bottom: 0.5rem; + margin-bottom: 0.65rem; display: block; - line-height: 1.5; + line-height: 1.62; font-size: 0.9rem; color: var(--color-text-primary); } @@ -106,7 +108,7 @@ margin: 1.5rem 0 1rem 0; padding: 0 0 0.6rem 0; color: var(--color-text-primary); - border-bottom: 1px solid rgba(255, 255, 255, 0.12); + border-bottom: 1px solid var(--border-color, rgba(255, 255, 255, 0.12)); } @@ -117,7 +119,7 @@ margin: 1.25rem 0 0.75rem 0; padding-bottom: 0.4rem; color: var(--color-text-primary); - border-bottom: 1px solid rgba(255, 255, 255, 0.1); + border-bottom: 1px solid var(--border-color, rgba(255, 255, 255, 0.1)); } @@ -210,20 +212,20 @@ .markdown-renderer .inline-code { - padding: 0.1em 0.4em; - margin: 0 0.05em; - font-size: 0.9em; - background: rgba(255, 255, 255, 0.05); - border: 1px solid rgba(255, 255, 255, 0.1); - border-radius: 4px; + padding: 0.12em 0.38em; + margin: 0 0.04em; + font-size: 0.88em; + background: rgba(255, 255, 255, 0.04); + border: 1px solid rgba(255, 255, 255, 0.07); + border-radius: 5px; font-family: var(--markdown-font-mono); white-space: nowrap; vertical-align: baseline; - line-height: 1.4; + line-height: 1.45; user-select: text !important; - color: #a8b4c8; + color: color-mix(in srgb, var(--color-text-primary) 88%, #a8b4c8 12%); font-weight: 500; - transition: all 0.15s ease; + transition: background-color 0.15s ease, border-color 0.15s ease, color 0.15s ease; box-decoration-break: clone; -webkit-box-decoration-break: clone; } @@ -242,41 +244,71 @@ .markdown-renderer .code-block-wrapper { - margin: 0.25rem 0.3rem; - border-radius: 4px; - background: var(--color-bg-primary); - border: 1px dashed rgba(255, 255, 255, 0.1); - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); + display: flex; + flex-direction: column; + margin: 0.75rem 0; + border-radius: 8px; + background: var(--markdown-code-bg-elevated); + border: 1px solid var(--border-color, rgba(255, 255, 255, 0.09)); + box-shadow: 0 1px 0 rgba(255, 255, 255, 0.04) inset, 0 8px 24px rgba(0, 0, 0, 0.22); overflow: hidden; position: relative; - transition: all 0.3s ease; + transition: border-color 0.2s ease, box-shadow 0.2s ease; } .markdown-renderer .code-block-wrapper:hover { - border: 1px solid rgba(255, 255, 255, 0.15); - border-color: rgba(255, 255, 255, 0.15); + border-color: color-mix(in srgb, var(--border-color, rgba(255, 255, 255, 0.09)) 70%, var(--primary-color, #3b82f6) 30%); + box-shadow: 0 1px 0 rgba(255, 255, 255, 0.05) inset, 0 10px 28px rgba(0, 0, 0, 0.26); +} + + +.markdown-renderer .code-block-toolbar { + display: flex; + align-items: center; + justify-content: space-between; + gap: 0.5rem; + min-height: 2.25rem; + padding: 0.35rem 0.5rem 0.35rem 0.75rem; + flex-shrink: 0; + background: color-mix(in srgb, var(--color-bg-primary) 94%, #ffffff 6%); + border-bottom: 1px solid var(--border-color, rgba(255, 255, 255, 0.08)); +} + + +.markdown-renderer .code-block-lang { + font-family: var(--font-family-sans); + font-size: 0.6875rem; + font-weight: 600; + letter-spacing: 0.06em; + text-transform: uppercase; + color: var(--color-text-muted); + user-select: none; +} + + +.markdown-renderer .code-block-body { + min-width: 0; + overflow-x: auto; } .markdown-renderer .copy-button { - position: absolute; - top: 0.35rem; - transform: none; - right: 0.75rem; - padding: 0.5rem; + position: relative; + flex-shrink: 0; + padding: 0.35rem; background: transparent; border: none; border-radius: 6px; - color: rgba(255, 255, 255, 0.5); + color: var(--color-text-muted); cursor: pointer; - transition: color 0.25s cubic-bezier(0.4, 0, 0.2, 1), background 0.25s cubic-bezier(0.4, 0, 0.2, 1), opacity 0.25s cubic-bezier(0.4, 0, 0.2, 1), transform 0.25s cubic-bezier(0.4, 0, 0.2, 1); + transition: color 0.2s ease, background-color 0.2s ease; display: flex; align-items: center; justify-content: center; width: 32px; height: 32px; - opacity: 0; + opacity: 0.9; z-index: 10; } @@ -290,45 +322,30 @@ } .markdown-renderer .copy-button:hover { - background: transparent; - color: #3b82f6; - transform: scale(1.1); + background: rgba(255, 255, 255, 0.06); + color: var(--primary-color, #3b82f6); } .markdown-renderer .copy-button:active { - transform: scale(1); color: #2563eb; } -.markdown-renderer .code-block-wrapper--single-line .copy-button { - top: 50%; - transform: translateY(-50%); -} - -.markdown-renderer .code-block-wrapper--single-line .copy-button:hover { - transform: translateY(-50%) scale(1.1); -} - -.markdown-renderer .code-block-wrapper--single-line .copy-button:active { - transform: translateY(-50%) scale(1); -} - .markdown-renderer .code-block-wrapper pre[class*="language-"] { margin: 0 !important; border: none !important; - border-radius: 8px !important; - padding: 1.25rem !important; + border-radius: 0 0 8px 8px !important; + padding: 1rem 1rem 1rem 0.75rem !important; background: var(--color-bg-primary) !important; box-shadow: none !important; - font-size: 0.7rem !important; + font-size: 0.875rem !important; } .markdown-renderer .code-block-wrapper pre code { font-family: var(--markdown-font-mono, "Fira Code", "JetBrains Mono", Consolas, "Courier New", monospace) !important; background: var(--color-bg-primary) !important; - font-size: 0.7rem !important; + font-size: 0.875rem !important; } @@ -336,13 +353,13 @@ background: var(--color-bg-primary) !important; border: none !important; box-shadow: none !important; - font-size: 0.5rem !important; + font-size: 0.875rem !important; } .markdown-renderer .code-block-wrapper code[style] { background: var(--color-bg-primary) !important; border: none !important; - font-size: 0.8rem !important; + font-size: 0.875rem !important; } .markdown-renderer .inline-code:hover { @@ -355,24 +372,21 @@ .markdown-renderer pre { padding: 1.25rem; overflow: auto; - font-size: 0.3rem; - line-height: 1.5; + font-size: 0.875rem; + line-height: 1.55; background: var(--color-bg-primary); - border: 1px dashed rgba(255, 255, 255, 0.1); - border-radius: 4px; - margin: 0.25rem 0.3rem; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); + border: 1px solid var(--border-color, rgba(255, 255, 255, 0.09)); + border-radius: 8px; + margin: 0.65rem 0; + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.18); position: relative; - transition: all 0.3s ease; + transition: border-color 0.2s ease, box-shadow 0.2s ease; } .markdown-renderer pre:hover { - border: 1px solid rgba(255, 255, 255, 0.15); - box-shadow: - 0 8px 24px rgba(0, 0, 0, 0.4), - inset 0 1px 0 rgba(255, 255, 255, 0.1), - inset 0 -1px 0 rgba(255, 255, 255, 0.05); + border-color: color-mix(in srgb, var(--border-color, rgba(255, 255, 255, 0.09)) 75%, var(--primary-color, #3b82f6) 25%); + box-shadow: 0 6px 20px rgba(0, 0, 0, 0.22); } @@ -381,7 +395,7 @@ background: transparent !important; border: none !important; border-radius: 0 !important; - margin: 1rem 0.3rem !important; + margin: 0.75rem 0 !important; box-shadow: none !important; position: static !important; } @@ -422,7 +436,7 @@ font-family: var(--markdown-font-mono); user-select: text !important; box-shadow: none; - font-size: 0.7rem !important; + font-size: 0.875rem !important; color: inherit; font-weight: inherit; } @@ -599,8 +613,7 @@ .markdown-renderer strong { font-weight: 650; - color: #f5f5f5; - text-shadow: 0 0 1px rgba(255, 255, 255, 0.15); + color: var(--color-text-primary); } @@ -714,6 +727,15 @@ border-color: rgba(0, 0, 0, 0.15); } + .code-block-toolbar { + background: #eef1f6; + border-bottom-color: rgba(15, 23, 42, 0.1); + } + + .code-block-lang { + color: #64748b; + } + pre { background: #f6f8fa; border-color: rgba(0, 0, 0, 0.1); @@ -730,6 +752,7 @@ .copy-button:hover { color: #0969da; + background: rgba(15, 23, 42, 0.06); } @@ -872,14 +895,6 @@ } -.markdown-renderer pre:hover { - box-shadow: - 0 8px 24px rgba(0, 0, 0, 0.4), - inset 0 1px 0 rgba(255, 255, 255, 0.1), - inset 0 -1px 0 rgba(255, 255, 255, 0.05); - border-color: rgba(255, 255, 255, 0.15); -} - .markdown-renderer blockquote:hover { background: rgba(255, 255, 255, 0.03); border-left-color: rgba(255, 255, 255, 0.2); diff --git a/src/web-ui/src/component-library/components/Markdown/Markdown.tsx b/src/web-ui/src/component-library/components/Markdown/Markdown.tsx index 0836eb3f..3de1d3dd 100644 --- a/src/web-ui/src/component-library/components/Markdown/Markdown.tsx +++ b/src/web-ui/src/component-library/components/Markdown/Markdown.tsx @@ -422,6 +422,60 @@ function isEditorOpenableFilePath(filePath: string): boolean { return EDITOR_OPENABLE_EXTENSIONS.has(fileName.slice(dotIdx + 1)); } +/** Human-readable label for Prism language ids (code block toolbar). */ +function formatCodeLanguageLabel(lang: string): string { + if (!lang) return 'Text'; + const key = lang.toLowerCase(); + const aliases: Record = { + js: 'JavaScript', + jsx: 'JavaScript', + mjs: 'JavaScript', + cjs: 'JavaScript', + ts: 'TypeScript', + tsx: 'TSX', + py: 'Python', + rs: 'Rust', + go: 'Go', + rb: 'Ruby', + sh: 'Shell', + bash: 'Bash', + zsh: 'Zsh', + fish: 'Fish', + md: 'Markdown', + yml: 'YAML', + yaml: 'YAML', + json: 'JSON', + html: 'HTML', + css: 'CSS', + scss: 'SCSS', + sass: 'Sass', + less: 'Less', + cpp: 'C++', + cxx: 'C++', + hpp: 'C++', + hxx: 'C++', + cc: 'C++', + c: 'C', + cs: 'C#', + fs: 'F#', + swift: 'Swift', + kt: 'Kotlin', + java: 'Java', + sql: 'SQL', + graphql: 'GraphQL', + dockerfile: 'Dockerfile', + makefile: 'Makefile', + toml: 'TOML', + xml: 'XML', + rust: 'Rust', + typescript: 'TypeScript', + javascript: 'JavaScript', + }; + if (aliases[key]) return aliases[key]; + const raw = lang.replace(/[_-]/g, ' '); + return raw.charAt(0).toUpperCase() + raw.slice(1).toLowerCase(); +} + const CopyButton: React.FC<{ code: string }> = ({ code }) => { const { t } = useI18n('components'); const [copied, setCopied] = useState(false); @@ -598,16 +652,20 @@ export const Markdown = React.memo(({ return (
    - +
    + {formatCodeLanguageLabel(normalizedLang)} + +
    +
    (({ > {code} +
    ); }, diff --git a/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx b/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx index ddee181f..14228b65 100644 --- a/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx +++ b/src/web-ui/src/features/ssh-remote/RemoteFileBrowser.tsx @@ -26,7 +26,10 @@ import './RemoteFileBrowser.scss'; interface RemoteFileBrowserProps { connectionId: string; + /** Defaults to `~` (remote home) to avoid listing `/` on restricted hosts. */ initialPath?: string; + /** Used by the Home button; defaults to `initialPath`. */ + homePath?: string; onSelect: (path: string) => void; onCancel: () => void; } @@ -48,20 +51,43 @@ function joinRemotePath(dir: string, fileName: string): string { if (!dir || dir === '/') { return `/${name}`; } + if (dir === '~') { + return name ? `~/${name}` : '~'; + } const base = dir.endsWith('/') ? dir.slice(0, -1) : dir; return `${base}/${name}`; } +/** Parent directory for remote paths (supports `~` and absolute POSIX paths). */ +function getRemoteParentPath(path: string): string | null { + if (path === '/' || path === '~') return null; + if (path.startsWith('~/')) { + const rest = path.slice(2); + const parts = rest.split('/').filter(Boolean); + if (parts.length === 0) return null; + parts.pop(); + if (parts.length === 0) return '~'; + return `~/${parts.join('/')}`; + } + const parts = path.split('/').filter(Boolean); + if (parts.length === 0) return null; + if (parts.length === 1) return '/'; + parts.pop(); + return `/${parts.join('/')}`; +} + function isTauriDesktop(): boolean { return typeof window !== 'undefined' && '__TAURI__' in window; } export const RemoteFileBrowser: React.FC = ({ connectionId, - initialPath = '/', + initialPath = '~', + homePath, onSelect, onCancel, }) => { + const homeAnchor = homePath ?? initialPath; const { t } = useI18n('common'); const [currentPath, setCurrentPath] = useState(initialPath); const [pathInputValue, setPathInputValue] = useState(initialPath); @@ -131,7 +157,12 @@ export const RemoteFileBrowser: React.FC = ({ if (e.key === 'Enter') { const val = pathInputValue.trim(); if (val) { - navigateTo(val.startsWith('/') ? val : `/${val}`); + const nav = val.startsWith('~') + ? val + : val.startsWith('/') + ? val + : `/${val}`; + navigateTo(nav); } } else if (e.key === 'Escape') { setPathInputValue(currentPath); @@ -221,7 +252,7 @@ export const RemoteFileBrowser: React.FC = ({ return; } - const parentPath = getParentPath(renameEntry.path) || '/'; + const parentPath = getRemoteParentPath(renameEntry.path) ?? '/'; const newPath = parentPath.endsWith('/') ? `${parentPath}${renameValue.trim()}` : `${parentPath}/${renameValue.trim()}`; @@ -235,13 +266,6 @@ export const RemoteFileBrowser: React.FC = ({ } }; - const getParentPath = (path: string): string | null => { - if (path === '/') return null; - const parts = path.split('/').filter(Boolean); - parts.pop(); - return '/' + parts.join('/'); - }; - const handleDownloadEntry = async (entry: RemoteFileEntry) => { if (entry.isDir) return; if (!isTauriDesktop()) { @@ -327,7 +351,22 @@ export const RemoteFileBrowser: React.FC = ({ return ; }; - const pathParts = currentPath.split('/').filter(Boolean); + const pathParts = (() => { + if (currentPath === '/' || currentPath === '') return []; + if (currentPath === '~') return ['~']; + if (currentPath.startsWith('~/')) { + return ['~', ...currentPath.slice(2).split('/').filter(Boolean)]; + } + return currentPath.split('/').filter(Boolean); + })(); + + const pathAtSegment = (index: number) => { + if (pathParts[0] === '~') { + if (index === 0) return '~'; + return `~/${pathParts.slice(1, index + 1).join('/')}`; + } + return `/${pathParts.slice(0, index + 1).join('/')}`; + }; return (
    @@ -366,8 +405,8 @@ export const RemoteFileBrowser: React.FC = ({ > @@ -376,13 +415,13 @@ export const RemoteFileBrowser: React.FC = ({ / ) : ( pathParts.map((part, index) => { - const path = '/' + pathParts.slice(0, index + 1).join('/'); + const segPath = pathAtSegment(index); const isLast = index === pathParts.length - 1; return ( - + @@ -407,9 +446,12 @@ export const RemoteFileBrowser: React.FC = ({ @@ -462,10 +504,10 @@ export const RemoteFileBrowser: React.FC = ({ {/* Parent directory link */} - {currentPath !== '/' && ( + {getRemoteParentPath(currentPath) !== null && ( { - const parent = getParentPath(currentPath); + const parent = getRemoteParentPath(currentPath); if (parent !== null) navigateTo(parent); }} className="remote-file-browser__row remote-file-browser__row--parent" diff --git a/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx b/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx index f1b845f3..cd3252b7 100644 --- a/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx +++ b/src/web-ui/src/features/ssh-remote/SSHRemoteProvider.tsx @@ -47,6 +47,8 @@ interface SSHContextValue { showConnectionDialog: boolean; showFileBrowser: boolean; error: string | null; + /** Default path for remote folder picker (`~` or resolved `$HOME` from server). */ + remoteFileBrowserInitialPath: string; // Actions connect: (connectionId: string, config: SSHConnectionConfig) => Promise; @@ -85,6 +87,7 @@ export const SSHRemoteProvider: React.FC = ({ children } const [showFileBrowser, setShowFileBrowser] = useState(false); const [error, setError] = useState(null); const [connectionError, setConnectionError] = useState(null); + const [remoteFileBrowserInitialPath, setRemoteFileBrowserInitialPath] = useState('~'); // Per-workspace connection statuses (keyed by connectionId) const [workspaceStatuses, setWorkspaceStatuses] = useState>({}); const heartbeatInterval = useRef(null); @@ -394,6 +397,10 @@ export const SSHRemoteProvider: React.FC = ({ children } if (result.success && result.connectionId) { log.info('SSH connection successful', { connectionId: result.connectionId }); + const home = result.serverInfo?.homeDir?.trim(); + setRemoteFileBrowserInitialPath( + home && home.length > 0 ? normalizeRemoteWorkspacePath(home) : '~' + ); setStatus('connected'); setIsConnected(true); setConnectionId(result.connectionId); @@ -446,6 +453,7 @@ export const SSHRemoteProvider: React.FC = ({ children } setRemoteWorkspace(null); setIsConnected(false); setShowFileBrowser(false); + setRemoteFileBrowserInitialPath('~'); if (currentRemoteWorkspace) { setWorkspaceStatus(currentRemoteWorkspace.connectionId, 'disconnected'); @@ -515,6 +523,7 @@ export const SSHRemoteProvider: React.FC = ({ children } showConnectionDialog, showFileBrowser, error, + remoteFileBrowserInitialPath, connect, disconnect, openWorkspace, diff --git a/src/web-ui/src/flow_chat/components/FlowTextBlock.scss b/src/web-ui/src/flow_chat/components/FlowTextBlock.scss index a1ab6863..db778b27 100644 --- a/src/web-ui/src/flow_chat/components/FlowTextBlock.scss +++ b/src/web-ui/src/flow_chat/components/FlowTextBlock.scss @@ -4,7 +4,21 @@ .flow-text-block { width: 100%; - + + /* Chat: slightly roomier reading measure than generic markdown preview */ + .markdown-renderer { + font-size: 0.9375rem; + line-height: 1.65; + letter-spacing: 0.01em; + } + + .markdown-renderer p, + .markdown-renderer li, + .markdown-renderer ul, + .markdown-renderer ol { + font-size: inherit; + } + .text-content { color: var(--color-text-primary); line-height: 1.6; diff --git a/src/web-ui/src/flow_chat/utils/sessionOrdering.test.ts b/src/web-ui/src/flow_chat/utils/sessionOrdering.test.ts index 25ec85fc..b2a8f83a 100644 --- a/src/web-ui/src/flow_chat/utils/sessionOrdering.test.ts +++ b/src/web-ui/src/flow_chat/utils/sessionOrdering.test.ts @@ -1,6 +1,10 @@ import { describe, expect, it } from 'vitest'; import type { Session } from '../types/flow-chat'; -import { compareSessionsForDisplay, getSessionSortTimestamp } from './sessionOrdering'; +import { + compareSessionsForDisplay, + getSessionSortTimestamp, + sessionBelongsToWorkspaceNavRow, +} from './sessionOrdering'; function createSession(overrides: Partial = {}): Session { return { @@ -55,4 +59,29 @@ describe('sessionOrdering', () => { const orderedIds = [...sessions].sort(compareSessionsForDisplay).map(session => session.sessionId); expect(orderedIds).toEqual(['a', 'b']); }); + + it('remote SSH: same host but different remote root does not share nav row', () => { + const conn = 'ssh-user@myserver.example.com:22'; + const host = 'myserver.example.com'; + const rowPath = '/home/u/project-a'; + const otherPath = '/home/u/project-b'; + + const sessionA = { + workspacePath: rowPath, + remoteConnectionId: conn, + remoteSshHost: host, + }; + const sessionB = { + workspacePath: otherPath, + remoteConnectionId: conn, + remoteSshHost: host, + }; + + expect( + sessionBelongsToWorkspaceNavRow(sessionA, rowPath, conn, host) + ).toBe(true); + expect( + sessionBelongsToWorkspaceNavRow(sessionB, rowPath, conn, host) + ).toBe(false); + }); }); diff --git a/src/web-ui/src/flow_chat/utils/sessionOrdering.ts b/src/web-ui/src/flow_chat/utils/sessionOrdering.ts index d4353795..191fb93b 100644 --- a/src/web-ui/src/flow_chat/utils/sessionOrdering.ts +++ b/src/web-ui/src/flow_chat/utils/sessionOrdering.ts @@ -20,8 +20,8 @@ function effectiveWorkspaceSshHost( /** * Whether a persisted session belongs to a nav row for this workspace. - * Remote mirror lists sessions by host+path on disk; metadata `workspacePath` / `remoteSshHost` can be stale, - * so we must match by SSH host (from metadata or embedded in connection id) before rejecting on path alone. + * Remote workspaces are scoped by **SSH host + normalized remote root** (and connection id when present). + * We must never treat "same host" as sufficient: two tabs to the same server at `/a` vs `/b` are distinct. */ export function sessionBelongsToWorkspaceNavRow( session: Pick, @@ -40,10 +40,11 @@ export function sessionBelongsToWorkspaceNavRow( const wsConnHost = hostFromSshConnectionId(wsConn); if (wsHostEff.length > 0) { - if (sessHost === wsHostEff) { + // Host match alone is insufficient (same server, different remote folders). + if (sessHost === wsHostEff && sp === wp) { return true; } - if (sessConnHost === wsHostEff) { + if (sessConnHost === wsHostEff && sp === wp) { return true; } if (sessConnHost && wsConnHost && sessConnHost === wsConnHost) { diff --git a/src/web-ui/src/locales/en-US/common.json b/src/web-ui/src/locales/en-US/common.json index 4cc9a2db..59a22bda 100644 --- a/src/web-ui/src/locales/en-US/common.json +++ b/src/web-ui/src/locales/en-US/common.json @@ -52,7 +52,7 @@ "showChatPanel": "Show Chat Panel", "hideChatPanel": "Hide Chat Panel", "switchToToolbar": "Floating window mode", - "remoteConnect": "Remote Control (Beta)", + "remoteConnect": "Remote Control", "modeSwitchAriaLabel": "View mode switch", "modeCowork": "Cowork", "modeCoder": "Coder", @@ -368,7 +368,7 @@ "errorCreateFailed": "Failed to create project" }, "remoteConnect": { - "title": "Remote Control (Beta)", + "title": "Remote Control", "tabLan": "LAN", "tabBitfunServer": "BitFun Server", "tabNgrok": "NAT Traversal", @@ -434,7 +434,7 @@ "openNgrokSetup": "Open ngrok setup page", "disclaimerTitle": "Remote Connect Disclaimer", "disclaimerIntro": "Before enabling Remote Connect, please read and accept the following:", - "disclaimerItemBeta": "Remote Connect is currently in Beta. It may contain undiscovered security vulnerabilities, functional defects, or incompatible changes. Please use it with full awareness of the risks.", + "disclaimerItemGeneralRisk": "Remote Connect may contain undiscovered security vulnerabilities, functional defects, or incompatible changes. Please use it with full awareness of the risks.", "disclaimerItemSecurity": "Remote Connect enables network communication paths (including but not limited to LAN, third-party relay, self-hosted relay, bot channels, and other pathways). Use it only on trusted devices and networks.", "disclaimerItemEncryption": "Remote message payloads are protected with end-to-end encryption (X25519 ECDH + AES-256-GCM with ephemeral key pairs per session); relay servers cannot decrypt message content. However, required metadata (such as device name, connection state, service endpoint, and other connection-context details) is not covered by business-message encryption and may still be visible to network paths, service nodes, or other infrastructure.", "disclaimerItemOpenSource": "BitFun's Remote Connect encryption implementation is fully open-source. You are free to audit the source code to verify its security.", @@ -942,6 +942,8 @@ "pickPrivateKeyDialogTitle": "Select SSH private key", "passphrase": "Passphrase", "passphraseOptional": "Leave empty if none", + "homeFolder": "Home folder", + "clickToEditPath": "Click to edit path", "selectWorkspace": "Select Workspace Directory", "openWorkspace": "Open as Workspace", "selected": "Selected", diff --git a/src/web-ui/src/locales/zh-CN/common.json b/src/web-ui/src/locales/zh-CN/common.json index bcc6105d..31ed7f84 100644 --- a/src/web-ui/src/locales/zh-CN/common.json +++ b/src/web-ui/src/locales/zh-CN/common.json @@ -52,7 +52,7 @@ "showChatPanel": "显示聊天面板", "hideChatPanel": "隐藏聊天面板", "switchToToolbar": "悬浮窗模式", - "remoteConnect": "远程控制 (Beta)", + "remoteConnect": "远程控制", "modeSwitchAriaLabel": "视图模式切换", "modeCowork": "Cowork", "modeCoder": "Coder", @@ -368,7 +368,7 @@ "errorCreateFailed": "创建工程失败" }, "remoteConnect": { - "title": "远程控制 (Beta)", + "title": "远程控制", "tabLan": "局域网", "tabBitfunServer": "BitFun服务器", "tabNgrok": "内网穿透", @@ -434,7 +434,7 @@ "openNgrokSetup": "打开 ngrok 安装与配置页面", "disclaimerTitle": "远程连接免责声明", "disclaimerIntro": "启用远程连接前,请确认你已理解并接受以下事项:", - "disclaimerItemBeta": "远程连接目前为 Beta 版本,可能存在未发现的安全漏洞、功能缺陷或不兼容变更,请在充分了解风险后使用。", + "disclaimerItemGeneralRisk": "远程连接可能存在未发现的安全漏洞、功能缺陷或不兼容变更,请在充分了解风险后使用。", "disclaimerItemSecurity": "远程连接会开启与网络通信相关的能力(包括但不限于局域网、第三方中继、自建服务、机器人通道等),请仅在可信网络和可信设备上使用。", "disclaimerItemEncryption": "远程连接采用端到端加密(X25519 ECDH + AES-256-GCM,每次会话生成临时密钥对)传输业务消息,中继服务器无法解密消息内容;但设备名称、连接状态、服务地址等必要元数据及其他连接上下文信息不属于业务消息密文范畴,仍可能被网络路径、服务节点或其他基础设施感知。", "disclaimerItemOpenSource": "BitFun 远程连接的加密实现完全开源,你可以自行审计源码以验证安全性。", @@ -942,6 +942,8 @@ "pickPrivateKeyDialogTitle": "选择 SSH 私钥", "passphrase": "密码短语", "passphraseOptional": "留空表示无密码短语", + "homeFolder": "主目录", + "clickToEditPath": "点击编辑路径", "selectWorkspace": "选择工作区目录", "openWorkspace": "打开为工作区", "selected": "已选择",