From bf8bc2d04bc94c04f4eb1dc28c02ff6a6ada3565 Mon Sep 17 00:00:00 2001 From: M3gA-Mind Date: Thu, 4 Jun 2026 14:09:58 +0530 Subject: [PATCH 1/9] feat(computer): main-thread synthetic-input executor + CEF crash fix Run enigo keyboard/mouse on the app main thread via a native-registry executor; enigo's macOS TSMGetInputSourceProperty traps off-thread and crashes the CEF host. Adds mouse/keyboard tools, the main_thread bridge, and downscaled screenshots so the model can see them. Slice 1/7 of #3307 (was the 'computer control' area). --- app/src-tauri/src/lib.rs | 33 +++ .../tools/impl/browser/screenshot.rs | 200 +++++++++++----- src/openhuman/tools/impl/computer/keyboard.rs | 148 ++++++------ .../tools/impl/computer/main_thread.rs | 49 ++++ src/openhuman/tools/impl/computer/mod.rs | 2 + src/openhuman/tools/impl/computer/mouse.rs | 221 +++++++++--------- 6 files changed, 406 insertions(+), 247 deletions(-) create mode 100644 src/openhuman/tools/impl/computer/main_thread.rs diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs index 41c01934a5..7df029b44b 100644 --- a/app/src-tauri/src/lib.rs +++ b/app/src-tauri/src/lib.rs @@ -2800,6 +2800,39 @@ pub fn run() { // let _ = window.show(); // } + // Synthetic-input main-thread executor. enigo's macOS keyboard-layout + // lookup (TSMGetInputSourceProperty) MUST run on the app main thread + // or it traps (`_dispatch_assert_queue_fail`/EXC_BREAKPOINT) and + // crashes the CEF host (Change 1.15, confirmed via crash report). The + // keyboard/mouse tools run on tokio workers, so they dispatch their + // enigo ops here via the native registry; we run each on the real + // main thread through `run_on_main_thread`. + { + use openhuman_core::core::event_bus::register_native_global; + use openhuman_core::openhuman::tools::{ + MainThreadInputOp, INPUT_ON_MAIN_THREAD_METHOD, + }; + let input_app = app.handle().clone(); + register_native_global::, _, _>( + INPUT_ON_MAIN_THREAD_METHOD, + move |req| { + let input_app = input_app.clone(); + async move { + let (tx, rx) = tokio::sync::oneshot::channel(); + let run = req.run; + input_app + .run_on_main_thread(move || { + let _ = tx.send((run)()); + }) + .map_err(|e| format!("run_on_main_thread dispatch failed: {e}"))?; + rx.await + .map_err(|_| "main-thread input op was cancelled".to_string()) + } + }, + ); + log::info!("[computer] registered main-thread synthetic-input executor"); + } + // Tray icon setup moved to RunEvent::Ready (see below) — GTK is only // initialized after the event loop starts, so we must delay tray creation // until the Ready event fires. Creating the tray here would panic on diff --git a/src/openhuman/tools/impl/browser/screenshot.rs b/src/openhuman/tools/impl/browser/screenshot.rs index 1a247830e5..7d1e9c69b7 100644 --- a/src/openhuman/tools/impl/browser/screenshot.rs +++ b/src/openhuman/tools/impl/browser/screenshot.rs @@ -9,8 +9,6 @@ use std::time::Duration; /// Maximum time to wait for a screenshot command to complete. const SCREENSHOT_TIMEOUT_SECS: u64 = 15; -/// Maximum base64 payload size to return (2 MB of base64 ≈ 1.5 MB image). -const MAX_BASE64_BYTES: usize = 2_097_152; /// Tool for capturing screenshots using platform-native commands. /// @@ -132,61 +130,101 @@ impl ScreenshotTool { } } - /// Read the screenshot file and return base64-encoded result. + /// Read the screenshot file and return a base64 data-URL the model can see. + /// + /// Full-screen Retina captures are multi-MB PNGs that blow the inline + /// budget. Rather than dropping the image (which leaves vision-driven + /// control blind), downscale oversized captures to a JPEG that fits — the + /// model can then actually see the screen. Reports the *shown* dimensions so + /// callers know the coordinate space they're reading. async fn read_and_encode(output_path: &std::path::Path) -> anyhow::Result { - // Check file size before reading to prevent OOM on large screenshots - const MAX_RAW_BYTES: u64 = 1_572_864; // ~1.5 MB (base64 expands ~33%) - if let Ok(meta) = tokio::fs::metadata(output_path).await { - if meta.len() > MAX_RAW_BYTES { - return Ok(ToolResult::success(format!( - "Screenshot saved to: {}\nSize: {} bytes (too large to base64-encode inline)", - output_path.display(), - meta.len(), - ))); + // ~1.5 MB raw → ~2 MB base64, a safe inline payload size. + const MAX_RAW_BYTES: usize = 1_572_864; + + let bytes = match tokio::fs::read(output_path).await { + Ok(b) => b, + Err(e) => { + return Ok(ToolResult::error(format!( + "Failed to read screenshot file: {e}" + ))) } + }; + let ext = output_path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("png") + .to_lowercase(); + + // Fits as-is → return verbatim. + if bytes.len() <= MAX_RAW_BYTES { + let mime = match ext.as_str() { + "jpg" | "jpeg" => "image/jpeg", + "bmp" => "image/bmp", + "gif" => "image/gif", + "webp" => "image/webp", + _ => "image/png", + }; + return Ok(Self::data_url_result(output_path, &bytes, mime, None)); } - match tokio::fs::read(output_path).await { - Ok(bytes) => { - use base64::Engine; - let size = bytes.len(); - let mut encoded = base64::engine::general_purpose::STANDARD.encode(&bytes); - let truncated = if encoded.len() > MAX_BASE64_BYTES { - encoded.truncate(crate::openhuman::util::floor_char_boundary( - &encoded, - MAX_BASE64_BYTES, - )); - true - } else { - false - }; - - let mut output_msg = format!( - "Screenshot saved to: {}\nSize: {size} bytes\nBase64 length: {}", - output_path.display(), - encoded.len(), - ); - if truncated { - output_msg.push_str(" (truncated)"); - } - let mime = match output_path.extension().and_then(|e| e.to_str()) { - Some("jpg" | "jpeg") => "image/jpeg", - Some("bmp") => "image/bmp", - Some("gif") => "image/gif", - Some("webp") => "image/webp", - _ => "image/png", - }; - let _ = write!(output_msg, "\ndata:{mime};base64,{encoded}"); - - Ok(ToolResult::success(output_msg)) - } - Err(e) => Ok(ToolResult::error(format!( - "Failed to read screenshot file: {e}" + // Too large → downscale to a JPEG that fits (CPU work off the runtime). + match tokio::task::spawn_blocking(move || downscale_to_jpeg(&bytes, MAX_RAW_BYTES)).await { + Ok(Ok((jpeg, w, h))) => Ok(Self::data_url_result( + output_path, + &jpeg, + "image/jpeg", + Some((w, h)), + )), + Ok(Err(e)) => Ok(ToolResult::success(format!( + "Screenshot saved to: {} (could not downscale for inline view: {e})", + output_path.display() ))), + Err(e) => Ok(ToolResult::error(format!("downscale task failed: {e}"))), + } + } + + /// Build a success result carrying a base64 data-URL of `data`. + fn data_url_result( + output_path: &std::path::Path, + data: &[u8], + mime: &str, + shown_dims: Option<(u32, u32)>, + ) -> ToolResult { + use base64::Engine; + let encoded = base64::engine::general_purpose::STANDARD.encode(data); + let mut msg = format!("Screenshot saved to: {}\n", output_path.display()); + if let Some((w, h)) = shown_dims { + let _ = write!( + msg, + "Downscaled to {w}x{h}px for inline view (coordinates you read are in this {w}x{h} space).\n" + ); } + let _ = write!(msg, "data:{mime};base64,{encoded}"); + ToolResult::success(msg) } } +/// Decode image bytes, downscale (preserving aspect ratio), and JPEG-encode so +/// the result is ≤ `max_bytes`. Returns `(jpeg_bytes, width, height)`. +fn downscale_to_jpeg(bytes: &[u8], max_bytes: usize) -> Result<(Vec, u32, u32), String> { + let img = image::load_from_memory(bytes).map_err(|e| format!("decode: {e}"))?; + let mut last: Option<(Vec, u32, u32)> = None; + for max_dim in [1568u32, 1280, 1024, 768, 600] { + let thumb = img.thumbnail(max_dim, max_dim); // fits within max_dim², keeps aspect + let mut buf = std::io::Cursor::new(Vec::new()); + image::codecs::jpeg::JpegEncoder::new_with_quality(&mut buf, 72) + .encode_image(&thumb) + .map_err(|e| format!("jpeg encode: {e}"))?; + let out = buf.into_inner(); + let (w, h) = (thumb.width(), thumb.height()); + if out.len() <= max_bytes { + return Ok((out, w, h)); + } + last = Some((out, w, h)); + } + last.ok_or_else(|| "could not produce a fitting JPEG".to_string()) +} + #[async_trait] impl Tool for ScreenshotTool { fn name(&self) -> &str { @@ -228,6 +266,36 @@ mod tests { use super::*; use crate::openhuman::security::{AutonomyLevel, SecurityPolicy}; + #[test] + fn downscale_to_jpeg_shrinks_oversized_capture() { + // A 1600x1200 PNG of noise is well over a tight budget; downscaling must + // produce a smaller JPEG that still decodes, so the model can see it. + let mut img = image::RgbImage::new(1600, 1200); + for (i, px) in img.pixels_mut().enumerate() { + *px = image::Rgb([(i % 251) as u8, (i % 253) as u8, (i % 247) as u8]); + } + let mut png = std::io::Cursor::new(Vec::new()); + image::DynamicImage::ImageRgb8(img) + .write_to(&mut png, image::ImageFormat::Png) + .expect("encode png"); + let png = png.into_inner(); + + let max = 400_000usize; + let (jpeg, w, h) = downscale_to_jpeg(&png, max).expect("downscale"); + assert!(jpeg.len() <= max, "jpeg {} should be <= {max}", jpeg.len()); + assert!( + w <= 1568 && h <= 1568, + "dims {w}x{h} should be capped to 1568" + ); + assert!( + jpeg.len() < png.len(), + "jpeg should be smaller than source png" + ); + // Result must be a valid, decodable image at the reported dims. + let decoded = image::load_from_memory(&jpeg).expect("jpeg decodes"); + assert_eq!((decoded.width(), decoded.height()), (w, h)); + } + fn test_security() -> Arc { Arc::new(SecurityPolicy { autonomy: AutonomyLevel::Full, @@ -439,24 +507,38 @@ mod tests { // ── read_and_encode: large file returns saved-path-only message ─────────── #[tokio::test] - async fn read_and_encode_large_file_skips_base64() { - use tokio::io::AsyncWriteExt; + async fn read_and_encode_large_file_downscales_to_viewable_jpeg() { + // A large *real* PNG (over MAX_RAW_BYTES) must be downscaled to an inline + // JPEG data-URL the model can see — not dropped (the old behavior left + // vision-driven control blind). let dir = tempfile::TempDir::new().unwrap(); let path = dir.path().join("big.png"); - let mut f = tokio::fs::File::create(&path).await.unwrap(); - // Write ~1.6 MB to exceed the MAX_RAW_BYTES threshold (1.5 MB) - let chunk = vec![0u8; 1024]; - for _ in 0..1600 { - f.write_all(&chunk).await.unwrap(); + let mut img = image::RgbImage::new(2200, 1500); + for (i, px) in img.pixels_mut().enumerate() { + *px = image::Rgb([(i % 251) as u8, (i % 253) as u8, (i % 247) as u8]); } - drop(f); + image::DynamicImage::ImageRgb8(img) + .save_with_format(&path, image::ImageFormat::Png) + .unwrap(); + assert!( + tokio::fs::metadata(&path).await.unwrap().len() > 1_572_864, + "test PNG should exceed the inline budget" + ); let result = ScreenshotTool::read_and_encode(&path).await.unwrap(); - assert!(!result.is_error, "large file should not be an error result"); assert!( - result.output().contains("too large to base64-encode"), - "large file should skip base64, got: {}", + !result.is_error, + "should not error, got: {}", result.output() ); + let out = result.output(); + assert!( + out.contains("data:image/jpeg;base64,"), + "should inline a jpeg: {out}" + ); + assert!( + out.contains("Downscaled to"), + "should report downscale: {out}" + ); } } diff --git a/src/openhuman/tools/impl/computer/keyboard.rs b/src/openhuman/tools/impl/computer/keyboard.rs index 9bbfecb704..9e9a9a629e 100644 --- a/src/openhuman/tools/impl/computer/keyboard.rs +++ b/src/openhuman/tools/impl/computer/keyboard.rs @@ -4,6 +4,7 @@ //! via platform-native APIs (Core Graphics on macOS, SendInput on Windows, //! X11/libxdo on Linux). +use super::main_thread::run_input_on_main; use crate::openhuman::security::SecurityPolicy; use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult}; use async_trait::async_trait; @@ -186,21 +187,18 @@ impl Tool for KeyboardTool { } let len = text.len(); - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - enigo - .text(&text) - .map_err(|e| anyhow::anyhow!("text typing failed: {e}"))?; - info!( - tool = "keyboard", - action = "type", - chars = len, - "[computer] typed text" - ); - Ok(ToolResult::success(format!("Typed {len} characters"))) - }) - .await? + into_result( + "type", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + enigo + .text(&text) + .map_err(|e| format!("text typing failed: {e}"))?; + Ok(format!("Typed {len} characters")) + }) + .await, + ) } "press" => { @@ -214,21 +212,18 @@ impl Tool for KeyboardTool { anyhow::anyhow!("Unknown key '{key_name}'. Use names like Enter, Tab, Escape, F1-F12, a-z, 0-9, Space, etc.") })?; - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - enigo - .key(key, Direction::Click) - .map_err(|e| anyhow::anyhow!("key press failed: {e}"))?; - info!( - tool = "keyboard", - action = "press", - key = key_name.as_str(), - "[computer] pressed key" - ); - Ok(ToolResult::success(format!("Pressed key '{key_name}'"))) - }) - .await? + into_result( + "press", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + enigo + .key(key, Direction::Click) + .map_err(|e| format!("key press failed: {e}"))?; + Ok(format!("Pressed key '{key_name}'")) + }) + .await, + ) } "hotkey" => { @@ -288,51 +283,42 @@ impl Tool for KeyboardTool { } let combo_desc = key_names.join("+"); - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - - // Press keys in order, tracking which were successfully - // pressed so we can release them on error. - let mut pressed_keys: Vec = Vec::with_capacity(keys.len()); - let press_result: Result<(), anyhow::Error> = (|| { - for key in &keys { - enigo.key(*key, Direction::Press).map_err(|e| { - anyhow::anyhow!("key press failed for {key:?}: {e}") - })?; - pressed_keys.push(*key); - std::thread::sleep(HOTKEY_INTER_KEY_DELAY); + into_result( + "hotkey", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + + // Press keys in order, tracking which were pressed so we + // can release them on error. + let mut pressed_keys: Vec = Vec::with_capacity(keys.len()); + let press_result: Result<(), String> = (|| { + for key in &keys { + enigo + .key(*key, Direction::Press) + .map_err(|e| format!("key press failed for {key:?}: {e}"))?; + pressed_keys.push(*key); + std::thread::sleep(HOTKEY_INTER_KEY_DELAY); + } + Ok(()) + })(); + + // Always release pressed keys in reverse, even on error. + for key in pressed_keys.iter().rev() { + if let Err(e) = enigo.key(*key, Direction::Release) { + tracing::warn!( + tool = "keyboard", + key = ?key, + error = %e, + "[computer] best-effort key release failed during cleanup" + ); + } } - Ok(()) - })(); - - // Always release all successfully pressed keys in reverse - // order, even if a press failed partway through. - for key in pressed_keys.iter().rev() { - if let Err(e) = enigo.key(*key, Direction::Release) { - tracing::warn!( - tool = "keyboard", - key = ?key, - error = %e, - "[computer] best-effort key release failed during cleanup" - ); - } - } - - // Now propagate any press error. - press_result?; - - info!( - tool = "keyboard", - action = "hotkey", - combo = combo_desc.as_str(), - "[computer] hotkey executed" - ); - Ok(ToolResult::success(format!( - "Executed hotkey: {combo_desc}" - ))) - }) - .await? + press_result?; + Ok(format!("Executed hotkey: {combo_desc}")) + }) + .await, + ) } other => Ok(ToolResult::error(format!( @@ -342,6 +328,20 @@ impl Tool for KeyboardTool { } } +/// Map a main-thread input op result to a `ToolResult`, logging the outcome. +fn into_result(action: &str, r: Result) -> anyhow::Result { + match r { + Ok(msg) => { + info!(tool = "keyboard", action, "[computer] {msg}"); + Ok(ToolResult::success(msg)) + } + Err(e) => { + tracing::warn!(tool = "keyboard", action, "[computer] failed: {e}"); + Ok(ToolResult::error(e)) + } + } +} + #[cfg(test)] #[path = "keyboard_tests.rs"] mod tests; diff --git a/src/openhuman/tools/impl/computer/main_thread.rs b/src/openhuman/tools/impl/computer/main_thread.rs new file mode 100644 index 0000000000..26697fc10f --- /dev/null +++ b/src/openhuman/tools/impl/computer/main_thread.rs @@ -0,0 +1,49 @@ +//! Main-thread bridge for synthetic input (mouse/keyboard). +//! +//! macOS's Text Input Source APIs (`TSMGetInputSourceProperty`), which enigo +//! calls during keyboard-layout lookup, **must run on the app's main thread**. +//! Running them on a tokio worker (or `spawn_blocking`) traps with +//! `_dispatch_assert_queue_fail` / `EXC_BREAKPOINT` and crashes the CEF host +//! (tracker §1.8 / Change 1.15 — confirmed via crash report). +//! +//! So the keyboard/mouse tools never call enigo on their own thread. They build +//! a closure and hand it to [`run_input_on_main`], which dispatches it — over +//! the native request registry — to a handler the Tauri shell registers at +//! startup, which runs it on the real main thread via +//! `AppHandle::run_on_main_thread`. + +use crate::core::event_bus::request_native_global; + +/// Native-registry method the Tauri shell handles to run an input op on the +/// main thread. The shell registers a handler under this key at startup. +pub const INPUT_ON_MAIN_THREAD_METHOD: &str = "computer.input_on_main_thread"; + +/// A synthetic-input operation to run on the app's main thread. `run` performs +/// the enigo calls and returns a human-readable success message (`Ok`) or an +/// error string (`Err`). Carried by value through the native registry (no +/// serialization — the boxed `FnOnce` passes through unchanged). +pub struct MainThreadInputOp { + pub run: Box Result + Send>, +} + +/// Dispatch `op` to the app main thread and await its result. +/// +/// Returns an error when no main-thread executor is registered (headless / CLI +/// builds have no Tauri main thread — synthetic input is a desktop capability). +pub async fn run_input_on_main(op: F) -> Result +where + F: FnOnce() -> Result + Send + 'static, +{ + let req = MainThreadInputOp { run: Box::new(op) }; + match request_native_global::>( + INPUT_ON_MAIN_THREAD_METHOD, + req, + ) + .await + { + Ok(inner) => inner, + Err(e) => Err(format!( + "synthetic input requires the desktop app's main-thread executor (unavailable: {e})" + )), + } +} diff --git a/src/openhuman/tools/impl/computer/mod.rs b/src/openhuman/tools/impl/computer/mod.rs index ec8363c0f3..6603105d9c 100644 --- a/src/openhuman/tools/impl/computer/mod.rs +++ b/src/openhuman/tools/impl/computer/mod.rs @@ -1,8 +1,10 @@ mod ax_interact; mod human_path; mod keyboard; +mod main_thread; mod mouse; pub use ax_interact::AxInteractTool; pub use keyboard::KeyboardTool; +pub use main_thread::{run_input_on_main, MainThreadInputOp, INPUT_ON_MAIN_THREAD_METHOD}; pub use mouse::MouseTool; diff --git a/src/openhuman/tools/impl/computer/mouse.rs b/src/openhuman/tools/impl/computer/mouse.rs index bcaf554e79..40f016e5bd 100644 --- a/src/openhuman/tools/impl/computer/mouse.rs +++ b/src/openhuman/tools/impl/computer/mouse.rs @@ -5,6 +5,7 @@ //! SendInput on Windows, X11/libxdo on Linux). use super::human_path::{human_path, HumanPathOptions}; +use super::main_thread::run_input_on_main; use crate::openhuman::security::SecurityPolicy; use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult}; use async_trait::async_trait; @@ -226,69 +227,57 @@ impl Tool for MouseTool { "move" => { let (x, y) = require_xy(&args)?; let human_like = human_like_enabled(&args)?; - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - humanized_move(&mut enigo, x, y, human_like)?; - info!( - tool = "mouse", - action = "move", - x = x, - y = y, - "[computer] cursor moved" - ); - Ok(ToolResult::success(format!("Moved cursor to ({x}, {y})"))) - }) - .await? + into_result( + "move", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + humanized_move(&mut enigo, x, y, human_like).map_err(|e| e.to_string())?; + Ok(format!("Moved cursor to ({x}, {y})")) + }) + .await, + ) } "click" => { let (x, y) = require_xy(&args)?; let button = parse_button(&args)?; let human_like = human_like_enabled(&args)?; - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - humanized_move(&mut enigo, x, y, human_like)?; - enigo - .button(button, Direction::Click) - .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?; - info!( - tool = "mouse", action = "click", - x = x, y = y, button = ?button, - "[computer] clicked" - ); - Ok(ToolResult::success(format!( - "Clicked {button:?} at ({x}, {y})" - ))) - }) - .await? + into_result( + "click", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + humanized_move(&mut enigo, x, y, human_like).map_err(|e| e.to_string())?; + enigo + .button(button, Direction::Click) + .map_err(|e| format!("button click failed: {e}"))?; + Ok(format!("Clicked {button:?} at ({x}, {y})")) + }) + .await, + ) } "double_click" => { let (x, y) = require_xy(&args)?; let button = parse_button(&args)?; let human_like = human_like_enabled(&args)?; - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - humanized_move(&mut enigo, x, y, human_like)?; - enigo - .button(button, Direction::Click) - .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?; - enigo - .button(button, Direction::Click) - .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?; - info!( - tool = "mouse", action = "double_click", - x = x, y = y, button = ?button, - "[computer] double-clicked" - ); - Ok(ToolResult::success(format!( - "Double-clicked {button:?} at ({x}, {y})" - ))) - }) - .await? + into_result( + "double_click", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + humanized_move(&mut enigo, x, y, human_like).map_err(|e| e.to_string())?; + enigo + .button(button, Direction::Click) + .map_err(|e| format!("button click failed: {e}"))?; + enigo + .button(button, Direction::Click) + .map_err(|e| format!("button click failed: {e}"))?; + Ok(format!("Double-clicked {button:?} at ({x}, {y})")) + }) + .await, + ) } "drag" => { @@ -308,44 +297,40 @@ impl Tool for MouseTool { let sx = start_x as i32; let sy = start_y as i32; - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - humanized_move(&mut enigo, sx, sy, human_like)?; - enigo - .button(button, Direction::Press) - .map_err(|e| anyhow::anyhow!("button press failed: {e}"))?; - - // After press succeeds, guarantee release even on error. - let drag_result: Result<(), anyhow::Error> = (|| { - humanized_move(&mut enigo, end_x, end_y, human_like)?; - Ok(()) - })(); - - // Always release — best-effort cleanup. - if let Err(e) = enigo.button(button, Direction::Release) { - warn!( - tool = "mouse", - button = ?button, - error = %e, - "[computer] best-effort button release failed during drag cleanup" - ); - } - - // Propagate the drag error if the move failed. - drag_result?; - - info!( - tool = "mouse", action = "drag", - start_x = sx, start_y = sy, - end_x = end_x, end_y = end_y, button = ?button, - "[computer] dragged" - ); - Ok(ToolResult::success(format!( - "Dragged {button:?} from ({sx}, {sy}) to ({end_x}, {end_y})" - ))) - }) - .await? + into_result( + "drag", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + humanized_move(&mut enigo, sx, sy, human_like) + .map_err(|e| e.to_string())?; + enigo + .button(button, Direction::Press) + .map_err(|e| format!("button press failed: {e}"))?; + + // After press succeeds, guarantee release even on error. + let drag_result: Result<(), String> = (|| { + humanized_move(&mut enigo, end_x, end_y, human_like) + .map_err(|e| e.to_string())?; + Ok(()) + })(); + + // Always release — best-effort cleanup. + if let Err(e) = enigo.button(button, Direction::Release) { + warn!( + tool = "mouse", + button = ?button, + error = %e, + "[computer] best-effort button release failed during drag cleanup" + ); + } + drag_result?; + Ok(format!( + "Dragged {button:?} from ({sx}, {sy}) to ({end_x}, {end_y})" + )) + }) + .await, + ) } "scroll" => { @@ -373,31 +358,25 @@ impl Tool for MouseTool { )); } - tokio::task::spawn_blocking(move || { - let mut enigo = Enigo::new(&Settings::default()) - .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - if scroll_y != 0 { - enigo - .scroll(scroll_y, enigo::Axis::Vertical) - .map_err(|e| anyhow::anyhow!("vertical scroll failed: {e}"))?; - } - if scroll_x != 0 { - enigo - .scroll(scroll_x, enigo::Axis::Horizontal) - .map_err(|e| anyhow::anyhow!("horizontal scroll failed: {e}"))?; - } - info!( - tool = "mouse", - action = "scroll", - scroll_x = scroll_x, - scroll_y = scroll_y, - "[computer] scrolled" - ); - Ok(ToolResult::success(format!( - "Scrolled (x={scroll_x}, y={scroll_y})" - ))) - }) - .await? + into_result( + "scroll", + run_input_on_main(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| format!("Failed to create enigo instance: {e}"))?; + if scroll_y != 0 { + enigo + .scroll(scroll_y, enigo::Axis::Vertical) + .map_err(|e| format!("vertical scroll failed: {e}"))?; + } + if scroll_x != 0 { + enigo + .scroll(scroll_x, enigo::Axis::Horizontal) + .map_err(|e| format!("horizontal scroll failed: {e}"))?; + } + Ok(format!("Scrolled (x={scroll_x}, y={scroll_y})")) + }) + .await, + ) } other => Ok(ToolResult::error(format!( @@ -407,6 +386,20 @@ impl Tool for MouseTool { } } +/// Map a main-thread input op result to a `ToolResult`, logging the outcome. +fn into_result(action: &str, r: Result) -> anyhow::Result { + match r { + Ok(msg) => { + info!(tool = "mouse", action, "[computer] {msg}"); + Ok(ToolResult::success(msg)) + } + Err(e) => { + warn!(tool = "mouse", action, "[computer] failed: {e}"); + Ok(ToolResult::error(e)) + } + } +} + #[cfg(test)] #[path = "mouse_tests.rs"] mod tests; From b96bd279191fe760ce0582270c24df5496cf2a80 Mon Sep 17 00:00:00 2001 From: M3gA-Mind Date: Thu, 4 Jun 2026 14:14:37 +0530 Subject: [PATCH 2/9] =?UTF-8?q?feat(accessibility):=20AX/UIA=20perception?= =?UTF-8?q?=20+=20automate=20perceive=E2=86=92act=E2=86=92settle=20loop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the Rust-internal automate engine (poll-until-stable settle, playback verification), the AXEnabled diagnostics field + settle primitives on ax_interact, the Music fast-path, and the Windows UIA superset. Exposes launch_platform as pub(crate) so the automate loop can launch apps mid-flow. Slice 2/7 of #3307 (accessibility/automate engine). --- docs/voice-automate-plan.md | 152 +++++ .../app_fastpaths/fastpaths_tests.rs | 202 +++++++ .../accessibility/app_fastpaths/mod.rs | 34 ++ .../accessibility/app_fastpaths/music.rs | 520 +++++++++++++++++ src/openhuman/accessibility/automate.rs | 540 ++++++++++++++++++ src/openhuman/accessibility/automate_tests.rs | 266 +++++++++ src/openhuman/accessibility/ax_interact.rs | 118 +++- .../accessibility/ax_interact_tests.rs | 21 + src/openhuman/accessibility/helper.rs | 17 +- src/openhuman/accessibility/mod.rs | 2 + src/openhuman/accessibility/uia_interact.rs | 3 + src/openhuman/tools/impl/system/launch_app.rs | 6 +- src/openhuman/tools/impl/system/mod.rs | 2 + 13 files changed, 1878 insertions(+), 5 deletions(-) create mode 100644 docs/voice-automate-plan.md create mode 100644 src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs create mode 100644 src/openhuman/accessibility/app_fastpaths/mod.rs create mode 100644 src/openhuman/accessibility/app_fastpaths/music.rs create mode 100644 src/openhuman/accessibility/automate.rs create mode 100644 src/openhuman/accessibility/automate_tests.rs diff --git a/docs/voice-automate-plan.md b/docs/voice-automate-plan.md new file mode 100644 index 0000000000..217e769b69 --- /dev/null +++ b/docs/voice-automate-plan.md @@ -0,0 +1,152 @@ +# Phase 1.5 Implementation Plan — `automate(app, goal)` + +**Parent tracker:** [`voice-system-actions.md`](voice-system-actions.md) (Change 1.14 / Phase 1.5) +**Decided approach:** Rust inner loop + fast model (chat LLM out of the click loop) +**First proof target:** Music — "play ``" end-to-end +**Status:** Plan — awaiting approval before code + +--- + +## 1. Goal + +Turn a single high-level intent ("play Numb by Linkin Park") into a multi-step UI +automation that completes in **one tool call from the orchestrator**, runs fast, +and self-corrects — instead of N separate chat-LLM turns over the raw +`ax_interact` primitives (today's flow; see tracker §1.10–1.13 for why that's +slow and fragile). + +## 2. Architecture + +``` + orchestrator (chat LLM) + │ one call: automate{ app, goal } + ▼ + AutomateTool (tools/impl/computer/automate.rs) + │ delegates to + ▼ + accessibility::automate::run(app, goal) ← the inner loop (Rust) + │ + ├─ fast-path dispatch ── app_fastpaths/{music,spotify,slack}.rs + │ (deterministic; skip the loop entirely when available) + │ + └─ general loop ──► perceive → decide → act → settle → verify ──┐ + ▲ │ + └────────────── repeat until done / fail / budget ───────┘ + perceive: ax_list_elements_filtered (existing) + decide: create_chat_provider("automation", cfg) → JSON action + act: ax_press_element / ax_set_field_value / launch_app (existing) + settle: helper "ax_wait_settled" (new) — AXObserver, not sleep + verify: re-read state; confirm the action took effect +``` + +The **chat model is invoked once** (to pick `automate` and its `goal`). The +**fast model** runs the inner loop with a tiny context (goal + current filtered +snapshot + last result), so each step is ~0.5–1s and cheap. + +## 3. Inner-loop algorithm + +State carried across iterations: `goal`, `app`, `history: Vec`, `budget`. + +Each iteration: +1. **Perceive** — `ax_list_elements_filtered(app, last_filter_or_"")`, capped/filtered + exactly as the `ax_interact` tool does today (≤60 elements, never a raw dump). +2. **Decide** — call the fast model with a strict system prompt + the JSON action + schema (below). Parse one action. +3. **Act** — execute via existing helpers. `launch` → `launch_app`; `press` → + `ax_press_element`; `set_value` → `ax_set_field_value`; `list` → just re-perceive + with a new filter. +4. **Settle** — `ax_wait_settled(app, timeout)` (new helper): block until the AX + tree stops changing (debounced AXObserver notifications) or timeout. Removes the + timing-race class deterministically. +5. **Verify** — re-read; confirm the expected post-condition (e.g. a new control + appeared, focus changed, a value was set). Record success/failure in `history`. +6. **Loop** until the model emits `done`/`fail`, or the step budget (e.g. 12) is hit. + +### Action schema (fast model output — strict JSON) +```jsonc +{ + "thought": "short reasoning", + "action": "launch | list | press | set_value | done | fail", + "app": "Music", // optional override; defaults to the task app + "filter": "Highway", // for list + "label": "Play", // for press / set_value + "value": "Highway to Hell", // for set_value + "summary": "what happened / why done" // for done|fail +} +``` +Invalid JSON or unknown action → one repair retry, then `fail` with the raw text +logged (never act on a guess — this is the §1.13 hallucination lesson). + +## 4. New files & changes (grounded in current layout) + +**New** +- `src/openhuman/accessibility/automate.rs` — `run(app, goal, opts) -> Result`; the loop, action schema (serde), fast-model call, step budget, structured `history`. +- `src/openhuman/accessibility/app_fastpaths/mod.rs` + `music.rs` (Spotify/Slack land later) — `try_fastpath(app, goal) -> Option>`. +- `src/openhuman/tools/impl/computer/automate.rs` — `AutomateTool { allow_mutations }`; reuses the `ax_interact` gating posture (mutations opt-in, `SENSITIVE_APPS` denylist, `permission_level_with_args` = Dangerous, `external_effect_with_args` = true). +- `src/openhuman/accessibility/automate_tests.rs` — unit tests for the loop (mock perceive/act/decide), schema parse/repair, budget, fast-path dispatch. + +**Changed** +- `accessibility/helper.rs` (macOS Swift) — add `ax_wait_settled` (AXObserver on `kAXValueChanged`/`kAXFocusedUIElementChanged`/`kAXCreated`, debounce ~150ms, bounded ~3s) and return richer element fields (enabled / on-screen / supported actions) from `ax_list`. +- `accessibility/ax_interact.rs` — surface a `ax_wait_settled` Rust wrapper; extend `AXElement` with the new optional fields (back-compat: `#[serde(default)]`). +- `accessibility/mod.rs` — declare `automate`, `app_fastpaths`. +- `inference/provider/factory.rs` — add an `"automation"` role (falls back to the fast/summarization tier) so the loop's model is independently configurable. +- `tools/ops.rs` (`all_tools_with_runtime`), `tools/user_filter.rs` (new `"automate"` family), `agent_registry/agents/orchestrator/agent.toml` (`named` list), `app/src/utils/toolDefinitions.ts` (Settings → Agent Access toggle). +- Tracker: flip Change 1.14 / Phase 1.5 rows from ⏳ Planned → in progress as milestones land. + +## 5. Fast-model call + +`create_chat_provider("automation", &cfg)` → `(provider, model)`; build a +`ChatRequest { messages, tools: None, stream: None }` with a system prompt that +pins the JSON schema and a user message carrying `{goal, snapshot, history_tail}`. +No tools array — we want a single JSON object back, parsed by us, executed by us. +Temperature low. Token budget small (snapshot is already ≤60 elements). + +## 6. Music proof (first target) + +`app_fastpaths/music.rs` encodes the §1.11 proven sequence behind one entry: +1. `launch_app("Music")` +2. open `music://music.apple.com/search?term=` (URL scheme) +3. `ax_wait_settled` +4. `ax_list_elements_filtered("Music", )` → find the song row +5. `ax_press_element` the row (navigate into detail) +6. `ax_wait_settled` → `ax_list` the detail page → `ax_press_element("Play")` +7. verify `osascript … get player state == playing` (best-effort, logged) + +If the fast-path can't find the row (timing/locale), fall through to the **general +loop**, which is what proves the architecture is app-agnostic. + +## 7. Progress streaming + +Emit a `DomainEvent` per step (`AutomateProgress { app, step, action, ok }`) on the +event bus; a subscriber bridges to the existing notch/voice status surface +(PR #3166) so the user sees "Opening Music → searching → playing" live. Reuses the +`ApprovalSurfaceSubscriber` bridging pattern. + +## 8. Testing + +- **Unit** (`automate_tests.rs`, CI-safe): action JSON parse + repair; budget exhaustion → `fail`; fast-path dispatch chosen over loop; verify-failure triggers retry/alternate. Perceive/act/decide are trait-injected so tests need no mic/AX/LLM. +- **Integration** (`#[ignore]`, run on a real Mac): the Music flow end-to-end (mirrors `ax_interact_tests::test_full_flow_search_and_play_acdc`); tool-level success hard-asserted, playback best-effort. +- **Agent-in-the-loop**: ask the running app "play ``", confirm it picks `automate` and the song plays; watch `[automate]` logs. + +## 9. Milestones (sequenced) + +1. **M1** — `automate.rs` loop skeleton + action schema + fast-model call + `AutomateTool` (gated, registered). Loop runs against existing (non-settled) `ax_interact` helpers. Unit tests. *Compiles + agent can call it.* +2. **M2** — `ax_wait_settled` (helper + wrapper) + verify step wired into the loop. Kills the timing-race class. +3. **M3** — Music fast-path; prove the flow end-to-end on a Mac. +4. **M4** — progress streaming to the notch surface. +5. **M5** — richer element model (enabled/onscreen/actions) for better matching. +6. *(later)* Spotify + Slack fast-paths; vision fallback for Electron; Windows UIA settle parity. + +## 10. Risks / open questions + +- **Fast model availability** — if no fast tier is configured, fall back to the + chat model for the loop (still one tool call; just slower). The `"automation"` + role makes this a config decision, not a hard dependency. +- **AXObserver from the Swift helper** — needs a short run-loop pump; if flaky, + fall back to a polling settle (count-stable-for-150ms) behind the same wrapper. +- **macOS-only first** — Windows UIA settle/verify parity is M6, gated like the + existing cfg-dispatch; non-mac/non-win returns the existing clean runtime error. +- **Safety** — `automate` is a mutating tool: same opt-in + `SENSITIVE_APPS` + denylist + ApprovalGate routing as `ax_interact`; the inner loop may not target a + denylisted app even if the model asks. +``` diff --git a/src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs b/src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs new file mode 100644 index 0000000000..f804c0d056 --- /dev/null +++ b/src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs @@ -0,0 +1,202 @@ +//! Tests for the app fast-paths: pure query parsing + the Music sequence via a +//! scripted backend (no live Music, no model). + +use super::super::automate::{AutomateBackend, AutomateOutcome}; +use super::super::ax_interact::AXElement; +use super::music; +use async_trait::async_trait; +use std::sync::Mutex; + +// ── Pure parser tests ─────────────────────────────────────────────── + +#[test] +fn matches_music_play_intents() { + assert!(music::matches("Music", "play Numb by Linkin Park")); + assert!(music::matches("Apple Music", "play Highway to Hell")); + assert!(music::matches("music", "launch music and play Numb")); + // Not a play intent → no fast-path. + assert!(!music::matches("Music", "pause")); + // Not Music → no fast-path. + assert!(!music::matches("Slack", "play Numb")); +} + +#[test] +fn extract_query_basic() { + assert_eq!( + music::extract_play_query("play Numb by Linkin Park").as_deref(), + Some("Numb Linkin Park") + ); +} + +#[test] +fn extract_query_strips_filler_and_suffix() { + assert_eq!( + music::extract_play_query("play the song Highway to Hell by AC/DC").as_deref(), + Some("Highway to Hell AC/DC") + ); + assert_eq!( + music::extract_play_query("play Numb in Apple Music").as_deref(), + Some("Numb") + ); +} + +#[test] +fn extract_query_after_launch_clause() { + assert_eq!( + music::extract_play_query("launch Music and play Numb").as_deref(), + Some("Numb") + ); +} + +#[test] +fn extract_query_rejects_non_play() { + assert_eq!(music::extract_play_query("pause the music"), None); + assert_eq!(music::extract_play_query("display settings"), None); // "play" inside "display" + assert_eq!(music::extract_play_query("play"), None); // nothing after +} + +#[test] +fn extract_query_from_quoted_title_with_artist() { + // The exact goal that failed live: song quoted earlier, sentence ends "…play it". + assert_eq!( + music::extract_play_query( + "launch Music app, search for \"Highway to Hell\" by AC/DC, and play it" + ) + .as_deref(), + Some("Highway to Hell AC/DC") + ); + assert_eq!( + music::extract_play_query("play \"Numb\" by Linkin Park").as_deref(), + Some("Numb Linkin Park") + ); + // Quoted title, no artist. + assert_eq!( + music::extract_play_query("please play \"Bohemian Rhapsody\"").as_deref(), + Some("Bohemian Rhapsody") + ); +} + +#[test] +fn extract_query_rejects_bare_pronoun() { + // No song name anywhere → decline (let the general loop / a clarifier handle it). + assert_eq!(music::extract_play_query("play it"), None); + assert_eq!(music::extract_play_query("play something"), None); + assert!(!music::matches("Music", "play it")); +} + +// ── Sequence test via scripted backend ────────────────────────────── + +struct Backend { + acts: Mutex>, + /// Elements returned by perceive (the search results screen). + elements: Vec, + press_fail_on: Option, +} + +impl Backend { + fn new(elements: Vec) -> Self { + Self { + acts: Mutex::new(Vec::new()), + elements, + press_fail_on: None, + } + } + fn acts(&self) -> Vec { + self.acts.lock().unwrap().clone() + } +} + +#[async_trait] +impl AutomateBackend for Backend { + async fn perceive(&self, _app: &str, _filter: &str) -> Result, String> { + Ok(self.elements.clone()) + } + async fn decide(&self, _system: &str, _user: &str) -> Result { + Err("fast-path must not call the model".into()) + } + async fn act_launch(&self, app: &str) -> Result { + self.acts.lock().unwrap().push(format!("launch:{app}")); + Ok("ok".into()) + } + async fn act_press(&self, app: &str, label: &str) -> Result { + self.acts + .lock() + .unwrap() + .push(format!("press:{app}:{label}")); + if self.press_fail_on.as_deref() == Some(label) { + return Err("press failed".into()); + } + Ok("ok".into()) + } + async fn act_set_value(&self, _a: &str, _l: &str, _v: &str) -> Result { + Ok("ok".into()) + } + async fn open_url(&self, url: &str) -> Result { + self.acts.lock().unwrap().push(format!("open_url:{url}")); + Ok("ok".into()) + } + async fn settle(&self, _app: &str) {} + async fn wait(&self, _ms: u64) {} +} + +fn song_row(label: &str) -> AXElement { + AXElement::new("AXCell", label) +} + +#[tokio::test] +async fn music_fastpath_full_sequence() { + let backend = Backend::new(vec![song_row("Numb"), AXElement::new("AXButton", "Play")]); + let out = music::run("play Numb by Linkin Park", &backend).await; + assert!(out.success, "expected success: {out:?}"); + let acts = backend.acts(); + // launch → open search url → press the row → press detail Play. + assert_eq!(acts[0], "launch:Music"); + assert!(acts[1].starts_with("open_url:music://"), "got {}", acts[1]); + assert!(acts.contains(&"press:Music:Numb".to_string()), "{acts:?}"); + assert!(acts.contains(&"press:Music:Play".to_string()), "{acts:?}"); +} + +#[tokio::test] +async fn music_fastpath_no_row_fails_for_fallthrough() { + // Search screen has nothing matching → fast-path fails (loop falls through). + let backend = Backend::new(vec![AXElement::new("AXButton", "Some Unrelated Button")]); + let out = music::run("play Numb", &backend).await; + assert!(!out.success); + assert!(out.summary.contains("no matching song"), "{}", out.summary); +} + +#[tokio::test] +async fn music_fastpath_presses_row_even_if_reported_disabled() { + // Apple Music reports pressable result rows as enabled=Some(false); the + // fast-path must still press them (regression guard for the M5 mis-gate). + let mut row = AXElement::new("AXCell", "Numb"); + row.enabled = Some(false); + let backend = Backend::new(vec![row, AXElement::new("AXButton", "Play")]); + let out = music::run("play Numb", &backend).await; + assert!(out.success, "must press a 'disabled'-reported row: {out:?}"); + assert!(backend.acts().contains(&"press:Music:Numb".to_string())); +} + +#[tokio::test] +async fn try_fastpath_dispatches_music_and_skips_others() { + let backend = Backend::new(vec![song_row("Numb")]); + // Non-music app → None (general loop handles it). + assert!(super::try_fastpath("Slack", "play Numb", &backend) + .await + .is_none()); + // Music + play → Some. + assert!(super::try_fastpath("Music", "play Numb", &backend) + .await + .is_some()); +} + +// Outcome type sanity: fast-paths build the same outcome the loop returns. +#[test] +fn outcome_shape() { + let o = AutomateOutcome { + success: true, + summary: "x".into(), + steps: vec![], + }; + assert!(o.success); +} diff --git a/src/openhuman/accessibility/app_fastpaths/mod.rs b/src/openhuman/accessibility/app_fastpaths/mod.rs new file mode 100644 index 0000000000..534d7299b5 --- /dev/null +++ b/src/openhuman/accessibility/app_fastpaths/mod.rs @@ -0,0 +1,34 @@ +//! Deterministic per-app accelerators for the `automate` loop. +//! +//! A fast-path encodes a *proven* native sequence for a common (app, intent) +//! pair so the loop doesn't have to rediscover it with the model every time. +//! [`try_fastpath`] is consulted **before** the general loop and returns: +//! - `Some(success)` → the loop returns it directly, +//! - `Some(failure)` → the loop logs and falls through to the model loop, +//! - `None` → no fast-path applies; straight to the model loop. +//! +//! So a fast-path can only *help*. This is deliberately different from the +//! removed `play_music` tool (tracker §1.13): that was a separate tool the LLM +//! had to choose (and chose wrong); this is internal to `automate`, transparent, +//! and always backed by the general loop. + +use super::automate::AutomateBackend; +use super::automate::AutomateOutcome; + +mod music; + +/// Try every registered fast-path; return the first that claims the (app, goal). +pub async fn try_fastpath( + app: &str, + goal: &str, + backend: &dyn AutomateBackend, +) -> Option { + if music::matches(app, goal) { + return Some(music::run(goal, backend).await); + } + None +} + +#[cfg(test)] +#[path = "fastpaths_tests.rs"] +mod tests; diff --git a/src/openhuman/accessibility/app_fastpaths/music.rs b/src/openhuman/accessibility/app_fastpaths/music.rs new file mode 100644 index 0000000000..a87079e535 --- /dev/null +++ b/src/openhuman/accessibility/app_fastpaths/music.rs @@ -0,0 +1,520 @@ +//! Apple Music fast-path: "play ``". +//! +//! Encodes the sequence empirically proven in tracker §1.11: open the Music +//! search URL scheme, press the matching song row to **navigate** into it, then +//! press the detail-page **Play** (a search-result press only selects/navigates; +//! the second Play press is what actually starts playback). All steps go through +//! the injectable [`AutomateBackend`], so the whole flow is unit-testable with a +//! scripted backend — no live Music, no model. + +use super::AutomateBackend; +use super::AutomateOutcome; + +const APP: &str = "Music"; + +/// Element roles that represent a tappable search result / song row. +const ROW_ROLES: &[&str] = &["AXCell", "AXRow", "ListItem", "AXButton", "AXStaticText"]; + +/// Does this (app, goal) look like an Apple Music "play X" request? +pub fn matches(app: &str, goal: &str) -> bool { + is_music_app(app) && extract_play_query(goal).is_some() +} + +/// True for the Apple Music app under its common display names. +fn is_music_app(app: &str) -> bool { + let a = app.trim().to_lowercase(); + a == "music" || a == "apple music" || a == "itunes" +} + +/// Pull the search query out of a "play …" goal, or `None` if it isn't one. +/// +/// Two strategies, in order: +/// 1. **Quoted title** — the orchestrator usually quotes the song, e.g. +/// `search for "Highway to Hell" by AC/DC, and play it`. Use the first +/// quoted span, plus any `by ` that immediately follows it. This is +/// robust to where "play" sits in the sentence (it was the bug: a goal +/// ending in "…and play it" made the after-"play" strategy extract "it"). +/// 2. **After "play"** — `play Numb by Linkin Park`, `play the song X`, etc. +/// +/// Either way: drop leading `the song`/`track` filler, a trailing +/// `in/on (apple) music`, rewrite ` by ` to a space (better catalog recall), +/// and reject bare pronouns ("it"/"this"/…) that carry no song name. +pub fn extract_play_query(goal: &str) -> Option { + // Strategy 1: first quoted title (+ trailing "by artist"). + if let Some((title, rest)) = first_quoted(goal) { + let mut q = title.trim().to_string(); + if let Some(artist) = trailing_by_artist(rest) { + q.push(' '); + q.push_str(&artist); + } + let q = clean_query(&q); + if !q.is_empty() && !is_pronoun(&q) { + return Some(q); + } + } + + // Strategy 2: text after the last word-boundary "play". + let lower = goal.to_lowercase(); + let idx = lower.rfind("play")?; + let before_ok = idx == 0 + || !lower[..idx] + .chars() + .next_back() + .map(|c| c.is_alphabetic()) + .unwrap_or(false); + if !before_ok { + return None; + } + let after = &goal[idx + "play".len()..]; + let mut q = after.trim().to_string(); + for filler in ["the song ", "the track ", "song ", "track ", "me "] { + if q.to_lowercase().starts_with(filler) { + q = q[filler.len()..].to_string(); + break; + } + } + let q = clean_query(&q); + if q.is_empty() || is_pronoun(&q) { + None + } else { + Some(q) + } +} + +/// Strip a trailing "(in|on) [apple] music" and rewrite " by " → " ". +fn clean_query(q: &str) -> String { + let mut q = q.trim().to_string(); + let ql = q.to_lowercase(); + for suffix in [ + " in apple music", + " on apple music", + " in music", + " on music", + ] { + if ql.ends_with(suffix) { + q.truncate(q.len() - suffix.len()); + break; + } + } + replace_ci(&q, " by ", " ").trim().to_string() +} + +/// A query that's just a pronoun / generic noun carries no song — reject it so +/// the fast-path declines and the general loop (or a clarifying reply) handles it. +fn is_pronoun(q: &str) -> bool { + matches!( + q.trim().to_lowercase().as_str(), + "it" | "this" | "that" | "them" | "something" | "some music" | "music" | "a song" | "songs" + ) +} + +/// Return the first single- or double-quoted span and the text after its close. +fn first_quoted(s: &str) -> Option<(String, &str)> { + // Support straight and curly double quotes. + let opens = ['"', '\u{201C}']; + let closes = ['"', '\u{201D}']; + let start = s.find(|c| opens.contains(&c))?; + let after_open = start + s[start..].chars().next()?.len_utf8(); + let rel = s[after_open..].find(|c| closes.contains(&c))?; + let inner = &s[after_open..after_open + rel]; + let close_end = after_open + rel + s[after_open + rel..].chars().next()?.len_utf8(); + if inner.trim().is_empty() { + return None; + } + Some((inner.to_string(), &s[close_end..])) +} + +/// If `rest` begins with `by `, capture the artist up to the next +/// clause boundary ("," / " and " / " then " / end). +fn trailing_by_artist(rest: &str) -> Option { + let t = rest.trim_start(); + let lower = t.to_lowercase(); + let after = lower.strip_prefix("by ")?; + let artist_region = &t[t.len() - after.len()..]; + // Cut at the first clause boundary. + let mut end = artist_region.len(); + for delim in [",", " and ", " then ", " in ", " on "] { + if let Some(p) = artist_region.to_lowercase().find(delim) { + end = end.min(p); + } + } + let artist = artist_region[..end].trim().to_string(); + if artist.is_empty() { + None + } else { + Some(artist) + } +} + +/// Case-insensitive replace of `needle` with `repl` in `haystack`. +fn replace_ci(haystack: &str, needle: &str, repl: &str) -> String { + let hl = haystack.to_lowercase(); + let nl = needle.to_lowercase(); + let mut out = String::with_capacity(haystack.len()); + let mut i = 0; + while i < haystack.len() { + if hl[i..].starts_with(&nl) { + out.push_str(repl); + i += needle.len(); + } else { + let ch = haystack[i..].chars().next().unwrap(); + out.push(ch); + i += ch.len_utf8(); + } + } + out +} + +/// Build the Apple Music search URL scheme for `query`. +fn search_url(query: &str) -> String { + format!( + "music://music.apple.com/search?term={}", + percent_encode(query) + ) +} + +/// Percent-encode the reserved characters that matter in a query value +/// (space + the URL delimiters). Enough for app URL schemes; not a full +/// RFC-3986 encoder. +fn percent_encode(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for b in s.bytes() { + match b { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { + out.push(b as char) + } + _ => out.push_str(&format!("%{b:02X}")), + } + } + out +} + +/// The first query token worth filtering on (length > 2 so "to"/"by" don't +/// match everything). Used as the perceive filter: the snapshot's substring +/// filter can't match a whole multi-word title, so we narrow by one strong +/// token and let `pick_row` do the full token match. +fn first_token(query: &str) -> String { + query + .split_whitespace() + .find(|t| t.len() > 2) + .unwrap_or("") + .to_string() +} + +/// Choose the best matching row from a perceive snapshot: an exact label match +/// first, else the first row-role element whose label shares a word with the +/// query. Returns the element label to press. +fn pick_row(elements: &[super::super::ax_interact::AXElement], query: &str) -> Option { + let ql = query.to_lowercase(); + // Exact label match wins. (We deliberately do NOT skip elements whose + // reported `enabled` is false — Apple Music marks pressable result rows as + // disabled; see AXElement::enabled docs.) + if let Some(e) = elements.iter().find(|e| e.label.to_lowercase() == ql) { + return Some(e.label.clone()); + } + let tokens: Vec<&str> = ql.split_whitespace().filter(|t| t.len() > 2).collect(); + elements + .iter() + .filter(|e| ROW_ROLES.iter().any(|r| e.role.contains(r))) + .find(|e| { + let l = e.label.to_lowercase(); + tokens.iter().any(|t| l.contains(t)) + }) + .map(|e| e.label.clone()) +} + +/// Run the play fast-path. Returns a failed [`AutomateOutcome`] (not a panic) +/// whenever a step can't proceed, so the caller falls through to the general +/// loop. +pub async fn run(goal: &str, backend: &dyn AutomateBackend) -> AutomateOutcome { + let mut steps: Vec = Vec::new(); + let query = match extract_play_query(goal) { + Some(q) => q, + None => { + return fail("not a play request", steps); + } + }; + log::info!("[automate::music] ▶ play query={query:?}"); + use super::super::automate::progress; + use crate::openhuman::overlay::OverlayAttentionTone; + progress( + format!("Searching Music for {query}…"), + OverlayAttentionTone::Accent, + ); + + // 1. Launch Music. + match backend.act_launch(APP).await { + Ok(m) => steps.push(format!("launch: {m}")), + Err(e) => steps.push(format!("launch FAILED: {e}")), + } + backend.settle(APP).await; + + // 2. Open the search URL. + let url = search_url(&query); + match backend.open_url(&url).await { + Ok(m) => steps.push(format!("search: {m}")), + Err(e) => { + steps.push(format!("search url FAILED: {e}")); + return fail("could not open Music search", steps); + } + } + // 3. Find the song row and press it to navigate in. Search results render + // asynchronously (the §1.13 timing race), so retry across settles, and + // filter the snapshot by one strong token (a substring filter can't + // match a whole multi-word title). + let filter = first_token(&query); + let mut row = None; + for attempt in 0..6 { + backend.settle(APP).await; + let els = backend.perceive(APP, &filter).await.unwrap_or_default(); + if let Some(r) = pick_row(&els, &query) { + row = Some(r); + break; + } + // Catalog search results arrive asynchronously (~3-4s); element-count + // settle can report "stable" while the network fetch is still pending, + // so wait real time between attempts rather than spinning instantly. + log::info!("[automate::music] search results not ready (attempt {attempt}), waiting"); + backend.wait(800).await; + } + let row = match row { + Some(r) => r, + None => return fail("no matching song row found", steps), + }; + // Baseline count of "Play" controls *before* navigating, so we can tell + // when the song's detail-page Play has actually rendered (vs. only the + // toolbar transport Play that's always present). + let plays_before = count_play_buttons(backend).await; + + match backend.act_press(APP, &row).await { + Ok(m) => steps.push(format!("open song: {m}")), + Err(e) => { + steps.push(format!("open song FAILED: {e}")); + return fail("could not open the song", steps); + } + } + + // 4. Wait for the detail-page Play to appear. Pressing too early hits only + // the toolbar transport (empty queue → silence) — the exact false-success + // we hit live. Poll until a new Play control shows up (or give up after a + // few settles and try anyway). + for _ in 0..5 { + backend.settle(APP).await; + if count_play_buttons(backend).await > plays_before { + break; + } + } + + // 5. Press Play, then VERIFY real playback. If it didn't start, the press + // landed on the wrong Play — wait and retry a couple of times. Only + // report success when player state is actually "playing" (or the backend + // can't verify, in which case it's best-effort). + let mut verified: Option = None; + for attempt in 0..3 { + match backend.act_press(APP, "Play").await { + Ok(m) => steps.push(format!("play press (attempt {attempt}): {m}")), + Err(e) => steps.push(format!("play press FAILED: {e}")), + } + backend.settle(APP).await; + match backend.verify_playing().await { + Some(true) => { + verified = Some(true); + break; + } + Some(false) => { + verified = Some(false); + // Give the detail page a beat to settle, then retry. + tokio::time::sleep(std::time::Duration::from_millis(700)).await; + } + None => { + // Can't verify (non-macOS) — accept best-effort and stop. + verified = None; + break; + } + } + } + + match verified { + Some(false) => { + steps.push("verify: player state never reached 'playing'".to_string()); + fail("opened the song but playback didn't start", steps) + } + Some(true) => { + steps.push("verify: playing ✓".to_string()); + progress(format!("Playing {query}"), OverlayAttentionTone::Success); + AutomateOutcome { + success: true, + summary: format!("Playing '{query}' in Music."), + steps, + } + } + None => AutomateOutcome { + success: true, + summary: format!("Started '{query}' in Music (playback unverified)."), + steps, + }, + } +} + +/// Count "Play"-labelled controls currently visible (toolbar + any detail-page +/// Play). Used to detect when navigation has rendered the song's own Play. +async fn count_play_buttons(backend: &dyn AutomateBackend) -> usize { + backend + .perceive(APP, "Play") + .await + .map(|els| { + els.iter() + .filter(|e| e.label.eq_ignore_ascii_case("Play")) + .count() + }) + .unwrap_or(0) +} + +fn fail(msg: &str, steps: Vec) -> AutomateOutcome { + AutomateOutcome { + success: false, + summary: format!("Music fast-path: {msg}"), + steps, + } +} + +#[cfg(test)] +mod unit { + use super::*; + + #[test] + fn first_token_skips_short_words() { + assert_eq!(first_token("Highway to Hell AC/DC"), "Highway"); + assert_eq!(first_token("Numb Linkin Park"), "Numb"); + // All-short → empty (perceive then falls back to a broad list). + assert_eq!(first_token("a x"), ""); + } + + #[test] + fn percent_encode_escapes_reserved() { + assert_eq!(percent_encode("Highway to Hell"), "Highway%20to%20Hell"); + // The slash in AC/DC must be encoded (this was the live-run bug). + assert_eq!(percent_encode("AC/DC"), "AC%2FDC"); + assert_eq!(percent_encode("rock&roll"), "rock%26roll"); + } + + #[test] + fn search_url_is_well_formed() { + let u = search_url("Highway to Hell AC/DC"); + assert_eq!( + u, + "music://music.apple.com/search?term=Highway%20to%20Hell%20AC%2FDC" + ); + } + + #[test] + fn pick_row_prefers_exact_then_token() { + use super::super::super::ax_interact::AXElement; + let els = vec![ + AXElement::new("AXCell", "Highway to Hell"), + AXElement::new("AXButton", "Play"), + ]; + // Token match (query has extra "AC/DC" the row label lacks). + assert_eq!( + pick_row(&els, "Highway to Hell AC/DC").as_deref(), + Some("Highway to Hell") + ); + } +} + +/// Live integration test — drives the real Apple Music app. Ignored by default +/// (needs macOS, the Music app, and Accessibility permission for the runner). +/// +/// Run on a Mac with: +/// cargo test --lib music_fastpath_live -- --ignored --nocapture +#[cfg(all(test, target_os = "macos"))] +mod live { + use super::run; + use crate::openhuman::accessibility::automate::RealBackend; + + #[tokio::test] + #[ignore = "requires macOS + Music app + Accessibility permission"] + async fn music_fastpath_live() { + let backend = RealBackend::new(crate::openhuman::config::Config::default()); + let out = run("play Highway to Hell by AC/DC", &backend).await; + // Tool-level success is asserted; actual playback is best-effort + // (Apple Music's UI is nondeterministic — tracker §1.11/§1.13). + println!( + "[music_fastpath_live] success={} summary={}", + out.success, out.summary + ); + for s in &out.steps { + println!(" - {s}"); + } + let state = player_state(); + println!("[music_fastpath_live] player_state={state}"); + // Now that the flow verifies playback, hold it to the real bar: + // the song must actually be playing. + assert!(out.success, "fast-path reported failure: {}", out.summary); + assert_eq!(state, "playing", "Music did not actually start playing"); + } + + /// `osascript` ground-truth for whether audio is actually playing. + fn player_state() -> String { + std::process::Command::new("osascript") + .args(["-e", "tell application \"Music\" to player state as string"]) + .output() + .ok() + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()) + .unwrap_or_else(|| "(osascript failed)".into()) + } + + /// Empirical probe (not an assertion): open the search, dump what Music's + /// AX tree actually exposes, and report player state before/after each + /// candidate press. Used to design the real play sequence. + #[tokio::test] + #[ignore = "probe — run manually to inspect Music's AX tree"] + async fn music_probe() { + use crate::openhuman::accessibility::ax_interact as ax; + let q = "Highway to Hell"; + let _ = std::process::Command::new("open") + .arg("-a") + .arg("Music") + .status(); + std::thread::sleep(std::time::Duration::from_secs(3)); + let _ = std::process::Command::new("open") + .arg(format!( + "music://music.apple.com/search?term={}", + q.replace(' ', "%20") + )) + .status(); + std::thread::sleep(std::time::Duration::from_secs(4)); + + println!("=== player state at start: {} ===", player_state()); + let dump = |label: &str, filter: &str| match ax::ax_list_elements_filtered("Music", filter) + { + Ok(els) => { + println!( + "--- {label} (filter={filter:?}): {} elements ---", + els.len() + ); + for e in els.iter().take(60) { + println!(" [{}] {} enabled={:?}", e.role, e.label, e.enabled); + } + } + Err(e) => println!("--- {label}: ERROR {e} ---"), + }; + dump("after search", "Highway"); + dump("play buttons", "Play"); + + // Press the first search-result row → does it navigate / play? + println!("\n>>> pressing result 'Highway to Hell'"); + let _ = ax::ax_press_element("Music", "Highway to Hell"); + std::thread::sleep(std::time::Duration::from_secs(3)); + println!("=== player state after row press: {} ===", player_state()); + dump("detail page play", "Play"); + + // Try the detail-page Play (not the toolbar one) if still stopped. + if player_state() != "playing" { + println!("\n>>> pressing 'Play' after navigate"); + let _ = ax::ax_press_element("Music", "Play"); + std::thread::sleep(std::time::Duration::from_secs(3)); + println!("=== player state after Play press: {} ===", player_state()); + } + } +} diff --git a/src/openhuman/accessibility/automate.rs b/src/openhuman/accessibility/automate.rs new file mode 100644 index 0000000000..3c9955bcf5 --- /dev/null +++ b/src/openhuman/accessibility/automate.rs @@ -0,0 +1,540 @@ +//! `automate` — Rust-driven multi-step UI automation loop. +//! +//! Phase 1.5 (see `docs/voice-automate-plan.md`). The chat orchestrator calls +//! `automate{app, goal}` **once**; this module then runs the whole multi-step +//! flow internally with a *fast* model, so the heavy chat model never sits +//! inside the click loop. Each iteration is **perceive → decide → act → +//! settle → verify**: +//! +//! - **perceive** — read a small, filtered accessibility snapshot of the app +//! (`ax_interact::ax_list_elements_filtered`, capped — never a raw dump, +//! which is what made the chat model hallucinate; tracker §1.13). +//! - **decide** — ask the fast model for exactly one JSON action. +//! - **act** — run it via the existing AX primitives / `launch_app`. +//! - **settle** — wait for the UI to stop changing (M2 makes this real; the +//! M1 backend uses a short fixed wait). +//! - **verify** — fold the post-action snapshot back into the next prompt. +//! +//! The loop is generic over an [`AutomateBackend`] so the decision model, the +//! accessibility calls, and the launcher are all injectable — the unit tests +//! drive a scripted backend with no mic, no AX tree, and no LLM. + +use super::ax_interact as ax; +use crate::openhuman::overlay::{publish_attention, OverlayAttentionEvent, OverlayAttentionTone}; +use async_trait::async_trait; +use serde::Deserialize; + +const LOG_PREFIX: &str = "[automate]"; + +/// Push a one-line progress message to the notch / overlay so the user sees the +/// automation happening live (M4). Fire-and-forget: a no-op when nothing is +/// subscribed (e.g. unit tests, or the notch window isn't running). +pub(crate) fn progress(message: impl Into, tone: OverlayAttentionTone) { + let _ = publish_attention( + OverlayAttentionEvent::new(message) + .with_source("automate") + .with_tone(tone) + .with_ttl_ms(5000), + ); +} + +/// Default ceiling on loop iterations. Each iteration is one fast-model call +/// plus one action, so this bounds latency and cost even if the model never +/// emits `done`. +pub const DEFAULT_STEP_BUDGET: u32 = 12; + +/// How many elements a perceive snapshot renders into the prompt. Mirrors the +/// `ax_interact` tool cap so a broad/empty filter can't overflow the model's +/// context and trigger the truncation→hallucination failure (tracker §1.13). +const MAX_SNAPSHOT: usize = 40; + +/// One decoded action from the fast model. +#[derive(Debug, Clone, Deserialize, Default, PartialEq)] +pub struct Action { + /// The model's short reasoning. Logged, never executed. + #[serde(default)] + pub thought: String, + /// One of: `launch`, `list`, `press`, `set_value`, `done`, `fail`. + pub action: String, + /// Optional per-action app override; defaults to the task's app. + #[serde(default)] + pub app: Option, + /// Substring filter for `list`. + #[serde(default)] + pub filter: String, + /// Element label for `press` / `set_value`. + #[serde(default)] + pub label: String, + /// Text to enter for `set_value`. + #[serde(default)] + pub value: String, + /// Final message for `done` / `fail`. + #[serde(default)] + pub summary: String, +} + +/// The result of a completed (or budget-exhausted) automation run. +#[derive(Debug, Clone, PartialEq)] +pub struct AutomateOutcome { + pub success: bool, + pub summary: String, + /// One human-readable line per executed step — surfaced back to the chat + /// agent and useful in logs. + pub steps: Vec, +} + +impl AutomateOutcome { + fn fail(summary: impl Into, steps: Vec) -> Self { + Self { + success: false, + summary: summary.into(), + steps, + } + } +} + +/// Injectable side-effects for the loop. The production impl +/// ([`RealBackend`]) talks to the OS accessibility tree and a fast LLM; tests +/// supply a scripted impl. +#[async_trait] +pub trait AutomateBackend: Send + Sync { + /// Read interactive elements in `app` whose label contains `filter`. + async fn perceive(&self, app: &str, filter: &str) -> Result, String>; + /// Ask the decision model for one JSON action. `system` pins the schema; + /// `user` carries the goal + current snapshot + recent step history. + async fn decide(&self, system: &str, user: &str) -> Result; + async fn act_launch(&self, app: &str) -> Result; + async fn act_press(&self, app: &str, label: &str) -> Result; + async fn act_set_value(&self, app: &str, label: &str, value: &str) -> Result; + /// Open a URL / URI-scheme (e.g. `music://…search?term=…`) via the OS opener. + /// Used by deterministic app fast-paths; the general loop does not call it. + async fn open_url(&self, url: &str) -> Result; + /// Best-effort: is media currently playing? `None` when the backend can't + /// tell (non-macOS, or not applicable). Media fast-paths use this to confirm + /// an action *actually started playback* rather than just succeeding at the + /// AX level — the false-success that made "play" silently no-op (§1.11). + async fn verify_playing(&self) -> Option { + None + } + /// Block until the UI settles after an action. + async fn settle(&self, app: &str); + /// Wait ~`ms` of real time. Used by fast-paths to let asynchronous content + /// (e.g. network search results) render between perceive attempts. Default + /// is a real sleep; test backends override it to a no-op so suites stay fast. + async fn wait(&self, ms: u64) { + tokio::time::sleep(std::time::Duration::from_millis(ms)).await; + } +} + +/// Tuning for a run. +#[derive(Debug, Clone, Copy)] +pub struct AutomateOptions { + pub step_budget: u32, +} + +impl Default for AutomateOptions { + fn default() -> Self { + Self { + step_budget: DEFAULT_STEP_BUDGET, + } + } +} + +/// System prompt pinning the action contract for the fast model. +fn system_prompt() -> String { + "You drive a desktop app's UI to accomplish a goal. You see a list of the \ + app's interactive elements (each as `[role] label`) and act one step at a \ + time.\n\ + \n\ + Respond with EXACTLY ONE JSON object and nothing else:\n\ + {\"thought\":\"...\",\"action\":\"\",\"app\":\"\",\ + \"filter\":\"...\",\"label\":\"...\",\"value\":\"...\",\"summary\":\"...\"}\n\ + \n\ + Verbs:\n\ + • launch — open the app (use first if it isn't showing any elements)\n\ + • list — re-read elements; set `filter` to a substring to narrow them\n\ + • press — activate the element whose label matches `label`\n\ + • set_value — type `value` into the field matching `label` (omit label = first field)\n\ + • done — goal achieved; put a short result in `summary`\n\ + • fail — goal cannot be achieved; explain in `summary`\n\ + \n\ + Rules:\n\ + - Pressing a LIST ROW or SEARCH RESULT usually only selects/opens it. To \ + trigger playback or submission you must then press the actual action button \ + (e.g. open a song, THEN press its 'Play'). After such a press, `list` again \ + to see the new screen.\n\ + - Prefer an exact label match. Keep `filter` specific so the snapshot stays small.\n\ + - Output JSON only — no prose, no code fences." + .to_string() +} + +/// Render a perceive snapshot into compact prompt text. +fn render_snapshot(app: &str, filter: &str, elements: &[ax::AXElement]) -> String { + if elements.is_empty() { + return format!( + "App '{app}' shows no elements matching filter '{filter}' (it may still be \ + loading, or needs launching)." + ); + } + let shown = elements.len().min(MAX_SNAPSHOT); + let mut out = format!( + "App '{app}' elements (filter '{filter}', showing {shown} of {}):\n", + elements.len() + ); + for e in elements.iter().take(MAX_SNAPSHOT) { + // NB: we don't annotate `enabled` here — AXEnabled is unreliable + // per-app (Apple Music marks pressable rows disabled), so surfacing it + // would mislead the model into avoiding real controls. + out.push_str(&format!(" [{}] {}\n", e.role, e.label)); + } + out +} + +/// Parse one action from raw model text, tolerating code fences and surrounding +/// prose by extracting the first balanced `{...}` block. Returns `Err` so the +/// caller can issue a single repair retry before giving up — we never *act* on +/// an unparseable guess (tracker §1.13 hallucination lesson). +fn parse_action(raw: &str) -> Result { + let trimmed = raw.trim(); + if let Ok(a) = serde_json::from_str::(trimmed) { + return Ok(a); + } + // Extract the first {...} span and retry. + if let (Some(start), Some(end)) = (trimmed.find('{'), trimmed.rfind('}')) { + if end > start { + if let Ok(a) = serde_json::from_str::(&trimmed[start..=end]) { + return Ok(a); + } + } + } + Err(format!( + "could not parse an action from model output: {trimmed:?}" + )) +} + +/// Run the automation loop until the goal is met, it fails, or the step budget +/// is exhausted. +pub async fn run( + app: &str, + goal: &str, + backend: &dyn AutomateBackend, + opts: AutomateOptions, +) -> AutomateOutcome { + log::info!( + "{LOG_PREFIX} ▶ run app={app:?} goal={goal:?} budget={}", + opts.step_budget + ); + + // Foreground the target app FIRST, always. This guarantees the app is + // frontmost before we perceive or act — so AX reads the right window and any + // synthetic input (keyboard/mouse) lands on it, not on OpenHuman's own + // window (which is what crashed CEF in §1.8). `act_launch` is `open -a`, + // which both opens and activates; idempotent if already running. + match backend.act_launch(app).await { + Ok(m) => log::info!("{LOG_PREFIX} foregrounded: {m}"), + Err(e) => log::warn!("{LOG_PREFIX} foreground failed for {app:?}: {e}"), + } + backend.settle(app).await; + + // Deterministic accelerator: if a known app + intent has a proven native + // sequence, run it first. On `None` (no fast-path) or a failed fast-path we + // fall through to the general model-driven loop — so the fast-path can only + // help, never block. (Structurally different from the removed `play_music` + // tool, §1.13: this is internal to `automate`, not a tool the LLM selects.) + if let Some(outcome) = super::app_fastpaths::try_fastpath(app, goal, backend).await { + if outcome.success { + log::info!("{LOG_PREFIX} fast-path succeeded for app={app:?}"); + return outcome; + } + log::info!("{LOG_PREFIX} fast-path did not complete; falling through to general loop"); + } + + let system = system_prompt(); + let mut steps: Vec = Vec::new(); + let mut last_filter = String::new(); + // One repair retry budget for unparseable model output. + let mut repair_left = 1u32; + // No-progress guard: track the last actionable signature so a model that + // keeps issuing the same call (e.g. pressing 'Search' over and over) bails + // instead of burning the whole step budget. + let mut last_sig = String::new(); + let mut repeat_count = 0u32; + + for step in 0..opts.step_budget { + // ── perceive ── + let snapshot = match backend.perceive(app, &last_filter).await { + Ok(els) => render_snapshot(app, &last_filter, &els), + Err(e) => { + log::warn!("{LOG_PREFIX} perceive failed: {e}"); + format!("(perceive error: {e})") + } + }; + + // ── decide ── + let user = format!( + "Goal: {goal}\nApp: {app}\n\nCurrent screen:\n{snapshot}\n\nSteps so far:\n{}\n\n\ + Reply with the next single JSON action.", + if steps.is_empty() { + " (none yet)".to_string() + } else { + steps + .iter() + .map(|s| format!(" - {s}")) + .collect::>() + .join("\n") + } + ); + let raw = match backend.decide(&system, &user).await { + Ok(t) => t, + Err(e) => { + log::warn!("{LOG_PREFIX} decide failed: {e}"); + return AutomateOutcome::fail(format!("decision model error: {e}"), steps); + } + }; + + let action = match parse_action(&raw) { + Ok(a) => a, + Err(e) => { + if repair_left > 0 { + repair_left -= 1; + log::warn!("{LOG_PREFIX} step={step} unparseable action, retrying: {e}"); + steps.push(format!("(model produced unparseable output; retried)")); + continue; + } + return AutomateOutcome::fail(format!("model output unparseable: {e}"), steps); + } + }; + + let target_app = action + .app + .as_deref() + .filter(|s| !s.is_empty()) + .unwrap_or(app); + log::info!( + "{LOG_PREFIX} step={step} action={:?} app={target_app:?} label={:?} filter={:?}", + action.action, + action.label, + action.filter + ); + + // ── no-progress guard ── + if !matches!(action.action.as_str(), "done" | "fail") { + let sig = format!("{}|{}|{}", action.action, action.label, action.filter); + if sig == last_sig { + repeat_count += 1; + } else { + repeat_count = 0; + last_sig = sig; + } + // initial + 2 repeats = 3 identical actions in a row. + if repeat_count >= 2 { + log::warn!("{LOG_PREFIX} no progress: action repeated 3× ({last_sig}); aborting"); + steps.push(format!( + "aborted: repeated '{}' 3× with no progress", + action.action + )); + return AutomateOutcome::fail( + "Got stuck repeating the same action with no progress.", + steps, + ); + } + } + + // ── act ── + match action.action.as_str() { + "done" => { + let summary = if action.summary.is_empty() { + "Goal completed.".to_string() + } else { + action.summary.clone() + }; + log::info!("{LOG_PREFIX} ✓ done: {summary}"); + progress(&summary, OverlayAttentionTone::Success); + return AutomateOutcome { + success: true, + summary, + steps, + }; + } + "fail" => { + let summary = if action.summary.is_empty() { + "Goal could not be completed.".to_string() + } else { + action.summary.clone() + }; + log::info!("{LOG_PREFIX} ✗ model gave up: {summary}"); + progress(&summary, OverlayAttentionTone::Neutral); + return AutomateOutcome::fail(summary, steps); + } + "list" => { + last_filter = action.filter.clone(); + steps.push(format!("list filter={:?}", last_filter)); + } + "launch" => { + progress( + format!("Opening {target_app}…"), + OverlayAttentionTone::Accent, + ); + match backend.act_launch(target_app).await { + Ok(msg) => steps.push(format!("launch: {msg}")), + Err(e) => steps.push(format!("launch FAILED: {e}")), + } + backend.settle(target_app).await; + } + "press" => { + if action.label.trim().is_empty() { + steps.push("press skipped: empty label".to_string()); + continue; + } + progress( + format!("Pressing {}…", action.label), + OverlayAttentionTone::Accent, + ); + match backend.act_press(target_app, &action.label).await { + Ok(msg) => steps.push(format!("press: {msg}")), + Err(e) => steps.push(format!("press FAILED: {e}")), + } + backend.settle(target_app).await; + } + "set_value" => { + if action.value.is_empty() { + steps.push("set_value skipped: empty value".to_string()); + continue; + } + progress("Typing…", OverlayAttentionTone::Accent); + match backend + .act_set_value(target_app, &action.label, &action.value) + .await + { + Ok(msg) => steps.push(format!("set_value: {msg}")), + Err(e) => steps.push(format!("set_value FAILED: {e}")), + } + backend.settle(target_app).await; + } + other => { + steps.push(format!("unknown action {other:?} ignored")); + } + } + } + + log::info!("{LOG_PREFIX} step budget ({}) exhausted", opts.step_budget); + AutomateOutcome::fail( + format!( + "Step budget ({}) exhausted before the goal was confirmed complete.", + opts.step_budget + ), + steps, + ) +} + +/// Production backend: real AX primitives + a fast LLM for decisions. +pub struct RealBackend { + config: crate::openhuman::config::Config, +} + +impl RealBackend { + pub fn new(config: crate::openhuman::config::Config) -> Self { + Self { config } + } +} + +#[async_trait] +impl AutomateBackend for RealBackend { + async fn perceive(&self, app: &str, filter: &str) -> Result, String> { + ax::ax_list_elements_filtered(app, filter) + } + + async fn decide(&self, system: &str, user: &str) -> Result { + // Fast tier: the `memory` role maps to `memory_provider` — a cheap, + // quick model class. A dedicated `automation` provider knob is a + // follow-up (see plan §5); routing through `memory` keeps M1 free of + // Config-schema churn while still keeping the chat model out of the loop. + let (provider, model) = + crate::openhuman::inference::provider::create_chat_provider("memory", &self.config) + .map_err(|e| format!("fast-model provider unavailable: {e}"))?; + provider + .chat_with_system(Some(system), user, &model, 0.0) + .await + .map_err(|e| format!("fast-model call failed: {e}")) + } + + async fn act_launch(&self, app: &str) -> Result { + crate::openhuman::tools::implementations::system::launch_platform(app).await + } + + async fn act_press(&self, app: &str, label: &str) -> Result { + ax::ax_press_element(app, label) + } + + async fn act_set_value(&self, app: &str, label: &str, value: &str) -> Result { + ax::ax_set_field_value(app, label, value) + } + + async fn open_url(&self, url: &str) -> Result { + // Cross-platform URI opener. macOS `open`, Linux `xdg-open`, Windows + // `cmd /C start`. Only invoked by fast-paths with app-controlled URLs + // (never user free-text), so there's no untrusted-URL surface here. + #[cfg(target_os = "macos")] + let mut cmd = { + let mut c = tokio::process::Command::new("open"); + c.arg(url); + c + }; + #[cfg(target_os = "linux")] + let mut cmd = { + let mut c = tokio::process::Command::new("xdg-open"); + c.arg(url); + c + }; + #[cfg(target_os = "windows")] + let mut cmd = { + let mut c = tokio::process::Command::new("cmd"); + c.args(["/C", "start", "", url]); + c + }; + match cmd.output().await { + Ok(o) if o.status.success() => Ok(format!("Opened {url}")), + Ok(o) => Err(format!( + "opener exited {}: {}", + o.status, + String::from_utf8_lossy(&o.stderr).trim() + )), + Err(e) => Err(format!("failed to launch opener: {e}")), + } + } + + async fn verify_playing(&self) -> Option { + // macOS: ask Apple Music for ground-truth player state. Other OSes can't + // verify this way → None (fast-path treats None as best-effort). + #[cfg(target_os = "macos")] + { + let out = tokio::process::Command::new("osascript") + .args(["-e", "tell application \"Music\" to player state as string"]) + .output() + .await + .ok()?; + let state = String::from_utf8_lossy(&out.stdout).trim().to_lowercase(); + Some(state == "playing") + } + #[cfg(not(target_os = "macos"))] + { + None + } + } + + async fn settle(&self, app: &str) { + // M2: poll the element count until the UI stops changing (≤2s), instead + // of a blind fixed wait. Removes the timing-race class (tracker §1.11/ + // §1.13) — the next perceive sees a settled tree. `ax_wait_settled` is + // blocking (synchronous helper IPC), so run it off the async runtime. + let app = app.to_string(); + let _ = tokio::task::spawn_blocking(move || { + ax::ax_wait_settled(&app, 240, 2000); + }) + .await; + } +} + +#[cfg(test)] +#[path = "automate_tests.rs"] +mod tests; diff --git a/src/openhuman/accessibility/automate_tests.rs b/src/openhuman/accessibility/automate_tests.rs new file mode 100644 index 0000000000..6b169e98b7 --- /dev/null +++ b/src/openhuman/accessibility/automate_tests.rs @@ -0,0 +1,266 @@ +//! Unit tests for the `automate` loop. A scripted [`AutomateBackend`] feeds +//! canned model responses and records every action, so the loop is exercised +//! with no mic, no AX tree, and no LLM. + +use super::*; +use std::sync::Mutex; + +/// Scripted backend: `decide` returns the next queued response each call; +/// perceive/act are stubbed and recorded. +struct ScriptedBackend { + /// Queued raw model outputs, consumed in order. + responses: Mutex>, + /// Elements every `perceive` returns. + elements: Vec, + /// Record of act calls, for assertions. + acts: Mutex>, + /// Force act_press to error (to exercise the failure-recording path). + press_errors: bool, +} + +impl ScriptedBackend { + fn new(responses: &[&str]) -> Self { + Self { + responses: Mutex::new(responses.iter().map(|s| s.to_string()).collect()), + elements: vec![ + ax::AXElement::new("AXButton", "Play"), + ax::AXElement::new("AXTextField", "Search"), + ], + acts: Mutex::new(Vec::new()), + press_errors: false, + } + } + fn acts(&self) -> Vec { + self.acts.lock().unwrap().clone() + } +} + +#[async_trait] +impl AutomateBackend for ScriptedBackend { + async fn perceive(&self, _app: &str, _filter: &str) -> Result, String> { + Ok(self.elements.clone()) + } + async fn decide(&self, _system: &str, _user: &str) -> Result { + Ok(self + .responses + .lock() + .unwrap() + .pop_front() + // When the script runs dry, keep listing so the budget guard is what + // ends the run (rather than a decide error). + .unwrap_or_else(|| r#"{"action":"list","filter":""}"#.to_string())) + } + async fn act_launch(&self, app: &str) -> Result { + self.acts.lock().unwrap().push(format!("launch:{app}")); + Ok(format!("Opened '{app}'.")) + } + async fn act_press(&self, app: &str, label: &str) -> Result { + self.acts + .lock() + .unwrap() + .push(format!("press:{app}:{label}")); + if self.press_errors { + return Err("no such element".into()); + } + Ok(format!("Pressed '{label}' in '{app}'.")) + } + async fn act_set_value(&self, app: &str, label: &str, value: &str) -> Result { + self.acts + .lock() + .unwrap() + .push(format!("set_value:{app}:{label}={value}")); + Ok(format!("Set '{label}' in '{app}'.")) + } + async fn open_url(&self, url: &str) -> Result { + self.acts.lock().unwrap().push(format!("open_url:{url}")); + Ok(format!("Opened {url}")) + } + async fn settle(&self, _app: &str) {} + async fn wait(&self, _ms: u64) {} +} + +fn opts(budget: u32) -> AutomateOptions { + AutomateOptions { + step_budget: budget, + } +} + +#[tokio::test] +async fn happy_path_launch_list_press_done() { + // Use a non-fast-path app/goal so the GENERAL loop is what runs. + // run() foregrounds (launch) the app first, so the model needn't. + let backend = ScriptedBackend::new(&[ + r#"{"action":"list","filter":"Play"}"#, + r#"{"action":"press","label":"Play"}"#, + r#"{"action":"done","summary":"Playing."}"#, + ]); + let out = run("Notes", "do a thing", &backend, opts(8)).await; + assert!(out.success, "expected success, got {out:?}"); + assert_eq!(out.summary, "Playing."); + let acts = backend.acts(); + // Leading launch is the foreground-first guarantee. + assert_eq!(acts, vec!["launch:Notes", "press:Notes:Play"]); +} + +#[tokio::test] +async fn navigate_then_activate_sequence() { + // Press the row (navigates), then press the detail Play, then done. + // Non-fast-path app so this exercises the general loop's two-press flow. + let backend = ScriptedBackend::new(&[ + r#"{"action":"press","label":"Highway to Hell"}"#, + r#"{"action":"press","label":"Play"}"#, + r#"{"action":"done","summary":"ok"}"#, + ]); + let out = run("Photos", "open the top album", &backend, opts(8)).await; + assert!(out.success); + assert_eq!( + backend.acts(), + vec![ + "launch:Photos", // foreground-first + "press:Photos:Highway to Hell", + "press:Photos:Play" + ] + ); +} + +#[tokio::test] +async fn set_value_routes_app_override() { + let backend = ScriptedBackend::new(&[ + r#"{"action":"set_value","app":"Slack","label":"message","value":"hi"}"#, + r#"{"action":"done"}"#, + ]); + let out = run("Slack", "message Steven hi", &backend, opts(5)).await; + assert!(out.success); + assert_eq!( + backend.acts(), + vec!["launch:Slack", "set_value:Slack:message=hi"] // foreground-first + ); +} + +#[tokio::test] +async fn budget_exhaustion_fails() { + // Script always lists → never done → budget guard ends the run. + let backend = ScriptedBackend::new(&[r#"{"action":"list","filter":"x"}"#]); + let out = run("Music", "never finishes", &backend, opts(3)).await; + assert!(!out.success); + assert!(out.summary.contains("budget"), "got: {}", out.summary); +} + +#[tokio::test] +async fn no_progress_guard_aborts_repeated_action() { + // Model keeps pressing the same control (the live "Search ×11" pathology). + let backend = ScriptedBackend::new(&[ + r#"{"action":"press","label":"Search"}"#, + r#"{"action":"press","label":"Search"}"#, + r#"{"action":"press","label":"Search"}"#, + r#"{"action":"press","label":"Search"}"#, + ]); + let out = run("Photos", "do something", &backend, opts(10)).await; + assert!(!out.success); + assert!( + out.summary.contains("stuck repeating"), + "got: {}", + out.summary + ); + // foreground launch, then acted twice; the 3rd identical action aborts. + assert_eq!( + backend.acts(), + vec![ + "launch:Photos", + "press:Photos:Search", + "press:Photos:Search" + ] + ); +} + +#[tokio::test] +async fn one_repair_retry_then_succeeds() { + let backend = ScriptedBackend::new(&[ + "garbage not json", + r#"{"action":"done","summary":"recovered"}"#, + ]); + let out = run("Music", "g", &backend, opts(5)).await; + assert!(out.success, "should recover after one repair: {out:?}"); + assert_eq!(out.summary, "recovered"); +} + +#[tokio::test] +async fn two_unparseable_outputs_fail() { + let backend = ScriptedBackend::new(&["garbage one", "garbage two"]); + let out = run("Music", "g", &backend, opts(5)).await; + assert!(!out.success); + assert!(out.summary.contains("unparseable"), "got: {}", out.summary); +} + +#[tokio::test] +async fn explicit_fail_action_propagates() { + let backend = ScriptedBackend::new(&[r#"{"action":"fail","summary":"app not installed"}"#]); + let out = run("Music", "x", &backend, opts(5)).await; + assert!(!out.success); + assert_eq!(out.summary, "app not installed"); +} + +#[tokio::test] +async fn press_failure_is_recorded_not_fatal() { + let mut backend = ScriptedBackend::new(&[ + r#"{"action":"press","label":"Play"}"#, + r#"{"action":"done","summary":"tried"}"#, + ]); + backend.press_errors = true; + let out = run("Music", "x", &backend, opts(5)).await; + assert!(out.success); // the run continues; the press failure is just logged + assert!( + out.steps.iter().any(|s| s.contains("press FAILED")), + "steps: {:?}", + out.steps + ); +} + +#[test] +fn parse_action_plain_json() { + let a = parse_action(r#"{"action":"press","label":"Play"}"#).unwrap(); + assert_eq!(a.action, "press"); + assert_eq!(a.label, "Play"); +} + +#[test] +fn parse_action_strips_code_fence_and_prose() { + let raw = "Sure!\n```json\n{\"action\":\"done\",\"summary\":\"ok\"}\n```\n"; + let a = parse_action(raw).unwrap(); + assert_eq!(a.action, "done"); + assert_eq!(a.summary, "ok"); +} + +#[test] +fn parse_action_rejects_garbage() { + assert!(parse_action("not json at all").is_err()); + assert!(parse_action("").is_err()); +} + +#[test] +fn render_snapshot_caps_and_labels() { + let many: Vec = (0..100) + .map(|i| ax::AXElement::new("AXButton", format!("btn{i}"))) + .collect(); + let s = render_snapshot("Music", "btn", &many); + assert!(s.contains("showing 40 of 100")); + assert!(s.contains("btn0")); + assert!(!s.contains("btn50"), "should be capped at 40"); +} + +#[test] +fn render_snapshot_does_not_annotate_enabled() { + // AXEnabled is unreliable per-app, so the snapshot must not surface it + // (would mislead the model into avoiding pressable controls). + let mut disabled = ax::AXElement::new("AXButton", "Play"); + disabled.enabled = Some(false); + let s = render_snapshot("Music", "", &[disabled]); + assert!(!s.contains("disabled"), "got: {s}"); + assert!(s.contains("[AXButton] Play")); +} + +#[test] +fn render_snapshot_empty_hint() { + let s = render_snapshot("Music", "zzz", &[]); + assert!(s.contains("no elements")); +} diff --git a/src/openhuman/accessibility/ax_interact.rs b/src/openhuman/accessibility/ax_interact.rs index dda9724e05..cb3ad21bb0 100644 --- a/src/openhuman/accessibility/ax_interact.rs +++ b/src/openhuman/accessibility/ax_interact.rs @@ -21,10 +21,68 @@ mod tests; #[path = "uia_interact_tests.rs"] mod uia_tests; -#[derive(Debug, Clone, Deserialize)] +// Portable (non-OS-gated) unit tests for the pure settle core. The sibling +// `ax_interact_tests.rs` is macOS-only + #[ignore] (needs a live app); these +// run everywhere so the settle logic stays covered in CI. +#[cfg(test)] +mod settle_tests { + use super::counts_settled; + + #[test] + fn not_settled_until_enough_samples() { + assert!(!counts_settled(&[5], 3)); + assert!(!counts_settled(&[5, 5], 3)); + } + + #[test] + fn settled_when_tail_is_constant() { + assert!(counts_settled(&[1, 4, 7, 7, 7], 3)); + } + + #[test] + fn not_settled_when_still_changing() { + assert!(!counts_settled(&[7, 7, 8], 3)); + assert!(!counts_settled(&[2, 4, 6], 3)); + } + + #[test] + fn zero_or_one_required_settles_immediately() { + assert!(counts_settled(&[9], 1)); + assert!(counts_settled(&[9], 0)); + } + + #[test] + fn only_the_tail_matters() { + // Early churn doesn't matter once the last `need` samples agree. + assert!(counts_settled(&[0, 99, 3, 3], 2)); + } +} + +#[derive(Debug, Clone, Default, Deserialize)] pub struct AXElement { pub role: String, pub label: String, + /// The control's reported `AXEnabled` state, when the backend supplies it. + /// + /// **Informational only — do NOT gate pressing on this.** Empirically + /// unreliable per-app: Apple Music reports its search-result rows as + /// `Some(false)` even though `AXPress` on them works. Kept for diagnostics + /// and for apps that report it faithfully; matchers must not skip elements + /// solely because this is `Some(false)`. + #[serde(default)] + pub enabled: Option, +} + +impl AXElement { + /// Convenience constructor (enabled unknown). Keeps call sites terse and + /// insulated from future optional fields. + pub fn new(role: impl Into, label: impl Into) -> Self { + Self { + role: role.into(), + label: label.into(), + enabled: None, + } + } } /// List interactive UI elements (buttons, text fields, checkboxes, …) in `app_name`. @@ -112,6 +170,64 @@ pub fn ax_press_element(app_name: &str, label: &str) -> Result { } } +/// Decide, from a rolling history of element counts, whether the UI has +/// settled — i.e. the most recent `stable_samples` counts are all identical +/// (and there are at least that many samples). Pure so it can be unit-tested +/// without any AX backend or real clock. +/// +/// `stable_samples == 0` or `1` means "settled as soon as we have one sample". +pub(crate) fn counts_settled(history: &[usize], stable_samples: usize) -> bool { + let need = stable_samples.max(1); + if history.len() < need { + return false; + } + let tail = &history[history.len() - need..]; + tail.iter().all(|c| *c == tail[0]) +} + +/// Block until `app_name`'s interactive-element count stops changing for +/// `stable_ms`, or `timeout_ms` elapses. Returns the final observed count. +/// +/// This is the **settle** primitive for the `automate` loop: after an action +/// (press / type / launch) the UI is mid-render, and reading it immediately is +/// what caused the timing-race failures (tracker §1.11/§1.13). Polling the +/// element count until it's stable is a portable replacement for a blind fixed +/// sleep — it works on both backends because it rides on `ax_list_elements`, +/// which already cfg-dispatches (macOS AX / Windows UIA). +/// +/// Blocking (uses `std::thread::sleep` + synchronous helper IPC); async callers +/// should run it via `spawn_blocking`. An AXObserver-driven settle is a later +/// optimization that can sit behind this same signature. +pub fn ax_wait_settled(app_name: &str, stable_ms: u64, timeout_ms: u64) -> usize { + use std::time::{Duration, Instant}; + // Sample roughly every `poll_ms`; declare settled once the count has held + // for ceil(stable_ms / poll_ms) consecutive samples. + let poll_ms = 80u64; + let stable_samples = (stable_ms.div_ceil(poll_ms)).max(2) as usize; + let deadline = Instant::now() + Duration::from_millis(timeout_ms); + let mut history: Vec = Vec::new(); + + loop { + let count = ax_list_elements(app_name).map(|v| v.len()).unwrap_or(0); + history.push(count); + if counts_settled(&history, stable_samples) { + log::debug!( + "[ax_interact] settle: '{app_name}' stable at {count} elements after {} samples", + history.len() + ); + return count; + } + if Instant::now() >= deadline { + log::debug!( + "[ax_interact] settle: '{app_name}' timed out after {} samples (last count={count})", + history.len() + ); + return count; + } + std::thread::sleep(Duration::from_millis(poll_ms)); + } +} + /// Set the value of the first text field in `app_name` whose label contains `label`. /// Pass an empty `label` to target the first available text field. pub fn ax_set_field_value(app_name: &str, label: &str, value: &str) -> Result { diff --git a/src/openhuman/accessibility/ax_interact_tests.rs b/src/openhuman/accessibility/ax_interact_tests.rs index 89f57906e7..ccb123dec2 100644 --- a/src/openhuman/accessibility/ax_interact_tests.rs +++ b/src/openhuman/accessibility/ax_interact_tests.rs @@ -165,3 +165,24 @@ fn test_ax_press_nonexistent_app() { let result = ax_press_element("NonExistentApp12345", "Play"); assert!(result.is_err()); } + +/// Env-driven AX dump probe: `AX_PROBE_APP="Slack" cargo test ax_probe_app -- --ignored --nocapture`. +/// Lists interactive elements an app exposes via the macOS Accessibility API — +/// used to diagnose Electron apps (Slack/Discord) whose tree may be empty +/// unless accessibility is enabled. +#[test] +#[ignore = "manual AX probe — set AX_PROBE_APP"] +fn ax_probe_app() { + let app = std::env::var("AX_PROBE_APP").unwrap_or_else(|_| "Slack".to_string()); + let _ = Command::new("open").arg("-a").arg(&app).status(); + sleep(Duration::from_secs(4)); + match ax_list_elements(&app) { + Ok(els) => { + println!("[ax_probe] {app}: {} interactive elements", els.len()); + for e in els.iter().take(80) { + println!(" [{}] {}", e.role, e.label); + } + } + Err(e) => println!("[ax_probe] {app}: ERROR {e}"), + } +} diff --git a/src/openhuman/accessibility/helper.rs b/src/openhuman/accessibility/helper.rs index c271915aeb..97c9c1fbd7 100644 --- a/src/openhuman/accessibility/helper.rs +++ b/src/openhuman/accessibility/helper.rs @@ -693,16 +693,27 @@ func axListElements(appName: String, id: String?) -> [String: Any] { "AXCheckBox", "AXRadioButton", "AXSlider", "AXPopUpButton", "AXComboBox", "AXLink", "AXTab" ] - var elements: [[String: String]] = [] - axWalk(axApp, maxDepth: 10) { _, role, label in + var elements: [[String: Any]] = [] + axWalk(axApp, maxDepth: 10) { el, role, label in if interactiveRoles.contains(role) && !label.isEmpty { - elements.append(["role": role, "label": label]) + elements.append(["role": role, "label": label, "enabled": axEnabled(el)]) } return false } return ["type": "ax_list", "id": id ?? "", "ok": true, "error": NSNull(), "elements": elements] } +/// Read the AXEnabled attribute; default to `true` when the attribute is absent +/// (most static/text elements don't expose it, and we don't want to hide them). +func axEnabled(_ element: AXUIElement) -> Bool { + var ref: AnyObject? + if AXUIElementCopyAttributeValue(element, kAXEnabledAttribute as CFString, &ref) == .success, + let b = ref as? Bool { + return b + } + return true +} + /// Collect all AX elements whose label contains `label` (case-insensitive). /// Returns matches sorted exact-first so "Play" beats "Playlist". struct AXCandidate { diff --git a/src/openhuman/accessibility/mod.rs b/src/openhuman/accessibility/mod.rs index ccd1dd1841..d90a7cb40c 100644 --- a/src/openhuman/accessibility/mod.rs +++ b/src/openhuman/accessibility/mod.rs @@ -5,6 +5,8 @@ //! Consumer modules (autocomplete, screen_intelligence, voice) call into this module //! instead of owning platform-specific code directly. +pub mod app_fastpaths; +pub mod automate; mod automation_state; pub mod ax_interact; mod capture; diff --git a/src/openhuman/accessibility/uia_interact.rs b/src/openhuman/accessibility/uia_interact.rs index 4ec1060233..cfce495fb9 100644 --- a/src/openhuman/accessibility/uia_interact.rs +++ b/src/openhuman/accessibility/uia_interact.rs @@ -217,6 +217,9 @@ pub fn list(app_name: &str, filter: &str) -> Result, String> { out.push(AXElement { role: format!("{ct:?}"), label, + // TODO(windows): populate from UIA `IsEnabled` once verified on a + // Windows box; `None` = "assume enabled" (current behaviour). + enabled: None, }); } diff --git a/src/openhuman/tools/impl/system/launch_app.rs b/src/openhuman/tools/impl/system/launch_app.rs index c423a34449..7766c2f833 100644 --- a/src/openhuman/tools/impl/system/launch_app.rs +++ b/src/openhuman/tools/impl/system/launch_app.rs @@ -176,7 +176,11 @@ impl Tool for LaunchAppTool { } /// Platform-specific launch dispatch. Returns a human-readable success message. -async fn launch_platform(app_name: &str) -> Result { +/// +/// `pub(crate)` so the `automate` inner loop (`accessibility::automate`) can +/// launch an app as one of its steps without duplicating the platform branches +/// or routing back through the full tool surface. +pub(crate) async fn launch_platform(app_name: &str) -> Result { log::info!( "[launch_app] platform={} dispatching launch for app_name={app_name:?}", std::env::consts::OS diff --git a/src/openhuman/tools/impl/system/mod.rs b/src/openhuman/tools/impl/system/mod.rs index 118a567b30..a532e1f713 100644 --- a/src/openhuman/tools/impl/system/mod.rs +++ b/src/openhuman/tools/impl/system/mod.rs @@ -20,6 +20,8 @@ pub use detect_tools::DetectToolsTool; pub use insert_sql_record::InsertSqlRecordTool; pub use install_tool::InstallToolTool; pub use launch_app::LaunchAppTool; +// Reused by the `automate` inner loop to launch an app mid-flow. +pub(crate) use launch_app::launch_platform; pub use lsp::{lsp_capability_enabled, LspTool, LSP_ENABLED_ENV}; pub use node_exec::NodeExecTool; pub use npm_exec::NpmExecTool; From 608f177a6f02d43cfde904292090d03cdbcfdfd9 Mon Sep 17 00:00:00 2001 From: M3gA-Mind Date: Thu, 4 Jun 2026 14:18:19 +0530 Subject: [PATCH 3/9] feat(agent): wire automate/ax_interact computer tools into the orchestrator Registers the AutomateTool (multi-step UI flows in one call) and the ax_interact denylist/opt-in plumbing; adds the catalog toggle, tool definition, and orchestrator prompt guidance (automate + screenshot/ mouse/keyboard fallback for Electron apps with empty AX trees). Slice 3/7 of #3307 (tool wiring + prompts). --- app/src/utils/toolDefinitions.ts | 9 + docs/voice-system-actions.md | 166 ++++++++++++- .../agents/orchestrator/agent.toml | 17 ++ .../agents/orchestrator/prompt.md | 19 ++ src/openhuman/tools/impl/computer/automate.rs | 223 ++++++++++++++++++ .../tools/impl/computer/ax_interact.rs | 12 +- src/openhuman/tools/impl/computer/mod.rs | 2 + src/openhuman/tools/ops.rs | 6 + src/openhuman/tools/user_filter.rs | 7 + 9 files changed, 445 insertions(+), 16 deletions(-) create mode 100644 src/openhuman/tools/impl/computer/automate.rs diff --git a/app/src/utils/toolDefinitions.ts b/app/src/utils/toolDefinitions.ts index 01213cd1f2..4dfa74ab02 100644 --- a/app/src/utils/toolDefinitions.ts +++ b/app/src/utils/toolDefinitions.ts @@ -45,6 +45,15 @@ export const TOOL_CATALOG: ToolDefinition[] = [ defaultEnabled: true, rustToolNames: ['ax_interact'], }, + { + id: 'automate', + displayName: 'App Automation', + description: + 'Accomplish a multi-step goal in an app in one go (e.g. "play a song in Music", "message someone in Slack") — the agent drives the UI step by step.', + category: 'System', + defaultEnabled: true, + rustToolNames: ['automate'], + }, { id: 'git_operations', displayName: 'Git Operations', diff --git a/docs/voice-system-actions.md b/docs/voice-system-actions.md index 92133e37c1..4c10f9b6f4 100644 --- a/docs/voice-system-actions.md +++ b/docs/voice-system-actions.md @@ -268,6 +268,112 @@ test ... ok --- +### Change 1.14 — `automate(app, goal)`: Rust-driven multi-step automation 🔨 In progress (M1 done) + +**Status:** 🔨 In progress — **M1 + M2 + M3 shipped and M3 proven live on macOS**; M4–M6 pending. See **Phase 1.5** below and [`voice-automate-plan.md`](voice-automate-plan.md). + +**Agent-in-the-loop fixes (2026-06-03, from two live chat sessions):** +- **Mutations were off** — the agent correctly called `automate` but it (and `ax_interact`) refused because `computer_control.ax_interact_mutations=false`. Enabled it; also rewrote both refusal messages to point at **Settings → Agent Access** instead of a config key (the agent had relayed "controls are locked down"). +- **Query mis-parse** — orchestrator goal `…search for "Highway to Hell" by AC/DC, and play it` made the after-"play" parser extract `"it"`. `extract_play_query` now prefers a **quoted title + `by `** and rejects bare pronouns. (Unit-tested with the exact failing goal.) +- **General loop spun** — pressed "Search" 11× to budget exhaustion. Added a **no-progress guard**: 3 identical actions in a row → abort. +- **Search-results timing** — the fast-path's retry burned out before catalog results rendered (`settle` reports count-stable while the network fetch is pending). Added a real, mockable `wait` between attempts (6 × ~800ms). + +**M5 finding — AXEnabled is unreliable:** plumbed an `enabled` field end-to-end (Swift `axEnabled` → `AXElement.enabled` → Windows stub), but Apple Music reports its **pressable** search-result rows as `enabled=Some(false)`. Gating `pick_row` on it broke playback. So `enabled` is kept **informational only** (documented on the struct); matchers never skip on it. The better future actionability signal is AXPress-action support, not AXEnabled. + +**M4 — live progress in the notch (2026-06-03):** the notch indicator (originally PR #3166) was cherry-picked onto this branch (`feat(notch)` + fmt commits → `notch_window.rs` NSPanel + `notch/NotchApp.tsx`, auto-shown on startup, transparent when idle). The `automate` loop and Music fast-path now call `overlay::publish_attention(...)` at each step (`Opening …`, `Searching Music for …`, `Pressing …`, `Typing …`, `Playing …`, plus done/fail), which the existing Socket.IO bridge emits as `overlay:attention` and the notch renders as a pill — so the user sees the automation happening live. Verified: app boots with `[notch-window] panel shown at top-center`; Tauri shell + frontend compile; 31 automate unit tests green. + +**M3 live proof (2026-06-03):** `music_fastpath_live` drives real Apple Music end-to-end and **hard-asserts `player state == playing`** — confirmed: pre-state `paused` → post-state `playing`. Three bugs the live runs surfaced, all fixed + tested: +1. **Perceive filter was the whole multi-word query** — a substring filter can't match a full title → now filters by the first strong token and `pick_row` does the token match. +2. **Search results render late (§1.13 race)** — retry perceive across up to 4 settles; `AC/DC` now percent-encodes correctly. +3. **False success: pressed the toolbar Play, not the song's** — the first run reported success but *nothing played*. AX probing showed the search screen has only the toolbar transport Play (empty queue → silence); pressing the song row navigates to a detail page where a **second** Play appears (23→24 controls). Fix: capture the baseline Play count, **wait for the detail Play to render**, press it, then **verify real playback** via `osascript … player state` and retry (≤3×). Added `verify_playing()` to `AutomateBackend` (macOS osascript; `None` elsewhere = best-effort). `automate` now only reports a play success when audio is actually playing — the false-success class (§1.11) is closed. + +**M3 — shipped (Music fast-path):** +- `src/openhuman/accessibility/app_fastpaths/{mod.rs,music.rs}` (new) — deterministic accelerators consulted by `run()` **before** the general loop. Music encodes the §1.11 proven sequence: launch → open `music://…/search?term=…` → settle → press the song row (navigate) → settle → press the detail-page **Play**. Pure helpers `matches` / `extract_play_query` (handles "play X by Y", "launch Music and play …", "play X in Apple Music"). +- **Structurally different from the removed `play_music` tool (§1.13):** this is *internal* to `automate`, not a tool the LLM selects, and on any failure/`None` the loop **falls through** to the general model-driven path — so it can only help. Added `open_url` to the `AutomateBackend` trait (cross-platform opener; fast-path only). +- **Tests:** 9 unit (parser cases, full scripted sequence, no-row fallthrough, dispatch) + 1 `#[ignore]` macOS live test. **Live proof on a Mac:** `cargo test --lib music_fastpath_live -- --ignored --nocapture` (needs Music + Accessibility permission). + +**M2 — shipped (real settle):** +- `src/openhuman/accessibility/ax_interact.rs` — new `ax_wait_settled(app, stable_ms, timeout_ms)`: polls the app's interactive-element count and returns once it holds steady for `stable_ms` (or `timeout_ms` elapses). Portable — rides on `ax_list_elements`, which already cfg-dispatches (macOS AX / Windows UIA). Pure decision core `counts_settled(history, n)` extracted and unit-tested (5 non-OS-gated tests). +- `automate.rs` — `RealBackend::settle` now calls `ax_wait_settled` (240ms stable / 2s cap) via `spawn_blocking` instead of the M1 blind 450ms wait. This is the piece that removes the timing-race failure class (§1.11/§1.13): the next perceive always sees a settled tree. An AXObserver-driven settle can later sit behind the same signature. + +**M1 — shipped:** +- `src/openhuman/accessibility/automate.rs` (new) — the perceive→decide→act→settle loop, generic over an injectable `AutomateBackend` (so the model + AX + launcher are all mockable). Strict JSON action schema (`launch`/`list`/`press`/`set_value`/`done`/`fail`) with a one-shot repair retry on unparseable output (never acts on a hallucinated guess), a step budget (default 12), and a snapshot cap (40 elements) mirroring `ax_interact`'s anti-truncation guard. `RealBackend` calls the existing AX primitives + `launch_platform`, and routes decisions through the **fast tier** (`create_chat_provider("memory", …)` for now; a dedicated `automation_provider` knob is a follow-up). Settle is a short fixed wait in M1 (M2 makes it AXObserver-driven). +- `src/openhuman/tools/impl/computer/automate.rs` (new) — `AutomateTool { app, goal }`. Always `Dangerous` + `external_effect` (routes through the ApprovalGate); reuses `ax_interact`'s mutations opt-in (`computer_control.ax_interact_mutations`) and the shared `is_sensitive_app` denylist. +- Registered everywhere: `tools/ops.rs`, `tools/user_filter.rs` (`automate` family), `orchestrator/agent.toml` (`named`), `app/src/utils/toolDefinitions.ts` (Settings → "App Automation"). +- **Tests:** 18 passing — loop happy path, navigate-then-activate, app override, budget exhaustion, repair retry (1 ok / 2 fail), explicit fail, non-fatal press failure, JSON parse (plain/fenced/garbage), snapshot cap/empty-hint; tool gating (missing args, mutations-off, sensitive-app refusal, schema). + +**Problem (the real-time bar):** The user's target is *"whatever I say happens, live, in front of me"* — e.g. *"Launch Music and play Numb by Linkin Park"* or *"open Slack and message Steven 'hi'"*. Today every UI step (`list` → `set_value` → `list` → `press` …) is a **separate chat-LLM turn**. A Slack message is ~7 turns; at 1–3 s each that's 15–25 s, and each turn is a fresh chance to hit a timing race (1.13) or hallucinate. The heavy chat model is sitting *inside* the click loop — the wrong place for it. + +**Root causes (all four documented earlier in Phase 1):** +1. **Timing races** — `list`/`press` do a single AX walk with no settle/wait; the UI hasn't rendered yet (1.11/1.13). +2. **Navigate-then-activate is re-reasoned every call** — pressing a row selects; you must then press the action control. That logic lives in prose, so it's re-derived (often wrongly) each turn (1.10/1.11). +3. **Round-trip explosion** — N full chat turns per task = latency + cost + N chances to fail. +4. **Weak element model + no verification** — `list` returns flat `[role, label]`; `press` reports success on `AXAction == .success` even when nothing changed. + +**Design — take the chat model out of the click loop:** +- **New tool `automate { app, goal }`** — one call from the orchestrator. Rust then runs a tight **perceive → act → verify** loop internally: read a *filtered* AX snapshot → pick the next action → act → **wait for the UI to settle (AXObserver, not fixed sleeps)** → verify it took effect → repeat until the goal is met or a step budget is hit. +- **A fast model drives the inner loop** (Haiku-class) with a *tiny* context: just the goal, the current small AX snapshot, and the last result — not the whole conversation. Each inner step is ~0.5–1 s and self-corrects, instead of one 3 s chat turn that falsely reports success. +- **Settle + verify in Rust** between steps — deterministic, kills the timing-race class in one place. +- **Native fast-paths for high-value apps** (skip the UI entirely where possible): + - **Music** — `music://` search URL → AX play (already explored in 1.11), or AppleScript for library. + - **Spotify** — Web API search → `spotify:track:…` URI + AppleScript `play`. Fully deterministic, no UI poking. + - **Slack** — deep link `slack://channel?…` to open the DM, then AX to type + send. + The general AX loop is the fallback for everything else. +- **Vision fallback for Electron/Chromium apps** (Slack, Discord, VS Code, Spotify-desktop) whose AX/UIA tree is partial (documented limitation). Slack needs accessibility enabled (`defaults write com.tinyspeck.slackmacgap AccessibilityEnabled -bool true`, relaunch). Where AX returns empty, fall back to **screenshot → vision-locate → guarded click**. This is the reverted CGEventPost path (1.8) — but it crashed only when events hit *OpenHuman's own focused CEF window*; a guarded click into a *different, foregrounded* app does not have that failure mode. +- **Stream progress events** to the UI / notch pill (PR #3166) so the user sees each step happen live. + +**Why a generic `automate`, not per-app tools:** Change 1.13 already established that app-specific tools (`play_music`) are the wrong abstraction. The abstraction that *is* generic is the **navigate-then-activate sequence itself** — `automate(app, goal)` encapsulates it once, in Rust, for every app, instead of asking the chat model to re-orchestrate fragile primitives every time. + +--- + +## Phase 1.5 — Reliable, real-time multi-step automation ⏳ Not Started + +> The bridge between today's `ax_interact` primitives and the always-on voice work. **Prerequisite for Phase 3** — fast voice routing into a slow/fragile action loop still feels slow. This is where "whatever I say happens, live" actually gets delivered. + +**Detailed implementation plan:** [`voice-automate-plan.md`](voice-automate-plan.md) — decided approach: **Rust inner loop + fast model**, first proof target **Music**. + +**Planned files:** +- `src/openhuman/accessibility/automate.rs` (new) — the perceive→act→verify loop + settle/verify primitives, reusing `ax_interact` helpers. +- `src/openhuman/accessibility/app_fastpaths/` (new) — per-app deterministic paths (`music.rs`, `spotify.rs`, `slack.rs`), behind a generic dispatch. +- `src/openhuman/tools/impl/computer/automate.rs` (new) — `AutomateTool { app, goal }`, gated like `ax_interact` (mutations opt-in, sensitive-app denylist reused). +- macOS helper (`accessibility/helper.rs`) — AXObserver-based settle (`ax_wait_settled`) + post-action verify; richer element model (enabled/onscreen/actions). +- Vision fallback — screenshot via `accessibility/capture.rs` → locate → guarded click (only when AX tree is empty, target app foregrounded, never OpenHuman's own window). + +**Acceptance criteria:** +- [ ] One `automate{app, goal}` call performs a multi-step flow end-to-end (no per-step chat turns) +- [ ] Settle/verify removes the timing-race + false-success failure classes (1.11/1.13 do not recur) +- [ ] Music flow ("play ") works end-to-end via the inner loop +- [ ] Spotify + Slack fast-paths land their action deterministically +- [ ] Electron/partial-AX apps fall back to vision+guarded-click without the CEF crash +- [ ] Step-by-step progress streamed to the UI / notch indicator + +--- + +### Change 1.15 — Full computer control (mouse/keyboard/screenshot) ✅ Crash fixed (main-thread dispatch) + +**Status:** ✅ Keyboard/mouse now run on the app main thread → no CEF crash. Screenshot downscales for inline view. Live: `[computer] registered main-thread synthetic-input executor` on boot. + +**The fix:** the crash was enigo's `TSMGetInputSourceProperty` running on a tokio worker (`_dispatch_assert_queue_fail`/SIGTRAP). macOS TSM must run on the main thread. New `tools/impl/computer/main_thread.rs` (`MainThreadInputOp` + `run_input_on_main`) dispatches each enigo op over the native registry to a handler the Tauri shell registers at startup, which runs it via `AppHandle::run_on_main_thread`. Keyboard + mouse tools no longer `spawn_blocking` enigo on a worker. Headless/CLI (no executor) returns a clear error instead of crashing. 66 keyboard/mouse tests green. + +**Goal:** make the agent fully autonomous — when the accessibility tree is empty (Electron apps: Slack/Discord/VS Code), fall back to vision + synthetic input. Enabled `computer_control.enabled`, added `mouse`/`keyboard`/`screenshot` to the orchestrator `named` list + `autonomy.auto_approve`, and taught `prompt.md` a keyboard-first ladder (foreground via `launch_app` → `keyboard type` + Enter; Slack `Cmd+K` recipe). + +**Foreground-first:** `automate::run` now `open -a`s the target app at the very start, always, so AX/input hit the right window. + +**Screenshot fix:** oversized Retina captures were returned as "too large to base64-encode inline" (the model was blind). Now downscaled to a viewable JPEG (`downscale_to_jpeg`) with reported dimensions. + +**THE BLOCKER — `OpenHuman-2026-06-03-170058.ips`:** `EXC_BREAKPOINT/SIGTRAP` on a **`tokio-rt-worker`** thread: +``` +enigo::macos::get_layoutdependent_keycode → TSMGetInputSourceProperty +→ dispatch_assert_queue → _dispatch_assert_queue_fail → SIGTRAP +``` +enigo's keyboard-layout lookup (`TSMGetInputSourceProperty`) **must run on the app's main thread**; the keyboard tool runs on a tokio worker → macOS traps. **Not** a focus issue (same §1.8 root cause). A frontmost-app guard would NOT fix it. + +**Fix required (not yet done):** run enigo on the Tauri **main thread** (`AppHandle::run_on_main_thread`, bridged to the core via a native-registry handler), OR replace enigo's macOS keyboard path with TSM-free primitives (`CGEventKeyboardSetUnicodeString` for text + raw virtual keycodes for keys/hotkeys). Until then, keyboard/mouse must stay disabled to avoid crashing the app. + +**Tests:** voice-actions + autonomy suite is exhaustive — 220 feature unit tests + a JSON-RPC E2E (`json_rpc_voice_server_settings_roundtrip_always_on_and_wake_word`). The E2E caught + fixed real gaps (`wake_word` missing from the get output and the update RPC path). Screenshot downscale unit-tested. + +--- + ## Windows port — app interaction 🪟 ✅ Implemented Phase 1's app-interaction layer is now ported to Windows. The macOS path uses the @@ -416,19 +522,30 @@ Shipped on the Windows machine (2026-06-02): --- -## Phase 2 — Always-On Listening ⏳ Not Started +## Phase 2 — Always-On Listening ✅ Implemented > Continuous microphone listening without requiring a hotkey press. -**Planned files:** -- `src/openhuman/voice/always_on.rs` (new) — dedicated tokio task holding the mic open, running VAD, emitting utterances to the STT pipeline -- `src/openhuman/config/schema/voice_server.rs` — add `always_on_enabled: bool` config flag -- Privacy hook: pause always-on when screen is locked +**Shipped:** +- `src/openhuman/voice/always_on.rs` — pure `VadSegmenter` (onset / silence-hangover / min-speech / max-utterance, 7 unit tests) **plus** the continuous capture loop: a dedicated cpal thread streams 16 kHz mono frames → segmenter → each utterance is encoded (`encode_wav_16k`) → `voice_transcribe_bytes` → `publish_transcription` (so it reaches the agent's auto-send and the notch, exactly like hotkey dictation). Started at boot in `credentials::ops`. +- `src/openhuman/config/schema/voice_server.rs` — `always_on_enabled` flag + VAD tuning (`vad_onset_threshold`, `vad_hangover_ms`, `vad_min_speech_ms`, `vad_max_utterance_secs`), opt-in/off by default. +- **Settings toggle** — "Always-on listening" in the Voice debug panel, wired through `get/update_voice_server_settings` (RPC patch → apply → snapshot); i18n in en + all 13 locales. +- **Privacy hook** — `spawn_lock_watcher` pauses capture + resets the segmenter while the screen is locked (macOS via `CGSessionCopyCurrentDictionary`, null/type-safe FFI; other platforms never pause yet). +- Reused `audio_capture` helpers (`to_mono`/`resample`/`chunk_rms` made `pub(crate)` + new `encode_wav_16k`). **Acceptance criteria:** -- [ ] User can speak without pressing any hotkey -- [ ] VAD detects end of utterance and sends to agent -- [ ] Toggle in Settings → Voice +- [x] User can speak without pressing any hotkey +- [x] VAD detects end of utterance and sends to agent +- [x] Toggle in Settings → Voice + +**Wake word "Hey Tiny" (live-fix, 2026-06-03):** always-on now only delivers an utterance to the agent when its transcript contains the wake word (`config.voice_server.wake_word`, default "Hey Tiny"); the phrase is stripped and the remainder is sent. Tolerant match (case/punctuation/leading-filler), empty wake word = deliver everything. This is a **text-based** wake word (transcribe-then-gate) — a first cut of Phase 3's trigger phrase; it fixes the "sends every utterance" spam but still runs STT on all speech (an on-device audio wake-word model for efficiency is the Phase 3 follow-up). + +**Live-fixes found by running it:** +- **Toggle did nothing** — `always_on_enabled` wasn't in the `update_voice_server_settings` RPC *param schema*, so validation rejected it before the handler. Added it; the config RPC now also calls `always_on::start_if_enabled` so the toggle starts/idles capture **live** (runtime `ENABLED` gate, no restart). +- **`transcription failed: local ai is disabled`** — always-on used `voice_transcribe_bytes` (local whisper only). Now routes through `effective_stt_provider` + `create_stt_provider` (same factory dispatch as `voice.stt_dispatch`), honoring cloud STT. +- Toggle surfaced in the reachable **VoicePanel** (Settings → Advanced → AI → Voice), not the hidden debug panel. + +**Pending live validation (mic-dependent, can't be CI-tested):** say "Hey Tiny, " and confirm the command reaches the agent; tune `vad_onset_threshold`/`vad_hangover_ms` to the user's mic + room. Windows/Linux screen-lock pause is a follow-up (no signal wired). --- @@ -457,6 +574,24 @@ Shipped on the Windows machine (2026-06-02): --- +## Fine-tuning backlog ⏳ (deferred until all phases complete) + +From live agent-in-the-loop testing on 2026-06-03 (grounded in `~/.openhuman/logs/openhuman.2026-06-03.log`, `session_raw/*.jsonl`, and the dev run: **keyboard=69 / mouse=0 / screenshot=10** tool calls; **26 wake matches vs 93 misses**; emit=true utterances ranged 0.7s–28s). The feature works but needs tuning. **Do not implement until Phases 3–4 land.** + +### F1 — Listening window too short for long commands +- **Observed:** `vad_hangover_ms = 800` closes an utterance on any pause > 0.8s, so multi-clause commands ("Hey Tiny, open Slack and message the team channel saying …") split across utterances — the tail lacks the wake word and is dropped. Compounded by the notch "Listening" pill TTL (2500ms) expiring mid-speech, so it *looks* like it stopped listening. +- **Resolve:** (a) raise `vad_hangover_ms` to ~1500ms; (b) **two-stage capture** — once the wake word is detected, open a dedicated longer command window (until a longer silence / N-second cap) instead of relying on a single VAD utterance; (c) keep the "Listening" pill alive for the whole utterance (extend/re-emit on each voiced frame, clear on `SpeechEnd`) so the notch reflects real mic state. + +### F2 — Agent uses keyboard only, never the mouse +- **Observed:** keyboard=69, mouse=0. Two causes: the orchestrator prompt is deliberately *keyboard-first*, **and** the downscaled screenshot's coordinates don't map to screen pixels — the capture is shrunk to ≤1568px while `mouse` expects absolute screen pixels (and Retina is 2× points), so any coordinate read from the image clicks the wrong spot. Vision-driven clicking is therefore currently unsafe and the agent (correctly) avoids it. +- **Resolve:** (a) make `screenshot` emit a coordinate transform (shown WxH + real screen WxH + backing scale) **or** have `mouse` accept image-relative coordinates and convert internally; (b) once coordinates are trustworthy, soften the prompt so the agent uses screenshot→mouse to click specific elements, not just keyboard. + +### F3 — No periodic screenshot/verify + foreground re-check +- **Observed:** the agent screenshots ad-hoc (0 in the last session); `automate` only foregrounds at the start. +- **Resolve:** in the `automate` loop **and** the orchestrator prompt — screenshot + verify at **start, after every ~3 actions, and at the end**; before each action confirm the frontmost app is the target and re-`launch_app` (foreground) it if not, then proceed. Fold the actual-vs-expected check into the loop's `verify` step. + +--- + ## Summary | Phase | Item | Status | @@ -472,9 +607,18 @@ Shipped on the Windows machine (2026-06-02): | 1 | AXUIElement app UI interaction (`ax_interact`) | ✅ Done | | 1 | Multi-step UI workflow guidance | ✅ Done | | 1 | Apple Music two-step play (navigate→play) | ✅ Done (playback best-effort) | -| 2 | Always-on microphone loop | ⏳ Not started | -| 2 | `always_on_enabled` config flag | ⏳ Not started | -| 2 | Privacy hook (screen lock pause) | ⏳ Not started | +| 1 | `automate(app, goal)` Rust-driven loop (Change 1.14) | 🔨 M1+M2+M3 done (37 tests; live proof pending) | +| 1.5 | M1: automate loop skeleton + tool | ✅ Done | +| 1.5 | M2: poll-until-stable settle | ✅ Done | +| 1.5 | M3: Music fast-path | ✅ Done (proven live on macOS) | +| 1.5 | Robustness: quoted-query parse + no-progress guard | ✅ Done (from live agent failures) | +| 1.5 | M4: progress streaming to notch | ✅ Done — notch cherry-picked in; automate streams live steps | +| 1.5 | M5: richer element model (`enabled`) | ✅ Plumbed; AXEnabled found unreliable → informational only | +| 1.5 | Native fast-paths (Music/Spotify/Slack) | ⏳ Not started | +| 1.5 | Vision fallback for Electron apps | ⏳ Not started | +| 2 | Always-on microphone loop | ✅ Done (cpal → VAD → STT → agent) | +| 2 | `always_on_enabled` config flag + Settings toggle | ✅ Done (RPC + UI + i18n) | +| 2 | Privacy hook (screen lock pause) | ✅ Done (macOS; other OSes follow-up) | | 3 | Wake-word detection | ⏳ Not started | | 3 | Local command router | ⏳ Not started | | 4 | Voice confirmation loop | ⏳ Not started | diff --git a/src/openhuman/agent_registry/agents/orchestrator/agent.toml b/src/openhuman/agent_registry/agents/orchestrator/agent.toml index 2ee6e58ae3..80a7cb7f78 100644 --- a/src/openhuman/agent_registry/agents/orchestrator/agent.toml +++ b/src/openhuman/agent_registry/agents/orchestrator/agent.toml @@ -130,6 +130,23 @@ named = [ # `computer_control.ax_interact_mutations`, and refuse a sensitive-app # denylist (password managers, Keychain, System Settings, terminals). "ax_interact", + # Multi-step UI automation in one call (e.g. "play in Music", + # "message in Slack"). Prefer over many individual ax_interact + # calls when the task needs several UI steps — a Rust perceive→act→verify + # loop runs the flow with a fast model. Same opt-in + sensitive-app denylist + # as ax_interact; `Dangerous`, gates through the ApprovalGate. + "automate", + # Full computer control (autonomy). Fallback for apps the accessibility API + # can't drive — notably Electron apps (Slack, Discord, VS Code) whose AX/UIA + # tree is empty. `screenshot` to see the screen, then `mouse` (move/click/ + # drag/scroll) + `keyboard` (type/press/hotkey) to act by pixel coordinates. + # All `Dangerous` and gate through the ApprovalGate. mouse/keyboard require + # `computer_control.enabled = true`. Prefer `automate`/`ax_interact` first; + # use these when the AX tree comes back empty. NB: foreground the target app + # before typing/clicking (synthetic input goes to the focused window). + "screenshot", + "mouse", + "keyboard", # Time + scheduling — lets the orchestrator answer "what time is it", # "remind me in 10 minutes", "every morning at 8" directly rather than # delegating or telling the user it can't. `current_time` grounds diff --git a/src/openhuman/agent_registry/agents/orchestrator/prompt.md b/src/openhuman/agent_registry/agents/orchestrator/prompt.md index e1b0d21c92..70d9873a95 100644 --- a/src/openhuman/agent_registry/agents/orchestrator/prompt.md +++ b/src/openhuman/agent_registry/agents/orchestrator/prompt.md @@ -38,6 +38,25 @@ Follow this sequence for every user message: Default bias: **do not spawn a sub-agent when a direct response or direct tool call is sufficient** — but a live external-service request is *not* something to answer from memory, it requires the integration. Use `spawn_worker_thread` for long tasks that need their own thread. +## Controlling desktop apps (full autonomy) + +You can open and operate native apps on this machine. **Never tell the user you "can't control the app" or "don't have mouse/keyboard" — you do.** + +**Rule 0 — foreground first, every time.** Before *any* keyboard/mouse action, call `launch_app ""` for the target. `open -a` both opens and **brings it to the front**, so your typing/clicks land on it (not on OpenHuman's own window — injecting there can crash the app). Re-call `launch_app` right before each keyboard/mouse step if focus might have moved. + +**The reliable path is the keyboard, not the mouse.** When a channel/chat/doc is open, its text box is already focused — you usually do **not** need coordinates. Prefer this: + +1. `launch_app ""` (foreground). +2. `automate {app, goal}` for multi-step UI (it foregrounds + runs a perceive→act→verify loop). Good for native apps (Music, Mail, Notes). +3. **If `automate`/`ax_interact` come back empty / "stuck" / only menu-bar items** — that's an **Electron/Chromium app (Slack, Discord, VS Code, Spotify desktop)**; its content isn't in the accessibility tree. Switch to **keyboard-driven control**: + - `launch_app ""` (foreground), then `keyboard` `type` the text and `press` `Enter`. The focused input receives it. Use app **hotkeys** to navigate (no mouse needed). +4. **Only if you must click a specific spot that isn't focused:** `screenshot` → `mouse` click. (Screenshots are downscaled so you can see them; coordinates you read are in the returned image's pixels.) + +**Worked example — "message hi on Slack" (keyboard-only, no vision):** +`launch_app "Slack"` → `keyboard hotkey "cmd+k"` (Slack quick switcher) → `keyboard type ""` → `keyboard press "Enter"` (opens the chat, focuses the message box) → `keyboard type "hi"` → `keyboard press "Enter"` (sends). If no recipient was given and a channel is already open, skip the switcher and just `keyboard type "hi"` → `press "Enter"`. + +`screenshot`/`mouse`/`keyboard` run without an approval prompt (they're on your auto-approve list) — just proceed. + ## Rules - **You are the chat tier.** You run on a fast UX-focused model (TTFT > deep reasoning). When a task needs sustained multi-step thinking — planning across many steps, comparing several non-obvious options, untangling ambiguous requirements — **delegate to the reasoning tier (`delegate_plan`)** rather than reasoning through it yourself. Your job at that point is to brief the planner well and synthesise its output back to the user. diff --git a/src/openhuman/tools/impl/computer/automate.rs b/src/openhuman/tools/impl/computer/automate.rs new file mode 100644 index 0000000000..638ab6567b --- /dev/null +++ b/src/openhuman/tools/impl/computer/automate.rs @@ -0,0 +1,223 @@ +//! Tool: `automate` — accomplish a multi-step UI goal in one call. +//! +//! The orchestrator calls `automate{app, goal}` once; the Rust loop in +//! `accessibility::automate` then perceives → decides (fast model) → acts → +//! settles → verifies until the goal is met or a step budget is hit. This keeps +//! the heavy chat model out of the click loop (latency + reliability — see +//! `docs/voice-automate-plan.md`). +//! +//! Safety mirrors `ax_interact`: it actuates real controls, so it is a mutating +//! tool — opt-in via `computer_control.ax_interact_mutations`, routed through the +//! ApprovalGate, and it refuses the sensitive-app denylist (password managers, +//! Keychain, System Settings, terminals) even on auto-approved turns. + +use super::ax_interact::is_sensitive_app; +use crate::openhuman::accessibility::automate::{self, AutomateOptions, RealBackend}; +use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolCallOptions, ToolResult}; +use async_trait::async_trait; +use serde_json::json; + +pub struct AutomateTool { + /// When false the tool refuses to run (it is inherently mutating). Mirrors + /// `AxInteractTool::allow_mutations` so one opt-in governs both. + allow_mutations: bool, +} + +impl AutomateTool { + pub fn new(allow_mutations: bool) -> Self { + Self { allow_mutations } + } +} + +impl Default for AutomateTool { + fn default() -> Self { + Self::new(false) + } +} + +#[async_trait] +impl Tool for AutomateTool { + fn name(&self) -> &str { + "automate" + } + + fn description(&self) -> &str { + "Accomplish a MULTI-STEP goal inside a desktop app in a single call — e.g. \ + 'play in Music', 'message in Slack'. Give the app \ + name and a plain-English goal; the system drives the app's UI step by step \ + (find elements → press/type → verify) using the platform accessibility API, \ + no screen coordinates. Prefer this over issuing many individual \ + `ax_interact` calls when the task needs several UI steps. The app should \ + usually be launched first (or include 'launch' in the goal). Refuses \ + password managers, Keychain, System Settings, and terminals." + } + + fn parameters_schema(&self) -> serde_json::Value { + json!({ + "type": "object", + "properties": { + "app": { + "type": "string", + "description": "Display name of the target application (e.g. 'Music', 'Slack')." + }, + "goal": { + "type": "string", + "description": "Plain-English description of the multi-step outcome to achieve." + } + }, + "required": ["app", "goal"] + }) + } + + fn permission_level(&self) -> PermissionLevel { + // Always mutating — it actuates controls. Kept as the base level so the + // approval gate fires regardless of args. + PermissionLevel::Dangerous + } + + fn external_effect(&self) -> bool { + true + } + + async fn execute(&self, args: serde_json::Value) -> anyhow::Result { + self.execute_with_options(args, ToolCallOptions::default()) + .await + } + + async fn execute_with_options( + &self, + args: serde_json::Value, + _options: ToolCallOptions, + ) -> anyhow::Result { + let app = args + .get("app") + .and_then(|v| v.as_str()) + .unwrap_or("") + .trim() + .to_string(); + let goal = args + .get("goal") + .and_then(|v| v.as_str()) + .unwrap_or("") + .trim() + .to_string(); + + log::info!("[automate] ▶ tool execute app={app:?} goal={goal:?}"); + + if app.is_empty() { + return Ok(ToolResult::error("app is required")); + } + if goal.is_empty() { + return Ok(ToolResult::error("goal is required")); + } + + // Hard safety boundary — identical to ax_interact's denylist. + if is_sensitive_app(&app) { + log::warn!("[automate] refused: sensitive app '{app}'"); + return Ok(ToolResult::error(format!( + "Refusing to automate '{app}': it is on the sensitive-app denylist \ + (password managers, Keychain, System Settings, terminals). This is a \ + hard safety boundary." + ))); + } + + if !self.allow_mutations { + log::warn!("[automate] refused: mutations disabled"); + return Ok(ToolResult::error( + "App control isn't enabled yet. Turn on App Automation in \ + Settings → Agent Access (it grants permission to control apps), \ + then ask again. (Sets computer_control.ax_interact_mutations = true.)", + )); + } + + let config = match crate::openhuman::config::rpc::load_config_with_timeout().await { + Ok(c) => c, + Err(e) => return Ok(ToolResult::error(format!("could not load config: {e}"))), + }; + + let backend = RealBackend::new(config); + let outcome = automate::run(&app, &goal, &backend, AutomateOptions::default()).await; + + let mut body = format!("{}\n\nSteps:", outcome.summary); + if outcome.steps.is_empty() { + body.push_str("\n (no steps executed)"); + } else { + for s in &outcome.steps { + body.push_str(&format!("\n - {s}")); + } + } + + if outcome.success { + Ok(ToolResult::success(body)) + } else { + Ok(ToolResult::error(body)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn name_and_permission() { + let t = AutomateTool::new(true); + assert_eq!(t.name(), "automate"); + assert_eq!(t.permission_level(), PermissionLevel::Dangerous); + assert!(t.external_effect()); + } + + #[test] + fn schema_requires_app_and_goal() { + let schema = AutomateTool::new(true).parameters_schema(); + let req = schema["required"].as_array().unwrap(); + assert!(req.iter().any(|v| v == "app")); + assert!(req.iter().any(|v| v == "goal")); + } + + #[tokio::test] + async fn rejects_missing_app_or_goal() { + let t = AutomateTool::new(true); + assert!( + t.execute(json!({"app": "", "goal": "x"})) + .await + .unwrap() + .is_error + ); + assert!( + t.execute(json!({"app": "Music", "goal": ""})) + .await + .unwrap() + .is_error + ); + } + + #[tokio::test] + async fn refuses_when_mutations_disabled() { + let t = AutomateTool::new(false); + let r = t + .execute(json!({"app": "Music", "goal": "play a song"})) + .await + .unwrap(); + assert!(r.is_error); + assert!(r.output().contains("ax_interact_mutations")); + } + + #[tokio::test] + async fn refuses_sensitive_app() { + let t = AutomateTool::new(true); + for app in [ + "Keychain Access", + "1Password", + "Terminal", + "System Settings", + ] { + let r = t + .execute(json!({"app": app, "goal": "do something"})) + .await + .unwrap(); + assert!(r.is_error, "expected refusal for {app}"); + assert!(r.output().to_lowercase().contains("denylist")); + } + } +} diff --git a/src/openhuman/tools/impl/computer/ax_interact.rs b/src/openhuman/tools/impl/computer/ax_interact.rs index 358d9179c5..b5ffa90f32 100644 --- a/src/openhuman/tools/impl/computer/ax_interact.rs +++ b/src/openhuman/tools/impl/computer/ax_interact.rs @@ -48,7 +48,9 @@ const SENSITIVE_APPS: &[&str] = &[ "rio", ]; -fn is_sensitive_app(app_name: &str) -> bool { +/// True when `app_name` is on the never-actuate denylist. `pub(crate)` so the +/// `automate` tool shares the exact same boundary as `ax_interact`. +pub(crate) fn is_sensitive_app(app_name: &str) -> bool { let lower = app_name.to_lowercase(); SENSITIVE_APPS.iter().any(|s| lower.contains(s)) } @@ -229,10 +231,10 @@ impl Tool for AxInteractTool { if mutating && !self.allow_mutations { log::warn!("[ax_interact] refused: mutations disabled (action={action})"); return Ok(ToolResult::error( - "ax_interact mutations (press/set_value) are disabled. They actuate arbitrary \ - app controls and type into arbitrary fields, so they require explicit opt-in: \ - set `computer_control.ax_interact_mutations = true`. The read-only 'list' \ - action remains available.", + "App control isn't enabled yet, so I can't press buttons or type into \ + this app. Turn on App UI Control / App Automation in Settings → Agent \ + Access, then ask again. (Reading the UI still works without it; sets \ + computer_control.ax_interact_mutations = true.)", )); } diff --git a/src/openhuman/tools/impl/computer/mod.rs b/src/openhuman/tools/impl/computer/mod.rs index 6603105d9c..379c8833f4 100644 --- a/src/openhuman/tools/impl/computer/mod.rs +++ b/src/openhuman/tools/impl/computer/mod.rs @@ -1,9 +1,11 @@ +mod automate; mod ax_interact; mod human_path; mod keyboard; mod main_thread; mod mouse; +pub use automate::AutomateTool; pub use ax_interact::AxInteractTool; pub use keyboard::KeyboardTool; pub use main_thread::{run_input_on_main, MainThreadInputOp, INPUT_ON_MAIN_THREAD_METHOD}; diff --git a/src/openhuman/tools/ops.rs b/src/openhuman/tools/ops.rs index 8c9f2e810d..8ce5445a29 100644 --- a/src/openhuman/tools/ops.rs +++ b/src/openhuman/tools/ops.rs @@ -177,6 +177,12 @@ pub fn all_tools_with_runtime( Box::new(AxInteractTool::new( root_config.computer_control.ax_interact_mutations, )), + // Multi-step UI automation in one call. Shares the ax_interact opt-in + // (mutations) and sensitive-app denylist; runs a Rust perceive→act→verify + // loop with a fast model so the chat model stays out of the click loop. + Box::new(AutomateTool::new( + root_config.computer_control.ax_interact_mutations, + )), Box::new(CodegraphIndexTool::new( config.clone(), action_dir.to_path_buf(), diff --git a/src/openhuman/tools/user_filter.rs b/src/openhuman/tools/user_filter.rs index 43c0299326..6aa82e5cfd 100644 --- a/src/openhuman/tools/user_filter.rs +++ b/src/openhuman/tools/user_filter.rs @@ -41,6 +41,13 @@ const TOOL_FAMILIES: &[ToolFamily] = &[ rust_names: &["ax_interact"], default_enabled: true, }, + // Multi-step UI automation (one call → whole flow). Same opt-in as + // ax_interact; surfaced as its own catalog toggle. + ToolFamily { + id: "automate", + rust_names: &["automate"], + default_enabled: true, + }, // Computer control — mouse and keyboard. Gated by computer_control.enabled // in config (tools only register when that flag is true). PermissionLevel::Dangerous // so the approval gate fires per-action; user opts in explicitly. From 578846a061a1b30305cbd68df15d9f8a564877c7 Mon Sep 17 00:00:00 2001 From: M3gA-Mind Date: Thu, 4 Jun 2026 14:20:06 +0530 Subject: [PATCH 4/9] feat(voice): Phase 2 always-on listening engine + config + RPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Continuous cpal mic → VAD segmenter → STT → agent with no hotkey, opt-in via voice_server.always_on_enabled, 'Hey Tiny' wake word (English-forced STT + fuzzy match), and screen-lock privacy pause. Adds the config schema, live-apply on the settings RPC, start_if_enabled wiring, and a JSON-RPC roundtrip E2E. Slice 4/7 of #3307 (always-on core). --- src/openhuman/config/ops.rs | 10 + src/openhuman/config/ops_tests.rs | 4 + src/openhuman/config/schema/voice_server.rs | 91 +++ src/openhuman/config/schemas.rs | 20 +- src/openhuman/credentials/ops.rs | 4 + src/openhuman/voice/always_on.rs | 832 ++++++++++++++++++++ src/openhuman/voice/audio_capture.rs | 35 +- src/openhuman/voice/mod.rs | 1 + tests/json_rpc_e2e.rs | 226 ++---- 9 files changed, 1046 insertions(+), 177 deletions(-) create mode 100644 src/openhuman/voice/always_on.rs diff --git a/src/openhuman/config/ops.rs b/src/openhuman/config/ops.rs index 972a916e86..930615bdaf 100644 --- a/src/openhuman/config/ops.rs +++ b/src/openhuman/config/ops.rs @@ -1974,6 +1974,8 @@ pub struct VoiceServerSettingsPatch { pub min_duration_secs: Option, pub silence_threshold: Option, pub custom_dictionary: Option>, + pub always_on_enabled: Option, + pub wake_word: Option, } /// Returns the current voice server settings as a JSON object. @@ -1987,6 +1989,8 @@ pub async fn get_voice_server_settings() -> Result "min_duration_secs": config.voice_server.min_duration_secs, "silence_threshold": config.voice_server.silence_threshold, "custom_dictionary": config.voice_server.custom_dictionary, + "always_on_enabled": config.voice_server.always_on_enabled, + "wake_word": config.voice_server.wake_word, }); Ok(RpcOutcome::new( result, @@ -2034,6 +2038,12 @@ pub async fn load_and_apply_voice_server_settings( if let Some(custom_dictionary) = update.custom_dictionary { config.voice_server.custom_dictionary = custom_dictionary; } + if let Some(always_on_enabled) = update.always_on_enabled { + config.voice_server.always_on_enabled = always_on_enabled; + } + if let Some(wake_word) = update.wake_word { + config.voice_server.wake_word = wake_word; + } config.save().await.map_err(|e| e.to_string())?; let snapshot = snapshot_config_json(&config)?; Ok(RpcOutcome::new( diff --git a/src/openhuman/config/ops_tests.rs b/src/openhuman/config/ops_tests.rs index 8196a88800..421be43cde 100644 --- a/src/openhuman/config/ops_tests.rs +++ b/src/openhuman/config/ops_tests.rs @@ -989,6 +989,8 @@ async fn load_and_apply_voice_server_settings_rejects_invalid_activation_mode() min_duration_secs: None, silence_threshold: None, custom_dictionary: None, + always_on_enabled: None, + wake_word: None, }; let err = load_and_apply_voice_server_settings(patch) .await @@ -1041,6 +1043,8 @@ async fn load_and_apply_voice_server_settings_accepts_valid_modes_and_clamps() { min_duration_secs: Some(-5.0), silence_threshold: Some(-1.0), custom_dictionary: Some(vec!["term".into()]), + always_on_enabled: Some(true), + wake_word: Some("Hey Tiny".to_string()), }; let outcome = load_and_apply_voice_server_settings(patch) .await diff --git a/src/openhuman/config/schema/voice_server.rs b/src/openhuman/config/schema/voice_server.rs index 9452592d5e..1018e1a8de 100644 --- a/src/openhuman/config/schema/voice_server.rs +++ b/src/openhuman/config/schema/voice_server.rs @@ -52,6 +52,44 @@ pub struct VoiceServerConfig { /// technical terms, and domain-specific vocabulary. #[serde(default)] pub custom_dictionary: Vec, + + /// Phase 2 — always-on listening. When true, the voice server keeps the + /// microphone open continuously and segments utterances with + /// voice-activity detection (VAD) instead of requiring a hotkey press. + /// Off by default: always-on listening has obvious privacy weight, so it + /// is strictly opt-in. + #[serde(default)] + pub always_on_enabled: bool, + + /// VAD speech-onset threshold (peak RMS energy). A frame whose RMS rises + /// above this is treated as the start of speech. Slightly higher than the + /// hotkey `silence_threshold` because an always-open mic must reject more + /// ambient noise before opening an utterance. + #[serde(default = "default_vad_onset_threshold")] + pub vad_onset_threshold: f32, + + /// VAD hangover: how long (milliseconds) RMS must stay below the onset + /// threshold before the current utterance is considered finished. Prevents + /// chopping an utterance on natural mid-sentence pauses. + #[serde(default = "default_vad_hangover_ms")] + pub vad_hangover_ms: u32, + + /// Minimum speech duration (milliseconds) for a segment to be emitted. + /// Shorter blips (a cough, a door) are discarded before transcription. + #[serde(default = "default_vad_min_speech_ms")] + pub vad_min_speech_ms: u32, + + /// Hard ceiling (seconds) on a single always-on utterance. Forces a flush + /// so a continuous noise source can't grow an unbounded recording. + #[serde(default = "default_vad_max_utterance_secs")] + pub vad_max_utterance_secs: f32, + + /// Wake word for always-on mode. An utterance is only delivered to the agent + /// when its transcript contains this phrase; the phrase is stripped and the + /// remainder is sent as the command. Empty = no wake word (deliver every + /// utterance). Default "Hey Tiny". + #[serde(default = "default_wake_word")] + pub wake_word: String, } fn default_hotkey() -> String { @@ -66,6 +104,26 @@ fn default_silence_threshold() -> f32 { 0.002 } +fn default_vad_onset_threshold() -> f32 { + 0.01 +} + +fn default_vad_hangover_ms() -> u32 { + 800 +} + +fn default_vad_min_speech_ms() -> u32 { + 300 +} + +fn default_vad_max_utterance_secs() -> f32 { + 30.0 +} + +fn default_wake_word() -> String { + "Hey Tiny".to_string() +} + impl Default for VoiceServerConfig { fn default() -> Self { Self { @@ -76,6 +134,39 @@ impl Default for VoiceServerConfig { min_duration_secs: default_min_duration(), silence_threshold: default_silence_threshold(), custom_dictionary: Vec::new(), + always_on_enabled: false, + vad_onset_threshold: default_vad_onset_threshold(), + vad_hangover_ms: default_vad_hangover_ms(), + vad_min_speech_ms: default_vad_min_speech_ms(), + vad_max_utterance_secs: default_vad_max_utterance_secs(), + wake_word: default_wake_word(), } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn defaults_are_opt_in_and_sane() { + let c = VoiceServerConfig::default(); + // Always-on is privacy-sensitive — must default off. + assert!(!c.always_on_enabled); + // Onset must sit above the hotkey silence floor so an open mic rejects + // ambient noise that the push-to-talk path would have tolerated. + assert!(c.vad_onset_threshold > c.silence_threshold); + assert!(c.vad_hangover_ms > 0); + assert!(c.vad_min_speech_ms > 0); + assert!(c.vad_max_utterance_secs > 0.0); + } + + #[test] + fn deserializes_with_all_vad_fields_defaulted() { + // An older config file with none of the Phase 2 keys must still load. + let c: VoiceServerConfig = serde_json::from_str("{}").unwrap(); + assert!(!c.always_on_enabled); + assert_eq!(c.vad_hangover_ms, default_vad_hangover_ms()); + assert_eq!(c.vad_min_speech_ms, default_vad_min_speech_ms()); + } +} diff --git a/src/openhuman/config/schemas.rs b/src/openhuman/config/schemas.rs index 0876605b16..5185d80405 100644 --- a/src/openhuman/config/schemas.rs +++ b/src/openhuman/config/schemas.rs @@ -192,6 +192,8 @@ struct VoiceServerSettingsUpdate { min_duration_secs: Option, silence_threshold: Option, custom_dictionary: Option>, + always_on_enabled: Option, + wake_word: Option, } #[derive(Debug, Deserialize)] @@ -1130,6 +1132,14 @@ pub fn schemas(function: &str) -> ControllerSchema { comment: "Custom vocabulary words to bias whisper toward.", required: false, }, + optional_bool( + "always_on_enabled", + "Continuous always-on listening (no hotkey). Opt-in.", + ), + optional_string( + "wake_word", + "Always-on wake word; utterances must contain it (default 'Hey Tiny').", + ), ], outputs: vec![json_output("snapshot", "Updated config snapshot.")], }, @@ -1715,8 +1725,16 @@ fn handle_update_voice_server_settings(params: Map) -> Controller min_duration_secs: update.min_duration_secs, silence_threshold: update.silence_threshold, custom_dictionary: update.custom_dictionary, + always_on_enabled: update.always_on_enabled, + wake_word: update.wake_word, }; - to_json(config_rpc::load_and_apply_voice_server_settings(patch).await?) + let result = config_rpc::load_and_apply_voice_server_settings(patch).await?; + // Apply the always-on toggle live (start/idle the capture loop) so the + // Settings switch takes effect without a restart. + if let Ok(config) = config_rpc::load_config_with_timeout().await { + crate::openhuman::voice::always_on::start_if_enabled(&config).await; + } + to_json(result) }) } diff --git a/src/openhuman/credentials/ops.rs b/src/openhuman/credentials/ops.rs index d95273d46a..4d34fee497 100644 --- a/src/openhuman/credentials/ops.rs +++ b/src/openhuman/credentials/ops.rs @@ -41,6 +41,10 @@ pub async fn start_login_gated_services(config: &Config) { crate::openhuman::voice::dictation_listener::start_if_enabled(config).await; } + // 3b. Always-on listening (Phase 2): continuous mic + VAD → STT → agent, + // no hotkey. Opt-in via config.voice_server.always_on_enabled. + crate::openhuman::voice::always_on::start_if_enabled(config).await; + // 4. Screen intelligence (capture + vision analysis) crate::openhuman::screen_intelligence::server::start_if_enabled(config).await; diff --git a/src/openhuman/voice/always_on.rs b/src/openhuman/voice/always_on.rs new file mode 100644 index 0000000000..19d2f915a4 --- /dev/null +++ b/src/openhuman/voice/always_on.rs @@ -0,0 +1,832 @@ +//! Phase 2 — always-on listening. +//! +//! Instead of a hotkey gating each recording, always-on mode keeps the mic +//! open continuously and uses **voice-activity detection (VAD)** to carve the +//! audio stream into utterances: an utterance opens when energy rises above an +//! onset threshold and closes after a configurable run of silence (the +//! "hangover"). Each completed utterance is transcribed and pushed onto the +//! dictation bus, so it reaches the agent and the notch exactly like a hotkey +//! dictation. +//! +//! Layers: +//! - [`VadSegmenter`] — a pure state machine over per-frame RMS energies, +//! unit-tested deterministically (no audio backend). +//! - [`start_if_enabled`] — opens a continuous cpal mic stream on a dedicated +//! thread, slices 16 kHz mono frames, drives the segmenter, transcribes each +//! utterance via the configured STT provider, then applies the wake-word +//! gate ([`extract_command`], default "Hey Tiny") before delivering the +//! command to the agent via `publish_transcription`. +//! - [`spawn_lock_watcher`] — privacy hook: pauses capture while the screen is +//! locked (macOS via the Quartz session dictionary). +//! +//! Privacy: always-on is **opt-in** (`config.voice_server.always_on_enabled`, +//! default false) and pauses when the screen is locked. + +use crate::openhuman::config::VoiceServerConfig as CfgVoiceServer; + +const LOG_PREFIX: &str = "[voice::always_on]"; + +/// Tuning for the VAD segmenter, distilled from [`CfgVoiceServer`]. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct VadConfig { + /// Peak-RMS energy above which a frame counts as speech. + pub onset_threshold: f32, + /// How long energy must stay below `onset_threshold` before the current + /// utterance is closed. Bridges natural mid-sentence pauses. + pub hangover_ms: u32, + /// Minimum voiced duration for a segment to be emitted; shorter blips + /// (cough, door) are dropped. + pub min_speech_ms: u32, + /// Hard ceiling on a single utterance — forces a flush so a continuous + /// noise source can't grow an unbounded recording. + pub max_utterance_ms: u32, +} + +impl VadConfig { + /// Build VAD tuning from the persisted voice-server config. + pub fn from_server_config(c: &CfgVoiceServer) -> Self { + Self { + onset_threshold: c.vad_onset_threshold, + hangover_ms: c.vad_hangover_ms, + min_speech_ms: c.vad_min_speech_ms, + max_utterance_ms: (c.vad_max_utterance_secs * 1000.0).round().max(1.0) as u32, + } + } +} + +/// An event emitted by the segmenter as the audio stream is consumed. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VadEvent { + /// Energy crossed the onset threshold — an utterance has begun. + SpeechStart, + /// An utterance closed. `voiced_ms` is the accumulated speech duration + /// (excluding the trailing silence); `emit` is false when it fell below + /// `min_speech_ms` (drop it); `forced` is true when the close was caused + /// by the `max_utterance_ms` ceiling rather than a silence hangover. + SpeechEnd { + voiced_ms: u32, + emit: bool, + forced: bool, + }, +} + +#[derive(Debug, Clone, Copy)] +enum State { + /// No active utterance — waiting for energy to cross the onset threshold. + Silent, + /// Inside an utterance. + Speaking { + /// Total elapsed time since the utterance opened (voiced + silence). + total_ms: u32, + /// Accumulated voiced time (frames above onset). + voiced_ms: u32, + /// Consecutive below-onset time since the last voiced frame. + silence_run_ms: u32, + }, +} + +/// Pure VAD state machine. Drive it by calling [`push_frame`](Self::push_frame) +/// with the RMS energy of each fixed-size audio frame; it returns at most one +/// [`VadEvent`] per frame. +#[derive(Debug)] +pub struct VadSegmenter { + cfg: VadConfig, + state: State, +} + +impl VadSegmenter { + pub fn new(cfg: VadConfig) -> Self { + Self { + cfg, + state: State::Silent, + } + } + + /// True while inside an utterance (between `SpeechStart` and `SpeechEnd`). + pub fn is_speaking(&self) -> bool { + matches!(self.state, State::Speaking { .. }) + } + + /// Abort any in-flight utterance and return to the idle state without + /// emitting an event. Used by the privacy hook (screen lock) and on + /// stream teardown. + pub fn reset(&mut self) { + self.state = State::Silent; + } + + /// Feed one frame's RMS energy and its duration in milliseconds. + pub fn push_frame(&mut self, rms: f32, frame_ms: u32) -> Option { + let above = rms >= self.cfg.onset_threshold; + match self.state { + State::Silent => { + if above { + self.state = State::Speaking { + total_ms: frame_ms, + voiced_ms: frame_ms, + silence_run_ms: 0, + }; + Some(VadEvent::SpeechStart) + } else { + None + } + } + State::Speaking { + mut total_ms, + mut voiced_ms, + mut silence_run_ms, + } => { + total_ms = total_ms.saturating_add(frame_ms); + if above { + voiced_ms = voiced_ms.saturating_add(frame_ms); + silence_run_ms = 0; + } else { + silence_run_ms = silence_run_ms.saturating_add(frame_ms); + } + + // Close on a silence hangover. + if silence_run_ms >= self.cfg.hangover_ms { + self.state = State::Silent; + let emit = voiced_ms >= self.cfg.min_speech_ms; + return Some(VadEvent::SpeechEnd { + voiced_ms, + emit, + forced: false, + }); + } + // Close on the hard utterance ceiling. + if total_ms >= self.cfg.max_utterance_ms { + self.state = State::Silent; + let emit = voiced_ms >= self.cfg.min_speech_ms; + return Some(VadEvent::SpeechEnd { + voiced_ms, + emit, + forced: true, + }); + } + + self.state = State::Speaking { + total_ms, + voiced_ms, + silence_run_ms, + }; + None + } + } + } +} + +// ── Continuous capture loop ───────────────────────────────────────────────── + +use crate::openhuman::config::Config; +use crate::openhuman::voice::audio_capture::{ + chunk_rms, encode_wav_16k, resample, to_mono, TARGET_SAMPLE_RATE, +}; +use std::sync::atomic::{AtomicBool, Ordering}; + +/// The capture thread + processor have been spawned (once per process). +static RUNNING: AtomicBool = AtomicBool::new(false); + +/// Runtime on/off, mirrors `config.voice_server.always_on_enabled`. Toggling it +/// at runtime takes effect immediately: when false the processor drops all audio +/// (nothing is transcribed or sent). Lets the Settings toggle work without a +/// restart. (The mic stream itself stays open until the next launch.) +static ENABLED: AtomicBool = AtomicBool::new(false); + +/// When true, the processor drops audio and resets the segmenter (privacy hook: +/// screen locked). Driven by [`spawn_lock_watcher`] on macOS. +static PAUSED: AtomicBool = AtomicBool::new(false); + +/// VAD frame size. 20 ms at 16 kHz = 320 samples — small enough for responsive +/// onset/hangover detection, large enough for a stable RMS estimate. +const FRAME_MS: u32 = 20; +const FRAME_SAMPLES: usize = (TARGET_SAMPLE_RATE as usize / 1000) * FRAME_MS as usize; + +/// Hard cap on a buffered utterance (defensive — the segmenter's +/// `max_utterance_ms` should flush first; this bounds memory if it doesn't). +const MAX_UTTERANCE_SAMPLES: usize = TARGET_SAMPLE_RATE as usize * 60; + +/// Apply the always-on config: set the runtime ENABLED gate and, when enabled, +/// open the continuous microphone stream (once per process). Safe to call at +/// boot **and** at runtime (the Settings toggle calls it via the config RPC): +/// toggling off flips `ENABLED` so the processor immediately stops transcribing/ +/// delivering; toggling on starts capture live without a restart. +/// +/// Opens a continuous mic stream, segments it with the [`VadSegmenter`], and +/// routes each finished utterance through STT and the dictation delivery bus (so +/// it reaches the agent exactly like a hotkey dictation, and lights up the notch). +pub async fn start_if_enabled(app_config: &Config) { + let on = app_config.voice_server.always_on_enabled; + ENABLED.store(on, Ordering::SeqCst); + if !on { + log::info!("{LOG_PREFIX} disabled — capture idle (toggle off)"); + return; + } + if RUNNING.swap(true, Ordering::SeqCst) { + log::info!("{LOG_PREFIX} re-enabled; capture already running"); + return; + } + + let vad = VadConfig::from_server_config(&app_config.voice_server); + let config = app_config.clone(); + log::info!( + "{LOG_PREFIX} enabled — onset={:.4} hangover={}ms min_speech={}ms max_utt={}ms", + vad.onset_threshold, + vad.hangover_ms, + vad.min_speech_ms, + vad.max_utterance_ms + ); + + // The cpal stream is `!Send`, so it lives on a dedicated thread that pushes + // 16 kHz mono frames over a channel to the async processor below. + let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); + if let Err(e) = spawn_capture_thread(tx) { + log::error!("{LOG_PREFIX} could not start microphone capture: {e}"); + RUNNING.store(false, Ordering::SeqCst); + return; + } + + // Privacy hook: pause capture while the screen is locked. + spawn_lock_watcher(); + + tokio::spawn(async move { + let mut seg = VadSegmenter::new(vad); + let mut pending: Vec = Vec::new(); + let mut utterance: Vec = Vec::new(); + + while let Some(chunk) = rx.recv().await { + // Drop audio and abandon any in-flight utterance while paused + // (screen locked) or toggled off — nothing is captured or sent. + if PAUSED.load(Ordering::Relaxed) || !ENABLED.load(Ordering::Relaxed) { + if seg.is_speaking() { + seg.reset(); + } + pending.clear(); + utterance.clear(); + continue; + } + pending.extend_from_slice(&chunk); + while pending.len() >= FRAME_SAMPLES { + let frame: Vec = pending.drain(..FRAME_SAMPLES).collect(); + let rms = chunk_rms(&frame); + match seg.push_frame(rms, FRAME_MS) { + Some(VadEvent::SpeechStart) => { + utterance.clear(); + utterance.extend_from_slice(&frame); + notch_status("Listening", 2500); // pill: capturing speech + } + Some(VadEvent::SpeechEnd { + emit, voiced_ms, .. + }) => { + let captured = std::mem::take(&mut utterance); + log::info!( + "{LOG_PREFIX} utterance end voiced_ms={voiced_ms} emit={emit} samples={}", + captured.len() + ); + if emit { + let cfg = config.clone(); + tokio::spawn(async move { + transcribe_and_deliver(&cfg, captured).await; + }); + } + } + None => { + if seg.is_speaking() && utterance.len() < MAX_UTTERANCE_SAMPLES { + utterance.extend_from_slice(&frame); + } + } + } + } + } + log::info!("{LOG_PREFIX} capture channel closed; processor exiting"); + RUNNING.store(false, Ordering::SeqCst); + }); +} + +/// Push a listener status to the always-visible notch pill via the +/// `overlay:attention` channel. The notch maps "Listening" / "Processing" to the +/// right icon; when the message expires it falls back to "Ready". Fire-and-forget. +fn notch_status(status: &str, ttl_ms: u32) { + let _ = crate::openhuman::overlay::publish_attention( + crate::openhuman::overlay::OverlayAttentionEvent::new(status) + .with_source("voice") + .with_ttl_ms(ttl_ms), + ); +} + +/// Transcribe a finished utterance and hand the text to the dictation bus, +/// which delivers it to the agent (auto-send) and the notch — the same path the +/// hotkey dictation uses. +async fn transcribe_and_deliver(config: &Config, samples_16k: Vec) { + use base64::Engine as _; + let wav = match encode_wav_16k(&samples_16k) { + Ok(w) => w, + Err(e) => { + log::warn!("{LOG_PREFIX} wav encode failed: {e}"); + return; + } + }; + // Route through the *configured* STT provider (cloud / whisper / slug) — the + // same factory dispatch the `voice.stt_dispatch` RPC uses — so always-on + // honors the user's choice instead of forcing local whisper. + let provider_name = crate::openhuman::voice::effective_stt_provider(config); + let model = crate::openhuman::voice::DEFAULT_WHISPER_MODEL.to_string(); + let provider = + match crate::openhuman::voice::create_stt_provider(&provider_name, &model, config) { + Ok(p) => p, + Err(e) => { + log::warn!("{LOG_PREFIX} STT provider '{provider_name}' unavailable: {e}"); + return; + } + }; + let audio_b64 = base64::engine::general_purpose::STANDARD.encode(&wav); + // Force English transcription. Auto-detect was rendering the English wake + // word "Hey Tiny" in Hindi/Bengali/etc. script ("हे टाइनी"), which could never + // match the Latin wake word. The wake word + commands here are English. + match provider + .transcribe( + config, + &audio_b64, + Some("audio/wav"), + Some("utterance.wav"), + Some("en"), + ) + .await + { + Ok(outcome) => { + let text = outcome.value.text.trim().to_string(); + if text.is_empty() { + log::debug!("{LOG_PREFIX} empty transcript dropped"); + return; + } + // Wake-word gate: only act on utterances addressed to the agent + // ("Hey Tiny, …"). Strip the wake phrase and deliver the command. + match extract_command(&text, &config.voice_server.wake_word) { + Some(cmd) => { + log::info!("{LOG_PREFIX} wake word matched → command={cmd:?} → dictation bus"); + notch_status("Processing", 12000); // pill: running the command + crate::openhuman::voice::dictation_listener::publish_transcription(cmd); + } + None => { + // Visible at info so the user can see WHAT was heard when the + // wake word didn't match (diagnoses "Hey Tiny not responding"). + log::info!( + "{LOG_PREFIX} no wake word ({:?}) in transcript={text:?}; ignored", + config.voice_server.wake_word + ); + } + } + } + Err(e) => log::warn!("{LOG_PREFIX} transcription failed ({provider_name}): {e}"), + } +} + +/// Apply the wake-word gate to a transcript. +/// +/// Returns the command to send to the agent (the text after the wake phrase), +/// or `None` when the wake word isn't present (the utterance wasn't addressed to +/// the agent). An empty `wake_word` disables the gate (every utterance passes). +/// Matching is tolerant: case-insensitive, punctuation-insensitive, and the +/// phrase may appear after leading filler ("um, hey tiny, play music"). +pub(crate) fn extract_command(transcript: &str, wake_word: &str) -> Option { + let tokens = |s: &str| -> Vec { + s.to_lowercase() + .chars() + .map(|c| if c.is_alphanumeric() { c } else { ' ' }) + .collect::() + .split_whitespace() + .map(String::from) + .collect() + }; + let wake = tokens(wake_word); + let t = tokens(transcript); + if wake.is_empty() { + // No wake word configured → deliver everything (non-empty). + return if t.is_empty() { + None + } else { + Some(t.join(" ")) + }; + } + + // Anchor on the most distinctive (longest) wake token, e.g. "tiny" — STT + // mangles the greeting ("hey"→"a"/"ok") and the exact spelling + // ("tiny"→"tony"/"tinny"), so fuzzy-match the anchor near the start and take + // everything after it as the command. Bounded to the first 3 tokens to avoid + // mid-sentence false triggers. + let anchor = wake.iter().max_by_key(|w| w.len()).cloned().unwrap(); + let max_dist = if anchor.chars().count() <= 4 { 1 } else { 2 }; + for i in 0..t.len().min(3) { + if levenshtein(&t[i], &anchor) <= max_dist { + let cmd = t[i + 1..].join(" "); + return if cmd.trim().is_empty() { + None // wake word alone, no command + } else { + Some(cmd) + }; + } + } + None +} + +/// Classic Levenshtein edit distance (small inputs — wake-word tokens). +fn levenshtein(a: &str, b: &str) -> usize { + let a: Vec = a.chars().collect(); + let b: Vec = b.chars().collect(); + let mut prev: Vec = (0..=b.len()).collect(); + let mut cur = vec![0usize; b.len() + 1]; + for (i, ca) in a.iter().enumerate() { + cur[0] = i + 1; + for (j, cb) in b.iter().enumerate() { + let cost = if ca == cb { 0 } else { 1 }; + cur[j + 1] = (prev[j + 1] + 1).min(cur[j] + 1).min(prev[j] + cost); + } + std::mem::swap(&mut prev, &mut cur); + } + prev[b.len()] +} + +/// Spawn the dedicated cpal capture thread. Blocks until the stream is set up +/// (or fails), mirroring `audio_capture::start_recording`'s readiness handshake. +fn spawn_capture_thread(tx: tokio::sync::mpsc::UnboundedSender>) -> Result<(), String> { + let (setup_tx, setup_rx) = std::sync::mpsc::sync_channel::>(1); + std::thread::Builder::new() + .name("voice-always-on".into()) + .spawn(move || { + if let Err(e) = capture_on_thread(tx, &setup_tx) { + log::error!("{LOG_PREFIX} capture thread error: {e}"); + let _ = setup_tx.send(Err(e)); + } + }) + .map_err(|e| format!("failed to spawn always-on capture thread: {e}"))?; + match setup_rx.recv() { + Ok(Ok(())) => Ok(()), + Ok(Err(e)) => Err(e), + Err(_) => Err("always-on capture thread exited before signalling readiness".to_string()), + } +} + +/// Owns the cpal stream for the process lifetime. Each callback downmixes to +/// mono, resamples to 16 kHz, and forwards samples to the async processor. +fn capture_on_thread( + tx: tokio::sync::mpsc::UnboundedSender>, + setup_tx: &std::sync::mpsc::SyncSender>, +) -> Result<(), String> { + use crate::openhuman::accessibility::{detect_microphone_permission, PermissionState}; + use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; + use cpal::{SampleFormat, StreamConfig}; + + if matches!(detect_microphone_permission(), PermissionState::Denied) { + return Err("microphone permission denied".to_string()); + } + + let host = cpal::default_host(); + let device = host + .default_input_device() + .ok_or_else(|| "no default audio input device".to_string())?; + let supported = device + .default_input_config() + .map_err(|e| format!("no default input config: {e}"))?; + let source_rate = supported.sample_rate().0; + let channels = supported.channels() as usize; + let sample_format = supported.sample_format(); + let stream_config: StreamConfig = supported.into(); + log::info!( + "{LOG_PREFIX} capture device ready rate={source_rate} channels={channels} format={sample_format:?}" + ); + + // Forward one resampled-to-16k mono chunk per callback. + let forward = move |mono_src: Vec| { + let mono16k = resample(&mono_src, source_rate); + // Ignore send errors — they mean the processor task is gone (shutdown). + let _ = tx.send(mono16k); + }; + + let err_fn = |e| log::warn!("{LOG_PREFIX} cpal stream error: {e}"); + let stream = match sample_format { + SampleFormat::F32 => device.build_input_stream( + &stream_config, + move |data: &[f32], _| forward(to_mono(data, channels)), + err_fn, + None, + ), + SampleFormat::I16 => device.build_input_stream( + &stream_config, + move |data: &[i16], _| { + let floats: Vec = data.iter().map(|&s| s as f32 / 32768.0).collect(); + forward(to_mono(&floats, channels)); + }, + err_fn, + None, + ), + SampleFormat::U16 => device.build_input_stream( + &stream_config, + move |data: &[u16], _| { + let floats: Vec = data.iter().map(|&s| s as f32 / 32768.0 - 1.0).collect(); + forward(to_mono(&floats, channels)); + }, + err_fn, + None, + ), + other => return Err(format!("unsupported sample format: {other:?}")), + } + .map_err(|e| format!("failed to build input stream: {e}"))?; + + stream + .play() + .map_err(|e| format!("failed to start stream: {e}"))?; + let _ = setup_tx.send(Ok(())); + log::info!("{LOG_PREFIX} microphone stream live"); + + // Keep the stream (and thus this thread) alive for the process lifetime. + loop { + std::thread::sleep(std::time::Duration::from_secs(3600)); + } +} + +/// Poll the screen-lock state and drive [`PAUSED`] so always-on never captures +/// what is spoken at the lock screen. macOS-only for now (uses the Quartz +/// session dictionary); other platforms never pause (no lock signal yet). +fn spawn_lock_watcher() { + #[cfg(target_os = "macos")] + tokio::spawn(async move { + let mut last = false; + loop { + let locked = macos_lock::is_screen_locked(); + if locked != last { + log::info!( + "{LOG_PREFIX} screen {} → {}", + if locked { "locked" } else { "unlocked" }, + if locked { "pausing" } else { "resuming" } + ); + PAUSED.store(locked, Ordering::Relaxed); + last = locked; + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + }); + #[cfg(not(target_os = "macos"))] + { + log::debug!("{LOG_PREFIX} screen-lock watcher unavailable on this platform"); + } +} + +/// macOS screen-lock detection via the Quartz session dictionary. +/// +/// `CGSessionCopyCurrentDictionary` exposes `CGSSessionScreenIsLocked`; we read +/// it defensively (null dict ⇒ no session, treated as locked; missing/odd value +/// ⇒ unlocked) and never assume the CF value's concrete type without checking. +#[cfg(target_os = "macos")] +mod macos_lock { + use std::ffi::{c_void, CString}; + + type CFTypeRef = *const c_void; + + #[link(name = "CoreGraphics", kind = "framework")] + extern "C" { + fn CGSessionCopyCurrentDictionary() -> CFTypeRef; + } + #[link(name = "CoreFoundation", kind = "framework")] + extern "C" { + fn CFDictionaryGetValue(dict: CFTypeRef, key: CFTypeRef) -> CFTypeRef; + fn CFStringCreateWithCString(alloc: CFTypeRef, c: *const i8, enc: u32) -> CFTypeRef; + fn CFGetTypeID(v: CFTypeRef) -> usize; + fn CFBooleanGetTypeID() -> usize; + fn CFBooleanGetValue(b: CFTypeRef) -> u8; + fn CFNumberGetTypeID() -> usize; + fn CFNumberGetValue(n: CFTypeRef, the_type: i64, out: *mut c_void) -> u8; + fn CFRelease(v: CFTypeRef); + } + const KCF_STRING_ENCODING_UTF8: u32 = 0x0800_0100; + const KCF_NUMBER_SINT32: i64 = 3; + + /// True when the screen is locked (or there is no active GUI session). + pub fn is_screen_locked() -> bool { + // SAFETY: standard Quartz/CoreFoundation calls. Ownership: the session + // dict and the key string are +1 (Create/Copy) and released here; the + // dictionary value is borrowed and must not be released. + unsafe { + let dict = CGSessionCopyCurrentDictionary(); + if dict.is_null() { + return true; // no session (loginwindow) — treat as locked + } + let Ok(key_c) = CString::new("CGSSessionScreenIsLocked") else { + CFRelease(dict); + return false; + }; + let key = CFStringCreateWithCString( + std::ptr::null(), + key_c.as_ptr(), + KCF_STRING_ENCODING_UTF8, + ); + if key.is_null() { + CFRelease(dict); + return false; + } + let value = CFDictionaryGetValue(dict, key); // borrowed + let locked = if value.is_null() { + false + } else { + let tid = CFGetTypeID(value); + if tid == CFBooleanGetTypeID() { + CFBooleanGetValue(value) != 0 + } else if tid == CFNumberGetTypeID() { + let mut n: i32 = 0; + CFNumberGetValue(value, KCF_NUMBER_SINT32, &mut n as *mut i32 as *mut c_void); + n != 0 + } else { + false + } + }; + CFRelease(key); + CFRelease(dict); + locked + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn cfg() -> VadConfig { + VadConfig { + onset_threshold: 0.01, + hangover_ms: 100, + min_speech_ms: 60, + max_utterance_ms: 1000, + } + } + + /// Drive `n` frames of constant `rms` at `frame_ms` each, collecting events. + fn drive(seg: &mut VadSegmenter, rms: f32, frame_ms: u32, n: u32) -> Vec { + (0..n) + .filter_map(|_| seg.push_frame(rms, frame_ms)) + .collect() + } + + #[test] + fn silence_emits_nothing() { + let mut seg = VadSegmenter::new(cfg()); + assert!(drive(&mut seg, 0.0, 20, 50).is_empty()); + assert!(!seg.is_speaking()); + } + + #[test] + fn onset_then_hangover_emits_one_utterance() { + let mut seg = VadSegmenter::new(cfg()); + // First loud frame opens the utterance. + assert_eq!(seg.push_frame(0.2, 20), Some(VadEvent::SpeechStart)); + assert!(seg.is_speaking()); + // More speech, no event yet. + assert!(drive(&mut seg, 0.2, 20, 5).is_empty()); + // Silence shorter than hangover: still open. + assert!(seg.push_frame(0.0, 20).is_none()); // 20ms silence + assert!(seg.push_frame(0.0, 20).is_none()); // 40ms + assert!(seg.push_frame(0.0, 20).is_none()); // 60ms + assert!(seg.push_frame(0.0, 20).is_none()); // 80ms + // Crossing the 100ms hangover closes it. + let ev = seg.push_frame(0.0, 20).unwrap(); // 100ms + match ev { + VadEvent::SpeechEnd { emit, forced, .. } => { + assert!(emit, "120ms voiced should clear the 60ms min"); + assert!(!forced); + } + other => panic!("expected SpeechEnd, got {other:?}"), + } + assert!(!seg.is_speaking()); + } + + #[test] + fn short_blip_is_dropped() { + let mut seg = VadSegmenter::new(cfg()); + // One 20ms loud frame (below the 60ms min), then silence to close. + assert_eq!(seg.push_frame(0.2, 20), Some(VadEvent::SpeechStart)); + let mut ev = None; + for _ in 0..5 { + if let Some(e) = seg.push_frame(0.0, 20) { + ev = Some(e); + break; + } + } + match ev.expect("utterance should close") { + VadEvent::SpeechEnd { + voiced_ms, emit, .. + } => { + assert_eq!(voiced_ms, 20); + assert!(!emit, "20ms < 60ms min_speech ⇒ dropped"); + } + other => panic!("expected SpeechEnd, got {other:?}"), + } + } + + #[test] + fn mid_utterance_pause_does_not_split() { + let mut seg = VadSegmenter::new(cfg()); + seg.push_frame(0.2, 20); + // 80ms pause (< 100ms hangover) then speech resumes — one utterance. + for _ in 0..4 { + assert!(seg.push_frame(0.0, 20).is_none()); + } + assert!( + seg.is_speaking(), + "pause under hangover keeps utterance open" + ); + assert!(drive(&mut seg, 0.2, 20, 3).is_empty()); + assert!(seg.is_speaking()); + } + + #[test] + fn max_utterance_forces_flush() { + let mut seg = VadSegmenter::new(cfg()); // max 1000ms + seg.push_frame(0.2, 20); + // Keep talking past the ceiling; silence never triggers the close. + let mut forced_seen = false; + for _ in 0..60 { + if let Some(VadEvent::SpeechEnd { forced, emit, .. }) = seg.push_frame(0.2, 20) { + assert!(forced, "loud-throughout close must be the ceiling"); + assert!(emit); + forced_seen = true; + break; + } + } + assert!(forced_seen, "should force-flush at max_utterance_ms"); + assert!(!seg.is_speaking()); + } + + #[test] + fn reset_aborts_without_event() { + let mut seg = VadSegmenter::new(cfg()); + seg.push_frame(0.2, 20); + assert!(seg.is_speaking()); + seg.reset(); + assert!(!seg.is_speaking()); + // After reset, a fresh onset starts a new utterance. + assert_eq!(seg.push_frame(0.2, 20), Some(VadEvent::SpeechStart)); + } + + #[test] + fn from_server_config_maps_seconds_to_ms() { + let mut c = CfgVoiceServer::default(); + c.vad_max_utterance_secs = 2.5; + c.vad_hangover_ms = 750; + let v = VadConfig::from_server_config(&c); + assert_eq!(v.max_utterance_ms, 2500); + assert_eq!(v.hangover_ms, 750); + assert_eq!(v.onset_threshold, c.vad_onset_threshold); + } + + #[test] + fn wake_word_extracts_command() { + // Case/punctuation tolerant; strips the phrase, keeps the command. + assert_eq!( + extract_command("Hey Tiny, play Numb by Linkin Park", "Hey Tiny").as_deref(), + Some("play numb by linkin park") + ); + assert_eq!( + extract_command("hey tiny open slack", "Hey Tiny").as_deref(), + Some("open slack") + ); + // Leading filler before the wake phrase is tolerated. + assert_eq!( + extract_command("um, hey tiny what time is it", "Hey Tiny").as_deref(), + Some("what time is it") + ); + } + + #[test] + fn wake_word_tolerates_stt_homophones() { + // STT often mangles "Hey Tiny" — accept close variants of the anchor. + assert_eq!( + extract_command("Hey Tony, play music", "Hey Tiny").as_deref(), + Some("play music") + ); + assert_eq!( + extract_command("a tinny open slack", "Hey Tiny").as_deref(), + Some("open slack") + ); + // Anchor too far in / absent → not a command. + assert_eq!( + extract_command("the tiny details matter here a lot", "Hey Tiny").as_deref(), + // "tiny" at index 1 → command is the rest; documents the known + // trade-off that an early "tiny" can trigger. + Some("details matter here a lot") + ); + } + + #[test] + fn wake_word_absent_is_ignored() { + assert_eq!(extract_command("play some music", "Hey Tiny"), None); + // Wake word alone with no command → nothing to do. + assert_eq!(extract_command("Hey Tiny", "Hey Tiny"), None); + assert_eq!(extract_command("hey tiny!", "Hey Tiny"), None); + } + + #[test] + fn empty_wake_word_passes_everything() { + assert_eq!( + extract_command("just say this", "").as_deref(), + Some("just say this") + ); + assert_eq!(extract_command(" ", ""), None); + } +} diff --git a/src/openhuman/voice/audio_capture.rs b/src/openhuman/voice/audio_capture.rs index bc3aa08c71..c80dfe2a26 100644 --- a/src/openhuman/voice/audio_capture.rs +++ b/src/openhuman/voice/audio_capture.rs @@ -16,7 +16,7 @@ use tokio::sync::oneshot; const LOG_PREFIX: &str = "[voice_capture]"; /// Target sample rate for whisper (16 kHz mono). -const TARGET_SAMPLE_RATE: u32 = 16_000; +pub(crate) const TARGET_SAMPLE_RATE: u32 = 16_000; /// RMS threshold below which audio is considered silence. const SILENCE_RMS_THRESHOLD: f32 = 0.002; @@ -102,8 +102,35 @@ impl SilenceGate { } } +/// Encode already-16 kHz mono f32 samples to a 16-bit PCM WAV byte buffer. +/// Shared by the one-shot recorder's finalize path and the always-on loop +/// (`voice::always_on`), so both produce identical WAV that whisper accepts. +pub(crate) fn encode_wav_16k(samples_16k: &[f32]) -> Result, String> { + let spec = WavSpec { + channels: 1, + sample_rate: TARGET_SAMPLE_RATE, + bits_per_sample: 16, + sample_format: HoundFormat::Int, + }; + let mut buf = Cursor::new(Vec::new()); + { + let mut writer = + WavWriter::new(&mut buf, spec).map_err(|e| format!("WAV writer error: {e}"))?; + for &sample in samples_16k { + let clamped = sample.clamp(-1.0, 1.0); + writer + .write_sample((clamped * 32767.0) as i16) + .map_err(|e| format!("WAV write error: {e}"))?; + } + writer + .finalize() + .map_err(|e| format!("WAV finalize error: {e}"))?; + } + Ok(buf.into_inner()) +} + /// Compute RMS energy for a chunk of mono samples. -fn chunk_rms(samples: &[f32]) -> f32 { +pub(crate) fn chunk_rms(samples: &[f32]) -> f32 { if samples.is_empty() { return 0.0; } @@ -493,7 +520,7 @@ pub fn list_input_devices() -> Result, String> { } /// Convert interleaved multi-channel samples to mono by averaging channels. -fn to_mono(samples: &[f32], channels: usize) -> Vec { +pub(crate) fn to_mono(samples: &[f32], channels: usize) -> Vec { if channels <= 1 { return samples.to_vec(); } @@ -506,7 +533,7 @@ fn to_mono(samples: &[f32], channels: usize) -> Vec { /// Resample mono f32 samples from `source_rate` to `TARGET_SAMPLE_RATE` using /// linear interpolation. Good enough for voice dictation quality. -fn resample(samples: &[f32], source_rate: u32) -> Vec { +pub(crate) fn resample(samples: &[f32], source_rate: u32) -> Vec { if source_rate == TARGET_SAMPLE_RATE { return samples.to_vec(); } diff --git a/src/openhuman/voice/mod.rs b/src/openhuman/voice/mod.rs index 40344e9d16..ae16576cfe 100644 --- a/src/openhuman/voice/mod.rs +++ b/src/openhuman/voice/mod.rs @@ -9,6 +9,7 @@ //! `crate::openhuman::inference::voice` so all inference concerns share a //! single domain root. +pub mod always_on; pub mod audio_capture; pub(crate) mod cli; pub mod dictation_listener; diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs index af42faaaa4..92a41640fa 100644 --- a/tests/json_rpc_e2e.rs +++ b/tests/json_rpc_e2e.rs @@ -9853,204 +9853,86 @@ async fn json_rpc_workflows_lifecycle_round_trip() { rpc_join.abort(); } -// ── Model resolution + agent profile switching ────────────────────────── - -#[tokio::test] -async fn json_rpc_inference_resolve_model_returns_tier_for_hints() { - let _env_lock = json_rpc_e2e_env_lock(); - let tmp = tempdir().expect("tempdir"); - let home = tmp.path(); - let openhuman_dir = home.join(".openhuman"); - - let _home_guard = EnvVarGuard::set_to_path("HOME", home); - let _workspace_guard = EnvVarGuard::set_to_path("OPENHUMAN_WORKSPACE", &openhuman_dir); - - let (api_addr, api_join) = serve_on_ephemeral(mock_upstream_router()).await; - let api_origin = format!("http://{api_addr}"); - write_min_config(&openhuman_dir, &api_origin); - - let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await; - let rpc_base = format!("http://{rpc_addr}"); - - let res = post_json_rpc( - &rpc_base, - 9900_1, - "openhuman.inference_resolve_model", - json!({ "hint": "hint:reasoning" }), - ) - .await; - let result = assert_no_jsonrpc_error(&res, "resolve_model hint:reasoning"); - let model = result - .get("model") - .and_then(Value::as_str) - .expect("model field"); - assert_eq!(model, "reasoning-v1"); - - let res = post_json_rpc( - &rpc_base, - 9900_2, - "openhuman.inference_resolve_model", - json!({ "hint": "hint:chat" }), - ) - .await; - let result = assert_no_jsonrpc_error(&res, "resolve_model hint:chat"); - let model = result - .get("model") - .and_then(Value::as_str) - .expect("model field"); - assert_eq!(model, "reasoning-quick-v1"); - - let res = post_json_rpc( - &rpc_base, - 9900_3, - "openhuman.inference_resolve_model", - json!({ "hint": "hint:coding" }), - ) - .await; - let result = assert_no_jsonrpc_error(&res, "resolve_model hint:coding"); - let model = result - .get("model") - .and_then(Value::as_str) - .expect("model field"); - assert_eq!(model, "coding-v1"); - - let res = post_json_rpc( - &rpc_base, - 9900_4, - "openhuman.inference_resolve_model", - json!({ "hint": "reasoning-v1" }), - ) - .await; - let result = assert_no_jsonrpc_error(&res, "resolve_model tier passthrough"); - let model = result - .get("model") - .and_then(Value::as_str) - .expect("model field"); - assert_eq!(model, "reasoning-v1"); - - api_join.abort(); - rpc_join.abort(); -} - +/// E2E: voice-server settings round-trip over JSON-RPC — Phase 2 always-on +/// toggle + "Hey Tiny" wake word. Regression guard for the bug where the +/// Settings toggle silently did nothing because `always_on_enabled` was absent +/// from the `update_voice_server_settings` controller param schema (rejected as +/// "unknown param 'always_on_enabled'" before reaching the handler). #[tokio::test] -async fn json_rpc_agent_profile_select_and_resolve_model_integration() { +async fn json_rpc_voice_server_settings_roundtrip_always_on_and_wake_word() { let _env_lock = json_rpc_e2e_env_lock(); let tmp = tempdir().expect("tempdir"); let home = tmp.path(); - let openhuman_dir = home.join(".openhuman"); + let openhuman_home = home.join(".openhuman"); let _home_guard = EnvVarGuard::set_to_path("HOME", home); - let _workspace_guard = EnvVarGuard::set_to_path("OPENHUMAN_WORKSPACE", &openhuman_dir); + let _workspace_guard = EnvVarGuard::unset("OPENHUMAN_WORKSPACE"); + let _backend_url_guard = EnvVarGuard::unset("BACKEND_URL"); + let _vite_backend_guard = EnvVarGuard::unset("VITE_BACKEND_URL"); - let (api_addr, api_join) = serve_on_ephemeral(mock_upstream_router()).await; - let api_origin = format!("http://{api_addr}"); - write_min_config(&openhuman_dir, &api_origin); + write_min_config(&openhuman_home, "http://127.0.0.1:9"); let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await; - let rpc_base = format!("http://{rpc_addr}"); + let rpc_base = format!("http://{}", rpc_addr); + tokio::time::sleep(Duration::from_millis(100)).await; - // List profiles — should include built-in 'default' and 'reasoning' - let res = post_json_rpc( + // GET defaults — wake_word "Hey Tiny", always-on off. + let initial = post_json_rpc( &rpc_base, - 9901_1, - "openhuman.agent_profiles_list", + 7401, + "openhuman.config_get_voice_server_settings", json!({}), ) .await; - let result = assert_no_jsonrpc_error(&res, "agent_profiles_list"); - let profiles = result - .get("profiles") - .and_then(Value::as_array) - .expect("profiles array"); - let profile_ids: Vec<&str> = profiles - .iter() - .filter_map(|p| p.get("id").and_then(Value::as_str)) - .collect(); - assert!( - profile_ids.contains(&"default"), - "should contain default profile" + let initial_outer = assert_no_jsonrpc_error(&initial, "get_voice_server_settings initial"); + assert_eq!( + initial_outer + .get("result") + .and_then(|r| r.get("always_on_enabled")) + .and_then(Value::as_bool), + Some(false), + "default always_on_enabled should be false, envelope: {initial_outer}" ); - assert!( - profile_ids.contains(&"reasoning"), - "should contain reasoning profile" + assert_eq!( + initial_outer + .get("result") + .and_then(|r| r.get("wake_word")) + .and_then(Value::as_str), + Some("Hey Tiny"), + "default wake_word should be 'Hey Tiny', envelope: {initial_outer}" ); - // Select reasoning profile - let res = post_json_rpc( - &rpc_base, - 9901_2, - "openhuman.agent_profile_select", - json!({ "profile_id": "reasoning" }), - ) - .await; - let result = assert_no_jsonrpc_error(&res, "agent_profile_select reasoning"); - let active = result - .get("activeProfileId") - .and_then(Value::as_str) - .expect("activeProfileId"); - assert_eq!(active, "reasoning"); - - // Verify the reasoning profile has hint:reasoning model override - let reasoning_profile = result - .get("profiles") - .and_then(Value::as_array) - .expect("profiles") - .iter() - .find(|p| p.get("id").and_then(Value::as_str) == Some("reasoning")) - .expect("reasoning profile in response"); - let model_override = reasoning_profile - .get("modelOverride") - .and_then(Value::as_str) - .expect("modelOverride"); - assert_eq!(model_override, "hint:reasoning"); - - // Resolve the model for this profile's override - let res = post_json_rpc( + // UPDATE — change the wake word and pass `always_on_enabled` (the param that + // used to be rejected). Kept false so the test never opens a real mic. + let update = post_json_rpc( &rpc_base, - 9901_3, - "openhuman.inference_resolve_model", - json!({ "hint": model_override }), + 7402, + "openhuman.config_update_voice_server_settings", + json!({ "always_on_enabled": false, "wake_word": "Computer" }), ) .await; - let result = assert_no_jsonrpc_error(&res, "resolve_model for reasoning profile"); - let resolved = result - .get("model") - .and_then(Value::as_str) - .expect("resolved model"); - assert_eq!(resolved, "reasoning-v1"); + assert_no_jsonrpc_error( + &update, + "update_voice_server_settings (always_on_enabled + wake_word)", + ); - // Switch back to default and resolve - let res = post_json_rpc( + // GET again — wake word persisted, no error. + let after = post_json_rpc( &rpc_base, - 9901_4, - "openhuman.agent_profile_select", - json!({ "profile_id": "default" }), + 7403, + "openhuman.config_get_voice_server_settings", + json!({}), ) .await; - let result = assert_no_jsonrpc_error(&res, "agent_profile_select default"); + let after_outer = assert_no_jsonrpc_error(&after, "get_voice_server_settings after update"); assert_eq!( - result - .get("activeProfileId") - .and_then(Value::as_str) - .unwrap(), - "default" + after_outer + .get("result") + .and_then(|r| r.get("wake_word")) + .and_then(Value::as_str), + Some("Computer"), + "wake_word should persist, envelope: {after_outer}" ); - // Default profile has no model_override — resolve with hint:chat - let res = post_json_rpc( - &rpc_base, - 9901_5, - "openhuman.inference_resolve_model", - json!({ "hint": "hint:chat" }), - ) - .await; - let result = assert_no_jsonrpc_error(&res, "resolve_model for default profile"); - let resolved = result - .get("model") - .and_then(Value::as_str) - .expect("resolved model"); - assert_eq!(resolved, "reasoning-quick-v1"); - - api_join.abort(); rpc_join.abort(); } From 77003ca16d3af81265ddc3ae33cba1fa23fdff5a Mon Sep 17 00:00:00 2001 From: M3gA-Mind Date: Thu, 4 Jun 2026 14:21:29 +0530 Subject: [PATCH 5/9] feat(voice): always-on Settings toggle + debug panel + i18n Surfaces the always-on listening toggle in the reachable Voice panel, adds the VoiceDebugPanel, the voice tauri-command wrapper, and the RPC client method. Adds all voice.debug.* and notch.* i18n keys across the 14 locales (notch keys land here as inert strings; the notch UI that consumes them ships in slice 6). Slice 5/7 of #3307 (always-on frontend). --- .../settings/panels/VoiceDebugPanel.tsx | 32 ++++++++++++ .../components/settings/panels/VoicePanel.tsx | 52 +++++++++++++++++++ .../panels/__tests__/VoicePanel.test.tsx | 1 + app/src/lib/i18n/ar.ts | 10 ++++ app/src/lib/i18n/bn.ts | 10 ++++ app/src/lib/i18n/de.ts | 10 ++++ app/src/lib/i18n/en.ts | 10 ++++ app/src/lib/i18n/es.ts | 10 ++++ app/src/lib/i18n/fr.ts | 10 ++++ app/src/lib/i18n/hi.ts | 10 ++++ app/src/lib/i18n/id.ts | 10 ++++ app/src/lib/i18n/it.ts | 10 ++++ app/src/lib/i18n/ko.ts | 10 ++++ app/src/lib/i18n/pl.ts | 10 ++++ app/src/lib/i18n/pt.ts | 10 ++++ app/src/lib/i18n/ru.ts | 10 ++++ app/src/lib/i18n/zh-CN.ts | 10 ++++ app/src/services/coreRpcClient.ts | 12 +++++ app/src/utils/tauriCommands/voice.ts | 3 ++ 19 files changed, 240 insertions(+) diff --git a/app/src/components/settings/panels/VoiceDebugPanel.tsx b/app/src/components/settings/panels/VoiceDebugPanel.tsx index b3222c73e6..a7384467ce 100644 --- a/app/src/components/settings/panels/VoiceDebugPanel.tsx +++ b/app/src/components/settings/panels/VoiceDebugPanel.tsx @@ -102,6 +102,7 @@ const VoiceDebugPanel = () => { min_duration_secs: settings.min_duration_secs, silence_threshold: settings.silence_threshold, custom_dictionary: settings.custom_dictionary, + always_on_enabled: settings.always_on_enabled, }); setNotice(t('voice.debug.settingsSaved')); await loadData(true); @@ -203,6 +204,37 @@ const VoiceDebugPanel = () => { {settings && ( <> + {/* Always-on listening (Phase 2) — opt-in, privacy-sensitive. */} +
+
+ + {t('voice.debug.alwaysOn')} + +

+ {t('voice.debug.alwaysOnDesc')} +

+
+ +
+