From bf8bc2d04bc94c04f4eb1dc28c02ff6a6ada3565 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 14:09:58 +0530
Subject: [PATCH 1/9] feat(computer): main-thread synthetic-input executor +
 CEF crash fix

Run enigo keyboard/mouse on the app main thread via a native-registry
executor; enigo's macOS TSMGetInputSourceProperty traps off-thread and
crashes the CEF host. Adds mouse/keyboard tools, the main_thread bridge,
and downscaled screenshots so the model can see them.

Slice 1/7 of #3307 (was the 'computer control' area).
---
 app/src-tauri/src/lib.rs                      |  33 +++
 .../tools/impl/browser/screenshot.rs          | 200 +++++++++++-----
 src/openhuman/tools/impl/computer/keyboard.rs | 148 ++++++------
 .../tools/impl/computer/main_thread.rs        |  49 ++++
 src/openhuman/tools/impl/computer/mod.rs      |   2 +
 src/openhuman/tools/impl/computer/mouse.rs    | 221 +++++++++---------
 6 files changed, 406 insertions(+), 247 deletions(-)
 create mode 100644 src/openhuman/tools/impl/computer/main_thread.rs

diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs
index 41c01934a5..7df029b44b 100644
--- a/app/src-tauri/src/lib.rs
+++ b/app/src-tauri/src/lib.rs
@@ -2800,6 +2800,39 @@ pub fn run() {
             //       let _ = window.show();
             //   }
 
+            // Synthetic-input main-thread executor. enigo's macOS keyboard-layout
+            // lookup (TSMGetInputSourceProperty) MUST run on the app main thread
+            // or it traps (`_dispatch_assert_queue_fail`/EXC_BREAKPOINT) and
+            // crashes the CEF host (Change 1.15, confirmed via crash report). The
+            // keyboard/mouse tools run on tokio workers, so they dispatch their
+            // enigo ops here via the native registry; we run each on the real
+            // main thread through `run_on_main_thread`.
+            {
+                use openhuman_core::core::event_bus::register_native_global;
+                use openhuman_core::openhuman::tools::{
+                    MainThreadInputOp, INPUT_ON_MAIN_THREAD_METHOD,
+                };
+                let input_app = app.handle().clone();
+                register_native_global::<MainThreadInputOp, Result<String, String>, _, _>(
+                    INPUT_ON_MAIN_THREAD_METHOD,
+                    move |req| {
+                        let input_app = input_app.clone();
+                        async move {
+                            let (tx, rx) = tokio::sync::oneshot::channel();
+                            let run = req.run;
+                            input_app
+                                .run_on_main_thread(move || {
+                                    let _ = tx.send((run)());
+                                })
+                                .map_err(|e| format!("run_on_main_thread dispatch failed: {e}"))?;
+                            rx.await
+                                .map_err(|_| "main-thread input op was cancelled".to_string())
+                        }
+                    },
+                );
+                log::info!("[computer] registered main-thread synthetic-input executor");
+            }
+
             // Tray icon setup moved to RunEvent::Ready (see below) — GTK is only
             // initialized after the event loop starts, so we must delay tray creation
             // until the Ready event fires. Creating the tray here would panic on
diff --git a/src/openhuman/tools/impl/browser/screenshot.rs b/src/openhuman/tools/impl/browser/screenshot.rs
index 1a247830e5..7d1e9c69b7 100644
--- a/src/openhuman/tools/impl/browser/screenshot.rs
+++ b/src/openhuman/tools/impl/browser/screenshot.rs
@@ -9,8 +9,6 @@ use std::time::Duration;
 
 /// Maximum time to wait for a screenshot command to complete.
 const SCREENSHOT_TIMEOUT_SECS: u64 = 15;
-/// Maximum base64 payload size to return (2 MB of base64 ≈ 1.5 MB image).
-const MAX_BASE64_BYTES: usize = 2_097_152;
 
 /// Tool for capturing screenshots using platform-native commands.
 ///
@@ -132,61 +130,101 @@ impl ScreenshotTool {
         }
     }
 
-    /// Read the screenshot file and return base64-encoded result.
+    /// Read the screenshot file and return a base64 data-URL the model can see.
+    ///
+    /// Full-screen Retina captures are multi-MB PNGs that blow the inline
+    /// budget. Rather than dropping the image (which leaves vision-driven
+    /// control blind), downscale oversized captures to a JPEG that fits — the
+    /// model can then actually see the screen. Reports the *shown* dimensions so
+    /// callers know the coordinate space they're reading.
     async fn read_and_encode(output_path: &std::path::Path) -> anyhow::Result<ToolResult> {
-        // Check file size before reading to prevent OOM on large screenshots
-        const MAX_RAW_BYTES: u64 = 1_572_864; // ~1.5 MB (base64 expands ~33%)
-        if let Ok(meta) = tokio::fs::metadata(output_path).await {
-            if meta.len() > MAX_RAW_BYTES {
-                return Ok(ToolResult::success(format!(
-                    "Screenshot saved to: {}\nSize: {} bytes (too large to base64-encode inline)",
-                    output_path.display(),
-                    meta.len(),
-                )));
+        // ~1.5 MB raw → ~2 MB base64, a safe inline payload size.
+        const MAX_RAW_BYTES: usize = 1_572_864;
+
+        let bytes = match tokio::fs::read(output_path).await {
+            Ok(b) => b,
+            Err(e) => {
+                return Ok(ToolResult::error(format!(
+                    "Failed to read screenshot file: {e}"
+                )))
             }
+        };
+        let ext = output_path
+            .extension()
+            .and_then(|e| e.to_str())
+            .unwrap_or("png")
+            .to_lowercase();
+
+        // Fits as-is → return verbatim.
+        if bytes.len() <= MAX_RAW_BYTES {
+            let mime = match ext.as_str() {
+                "jpg" | "jpeg" => "image/jpeg",
+                "bmp" => "image/bmp",
+                "gif" => "image/gif",
+                "webp" => "image/webp",
+                _ => "image/png",
+            };
+            return Ok(Self::data_url_result(output_path, &bytes, mime, None));
         }
 
-        match tokio::fs::read(output_path).await {
-            Ok(bytes) => {
-                use base64::Engine;
-                let size = bytes.len();
-                let mut encoded = base64::engine::general_purpose::STANDARD.encode(&bytes);
-                let truncated = if encoded.len() > MAX_BASE64_BYTES {
-                    encoded.truncate(crate::openhuman::util::floor_char_boundary(
-                        &encoded,
-                        MAX_BASE64_BYTES,
-                    ));
-                    true
-                } else {
-                    false
-                };
-
-                let mut output_msg = format!(
-                    "Screenshot saved to: {}\nSize: {size} bytes\nBase64 length: {}",
-                    output_path.display(),
-                    encoded.len(),
-                );
-                if truncated {
-                    output_msg.push_str(" (truncated)");
-                }
-                let mime = match output_path.extension().and_then(|e| e.to_str()) {
-                    Some("jpg" | "jpeg") => "image/jpeg",
-                    Some("bmp") => "image/bmp",
-                    Some("gif") => "image/gif",
-                    Some("webp") => "image/webp",
-                    _ => "image/png",
-                };
-                let _ = write!(output_msg, "\ndata:{mime};base64,{encoded}");
-
-                Ok(ToolResult::success(output_msg))
-            }
-            Err(e) => Ok(ToolResult::error(format!(
-                "Failed to read screenshot file: {e}"
+        // Too large → downscale to a JPEG that fits (CPU work off the runtime).
+        match tokio::task::spawn_blocking(move || downscale_to_jpeg(&bytes, MAX_RAW_BYTES)).await {
+            Ok(Ok((jpeg, w, h))) => Ok(Self::data_url_result(
+                output_path,
+                &jpeg,
+                "image/jpeg",
+                Some((w, h)),
+            )),
+            Ok(Err(e)) => Ok(ToolResult::success(format!(
+                "Screenshot saved to: {} (could not downscale for inline view: {e})",
+                output_path.display()
             ))),
+            Err(e) => Ok(ToolResult::error(format!("downscale task failed: {e}"))),
+        }
+    }
+
+    /// Build a success result carrying a base64 data-URL of `data`.
+    fn data_url_result(
+        output_path: &std::path::Path,
+        data: &[u8],
+        mime: &str,
+        shown_dims: Option<(u32, u32)>,
+    ) -> ToolResult {
+        use base64::Engine;
+        let encoded = base64::engine::general_purpose::STANDARD.encode(data);
+        let mut msg = format!("Screenshot saved to: {}\n", output_path.display());
+        if let Some((w, h)) = shown_dims {
+            let _ = write!(
+                msg,
+                "Downscaled to {w}x{h}px for inline view (coordinates you read are in this {w}x{h} space).\n"
+            );
         }
+        let _ = write!(msg, "data:{mime};base64,{encoded}");
+        ToolResult::success(msg)
     }
 }
 
+/// Decode image bytes, downscale (preserving aspect ratio), and JPEG-encode so
+/// the result is ≤ `max_bytes`. Returns `(jpeg_bytes, width, height)`.
+fn downscale_to_jpeg(bytes: &[u8], max_bytes: usize) -> Result<(Vec<u8>, u32, u32), String> {
+    let img = image::load_from_memory(bytes).map_err(|e| format!("decode: {e}"))?;
+    let mut last: Option<(Vec<u8>, u32, u32)> = None;
+    for max_dim in [1568u32, 1280, 1024, 768, 600] {
+        let thumb = img.thumbnail(max_dim, max_dim); // fits within max_dim², keeps aspect
+        let mut buf = std::io::Cursor::new(Vec::new());
+        image::codecs::jpeg::JpegEncoder::new_with_quality(&mut buf, 72)
+            .encode_image(&thumb)
+            .map_err(|e| format!("jpeg encode: {e}"))?;
+        let out = buf.into_inner();
+        let (w, h) = (thumb.width(), thumb.height());
+        if out.len() <= max_bytes {
+            return Ok((out, w, h));
+        }
+        last = Some((out, w, h));
+    }
+    last.ok_or_else(|| "could not produce a fitting JPEG".to_string())
+}
+
 #[async_trait]
 impl Tool for ScreenshotTool {
     fn name(&self) -> &str {
@@ -228,6 +266,36 @@ mod tests {
     use super::*;
     use crate::openhuman::security::{AutonomyLevel, SecurityPolicy};
 
+    #[test]
+    fn downscale_to_jpeg_shrinks_oversized_capture() {
+        // A 1600x1200 PNG of noise is well over a tight budget; downscaling must
+        // produce a smaller JPEG that still decodes, so the model can see it.
+        let mut img = image::RgbImage::new(1600, 1200);
+        for (i, px) in img.pixels_mut().enumerate() {
+            *px = image::Rgb([(i % 251) as u8, (i % 253) as u8, (i % 247) as u8]);
+        }
+        let mut png = std::io::Cursor::new(Vec::new());
+        image::DynamicImage::ImageRgb8(img)
+            .write_to(&mut png, image::ImageFormat::Png)
+            .expect("encode png");
+        let png = png.into_inner();
+
+        let max = 400_000usize;
+        let (jpeg, w, h) = downscale_to_jpeg(&png, max).expect("downscale");
+        assert!(jpeg.len() <= max, "jpeg {} should be <= {max}", jpeg.len());
+        assert!(
+            w <= 1568 && h <= 1568,
+            "dims {w}x{h} should be capped to 1568"
+        );
+        assert!(
+            jpeg.len() < png.len(),
+            "jpeg should be smaller than source png"
+        );
+        // Result must be a valid, decodable image at the reported dims.
+        let decoded = image::load_from_memory(&jpeg).expect("jpeg decodes");
+        assert_eq!((decoded.width(), decoded.height()), (w, h));
+    }
+
     fn test_security() -> Arc<SecurityPolicy> {
         Arc::new(SecurityPolicy {
             autonomy: AutonomyLevel::Full,
@@ -439,24 +507,38 @@ mod tests {
     // ── read_and_encode: large file returns saved-path-only message ───────────
 
     #[tokio::test]
-    async fn read_and_encode_large_file_skips_base64() {
-        use tokio::io::AsyncWriteExt;
+    async fn read_and_encode_large_file_downscales_to_viewable_jpeg() {
+        // A large *real* PNG (over MAX_RAW_BYTES) must be downscaled to an inline
+        // JPEG data-URL the model can see — not dropped (the old behavior left
+        // vision-driven control blind).
         let dir = tempfile::TempDir::new().unwrap();
         let path = dir.path().join("big.png");
-        let mut f = tokio::fs::File::create(&path).await.unwrap();
-        // Write ~1.6 MB to exceed the MAX_RAW_BYTES threshold (1.5 MB)
-        let chunk = vec![0u8; 1024];
-        for _ in 0..1600 {
-            f.write_all(&chunk).await.unwrap();
+        let mut img = image::RgbImage::new(2200, 1500);
+        for (i, px) in img.pixels_mut().enumerate() {
+            *px = image::Rgb([(i % 251) as u8, (i % 253) as u8, (i % 247) as u8]);
         }
-        drop(f);
+        image::DynamicImage::ImageRgb8(img)
+            .save_with_format(&path, image::ImageFormat::Png)
+            .unwrap();
+        assert!(
+            tokio::fs::metadata(&path).await.unwrap().len() > 1_572_864,
+            "test PNG should exceed the inline budget"
+        );
 
         let result = ScreenshotTool::read_and_encode(&path).await.unwrap();
-        assert!(!result.is_error, "large file should not be an error result");
         assert!(
-            result.output().contains("too large to base64-encode"),
-            "large file should skip base64, got: {}",
+            !result.is_error,
+            "should not error, got: {}",
             result.output()
         );
+        let out = result.output();
+        assert!(
+            out.contains("data:image/jpeg;base64,"),
+            "should inline a jpeg: {out}"
+        );
+        assert!(
+            out.contains("Downscaled to"),
+            "should report downscale: {out}"
+        );
     }
 }
diff --git a/src/openhuman/tools/impl/computer/keyboard.rs b/src/openhuman/tools/impl/computer/keyboard.rs
index 9bbfecb704..9e9a9a629e 100644
--- a/src/openhuman/tools/impl/computer/keyboard.rs
+++ b/src/openhuman/tools/impl/computer/keyboard.rs
@@ -4,6 +4,7 @@
 //! via platform-native APIs (Core Graphics on macOS, SendInput on Windows,
 //! X11/libxdo on Linux).
 
+use super::main_thread::run_input_on_main;
 use crate::openhuman::security::SecurityPolicy;
 use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult};
 use async_trait::async_trait;
@@ -186,21 +187,18 @@ impl Tool for KeyboardTool {
                 }
 
                 let len = text.len();
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    enigo
-                        .text(&text)
-                        .map_err(|e| anyhow::anyhow!("text typing failed: {e}"))?;
-                    info!(
-                        tool = "keyboard",
-                        action = "type",
-                        chars = len,
-                        "[computer] typed text"
-                    );
-                    Ok(ToolResult::success(format!("Typed {len} characters")))
-                })
-                .await?
+                into_result(
+                    "type",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        enigo
+                            .text(&text)
+                            .map_err(|e| format!("text typing failed: {e}"))?;
+                        Ok(format!("Typed {len} characters"))
+                    })
+                    .await,
+                )
             }
 
             "press" => {
@@ -214,21 +212,18 @@ impl Tool for KeyboardTool {
                     anyhow::anyhow!("Unknown key '{key_name}'. Use names like Enter, Tab, Escape, F1-F12, a-z, 0-9, Space, etc.")
                 })?;
 
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    enigo
-                        .key(key, Direction::Click)
-                        .map_err(|e| anyhow::anyhow!("key press failed: {e}"))?;
-                    info!(
-                        tool = "keyboard",
-                        action = "press",
-                        key = key_name.as_str(),
-                        "[computer] pressed key"
-                    );
-                    Ok(ToolResult::success(format!("Pressed key '{key_name}'")))
-                })
-                .await?
+                into_result(
+                    "press",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        enigo
+                            .key(key, Direction::Click)
+                            .map_err(|e| format!("key press failed: {e}"))?;
+                        Ok(format!("Pressed key '{key_name}'"))
+                    })
+                    .await,
+                )
             }
 
             "hotkey" => {
@@ -288,51 +283,42 @@ impl Tool for KeyboardTool {
                 }
 
                 let combo_desc = key_names.join("+");
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-
-                    // Press keys in order, tracking which were successfully
-                    // pressed so we can release them on error.
-                    let mut pressed_keys: Vec<Key> = Vec::with_capacity(keys.len());
-                    let press_result: Result<(), anyhow::Error> = (|| {
-                        for key in &keys {
-                            enigo.key(*key, Direction::Press).map_err(|e| {
-                                anyhow::anyhow!("key press failed for {key:?}: {e}")
-                            })?;
-                            pressed_keys.push(*key);
-                            std::thread::sleep(HOTKEY_INTER_KEY_DELAY);
+                into_result(
+                    "hotkey",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+
+                        // Press keys in order, tracking which were pressed so we
+                        // can release them on error.
+                        let mut pressed_keys: Vec<Key> = Vec::with_capacity(keys.len());
+                        let press_result: Result<(), String> = (|| {
+                            for key in &keys {
+                                enigo
+                                    .key(*key, Direction::Press)
+                                    .map_err(|e| format!("key press failed for {key:?}: {e}"))?;
+                                pressed_keys.push(*key);
+                                std::thread::sleep(HOTKEY_INTER_KEY_DELAY);
+                            }
+                            Ok(())
+                        })();
+
+                        // Always release pressed keys in reverse, even on error.
+                        for key in pressed_keys.iter().rev() {
+                            if let Err(e) = enigo.key(*key, Direction::Release) {
+                                tracing::warn!(
+                                    tool = "keyboard",
+                                    key = ?key,
+                                    error = %e,
+                                    "[computer] best-effort key release failed during cleanup"
+                                );
+                            }
                         }
-                        Ok(())
-                    })();
-
-                    // Always release all successfully pressed keys in reverse
-                    // order, even if a press failed partway through.
-                    for key in pressed_keys.iter().rev() {
-                        if let Err(e) = enigo.key(*key, Direction::Release) {
-                            tracing::warn!(
-                                tool = "keyboard",
-                                key = ?key,
-                                error = %e,
-                                "[computer] best-effort key release failed during cleanup"
-                            );
-                        }
-                    }
-
-                    // Now propagate any press error.
-                    press_result?;
-
-                    info!(
-                        tool = "keyboard",
-                        action = "hotkey",
-                        combo = combo_desc.as_str(),
-                        "[computer] hotkey executed"
-                    );
-                    Ok(ToolResult::success(format!(
-                        "Executed hotkey: {combo_desc}"
-                    )))
-                })
-                .await?
+                        press_result?;
+                        Ok(format!("Executed hotkey: {combo_desc}"))
+                    })
+                    .await,
+                )
             }
 
             other => Ok(ToolResult::error(format!(
@@ -342,6 +328,20 @@ impl Tool for KeyboardTool {
     }
 }
 
+/// Map a main-thread input op result to a `ToolResult`, logging the outcome.
+fn into_result(action: &str, r: Result<String, String>) -> anyhow::Result<ToolResult> {
+    match r {
+        Ok(msg) => {
+            info!(tool = "keyboard", action, "[computer] {msg}");
+            Ok(ToolResult::success(msg))
+        }
+        Err(e) => {
+            tracing::warn!(tool = "keyboard", action, "[computer] failed: {e}");
+            Ok(ToolResult::error(e))
+        }
+    }
+}
+
 #[cfg(test)]
 #[path = "keyboard_tests.rs"]
 mod tests;
diff --git a/src/openhuman/tools/impl/computer/main_thread.rs b/src/openhuman/tools/impl/computer/main_thread.rs
new file mode 100644
index 0000000000..26697fc10f
--- /dev/null
+++ b/src/openhuman/tools/impl/computer/main_thread.rs
@@ -0,0 +1,49 @@
+//! Main-thread bridge for synthetic input (mouse/keyboard).
+//!
+//! macOS's Text Input Source APIs (`TSMGetInputSourceProperty`), which enigo
+//! calls during keyboard-layout lookup, **must run on the app's main thread**.
+//! Running them on a tokio worker (or `spawn_blocking`) traps with
+//! `_dispatch_assert_queue_fail` / `EXC_BREAKPOINT` and crashes the CEF host
+//! (tracker §1.8 / Change 1.15 — confirmed via crash report).
+//!
+//! So the keyboard/mouse tools never call enigo on their own thread. They build
+//! a closure and hand it to [`run_input_on_main`], which dispatches it — over
+//! the native request registry — to a handler the Tauri shell registers at
+//! startup, which runs it on the real main thread via
+//! `AppHandle::run_on_main_thread`.
+
+use crate::core::event_bus::request_native_global;
+
+/// Native-registry method the Tauri shell handles to run an input op on the
+/// main thread. The shell registers a handler under this key at startup.
+pub const INPUT_ON_MAIN_THREAD_METHOD: &str = "computer.input_on_main_thread";
+
+/// A synthetic-input operation to run on the app's main thread. `run` performs
+/// the enigo calls and returns a human-readable success message (`Ok`) or an
+/// error string (`Err`). Carried by value through the native registry (no
+/// serialization — the boxed `FnOnce` passes through unchanged).
+pub struct MainThreadInputOp {
+    pub run: Box<dyn FnOnce() -> Result<String, String> + Send>,
+}
+
+/// Dispatch `op` to the app main thread and await its result.
+///
+/// Returns an error when no main-thread executor is registered (headless / CLI
+/// builds have no Tauri main thread — synthetic input is a desktop capability).
+pub async fn run_input_on_main<F>(op: F) -> Result<String, String>
+where
+    F: FnOnce() -> Result<String, String> + Send + 'static,
+{
+    let req = MainThreadInputOp { run: Box::new(op) };
+    match request_native_global::<MainThreadInputOp, Result<String, String>>(
+        INPUT_ON_MAIN_THREAD_METHOD,
+        req,
+    )
+    .await
+    {
+        Ok(inner) => inner,
+        Err(e) => Err(format!(
+            "synthetic input requires the desktop app's main-thread executor (unavailable: {e})"
+        )),
+    }
+}
diff --git a/src/openhuman/tools/impl/computer/mod.rs b/src/openhuman/tools/impl/computer/mod.rs
index ec8363c0f3..6603105d9c 100644
--- a/src/openhuman/tools/impl/computer/mod.rs
+++ b/src/openhuman/tools/impl/computer/mod.rs
@@ -1,8 +1,10 @@
 mod ax_interact;
 mod human_path;
 mod keyboard;
+mod main_thread;
 mod mouse;
 
 pub use ax_interact::AxInteractTool;
 pub use keyboard::KeyboardTool;
+pub use main_thread::{run_input_on_main, MainThreadInputOp, INPUT_ON_MAIN_THREAD_METHOD};
 pub use mouse::MouseTool;
diff --git a/src/openhuman/tools/impl/computer/mouse.rs b/src/openhuman/tools/impl/computer/mouse.rs
index bcaf554e79..40f016e5bd 100644
--- a/src/openhuman/tools/impl/computer/mouse.rs
+++ b/src/openhuman/tools/impl/computer/mouse.rs
@@ -5,6 +5,7 @@
 //! SendInput on Windows, X11/libxdo on Linux).
 
 use super::human_path::{human_path, HumanPathOptions};
+use super::main_thread::run_input_on_main;
 use crate::openhuman::security::SecurityPolicy;
 use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult};
 use async_trait::async_trait;
@@ -226,69 +227,57 @@ impl Tool for MouseTool {
             "move" => {
                 let (x, y) = require_xy(&args)?;
                 let human_like = human_like_enabled(&args)?;
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    humanized_move(&mut enigo, x, y, human_like)?;
-                    info!(
-                        tool = "mouse",
-                        action = "move",
-                        x = x,
-                        y = y,
-                        "[computer] cursor moved"
-                    );
-                    Ok(ToolResult::success(format!("Moved cursor to ({x}, {y})")))
-                })
-                .await?
+                into_result(
+                    "move",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        humanized_move(&mut enigo, x, y, human_like).map_err(|e| e.to_string())?;
+                        Ok(format!("Moved cursor to ({x}, {y})"))
+                    })
+                    .await,
+                )
             }
 
             "click" => {
                 let (x, y) = require_xy(&args)?;
                 let button = parse_button(&args)?;
                 let human_like = human_like_enabled(&args)?;
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    humanized_move(&mut enigo, x, y, human_like)?;
-                    enigo
-                        .button(button, Direction::Click)
-                        .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?;
-                    info!(
-                        tool = "mouse", action = "click",
-                        x = x, y = y, button = ?button,
-                        "[computer] clicked"
-                    );
-                    Ok(ToolResult::success(format!(
-                        "Clicked {button:?} at ({x}, {y})"
-                    )))
-                })
-                .await?
+                into_result(
+                    "click",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        humanized_move(&mut enigo, x, y, human_like).map_err(|e| e.to_string())?;
+                        enigo
+                            .button(button, Direction::Click)
+                            .map_err(|e| format!("button click failed: {e}"))?;
+                        Ok(format!("Clicked {button:?} at ({x}, {y})"))
+                    })
+                    .await,
+                )
             }
 
             "double_click" => {
                 let (x, y) = require_xy(&args)?;
                 let button = parse_button(&args)?;
                 let human_like = human_like_enabled(&args)?;
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    humanized_move(&mut enigo, x, y, human_like)?;
-                    enigo
-                        .button(button, Direction::Click)
-                        .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?;
-                    enigo
-                        .button(button, Direction::Click)
-                        .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?;
-                    info!(
-                        tool = "mouse", action = "double_click",
-                        x = x, y = y, button = ?button,
-                        "[computer] double-clicked"
-                    );
-                    Ok(ToolResult::success(format!(
-                        "Double-clicked {button:?} at ({x}, {y})"
-                    )))
-                })
-                .await?
+                into_result(
+                    "double_click",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        humanized_move(&mut enigo, x, y, human_like).map_err(|e| e.to_string())?;
+                        enigo
+                            .button(button, Direction::Click)
+                            .map_err(|e| format!("button click failed: {e}"))?;
+                        enigo
+                            .button(button, Direction::Click)
+                            .map_err(|e| format!("button click failed: {e}"))?;
+                        Ok(format!("Double-clicked {button:?} at ({x}, {y})"))
+                    })
+                    .await,
+                )
             }
 
             "drag" => {
@@ -308,44 +297,40 @@ impl Tool for MouseTool {
                 let sx = start_x as i32;
                 let sy = start_y as i32;
 
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    humanized_move(&mut enigo, sx, sy, human_like)?;
-                    enigo
-                        .button(button, Direction::Press)
-                        .map_err(|e| anyhow::anyhow!("button press failed: {e}"))?;
-
-                    // After press succeeds, guarantee release even on error.
-                    let drag_result: Result<(), anyhow::Error> = (|| {
-                        humanized_move(&mut enigo, end_x, end_y, human_like)?;
-                        Ok(())
-                    })();
-
-                    // Always release — best-effort cleanup.
-                    if let Err(e) = enigo.button(button, Direction::Release) {
-                        warn!(
-                            tool = "mouse",
-                            button = ?button,
-                            error = %e,
-                            "[computer] best-effort button release failed during drag cleanup"
-                        );
-                    }
-
-                    // Propagate the drag error if the move failed.
-                    drag_result?;
-
-                    info!(
-                        tool = "mouse", action = "drag",
-                        start_x = sx, start_y = sy,
-                        end_x = end_x, end_y = end_y, button = ?button,
-                        "[computer] dragged"
-                    );
-                    Ok(ToolResult::success(format!(
-                        "Dragged {button:?} from ({sx}, {sy}) to ({end_x}, {end_y})"
-                    )))
-                })
-                .await?
+                into_result(
+                    "drag",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        humanized_move(&mut enigo, sx, sy, human_like)
+                            .map_err(|e| e.to_string())?;
+                        enigo
+                            .button(button, Direction::Press)
+                            .map_err(|e| format!("button press failed: {e}"))?;
+
+                        // After press succeeds, guarantee release even on error.
+                        let drag_result: Result<(), String> = (|| {
+                            humanized_move(&mut enigo, end_x, end_y, human_like)
+                                .map_err(|e| e.to_string())?;
+                            Ok(())
+                        })();
+
+                        // Always release — best-effort cleanup.
+                        if let Err(e) = enigo.button(button, Direction::Release) {
+                            warn!(
+                                tool = "mouse",
+                                button = ?button,
+                                error = %e,
+                                "[computer] best-effort button release failed during drag cleanup"
+                            );
+                        }
+                        drag_result?;
+                        Ok(format!(
+                            "Dragged {button:?} from ({sx}, {sy}) to ({end_x}, {end_y})"
+                        ))
+                    })
+                    .await,
+                )
             }
 
             "scroll" => {
@@ -373,31 +358,25 @@ impl Tool for MouseTool {
                     ));
                 }
 
-                tokio::task::spawn_blocking(move || {
-                    let mut enigo = Enigo::new(&Settings::default())
-                        .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?;
-                    if scroll_y != 0 {
-                        enigo
-                            .scroll(scroll_y, enigo::Axis::Vertical)
-                            .map_err(|e| anyhow::anyhow!("vertical scroll failed: {e}"))?;
-                    }
-                    if scroll_x != 0 {
-                        enigo
-                            .scroll(scroll_x, enigo::Axis::Horizontal)
-                            .map_err(|e| anyhow::anyhow!("horizontal scroll failed: {e}"))?;
-                    }
-                    info!(
-                        tool = "mouse",
-                        action = "scroll",
-                        scroll_x = scroll_x,
-                        scroll_y = scroll_y,
-                        "[computer] scrolled"
-                    );
-                    Ok(ToolResult::success(format!(
-                        "Scrolled (x={scroll_x}, y={scroll_y})"
-                    )))
-                })
-                .await?
+                into_result(
+                    "scroll",
+                    run_input_on_main(move || {
+                        let mut enigo = Enigo::new(&Settings::default())
+                            .map_err(|e| format!("Failed to create enigo instance: {e}"))?;
+                        if scroll_y != 0 {
+                            enigo
+                                .scroll(scroll_y, enigo::Axis::Vertical)
+                                .map_err(|e| format!("vertical scroll failed: {e}"))?;
+                        }
+                        if scroll_x != 0 {
+                            enigo
+                                .scroll(scroll_x, enigo::Axis::Horizontal)
+                                .map_err(|e| format!("horizontal scroll failed: {e}"))?;
+                        }
+                        Ok(format!("Scrolled (x={scroll_x}, y={scroll_y})"))
+                    })
+                    .await,
+                )
             }
 
             other => Ok(ToolResult::error(format!(
@@ -407,6 +386,20 @@ impl Tool for MouseTool {
     }
 }
 
+/// Map a main-thread input op result to a `ToolResult`, logging the outcome.
+fn into_result(action: &str, r: Result<String, String>) -> anyhow::Result<ToolResult> {
+    match r {
+        Ok(msg) => {
+            info!(tool = "mouse", action, "[computer] {msg}");
+            Ok(ToolResult::success(msg))
+        }
+        Err(e) => {
+            warn!(tool = "mouse", action, "[computer] failed: {e}");
+            Ok(ToolResult::error(e))
+        }
+    }
+}
+
 #[cfg(test)]
 #[path = "mouse_tests.rs"]
 mod tests;

From b96bd279191fe760ce0582270c24df5496cf2a80 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 14:14:37 +0530
Subject: [PATCH 2/9] =?UTF-8?q?feat(accessibility):=20AX/UIA=20perception?=
 =?UTF-8?q?=20+=20automate=20perceive=E2=86=92act=E2=86=92settle=20loop?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the Rust-internal automate engine (poll-until-stable settle, playback
verification), the AXEnabled diagnostics field + settle primitives on
ax_interact, the Music fast-path, and the Windows UIA superset. Exposes
launch_platform as pub(crate) so the automate loop can launch apps mid-flow.

Slice 2/7 of #3307 (accessibility/automate engine).
---
 docs/voice-automate-plan.md                   | 152 +++++
 .../app_fastpaths/fastpaths_tests.rs          | 202 +++++++
 .../accessibility/app_fastpaths/mod.rs        |  34 ++
 .../accessibility/app_fastpaths/music.rs      | 520 +++++++++++++++++
 src/openhuman/accessibility/automate.rs       | 540 ++++++++++++++++++
 src/openhuman/accessibility/automate_tests.rs | 266 +++++++++
 src/openhuman/accessibility/ax_interact.rs    | 118 +++-
 .../accessibility/ax_interact_tests.rs        |  21 +
 src/openhuman/accessibility/helper.rs         |  17 +-
 src/openhuman/accessibility/mod.rs            |   2 +
 src/openhuman/accessibility/uia_interact.rs   |   3 +
 src/openhuman/tools/impl/system/launch_app.rs |   6 +-
 src/openhuman/tools/impl/system/mod.rs        |   2 +
 13 files changed, 1878 insertions(+), 5 deletions(-)
 create mode 100644 docs/voice-automate-plan.md
 create mode 100644 src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs
 create mode 100644 src/openhuman/accessibility/app_fastpaths/mod.rs
 create mode 100644 src/openhuman/accessibility/app_fastpaths/music.rs
 create mode 100644 src/openhuman/accessibility/automate.rs
 create mode 100644 src/openhuman/accessibility/automate_tests.rs

diff --git a/docs/voice-automate-plan.md b/docs/voice-automate-plan.md
new file mode 100644
index 0000000000..217e769b69
--- /dev/null
+++ b/docs/voice-automate-plan.md
@@ -0,0 +1,152 @@
+# Phase 1.5 Implementation Plan — `automate(app, goal)`
+
+**Parent tracker:** [`voice-system-actions.md`](voice-system-actions.md) (Change 1.14 / Phase 1.5)
+**Decided approach:** Rust inner loop + fast model (chat LLM out of the click loop)
+**First proof target:** Music — "play `<song>`" end-to-end
+**Status:** Plan — awaiting approval before code
+
+---
+
+## 1. Goal
+
+Turn a single high-level intent ("play Numb by Linkin Park") into a multi-step UI
+automation that completes in **one tool call from the orchestrator**, runs fast,
+and self-corrects — instead of N separate chat-LLM turns over the raw
+`ax_interact` primitives (today's flow; see tracker §1.10–1.13 for why that's
+slow and fragile).
+
+## 2. Architecture
+
+```
+ orchestrator (chat LLM)
+        │  one call: automate{ app, goal }
+        ▼
+ AutomateTool (tools/impl/computer/automate.rs)
+        │  delegates to
+        ▼
+ accessibility::automate::run(app, goal)         ← the inner loop (Rust)
+        │
+        ├─ fast-path dispatch ── app_fastpaths/{music,spotify,slack}.rs
+        │      (deterministic; skip the loop entirely when available)
+        │
+        └─ general loop ──► perceive → decide → act → settle → verify ──┐
+               ▲                                                         │
+               └────────────── repeat until done / fail / budget ───────┘
+                 perceive: ax_list_elements_filtered (existing)
+                 decide:   create_chat_provider("automation", cfg) → JSON action
+                 act:      ax_press_element / ax_set_field_value / launch_app (existing)
+                 settle:   helper "ax_wait_settled" (new) — AXObserver, not sleep
+                 verify:   re-read state; confirm the action took effect
+```
+
+The **chat model is invoked once** (to pick `automate` and its `goal`). The
+**fast model** runs the inner loop with a tiny context (goal + current filtered
+snapshot + last result), so each step is ~0.5–1s and cheap.
+
+## 3. Inner-loop algorithm
+
+State carried across iterations: `goal`, `app`, `history: Vec<Step>`, `budget`.
+
+Each iteration:
+1. **Perceive** — `ax_list_elements_filtered(app, last_filter_or_"")`, capped/filtered
+   exactly as the `ax_interact` tool does today (≤60 elements, never a raw dump).
+2. **Decide** — call the fast model with a strict system prompt + the JSON action
+   schema (below). Parse one action.
+3. **Act** — execute via existing helpers. `launch` → `launch_app`; `press` →
+   `ax_press_element`; `set_value` → `ax_set_field_value`; `list` → just re-perceive
+   with a new filter.
+4. **Settle** — `ax_wait_settled(app, timeout)` (new helper): block until the AX
+   tree stops changing (debounced AXObserver notifications) or timeout. Removes the
+   timing-race class deterministically.
+5. **Verify** — re-read; confirm the expected post-condition (e.g. a new control
+   appeared, focus changed, a value was set). Record success/failure in `history`.
+6. **Loop** until the model emits `done`/`fail`, or the step budget (e.g. 12) is hit.
+
+### Action schema (fast model output — strict JSON)
+```jsonc
+{
+  "thought": "short reasoning",
+  "action": "launch | list | press | set_value | done | fail",
+  "app": "Music",            // optional override; defaults to the task app
+  "filter": "Highway",       // for list
+  "label": "Play",           // for press / set_value
+  "value": "Highway to Hell", // for set_value
+  "summary": "what happened / why done"  // for done|fail
+}
+```
+Invalid JSON or unknown action → one repair retry, then `fail` with the raw text
+logged (never act on a guess — this is the §1.13 hallucination lesson).
+
+## 4. New files & changes (grounded in current layout)
+
+**New**
+- `src/openhuman/accessibility/automate.rs` — `run(app, goal, opts) -> Result<AutomateOutcome, String>`; the loop, action schema (serde), fast-model call, step budget, structured `history`.
+- `src/openhuman/accessibility/app_fastpaths/mod.rs` + `music.rs` (Spotify/Slack land later) — `try_fastpath(app, goal) -> Option<Result<…>>`.
+- `src/openhuman/tools/impl/computer/automate.rs` — `AutomateTool { allow_mutations }`; reuses the `ax_interact` gating posture (mutations opt-in, `SENSITIVE_APPS` denylist, `permission_level_with_args` = Dangerous, `external_effect_with_args` = true).
+- `src/openhuman/accessibility/automate_tests.rs` — unit tests for the loop (mock perceive/act/decide), schema parse/repair, budget, fast-path dispatch.
+
+**Changed**
+- `accessibility/helper.rs` (macOS Swift) — add `ax_wait_settled` (AXObserver on `kAXValueChanged`/`kAXFocusedUIElementChanged`/`kAXCreated`, debounce ~150ms, bounded ~3s) and return richer element fields (enabled / on-screen / supported actions) from `ax_list`.
+- `accessibility/ax_interact.rs` — surface a `ax_wait_settled` Rust wrapper; extend `AXElement` with the new optional fields (back-compat: `#[serde(default)]`).
+- `accessibility/mod.rs` — declare `automate`, `app_fastpaths`.
+- `inference/provider/factory.rs` — add an `"automation"` role (falls back to the fast/summarization tier) so the loop's model is independently configurable.
+- `tools/ops.rs` (`all_tools_with_runtime`), `tools/user_filter.rs` (new `"automate"` family), `agent_registry/agents/orchestrator/agent.toml` (`named` list), `app/src/utils/toolDefinitions.ts` (Settings → Agent Access toggle).
+- Tracker: flip Change 1.14 / Phase 1.5 rows from ⏳ Planned → in progress as milestones land.
+
+## 5. Fast-model call
+
+`create_chat_provider("automation", &cfg)` → `(provider, model)`; build a
+`ChatRequest { messages, tools: None, stream: None }` with a system prompt that
+pins the JSON schema and a user message carrying `{goal, snapshot, history_tail}`.
+No tools array — we want a single JSON object back, parsed by us, executed by us.
+Temperature low. Token budget small (snapshot is already ≤60 elements).
+
+## 6. Music proof (first target)
+
+`app_fastpaths/music.rs` encodes the §1.11 proven sequence behind one entry:
+1. `launch_app("Music")`
+2. open `music://music.apple.com/search?term=<query>` (URL scheme)
+3. `ax_wait_settled`
+4. `ax_list_elements_filtered("Music", <query>)` → find the song row
+5. `ax_press_element` the row (navigate into detail)
+6. `ax_wait_settled` → `ax_list` the detail page → `ax_press_element("Play")`
+7. verify `osascript … get player state == playing` (best-effort, logged)
+
+If the fast-path can't find the row (timing/locale), fall through to the **general
+loop**, which is what proves the architecture is app-agnostic.
+
+## 7. Progress streaming
+
+Emit a `DomainEvent` per step (`AutomateProgress { app, step, action, ok }`) on the
+event bus; a subscriber bridges to the existing notch/voice status surface
+(PR #3166) so the user sees "Opening Music → searching → playing" live. Reuses the
+`ApprovalSurfaceSubscriber` bridging pattern.
+
+## 8. Testing
+
+- **Unit** (`automate_tests.rs`, CI-safe): action JSON parse + repair; budget exhaustion → `fail`; fast-path dispatch chosen over loop; verify-failure triggers retry/alternate. Perceive/act/decide are trait-injected so tests need no mic/AX/LLM.
+- **Integration** (`#[ignore]`, run on a real Mac): the Music flow end-to-end (mirrors `ax_interact_tests::test_full_flow_search_and_play_acdc`); tool-level success hard-asserted, playback best-effort.
+- **Agent-in-the-loop**: ask the running app "play `<song>`", confirm it picks `automate` and the song plays; watch `[automate]` logs.
+
+## 9. Milestones (sequenced)
+
+1. **M1** — `automate.rs` loop skeleton + action schema + fast-model call + `AutomateTool` (gated, registered). Loop runs against existing (non-settled) `ax_interact` helpers. Unit tests. *Compiles + agent can call it.*
+2. **M2** — `ax_wait_settled` (helper + wrapper) + verify step wired into the loop. Kills the timing-race class.
+3. **M3** — Music fast-path; prove the flow end-to-end on a Mac.
+4. **M4** — progress streaming to the notch surface.
+5. **M5** — richer element model (enabled/onscreen/actions) for better matching.
+6. *(later)* Spotify + Slack fast-paths; vision fallback for Electron; Windows UIA settle parity.
+
+## 10. Risks / open questions
+
+- **Fast model availability** — if no fast tier is configured, fall back to the
+  chat model for the loop (still one tool call; just slower). The `"automation"`
+  role makes this a config decision, not a hard dependency.
+- **AXObserver from the Swift helper** — needs a short run-loop pump; if flaky,
+  fall back to a polling settle (count-stable-for-150ms) behind the same wrapper.
+- **macOS-only first** — Windows UIA settle/verify parity is M6, gated like the
+  existing cfg-dispatch; non-mac/non-win returns the existing clean runtime error.
+- **Safety** — `automate` is a mutating tool: same opt-in + `SENSITIVE_APPS`
+  denylist + ApprovalGate routing as `ax_interact`; the inner loop may not target a
+  denylisted app even if the model asks.
+```
diff --git a/src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs b/src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs
new file mode 100644
index 0000000000..f804c0d056
--- /dev/null
+++ b/src/openhuman/accessibility/app_fastpaths/fastpaths_tests.rs
@@ -0,0 +1,202 @@
+//! Tests for the app fast-paths: pure query parsing + the Music sequence via a
+//! scripted backend (no live Music, no model).
+
+use super::super::automate::{AutomateBackend, AutomateOutcome};
+use super::super::ax_interact::AXElement;
+use super::music;
+use async_trait::async_trait;
+use std::sync::Mutex;
+
+// ── Pure parser tests ───────────────────────────────────────────────
+
+#[test]
+fn matches_music_play_intents() {
+    assert!(music::matches("Music", "play Numb by Linkin Park"));
+    assert!(music::matches("Apple Music", "play Highway to Hell"));
+    assert!(music::matches("music", "launch music and play Numb"));
+    // Not a play intent → no fast-path.
+    assert!(!music::matches("Music", "pause"));
+    // Not Music → no fast-path.
+    assert!(!music::matches("Slack", "play Numb"));
+}
+
+#[test]
+fn extract_query_basic() {
+    assert_eq!(
+        music::extract_play_query("play Numb by Linkin Park").as_deref(),
+        Some("Numb Linkin Park")
+    );
+}
+
+#[test]
+fn extract_query_strips_filler_and_suffix() {
+    assert_eq!(
+        music::extract_play_query("play the song Highway to Hell by AC/DC").as_deref(),
+        Some("Highway to Hell AC/DC")
+    );
+    assert_eq!(
+        music::extract_play_query("play Numb in Apple Music").as_deref(),
+        Some("Numb")
+    );
+}
+
+#[test]
+fn extract_query_after_launch_clause() {
+    assert_eq!(
+        music::extract_play_query("launch Music and play Numb").as_deref(),
+        Some("Numb")
+    );
+}
+
+#[test]
+fn extract_query_rejects_non_play() {
+    assert_eq!(music::extract_play_query("pause the music"), None);
+    assert_eq!(music::extract_play_query("display settings"), None); // "play" inside "display"
+    assert_eq!(music::extract_play_query("play"), None); // nothing after
+}
+
+#[test]
+fn extract_query_from_quoted_title_with_artist() {
+    // The exact goal that failed live: song quoted earlier, sentence ends "…play it".
+    assert_eq!(
+        music::extract_play_query(
+            "launch Music app, search for \"Highway to Hell\" by AC/DC, and play it"
+        )
+        .as_deref(),
+        Some("Highway to Hell AC/DC")
+    );
+    assert_eq!(
+        music::extract_play_query("play \"Numb\" by Linkin Park").as_deref(),
+        Some("Numb Linkin Park")
+    );
+    // Quoted title, no artist.
+    assert_eq!(
+        music::extract_play_query("please play \"Bohemian Rhapsody\"").as_deref(),
+        Some("Bohemian Rhapsody")
+    );
+}
+
+#[test]
+fn extract_query_rejects_bare_pronoun() {
+    // No song name anywhere → decline (let the general loop / a clarifier handle it).
+    assert_eq!(music::extract_play_query("play it"), None);
+    assert_eq!(music::extract_play_query("play something"), None);
+    assert!(!music::matches("Music", "play it"));
+}
+
+// ── Sequence test via scripted backend ──────────────────────────────
+
+struct Backend {
+    acts: Mutex<Vec<String>>,
+    /// Elements returned by perceive (the search results screen).
+    elements: Vec<AXElement>,
+    press_fail_on: Option<String>,
+}
+
+impl Backend {
+    fn new(elements: Vec<AXElement>) -> Self {
+        Self {
+            acts: Mutex::new(Vec::new()),
+            elements,
+            press_fail_on: None,
+        }
+    }
+    fn acts(&self) -> Vec<String> {
+        self.acts.lock().unwrap().clone()
+    }
+}
+
+#[async_trait]
+impl AutomateBackend for Backend {
+    async fn perceive(&self, _app: &str, _filter: &str) -> Result<Vec<AXElement>, String> {
+        Ok(self.elements.clone())
+    }
+    async fn decide(&self, _system: &str, _user: &str) -> Result<String, String> {
+        Err("fast-path must not call the model".into())
+    }
+    async fn act_launch(&self, app: &str) -> Result<String, String> {
+        self.acts.lock().unwrap().push(format!("launch:{app}"));
+        Ok("ok".into())
+    }
+    async fn act_press(&self, app: &str, label: &str) -> Result<String, String> {
+        self.acts
+            .lock()
+            .unwrap()
+            .push(format!("press:{app}:{label}"));
+        if self.press_fail_on.as_deref() == Some(label) {
+            return Err("press failed".into());
+        }
+        Ok("ok".into())
+    }
+    async fn act_set_value(&self, _a: &str, _l: &str, _v: &str) -> Result<String, String> {
+        Ok("ok".into())
+    }
+    async fn open_url(&self, url: &str) -> Result<String, String> {
+        self.acts.lock().unwrap().push(format!("open_url:{url}"));
+        Ok("ok".into())
+    }
+    async fn settle(&self, _app: &str) {}
+    async fn wait(&self, _ms: u64) {}
+}
+
+fn song_row(label: &str) -> AXElement {
+    AXElement::new("AXCell", label)
+}
+
+#[tokio::test]
+async fn music_fastpath_full_sequence() {
+    let backend = Backend::new(vec![song_row("Numb"), AXElement::new("AXButton", "Play")]);
+    let out = music::run("play Numb by Linkin Park", &backend).await;
+    assert!(out.success, "expected success: {out:?}");
+    let acts = backend.acts();
+    // launch → open search url → press the row → press detail Play.
+    assert_eq!(acts[0], "launch:Music");
+    assert!(acts[1].starts_with("open_url:music://"), "got {}", acts[1]);
+    assert!(acts.contains(&"press:Music:Numb".to_string()), "{acts:?}");
+    assert!(acts.contains(&"press:Music:Play".to_string()), "{acts:?}");
+}
+
+#[tokio::test]
+async fn music_fastpath_no_row_fails_for_fallthrough() {
+    // Search screen has nothing matching → fast-path fails (loop falls through).
+    let backend = Backend::new(vec![AXElement::new("AXButton", "Some Unrelated Button")]);
+    let out = music::run("play Numb", &backend).await;
+    assert!(!out.success);
+    assert!(out.summary.contains("no matching song"), "{}", out.summary);
+}
+
+#[tokio::test]
+async fn music_fastpath_presses_row_even_if_reported_disabled() {
+    // Apple Music reports pressable result rows as enabled=Some(false); the
+    // fast-path must still press them (regression guard for the M5 mis-gate).
+    let mut row = AXElement::new("AXCell", "Numb");
+    row.enabled = Some(false);
+    let backend = Backend::new(vec![row, AXElement::new("AXButton", "Play")]);
+    let out = music::run("play Numb", &backend).await;
+    assert!(out.success, "must press a 'disabled'-reported row: {out:?}");
+    assert!(backend.acts().contains(&"press:Music:Numb".to_string()));
+}
+
+#[tokio::test]
+async fn try_fastpath_dispatches_music_and_skips_others() {
+    let backend = Backend::new(vec![song_row("Numb")]);
+    // Non-music app → None (general loop handles it).
+    assert!(super::try_fastpath("Slack", "play Numb", &backend)
+        .await
+        .is_none());
+    // Music + play → Some.
+    assert!(super::try_fastpath("Music", "play Numb", &backend)
+        .await
+        .is_some());
+}
+
+// Outcome type sanity: fast-paths build the same outcome the loop returns.
+#[test]
+fn outcome_shape() {
+    let o = AutomateOutcome {
+        success: true,
+        summary: "x".into(),
+        steps: vec![],
+    };
+    assert!(o.success);
+}
diff --git a/src/openhuman/accessibility/app_fastpaths/mod.rs b/src/openhuman/accessibility/app_fastpaths/mod.rs
new file mode 100644
index 0000000000..534d7299b5
--- /dev/null
+++ b/src/openhuman/accessibility/app_fastpaths/mod.rs
@@ -0,0 +1,34 @@
+//! Deterministic per-app accelerators for the `automate` loop.
+//!
+//! A fast-path encodes a *proven* native sequence for a common (app, intent)
+//! pair so the loop doesn't have to rediscover it with the model every time.
+//! [`try_fastpath`] is consulted **before** the general loop and returns:
+//!   - `Some(success)`  → the loop returns it directly,
+//!   - `Some(failure)`  → the loop logs and falls through to the model loop,
+//!   - `None`           → no fast-path applies; straight to the model loop.
+//!
+//! So a fast-path can only *help*. This is deliberately different from the
+//! removed `play_music` tool (tracker §1.13): that was a separate tool the LLM
+//! had to choose (and chose wrong); this is internal to `automate`, transparent,
+//! and always backed by the general loop.
+
+use super::automate::AutomateBackend;
+use super::automate::AutomateOutcome;
+
+mod music;
+
+/// Try every registered fast-path; return the first that claims the (app, goal).
+pub async fn try_fastpath(
+    app: &str,
+    goal: &str,
+    backend: &dyn AutomateBackend,
+) -> Option<AutomateOutcome> {
+    if music::matches(app, goal) {
+        return Some(music::run(goal, backend).await);
+    }
+    None
+}
+
+#[cfg(test)]
+#[path = "fastpaths_tests.rs"]
+mod tests;
diff --git a/src/openhuman/accessibility/app_fastpaths/music.rs b/src/openhuman/accessibility/app_fastpaths/music.rs
new file mode 100644
index 0000000000..a87079e535
--- /dev/null
+++ b/src/openhuman/accessibility/app_fastpaths/music.rs
@@ -0,0 +1,520 @@
+//! Apple Music fast-path: "play `<song>`".
+//!
+//! Encodes the sequence empirically proven in tracker §1.11: open the Music
+//! search URL scheme, press the matching song row to **navigate** into it, then
+//! press the detail-page **Play** (a search-result press only selects/navigates;
+//! the second Play press is what actually starts playback). All steps go through
+//! the injectable [`AutomateBackend`], so the whole flow is unit-testable with a
+//! scripted backend — no live Music, no model.
+
+use super::AutomateBackend;
+use super::AutomateOutcome;
+
+const APP: &str = "Music";
+
+/// Element roles that represent a tappable search result / song row.
+const ROW_ROLES: &[&str] = &["AXCell", "AXRow", "ListItem", "AXButton", "AXStaticText"];
+
+/// Does this (app, goal) look like an Apple Music "play X" request?
+pub fn matches(app: &str, goal: &str) -> bool {
+    is_music_app(app) && extract_play_query(goal).is_some()
+}
+
+/// True for the Apple Music app under its common display names.
+fn is_music_app(app: &str) -> bool {
+    let a = app.trim().to_lowercase();
+    a == "music" || a == "apple music" || a == "itunes"
+}
+
+/// Pull the search query out of a "play …" goal, or `None` if it isn't one.
+///
+/// Two strategies, in order:
+/// 1. **Quoted title** — the orchestrator usually quotes the song, e.g.
+///    `search for "Highway to Hell" by AC/DC, and play it`. Use the first
+///    quoted span, plus any `by <artist>` that immediately follows it. This is
+///    robust to where "play" sits in the sentence (it was the bug: a goal
+///    ending in "…and play it" made the after-"play" strategy extract "it").
+/// 2. **After "play"** — `play Numb by Linkin Park`, `play the song X`, etc.
+///
+/// Either way: drop leading `the song`/`track` filler, a trailing
+/// `in/on (apple) music`, rewrite ` by ` to a space (better catalog recall),
+/// and reject bare pronouns ("it"/"this"/…) that carry no song name.
+pub fn extract_play_query(goal: &str) -> Option<String> {
+    // Strategy 1: first quoted title (+ trailing "by artist").
+    if let Some((title, rest)) = first_quoted(goal) {
+        let mut q = title.trim().to_string();
+        if let Some(artist) = trailing_by_artist(rest) {
+            q.push(' ');
+            q.push_str(&artist);
+        }
+        let q = clean_query(&q);
+        if !q.is_empty() && !is_pronoun(&q) {
+            return Some(q);
+        }
+    }
+
+    // Strategy 2: text after the last word-boundary "play".
+    let lower = goal.to_lowercase();
+    let idx = lower.rfind("play")?;
+    let before_ok = idx == 0
+        || !lower[..idx]
+            .chars()
+            .next_back()
+            .map(|c| c.is_alphabetic())
+            .unwrap_or(false);
+    if !before_ok {
+        return None;
+    }
+    let after = &goal[idx + "play".len()..];
+    let mut q = after.trim().to_string();
+    for filler in ["the song ", "the track ", "song ", "track ", "me "] {
+        if q.to_lowercase().starts_with(filler) {
+            q = q[filler.len()..].to_string();
+            break;
+        }
+    }
+    let q = clean_query(&q);
+    if q.is_empty() || is_pronoun(&q) {
+        None
+    } else {
+        Some(q)
+    }
+}
+
+/// Strip a trailing "(in|on) [apple] music" and rewrite " by " → " ".
+fn clean_query(q: &str) -> String {
+    let mut q = q.trim().to_string();
+    let ql = q.to_lowercase();
+    for suffix in [
+        " in apple music",
+        " on apple music",
+        " in music",
+        " on music",
+    ] {
+        if ql.ends_with(suffix) {
+            q.truncate(q.len() - suffix.len());
+            break;
+        }
+    }
+    replace_ci(&q, " by ", " ").trim().to_string()
+}
+
+/// A query that's just a pronoun / generic noun carries no song — reject it so
+/// the fast-path declines and the general loop (or a clarifying reply) handles it.
+fn is_pronoun(q: &str) -> bool {
+    matches!(
+        q.trim().to_lowercase().as_str(),
+        "it" | "this" | "that" | "them" | "something" | "some music" | "music" | "a song" | "songs"
+    )
+}
+
+/// Return the first single- or double-quoted span and the text after its close.
+fn first_quoted(s: &str) -> Option<(String, &str)> {
+    // Support straight and curly double quotes.
+    let opens = ['"', '\u{201C}'];
+    let closes = ['"', '\u{201D}'];
+    let start = s.find(|c| opens.contains(&c))?;
+    let after_open = start + s[start..].chars().next()?.len_utf8();
+    let rel = s[after_open..].find(|c| closes.contains(&c))?;
+    let inner = &s[after_open..after_open + rel];
+    let close_end = after_open + rel + s[after_open + rel..].chars().next()?.len_utf8();
+    if inner.trim().is_empty() {
+        return None;
+    }
+    Some((inner.to_string(), &s[close_end..]))
+}
+
+/// If `rest` begins with `by <artist>`, capture the artist up to the next
+/// clause boundary ("," / " and " / " then " / end).
+fn trailing_by_artist(rest: &str) -> Option<String> {
+    let t = rest.trim_start();
+    let lower = t.to_lowercase();
+    let after = lower.strip_prefix("by ")?;
+    let artist_region = &t[t.len() - after.len()..];
+    // Cut at the first clause boundary.
+    let mut end = artist_region.len();
+    for delim in [",", " and ", " then ", " in ", " on "] {
+        if let Some(p) = artist_region.to_lowercase().find(delim) {
+            end = end.min(p);
+        }
+    }
+    let artist = artist_region[..end].trim().to_string();
+    if artist.is_empty() {
+        None
+    } else {
+        Some(artist)
+    }
+}
+
+/// Case-insensitive replace of `needle` with `repl` in `haystack`.
+fn replace_ci(haystack: &str, needle: &str, repl: &str) -> String {
+    let hl = haystack.to_lowercase();
+    let nl = needle.to_lowercase();
+    let mut out = String::with_capacity(haystack.len());
+    let mut i = 0;
+    while i < haystack.len() {
+        if hl[i..].starts_with(&nl) {
+            out.push_str(repl);
+            i += needle.len();
+        } else {
+            let ch = haystack[i..].chars().next().unwrap();
+            out.push(ch);
+            i += ch.len_utf8();
+        }
+    }
+    out
+}
+
+/// Build the Apple Music search URL scheme for `query`.
+fn search_url(query: &str) -> String {
+    format!(
+        "music://music.apple.com/search?term={}",
+        percent_encode(query)
+    )
+}
+
+/// Percent-encode the reserved characters that matter in a query value
+/// (space + the URL delimiters). Enough for app URL schemes; not a full
+/// RFC-3986 encoder.
+fn percent_encode(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    for b in s.bytes() {
+        match b {
+            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
+                out.push(b as char)
+            }
+            _ => out.push_str(&format!("%{b:02X}")),
+        }
+    }
+    out
+}
+
+/// The first query token worth filtering on (length > 2 so "to"/"by" don't
+/// match everything). Used as the perceive filter: the snapshot's substring
+/// filter can't match a whole multi-word title, so we narrow by one strong
+/// token and let `pick_row` do the full token match.
+fn first_token(query: &str) -> String {
+    query
+        .split_whitespace()
+        .find(|t| t.len() > 2)
+        .unwrap_or("")
+        .to_string()
+}
+
+/// Choose the best matching row from a perceive snapshot: an exact label match
+/// first, else the first row-role element whose label shares a word with the
+/// query. Returns the element label to press.
+fn pick_row(elements: &[super::super::ax_interact::AXElement], query: &str) -> Option<String> {
+    let ql = query.to_lowercase();
+    // Exact label match wins. (We deliberately do NOT skip elements whose
+    // reported `enabled` is false — Apple Music marks pressable result rows as
+    // disabled; see AXElement::enabled docs.)
+    if let Some(e) = elements.iter().find(|e| e.label.to_lowercase() == ql) {
+        return Some(e.label.clone());
+    }
+    let tokens: Vec<&str> = ql.split_whitespace().filter(|t| t.len() > 2).collect();
+    elements
+        .iter()
+        .filter(|e| ROW_ROLES.iter().any(|r| e.role.contains(r)))
+        .find(|e| {
+            let l = e.label.to_lowercase();
+            tokens.iter().any(|t| l.contains(t))
+        })
+        .map(|e| e.label.clone())
+}
+
+/// Run the play fast-path. Returns a failed [`AutomateOutcome`] (not a panic)
+/// whenever a step can't proceed, so the caller falls through to the general
+/// loop.
+pub async fn run(goal: &str, backend: &dyn AutomateBackend) -> AutomateOutcome {
+    let mut steps: Vec<String> = Vec::new();
+    let query = match extract_play_query(goal) {
+        Some(q) => q,
+        None => {
+            return fail("not a play request", steps);
+        }
+    };
+    log::info!("[automate::music] ▶ play query={query:?}");
+    use super::super::automate::progress;
+    use crate::openhuman::overlay::OverlayAttentionTone;
+    progress(
+        format!("Searching Music for {query}…"),
+        OverlayAttentionTone::Accent,
+    );
+
+    // 1. Launch Music.
+    match backend.act_launch(APP).await {
+        Ok(m) => steps.push(format!("launch: {m}")),
+        Err(e) => steps.push(format!("launch FAILED: {e}")),
+    }
+    backend.settle(APP).await;
+
+    // 2. Open the search URL.
+    let url = search_url(&query);
+    match backend.open_url(&url).await {
+        Ok(m) => steps.push(format!("search: {m}")),
+        Err(e) => {
+            steps.push(format!("search url FAILED: {e}"));
+            return fail("could not open Music search", steps);
+        }
+    }
+    // 3. Find the song row and press it to navigate in. Search results render
+    //    asynchronously (the §1.13 timing race), so retry across settles, and
+    //    filter the snapshot by one strong token (a substring filter can't
+    //    match a whole multi-word title).
+    let filter = first_token(&query);
+    let mut row = None;
+    for attempt in 0..6 {
+        backend.settle(APP).await;
+        let els = backend.perceive(APP, &filter).await.unwrap_or_default();
+        if let Some(r) = pick_row(&els, &query) {
+            row = Some(r);
+            break;
+        }
+        // Catalog search results arrive asynchronously (~3-4s); element-count
+        // settle can report "stable" while the network fetch is still pending,
+        // so wait real time between attempts rather than spinning instantly.
+        log::info!("[automate::music] search results not ready (attempt {attempt}), waiting");
+        backend.wait(800).await;
+    }
+    let row = match row {
+        Some(r) => r,
+        None => return fail("no matching song row found", steps),
+    };
+    // Baseline count of "Play" controls *before* navigating, so we can tell
+    // when the song's detail-page Play has actually rendered (vs. only the
+    // toolbar transport Play that's always present).
+    let plays_before = count_play_buttons(backend).await;
+
+    match backend.act_press(APP, &row).await {
+        Ok(m) => steps.push(format!("open song: {m}")),
+        Err(e) => {
+            steps.push(format!("open song FAILED: {e}"));
+            return fail("could not open the song", steps);
+        }
+    }
+
+    // 4. Wait for the detail-page Play to appear. Pressing too early hits only
+    //    the toolbar transport (empty queue → silence) — the exact false-success
+    //    we hit live. Poll until a new Play control shows up (or give up after a
+    //    few settles and try anyway).
+    for _ in 0..5 {
+        backend.settle(APP).await;
+        if count_play_buttons(backend).await > plays_before {
+            break;
+        }
+    }
+
+    // 5. Press Play, then VERIFY real playback. If it didn't start, the press
+    //    landed on the wrong Play — wait and retry a couple of times. Only
+    //    report success when player state is actually "playing" (or the backend
+    //    can't verify, in which case it's best-effort).
+    let mut verified: Option<bool> = None;
+    for attempt in 0..3 {
+        match backend.act_press(APP, "Play").await {
+            Ok(m) => steps.push(format!("play press (attempt {attempt}): {m}")),
+            Err(e) => steps.push(format!("play press FAILED: {e}")),
+        }
+        backend.settle(APP).await;
+        match backend.verify_playing().await {
+            Some(true) => {
+                verified = Some(true);
+                break;
+            }
+            Some(false) => {
+                verified = Some(false);
+                // Give the detail page a beat to settle, then retry.
+                tokio::time::sleep(std::time::Duration::from_millis(700)).await;
+            }
+            None => {
+                // Can't verify (non-macOS) — accept best-effort and stop.
+                verified = None;
+                break;
+            }
+        }
+    }
+
+    match verified {
+        Some(false) => {
+            steps.push("verify: player state never reached 'playing'".to_string());
+            fail("opened the song but playback didn't start", steps)
+        }
+        Some(true) => {
+            steps.push("verify: playing ✓".to_string());
+            progress(format!("Playing {query}"), OverlayAttentionTone::Success);
+            AutomateOutcome {
+                success: true,
+                summary: format!("Playing '{query}' in Music."),
+                steps,
+            }
+        }
+        None => AutomateOutcome {
+            success: true,
+            summary: format!("Started '{query}' in Music (playback unverified)."),
+            steps,
+        },
+    }
+}
+
+/// Count "Play"-labelled controls currently visible (toolbar + any detail-page
+/// Play). Used to detect when navigation has rendered the song's own Play.
+async fn count_play_buttons(backend: &dyn AutomateBackend) -> usize {
+    backend
+        .perceive(APP, "Play")
+        .await
+        .map(|els| {
+            els.iter()
+                .filter(|e| e.label.eq_ignore_ascii_case("Play"))
+                .count()
+        })
+        .unwrap_or(0)
+}
+
+fn fail(msg: &str, steps: Vec<String>) -> AutomateOutcome {
+    AutomateOutcome {
+        success: false,
+        summary: format!("Music fast-path: {msg}"),
+        steps,
+    }
+}
+
+#[cfg(test)]
+mod unit {
+    use super::*;
+
+    #[test]
+    fn first_token_skips_short_words() {
+        assert_eq!(first_token("Highway to Hell AC/DC"), "Highway");
+        assert_eq!(first_token("Numb Linkin Park"), "Numb");
+        // All-short → empty (perceive then falls back to a broad list).
+        assert_eq!(first_token("a x"), "");
+    }
+
+    #[test]
+    fn percent_encode_escapes_reserved() {
+        assert_eq!(percent_encode("Highway to Hell"), "Highway%20to%20Hell");
+        // The slash in AC/DC must be encoded (this was the live-run bug).
+        assert_eq!(percent_encode("AC/DC"), "AC%2FDC");
+        assert_eq!(percent_encode("rock&roll"), "rock%26roll");
+    }
+
+    #[test]
+    fn search_url_is_well_formed() {
+        let u = search_url("Highway to Hell AC/DC");
+        assert_eq!(
+            u,
+            "music://music.apple.com/search?term=Highway%20to%20Hell%20AC%2FDC"
+        );
+    }
+
+    #[test]
+    fn pick_row_prefers_exact_then_token() {
+        use super::super::super::ax_interact::AXElement;
+        let els = vec![
+            AXElement::new("AXCell", "Highway to Hell"),
+            AXElement::new("AXButton", "Play"),
+        ];
+        // Token match (query has extra "AC/DC" the row label lacks).
+        assert_eq!(
+            pick_row(&els, "Highway to Hell AC/DC").as_deref(),
+            Some("Highway to Hell")
+        );
+    }
+}
+
+/// Live integration test — drives the real Apple Music app. Ignored by default
+/// (needs macOS, the Music app, and Accessibility permission for the runner).
+///
+/// Run on a Mac with:
+///   cargo test --lib music_fastpath_live -- --ignored --nocapture
+#[cfg(all(test, target_os = "macos"))]
+mod live {
+    use super::run;
+    use crate::openhuman::accessibility::automate::RealBackend;
+
+    #[tokio::test]
+    #[ignore = "requires macOS + Music app + Accessibility permission"]
+    async fn music_fastpath_live() {
+        let backend = RealBackend::new(crate::openhuman::config::Config::default());
+        let out = run("play Highway to Hell by AC/DC", &backend).await;
+        // Tool-level success is asserted; actual playback is best-effort
+        // (Apple Music's UI is nondeterministic — tracker §1.11/§1.13).
+        println!(
+            "[music_fastpath_live] success={} summary={}",
+            out.success, out.summary
+        );
+        for s in &out.steps {
+            println!("  - {s}");
+        }
+        let state = player_state();
+        println!("[music_fastpath_live] player_state={state}");
+        // Now that the flow verifies playback, hold it to the real bar:
+        // the song must actually be playing.
+        assert!(out.success, "fast-path reported failure: {}", out.summary);
+        assert_eq!(state, "playing", "Music did not actually start playing");
+    }
+
+    /// `osascript` ground-truth for whether audio is actually playing.
+    fn player_state() -> String {
+        std::process::Command::new("osascript")
+            .args(["-e", "tell application \"Music\" to player state as string"])
+            .output()
+            .ok()
+            .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
+            .unwrap_or_else(|| "(osascript failed)".into())
+    }
+
+    /// Empirical probe (not an assertion): open the search, dump what Music's
+    /// AX tree actually exposes, and report player state before/after each
+    /// candidate press. Used to design the real play sequence.
+    #[tokio::test]
+    #[ignore = "probe — run manually to inspect Music's AX tree"]
+    async fn music_probe() {
+        use crate::openhuman::accessibility::ax_interact as ax;
+        let q = "Highway to Hell";
+        let _ = std::process::Command::new("open")
+            .arg("-a")
+            .arg("Music")
+            .status();
+        std::thread::sleep(std::time::Duration::from_secs(3));
+        let _ = std::process::Command::new("open")
+            .arg(format!(
+                "music://music.apple.com/search?term={}",
+                q.replace(' ', "%20")
+            ))
+            .status();
+        std::thread::sleep(std::time::Duration::from_secs(4));
+
+        println!("=== player state at start: {} ===", player_state());
+        let dump = |label: &str, filter: &str| match ax::ax_list_elements_filtered("Music", filter)
+        {
+            Ok(els) => {
+                println!(
+                    "--- {label} (filter={filter:?}): {} elements ---",
+                    els.len()
+                );
+                for e in els.iter().take(60) {
+                    println!("   [{}] {} enabled={:?}", e.role, e.label, e.enabled);
+                }
+            }
+            Err(e) => println!("--- {label}: ERROR {e} ---"),
+        };
+        dump("after search", "Highway");
+        dump("play buttons", "Play");
+
+        // Press the first search-result row → does it navigate / play?
+        println!("\n>>> pressing result 'Highway to Hell'");
+        let _ = ax::ax_press_element("Music", "Highway to Hell");
+        std::thread::sleep(std::time::Duration::from_secs(3));
+        println!("=== player state after row press: {} ===", player_state());
+        dump("detail page play", "Play");
+
+        // Try the detail-page Play (not the toolbar one) if still stopped.
+        if player_state() != "playing" {
+            println!("\n>>> pressing 'Play' after navigate");
+            let _ = ax::ax_press_element("Music", "Play");
+            std::thread::sleep(std::time::Duration::from_secs(3));
+            println!("=== player state after Play press: {} ===", player_state());
+        }
+    }
+}
diff --git a/src/openhuman/accessibility/automate.rs b/src/openhuman/accessibility/automate.rs
new file mode 100644
index 0000000000..3c9955bcf5
--- /dev/null
+++ b/src/openhuman/accessibility/automate.rs
@@ -0,0 +1,540 @@
+//! `automate` — Rust-driven multi-step UI automation loop.
+//!
+//! Phase 1.5 (see `docs/voice-automate-plan.md`). The chat orchestrator calls
+//! `automate{app, goal}` **once**; this module then runs the whole multi-step
+//! flow internally with a *fast* model, so the heavy chat model never sits
+//! inside the click loop. Each iteration is **perceive → decide → act →
+//! settle → verify**:
+//!
+//!   - **perceive** — read a small, filtered accessibility snapshot of the app
+//!     (`ax_interact::ax_list_elements_filtered`, capped — never a raw dump,
+//!     which is what made the chat model hallucinate; tracker §1.13).
+//!   - **decide** — ask the fast model for exactly one JSON action.
+//!   - **act**     — run it via the existing AX primitives / `launch_app`.
+//!   - **settle**  — wait for the UI to stop changing (M2 makes this real; the
+//!     M1 backend uses a short fixed wait).
+//!   - **verify**  — fold the post-action snapshot back into the next prompt.
+//!
+//! The loop is generic over an [`AutomateBackend`] so the decision model, the
+//! accessibility calls, and the launcher are all injectable — the unit tests
+//! drive a scripted backend with no mic, no AX tree, and no LLM.
+
+use super::ax_interact as ax;
+use crate::openhuman::overlay::{publish_attention, OverlayAttentionEvent, OverlayAttentionTone};
+use async_trait::async_trait;
+use serde::Deserialize;
+
+const LOG_PREFIX: &str = "[automate]";
+
+/// Push a one-line progress message to the notch / overlay so the user sees the
+/// automation happening live (M4). Fire-and-forget: a no-op when nothing is
+/// subscribed (e.g. unit tests, or the notch window isn't running).
+pub(crate) fn progress(message: impl Into<String>, tone: OverlayAttentionTone) {
+    let _ = publish_attention(
+        OverlayAttentionEvent::new(message)
+            .with_source("automate")
+            .with_tone(tone)
+            .with_ttl_ms(5000),
+    );
+}
+
+/// Default ceiling on loop iterations. Each iteration is one fast-model call
+/// plus one action, so this bounds latency and cost even if the model never
+/// emits `done`.
+pub const DEFAULT_STEP_BUDGET: u32 = 12;
+
+/// How many elements a perceive snapshot renders into the prompt. Mirrors the
+/// `ax_interact` tool cap so a broad/empty filter can't overflow the model's
+/// context and trigger the truncation→hallucination failure (tracker §1.13).
+const MAX_SNAPSHOT: usize = 40;
+
+/// One decoded action from the fast model.
+#[derive(Debug, Clone, Deserialize, Default, PartialEq)]
+pub struct Action {
+    /// The model's short reasoning. Logged, never executed.
+    #[serde(default)]
+    pub thought: String,
+    /// One of: `launch`, `list`, `press`, `set_value`, `done`, `fail`.
+    pub action: String,
+    /// Optional per-action app override; defaults to the task's app.
+    #[serde(default)]
+    pub app: Option<String>,
+    /// Substring filter for `list`.
+    #[serde(default)]
+    pub filter: String,
+    /// Element label for `press` / `set_value`.
+    #[serde(default)]
+    pub label: String,
+    /// Text to enter for `set_value`.
+    #[serde(default)]
+    pub value: String,
+    /// Final message for `done` / `fail`.
+    #[serde(default)]
+    pub summary: String,
+}
+
+/// The result of a completed (or budget-exhausted) automation run.
+#[derive(Debug, Clone, PartialEq)]
+pub struct AutomateOutcome {
+    pub success: bool,
+    pub summary: String,
+    /// One human-readable line per executed step — surfaced back to the chat
+    /// agent and useful in logs.
+    pub steps: Vec<String>,
+}
+
+impl AutomateOutcome {
+    fn fail(summary: impl Into<String>, steps: Vec<String>) -> Self {
+        Self {
+            success: false,
+            summary: summary.into(),
+            steps,
+        }
+    }
+}
+
+/// Injectable side-effects for the loop. The production impl
+/// ([`RealBackend`]) talks to the OS accessibility tree and a fast LLM; tests
+/// supply a scripted impl.
+#[async_trait]
+pub trait AutomateBackend: Send + Sync {
+    /// Read interactive elements in `app` whose label contains `filter`.
+    async fn perceive(&self, app: &str, filter: &str) -> Result<Vec<ax::AXElement>, String>;
+    /// Ask the decision model for one JSON action. `system` pins the schema;
+    /// `user` carries the goal + current snapshot + recent step history.
+    async fn decide(&self, system: &str, user: &str) -> Result<String, String>;
+    async fn act_launch(&self, app: &str) -> Result<String, String>;
+    async fn act_press(&self, app: &str, label: &str) -> Result<String, String>;
+    async fn act_set_value(&self, app: &str, label: &str, value: &str) -> Result<String, String>;
+    /// Open a URL / URI-scheme (e.g. `music://…search?term=…`) via the OS opener.
+    /// Used by deterministic app fast-paths; the general loop does not call it.
+    async fn open_url(&self, url: &str) -> Result<String, String>;
+    /// Best-effort: is media currently playing? `None` when the backend can't
+    /// tell (non-macOS, or not applicable). Media fast-paths use this to confirm
+    /// an action *actually started playback* rather than just succeeding at the
+    /// AX level — the false-success that made "play" silently no-op (§1.11).
+    async fn verify_playing(&self) -> Option<bool> {
+        None
+    }
+    /// Block until the UI settles after an action.
+    async fn settle(&self, app: &str);
+    /// Wait ~`ms` of real time. Used by fast-paths to let asynchronous content
+    /// (e.g. network search results) render between perceive attempts. Default
+    /// is a real sleep; test backends override it to a no-op so suites stay fast.
+    async fn wait(&self, ms: u64) {
+        tokio::time::sleep(std::time::Duration::from_millis(ms)).await;
+    }
+}
+
+/// Tuning for a run.
+#[derive(Debug, Clone, Copy)]
+pub struct AutomateOptions {
+    pub step_budget: u32,
+}
+
+impl Default for AutomateOptions {
+    fn default() -> Self {
+        Self {
+            step_budget: DEFAULT_STEP_BUDGET,
+        }
+    }
+}
+
+/// System prompt pinning the action contract for the fast model.
+fn system_prompt() -> String {
+    "You drive a desktop app's UI to accomplish a goal. You see a list of the \
+     app's interactive elements (each as `[role] label`) and act one step at a \
+     time.\n\
+     \n\
+     Respond with EXACTLY ONE JSON object and nothing else:\n\
+     {\"thought\":\"...\",\"action\":\"<verb>\",\"app\":\"<optional>\",\
+     \"filter\":\"...\",\"label\":\"...\",\"value\":\"...\",\"summary\":\"...\"}\n\
+     \n\
+     Verbs:\n\
+     • launch     — open the app (use first if it isn't showing any elements)\n\
+     • list       — re-read elements; set `filter` to a substring to narrow them\n\
+     • press      — activate the element whose label matches `label`\n\
+     • set_value  — type `value` into the field matching `label` (omit label = first field)\n\
+     • done       — goal achieved; put a short result in `summary`\n\
+     • fail       — goal cannot be achieved; explain in `summary`\n\
+     \n\
+     Rules:\n\
+     - Pressing a LIST ROW or SEARCH RESULT usually only selects/opens it. To \
+     trigger playback or submission you must then press the actual action button \
+     (e.g. open a song, THEN press its 'Play'). After such a press, `list` again \
+     to see the new screen.\n\
+     - Prefer an exact label match. Keep `filter` specific so the snapshot stays small.\n\
+     - Output JSON only — no prose, no code fences."
+        .to_string()
+}
+
+/// Render a perceive snapshot into compact prompt text.
+fn render_snapshot(app: &str, filter: &str, elements: &[ax::AXElement]) -> String {
+    if elements.is_empty() {
+        return format!(
+            "App '{app}' shows no elements matching filter '{filter}' (it may still be \
+             loading, or needs launching)."
+        );
+    }
+    let shown = elements.len().min(MAX_SNAPSHOT);
+    let mut out = format!(
+        "App '{app}' elements (filter '{filter}', showing {shown} of {}):\n",
+        elements.len()
+    );
+    for e in elements.iter().take(MAX_SNAPSHOT) {
+        // NB: we don't annotate `enabled` here — AXEnabled is unreliable
+        // per-app (Apple Music marks pressable rows disabled), so surfacing it
+        // would mislead the model into avoiding real controls.
+        out.push_str(&format!("  [{}] {}\n", e.role, e.label));
+    }
+    out
+}
+
+/// Parse one action from raw model text, tolerating code fences and surrounding
+/// prose by extracting the first balanced `{...}` block. Returns `Err` so the
+/// caller can issue a single repair retry before giving up — we never *act* on
+/// an unparseable guess (tracker §1.13 hallucination lesson).
+fn parse_action(raw: &str) -> Result<Action, String> {
+    let trimmed = raw.trim();
+    if let Ok(a) = serde_json::from_str::<Action>(trimmed) {
+        return Ok(a);
+    }
+    // Extract the first {...} span and retry.
+    if let (Some(start), Some(end)) = (trimmed.find('{'), trimmed.rfind('}')) {
+        if end > start {
+            if let Ok(a) = serde_json::from_str::<Action>(&trimmed[start..=end]) {
+                return Ok(a);
+            }
+        }
+    }
+    Err(format!(
+        "could not parse an action from model output: {trimmed:?}"
+    ))
+}
+
+/// Run the automation loop until the goal is met, it fails, or the step budget
+/// is exhausted.
+pub async fn run(
+    app: &str,
+    goal: &str,
+    backend: &dyn AutomateBackend,
+    opts: AutomateOptions,
+) -> AutomateOutcome {
+    log::info!(
+        "{LOG_PREFIX} ▶ run app={app:?} goal={goal:?} budget={}",
+        opts.step_budget
+    );
+
+    // Foreground the target app FIRST, always. This guarantees the app is
+    // frontmost before we perceive or act — so AX reads the right window and any
+    // synthetic input (keyboard/mouse) lands on it, not on OpenHuman's own
+    // window (which is what crashed CEF in §1.8). `act_launch` is `open -a`,
+    // which both opens and activates; idempotent if already running.
+    match backend.act_launch(app).await {
+        Ok(m) => log::info!("{LOG_PREFIX} foregrounded: {m}"),
+        Err(e) => log::warn!("{LOG_PREFIX} foreground failed for {app:?}: {e}"),
+    }
+    backend.settle(app).await;
+
+    // Deterministic accelerator: if a known app + intent has a proven native
+    // sequence, run it first. On `None` (no fast-path) or a failed fast-path we
+    // fall through to the general model-driven loop — so the fast-path can only
+    // help, never block. (Structurally different from the removed `play_music`
+    // tool, §1.13: this is internal to `automate`, not a tool the LLM selects.)
+    if let Some(outcome) = super::app_fastpaths::try_fastpath(app, goal, backend).await {
+        if outcome.success {
+            log::info!("{LOG_PREFIX} fast-path succeeded for app={app:?}");
+            return outcome;
+        }
+        log::info!("{LOG_PREFIX} fast-path did not complete; falling through to general loop");
+    }
+
+    let system = system_prompt();
+    let mut steps: Vec<String> = Vec::new();
+    let mut last_filter = String::new();
+    // One repair retry budget for unparseable model output.
+    let mut repair_left = 1u32;
+    // No-progress guard: track the last actionable signature so a model that
+    // keeps issuing the same call (e.g. pressing 'Search' over and over) bails
+    // instead of burning the whole step budget.
+    let mut last_sig = String::new();
+    let mut repeat_count = 0u32;
+
+    for step in 0..opts.step_budget {
+        // ── perceive ──
+        let snapshot = match backend.perceive(app, &last_filter).await {
+            Ok(els) => render_snapshot(app, &last_filter, &els),
+            Err(e) => {
+                log::warn!("{LOG_PREFIX} perceive failed: {e}");
+                format!("(perceive error: {e})")
+            }
+        };
+
+        // ── decide ──
+        let user = format!(
+            "Goal: {goal}\nApp: {app}\n\nCurrent screen:\n{snapshot}\n\nSteps so far:\n{}\n\n\
+             Reply with the next single JSON action.",
+            if steps.is_empty() {
+                "  (none yet)".to_string()
+            } else {
+                steps
+                    .iter()
+                    .map(|s| format!("  - {s}"))
+                    .collect::<Vec<_>>()
+                    .join("\n")
+            }
+        );
+        let raw = match backend.decide(&system, &user).await {
+            Ok(t) => t,
+            Err(e) => {
+                log::warn!("{LOG_PREFIX} decide failed: {e}");
+                return AutomateOutcome::fail(format!("decision model error: {e}"), steps);
+            }
+        };
+
+        let action = match parse_action(&raw) {
+            Ok(a) => a,
+            Err(e) => {
+                if repair_left > 0 {
+                    repair_left -= 1;
+                    log::warn!("{LOG_PREFIX} step={step} unparseable action, retrying: {e}");
+                    steps.push(format!("(model produced unparseable output; retried)"));
+                    continue;
+                }
+                return AutomateOutcome::fail(format!("model output unparseable: {e}"), steps);
+            }
+        };
+
+        let target_app = action
+            .app
+            .as_deref()
+            .filter(|s| !s.is_empty())
+            .unwrap_or(app);
+        log::info!(
+            "{LOG_PREFIX} step={step} action={:?} app={target_app:?} label={:?} filter={:?}",
+            action.action,
+            action.label,
+            action.filter
+        );
+
+        // ── no-progress guard ──
+        if !matches!(action.action.as_str(), "done" | "fail") {
+            let sig = format!("{}|{}|{}", action.action, action.label, action.filter);
+            if sig == last_sig {
+                repeat_count += 1;
+            } else {
+                repeat_count = 0;
+                last_sig = sig;
+            }
+            // initial + 2 repeats = 3 identical actions in a row.
+            if repeat_count >= 2 {
+                log::warn!("{LOG_PREFIX} no progress: action repeated 3× ({last_sig}); aborting");
+                steps.push(format!(
+                    "aborted: repeated '{}' 3× with no progress",
+                    action.action
+                ));
+                return AutomateOutcome::fail(
+                    "Got stuck repeating the same action with no progress.",
+                    steps,
+                );
+            }
+        }
+
+        // ── act ──
+        match action.action.as_str() {
+            "done" => {
+                let summary = if action.summary.is_empty() {
+                    "Goal completed.".to_string()
+                } else {
+                    action.summary.clone()
+                };
+                log::info!("{LOG_PREFIX} ✓ done: {summary}");
+                progress(&summary, OverlayAttentionTone::Success);
+                return AutomateOutcome {
+                    success: true,
+                    summary,
+                    steps,
+                };
+            }
+            "fail" => {
+                let summary = if action.summary.is_empty() {
+                    "Goal could not be completed.".to_string()
+                } else {
+                    action.summary.clone()
+                };
+                log::info!("{LOG_PREFIX} ✗ model gave up: {summary}");
+                progress(&summary, OverlayAttentionTone::Neutral);
+                return AutomateOutcome::fail(summary, steps);
+            }
+            "list" => {
+                last_filter = action.filter.clone();
+                steps.push(format!("list filter={:?}", last_filter));
+            }
+            "launch" => {
+                progress(
+                    format!("Opening {target_app}…"),
+                    OverlayAttentionTone::Accent,
+                );
+                match backend.act_launch(target_app).await {
+                    Ok(msg) => steps.push(format!("launch: {msg}")),
+                    Err(e) => steps.push(format!("launch FAILED: {e}")),
+                }
+                backend.settle(target_app).await;
+            }
+            "press" => {
+                if action.label.trim().is_empty() {
+                    steps.push("press skipped: empty label".to_string());
+                    continue;
+                }
+                progress(
+                    format!("Pressing {}…", action.label),
+                    OverlayAttentionTone::Accent,
+                );
+                match backend.act_press(target_app, &action.label).await {
+                    Ok(msg) => steps.push(format!("press: {msg}")),
+                    Err(e) => steps.push(format!("press FAILED: {e}")),
+                }
+                backend.settle(target_app).await;
+            }
+            "set_value" => {
+                if action.value.is_empty() {
+                    steps.push("set_value skipped: empty value".to_string());
+                    continue;
+                }
+                progress("Typing…", OverlayAttentionTone::Accent);
+                match backend
+                    .act_set_value(target_app, &action.label, &action.value)
+                    .await
+                {
+                    Ok(msg) => steps.push(format!("set_value: {msg}")),
+                    Err(e) => steps.push(format!("set_value FAILED: {e}")),
+                }
+                backend.settle(target_app).await;
+            }
+            other => {
+                steps.push(format!("unknown action {other:?} ignored"));
+            }
+        }
+    }
+
+    log::info!("{LOG_PREFIX} step budget ({}) exhausted", opts.step_budget);
+    AutomateOutcome::fail(
+        format!(
+            "Step budget ({}) exhausted before the goal was confirmed complete.",
+            opts.step_budget
+        ),
+        steps,
+    )
+}
+
+/// Production backend: real AX primitives + a fast LLM for decisions.
+pub struct RealBackend {
+    config: crate::openhuman::config::Config,
+}
+
+impl RealBackend {
+    pub fn new(config: crate::openhuman::config::Config) -> Self {
+        Self { config }
+    }
+}
+
+#[async_trait]
+impl AutomateBackend for RealBackend {
+    async fn perceive(&self, app: &str, filter: &str) -> Result<Vec<ax::AXElement>, String> {
+        ax::ax_list_elements_filtered(app, filter)
+    }
+
+    async fn decide(&self, system: &str, user: &str) -> Result<String, String> {
+        // Fast tier: the `memory` role maps to `memory_provider` — a cheap,
+        // quick model class. A dedicated `automation` provider knob is a
+        // follow-up (see plan §5); routing through `memory` keeps M1 free of
+        // Config-schema churn while still keeping the chat model out of the loop.
+        let (provider, model) =
+            crate::openhuman::inference::provider::create_chat_provider("memory", &self.config)
+                .map_err(|e| format!("fast-model provider unavailable: {e}"))?;
+        provider
+            .chat_with_system(Some(system), user, &model, 0.0)
+            .await
+            .map_err(|e| format!("fast-model call failed: {e}"))
+    }
+
+    async fn act_launch(&self, app: &str) -> Result<String, String> {
+        crate::openhuman::tools::implementations::system::launch_platform(app).await
+    }
+
+    async fn act_press(&self, app: &str, label: &str) -> Result<String, String> {
+        ax::ax_press_element(app, label)
+    }
+
+    async fn act_set_value(&self, app: &str, label: &str, value: &str) -> Result<String, String> {
+        ax::ax_set_field_value(app, label, value)
+    }
+
+    async fn open_url(&self, url: &str) -> Result<String, String> {
+        // Cross-platform URI opener. macOS `open`, Linux `xdg-open`, Windows
+        // `cmd /C start`. Only invoked by fast-paths with app-controlled URLs
+        // (never user free-text), so there's no untrusted-URL surface here.
+        #[cfg(target_os = "macos")]
+        let mut cmd = {
+            let mut c = tokio::process::Command::new("open");
+            c.arg(url);
+            c
+        };
+        #[cfg(target_os = "linux")]
+        let mut cmd = {
+            let mut c = tokio::process::Command::new("xdg-open");
+            c.arg(url);
+            c
+        };
+        #[cfg(target_os = "windows")]
+        let mut cmd = {
+            let mut c = tokio::process::Command::new("cmd");
+            c.args(["/C", "start", "", url]);
+            c
+        };
+        match cmd.output().await {
+            Ok(o) if o.status.success() => Ok(format!("Opened {url}")),
+            Ok(o) => Err(format!(
+                "opener exited {}: {}",
+                o.status,
+                String::from_utf8_lossy(&o.stderr).trim()
+            )),
+            Err(e) => Err(format!("failed to launch opener: {e}")),
+        }
+    }
+
+    async fn verify_playing(&self) -> Option<bool> {
+        // macOS: ask Apple Music for ground-truth player state. Other OSes can't
+        // verify this way → None (fast-path treats None as best-effort).
+        #[cfg(target_os = "macos")]
+        {
+            let out = tokio::process::Command::new("osascript")
+                .args(["-e", "tell application \"Music\" to player state as string"])
+                .output()
+                .await
+                .ok()?;
+            let state = String::from_utf8_lossy(&out.stdout).trim().to_lowercase();
+            Some(state == "playing")
+        }
+        #[cfg(not(target_os = "macos"))]
+        {
+            None
+        }
+    }
+
+    async fn settle(&self, app: &str) {
+        // M2: poll the element count until the UI stops changing (≤2s), instead
+        // of a blind fixed wait. Removes the timing-race class (tracker §1.11/
+        // §1.13) — the next perceive sees a settled tree. `ax_wait_settled` is
+        // blocking (synchronous helper IPC), so run it off the async runtime.
+        let app = app.to_string();
+        let _ = tokio::task::spawn_blocking(move || {
+            ax::ax_wait_settled(&app, 240, 2000);
+        })
+        .await;
+    }
+}
+
+#[cfg(test)]
+#[path = "automate_tests.rs"]
+mod tests;
diff --git a/src/openhuman/accessibility/automate_tests.rs b/src/openhuman/accessibility/automate_tests.rs
new file mode 100644
index 0000000000..6b169e98b7
--- /dev/null
+++ b/src/openhuman/accessibility/automate_tests.rs
@@ -0,0 +1,266 @@
+//! Unit tests for the `automate` loop. A scripted [`AutomateBackend`] feeds
+//! canned model responses and records every action, so the loop is exercised
+//! with no mic, no AX tree, and no LLM.
+
+use super::*;
+use std::sync::Mutex;
+
+/// Scripted backend: `decide` returns the next queued response each call;
+/// perceive/act are stubbed and recorded.
+struct ScriptedBackend {
+    /// Queued raw model outputs, consumed in order.
+    responses: Mutex<std::collections::VecDeque<String>>,
+    /// Elements every `perceive` returns.
+    elements: Vec<ax::AXElement>,
+    /// Record of act calls, for assertions.
+    acts: Mutex<Vec<String>>,
+    /// Force act_press to error (to exercise the failure-recording path).
+    press_errors: bool,
+}
+
+impl ScriptedBackend {
+    fn new(responses: &[&str]) -> Self {
+        Self {
+            responses: Mutex::new(responses.iter().map(|s| s.to_string()).collect()),
+            elements: vec![
+                ax::AXElement::new("AXButton", "Play"),
+                ax::AXElement::new("AXTextField", "Search"),
+            ],
+            acts: Mutex::new(Vec::new()),
+            press_errors: false,
+        }
+    }
+    fn acts(&self) -> Vec<String> {
+        self.acts.lock().unwrap().clone()
+    }
+}
+
+#[async_trait]
+impl AutomateBackend for ScriptedBackend {
+    async fn perceive(&self, _app: &str, _filter: &str) -> Result<Vec<ax::AXElement>, String> {
+        Ok(self.elements.clone())
+    }
+    async fn decide(&self, _system: &str, _user: &str) -> Result<String, String> {
+        Ok(self
+            .responses
+            .lock()
+            .unwrap()
+            .pop_front()
+            // When the script runs dry, keep listing so the budget guard is what
+            // ends the run (rather than a decide error).
+            .unwrap_or_else(|| r#"{"action":"list","filter":""}"#.to_string()))
+    }
+    async fn act_launch(&self, app: &str) -> Result<String, String> {
+        self.acts.lock().unwrap().push(format!("launch:{app}"));
+        Ok(format!("Opened '{app}'."))
+    }
+    async fn act_press(&self, app: &str, label: &str) -> Result<String, String> {
+        self.acts
+            .lock()
+            .unwrap()
+            .push(format!("press:{app}:{label}"));
+        if self.press_errors {
+            return Err("no such element".into());
+        }
+        Ok(format!("Pressed '{label}' in '{app}'."))
+    }
+    async fn act_set_value(&self, app: &str, label: &str, value: &str) -> Result<String, String> {
+        self.acts
+            .lock()
+            .unwrap()
+            .push(format!("set_value:{app}:{label}={value}"));
+        Ok(format!("Set '{label}' in '{app}'."))
+    }
+    async fn open_url(&self, url: &str) -> Result<String, String> {
+        self.acts.lock().unwrap().push(format!("open_url:{url}"));
+        Ok(format!("Opened {url}"))
+    }
+    async fn settle(&self, _app: &str) {}
+    async fn wait(&self, _ms: u64) {}
+}
+
+fn opts(budget: u32) -> AutomateOptions {
+    AutomateOptions {
+        step_budget: budget,
+    }
+}
+
+#[tokio::test]
+async fn happy_path_launch_list_press_done() {
+    // Use a non-fast-path app/goal so the GENERAL loop is what runs.
+    // run() foregrounds (launch) the app first, so the model needn't.
+    let backend = ScriptedBackend::new(&[
+        r#"{"action":"list","filter":"Play"}"#,
+        r#"{"action":"press","label":"Play"}"#,
+        r#"{"action":"done","summary":"Playing."}"#,
+    ]);
+    let out = run("Notes", "do a thing", &backend, opts(8)).await;
+    assert!(out.success, "expected success, got {out:?}");
+    assert_eq!(out.summary, "Playing.");
+    let acts = backend.acts();
+    // Leading launch is the foreground-first guarantee.
+    assert_eq!(acts, vec!["launch:Notes", "press:Notes:Play"]);
+}
+
+#[tokio::test]
+async fn navigate_then_activate_sequence() {
+    // Press the row (navigates), then press the detail Play, then done.
+    // Non-fast-path app so this exercises the general loop's two-press flow.
+    let backend = ScriptedBackend::new(&[
+        r#"{"action":"press","label":"Highway to Hell"}"#,
+        r#"{"action":"press","label":"Play"}"#,
+        r#"{"action":"done","summary":"ok"}"#,
+    ]);
+    let out = run("Photos", "open the top album", &backend, opts(8)).await;
+    assert!(out.success);
+    assert_eq!(
+        backend.acts(),
+        vec![
+            "launch:Photos", // foreground-first
+            "press:Photos:Highway to Hell",
+            "press:Photos:Play"
+        ]
+    );
+}
+
+#[tokio::test]
+async fn set_value_routes_app_override() {
+    let backend = ScriptedBackend::new(&[
+        r#"{"action":"set_value","app":"Slack","label":"message","value":"hi"}"#,
+        r#"{"action":"done"}"#,
+    ]);
+    let out = run("Slack", "message Steven hi", &backend, opts(5)).await;
+    assert!(out.success);
+    assert_eq!(
+        backend.acts(),
+        vec!["launch:Slack", "set_value:Slack:message=hi"] // foreground-first
+    );
+}
+
+#[tokio::test]
+async fn budget_exhaustion_fails() {
+    // Script always lists → never done → budget guard ends the run.
+    let backend = ScriptedBackend::new(&[r#"{"action":"list","filter":"x"}"#]);
+    let out = run("Music", "never finishes", &backend, opts(3)).await;
+    assert!(!out.success);
+    assert!(out.summary.contains("budget"), "got: {}", out.summary);
+}
+
+#[tokio::test]
+async fn no_progress_guard_aborts_repeated_action() {
+    // Model keeps pressing the same control (the live "Search ×11" pathology).
+    let backend = ScriptedBackend::new(&[
+        r#"{"action":"press","label":"Search"}"#,
+        r#"{"action":"press","label":"Search"}"#,
+        r#"{"action":"press","label":"Search"}"#,
+        r#"{"action":"press","label":"Search"}"#,
+    ]);
+    let out = run("Photos", "do something", &backend, opts(10)).await;
+    assert!(!out.success);
+    assert!(
+        out.summary.contains("stuck repeating"),
+        "got: {}",
+        out.summary
+    );
+    // foreground launch, then acted twice; the 3rd identical action aborts.
+    assert_eq!(
+        backend.acts(),
+        vec![
+            "launch:Photos",
+            "press:Photos:Search",
+            "press:Photos:Search"
+        ]
+    );
+}
+
+#[tokio::test]
+async fn one_repair_retry_then_succeeds() {
+    let backend = ScriptedBackend::new(&[
+        "garbage not json",
+        r#"{"action":"done","summary":"recovered"}"#,
+    ]);
+    let out = run("Music", "g", &backend, opts(5)).await;
+    assert!(out.success, "should recover after one repair: {out:?}");
+    assert_eq!(out.summary, "recovered");
+}
+
+#[tokio::test]
+async fn two_unparseable_outputs_fail() {
+    let backend = ScriptedBackend::new(&["garbage one", "garbage two"]);
+    let out = run("Music", "g", &backend, opts(5)).await;
+    assert!(!out.success);
+    assert!(out.summary.contains("unparseable"), "got: {}", out.summary);
+}
+
+#[tokio::test]
+async fn explicit_fail_action_propagates() {
+    let backend = ScriptedBackend::new(&[r#"{"action":"fail","summary":"app not installed"}"#]);
+    let out = run("Music", "x", &backend, opts(5)).await;
+    assert!(!out.success);
+    assert_eq!(out.summary, "app not installed");
+}
+
+#[tokio::test]
+async fn press_failure_is_recorded_not_fatal() {
+    let mut backend = ScriptedBackend::new(&[
+        r#"{"action":"press","label":"Play"}"#,
+        r#"{"action":"done","summary":"tried"}"#,
+    ]);
+    backend.press_errors = true;
+    let out = run("Music", "x", &backend, opts(5)).await;
+    assert!(out.success); // the run continues; the press failure is just logged
+    assert!(
+        out.steps.iter().any(|s| s.contains("press FAILED")),
+        "steps: {:?}",
+        out.steps
+    );
+}
+
+#[test]
+fn parse_action_plain_json() {
+    let a = parse_action(r#"{"action":"press","label":"Play"}"#).unwrap();
+    assert_eq!(a.action, "press");
+    assert_eq!(a.label, "Play");
+}
+
+#[test]
+fn parse_action_strips_code_fence_and_prose() {
+    let raw = "Sure!\n```json\n{\"action\":\"done\",\"summary\":\"ok\"}\n```\n";
+    let a = parse_action(raw).unwrap();
+    assert_eq!(a.action, "done");
+    assert_eq!(a.summary, "ok");
+}
+
+#[test]
+fn parse_action_rejects_garbage() {
+    assert!(parse_action("not json at all").is_err());
+    assert!(parse_action("").is_err());
+}
+
+#[test]
+fn render_snapshot_caps_and_labels() {
+    let many: Vec<ax::AXElement> = (0..100)
+        .map(|i| ax::AXElement::new("AXButton", format!("btn{i}")))
+        .collect();
+    let s = render_snapshot("Music", "btn", &many);
+    assert!(s.contains("showing 40 of 100"));
+    assert!(s.contains("btn0"));
+    assert!(!s.contains("btn50"), "should be capped at 40");
+}
+
+#[test]
+fn render_snapshot_does_not_annotate_enabled() {
+    // AXEnabled is unreliable per-app, so the snapshot must not surface it
+    // (would mislead the model into avoiding pressable controls).
+    let mut disabled = ax::AXElement::new("AXButton", "Play");
+    disabled.enabled = Some(false);
+    let s = render_snapshot("Music", "", &[disabled]);
+    assert!(!s.contains("disabled"), "got: {s}");
+    assert!(s.contains("[AXButton] Play"));
+}
+
+#[test]
+fn render_snapshot_empty_hint() {
+    let s = render_snapshot("Music", "zzz", &[]);
+    assert!(s.contains("no elements"));
+}
diff --git a/src/openhuman/accessibility/ax_interact.rs b/src/openhuman/accessibility/ax_interact.rs
index dda9724e05..cb3ad21bb0 100644
--- a/src/openhuman/accessibility/ax_interact.rs
+++ b/src/openhuman/accessibility/ax_interact.rs
@@ -21,10 +21,68 @@ mod tests;
 #[path = "uia_interact_tests.rs"]
 mod uia_tests;
 
-#[derive(Debug, Clone, Deserialize)]
+// Portable (non-OS-gated) unit tests for the pure settle core. The sibling
+// `ax_interact_tests.rs` is macOS-only + #[ignore] (needs a live app); these
+// run everywhere so the settle logic stays covered in CI.
+#[cfg(test)]
+mod settle_tests {
+    use super::counts_settled;
+
+    #[test]
+    fn not_settled_until_enough_samples() {
+        assert!(!counts_settled(&[5], 3));
+        assert!(!counts_settled(&[5, 5], 3));
+    }
+
+    #[test]
+    fn settled_when_tail_is_constant() {
+        assert!(counts_settled(&[1, 4, 7, 7, 7], 3));
+    }
+
+    #[test]
+    fn not_settled_when_still_changing() {
+        assert!(!counts_settled(&[7, 7, 8], 3));
+        assert!(!counts_settled(&[2, 4, 6], 3));
+    }
+
+    #[test]
+    fn zero_or_one_required_settles_immediately() {
+        assert!(counts_settled(&[9], 1));
+        assert!(counts_settled(&[9], 0));
+    }
+
+    #[test]
+    fn only_the_tail_matters() {
+        // Early churn doesn't matter once the last `need` samples agree.
+        assert!(counts_settled(&[0, 99, 3, 3], 2));
+    }
+}
+
+#[derive(Debug, Clone, Default, Deserialize)]
 pub struct AXElement {
     pub role: String,
     pub label: String,
+    /// The control's reported `AXEnabled` state, when the backend supplies it.
+    ///
+    /// **Informational only — do NOT gate pressing on this.** Empirically
+    /// unreliable per-app: Apple Music reports its search-result rows as
+    /// `Some(false)` even though `AXPress` on them works. Kept for diagnostics
+    /// and for apps that report it faithfully; matchers must not skip elements
+    /// solely because this is `Some(false)`.
+    #[serde(default)]
+    pub enabled: Option<bool>,
+}
+
+impl AXElement {
+    /// Convenience constructor (enabled unknown). Keeps call sites terse and
+    /// insulated from future optional fields.
+    pub fn new(role: impl Into<String>, label: impl Into<String>) -> Self {
+        Self {
+            role: role.into(),
+            label: label.into(),
+            enabled: None,
+        }
+    }
 }
 
 /// List interactive UI elements (buttons, text fields, checkboxes, …) in `app_name`.
@@ -112,6 +170,64 @@ pub fn ax_press_element(app_name: &str, label: &str) -> Result<String, String> {
     }
 }
 
+/// Decide, from a rolling history of element counts, whether the UI has
+/// settled — i.e. the most recent `stable_samples` counts are all identical
+/// (and there are at least that many samples). Pure so it can be unit-tested
+/// without any AX backend or real clock.
+///
+/// `stable_samples == 0` or `1` means "settled as soon as we have one sample".
+pub(crate) fn counts_settled(history: &[usize], stable_samples: usize) -> bool {
+    let need = stable_samples.max(1);
+    if history.len() < need {
+        return false;
+    }
+    let tail = &history[history.len() - need..];
+    tail.iter().all(|c| *c == tail[0])
+}
+
+/// Block until `app_name`'s interactive-element count stops changing for
+/// `stable_ms`, or `timeout_ms` elapses. Returns the final observed count.
+///
+/// This is the **settle** primitive for the `automate` loop: after an action
+/// (press / type / launch) the UI is mid-render, and reading it immediately is
+/// what caused the timing-race failures (tracker §1.11/§1.13). Polling the
+/// element count until it's stable is a portable replacement for a blind fixed
+/// sleep — it works on both backends because it rides on `ax_list_elements`,
+/// which already cfg-dispatches (macOS AX / Windows UIA).
+///
+/// Blocking (uses `std::thread::sleep` + synchronous helper IPC); async callers
+/// should run it via `spawn_blocking`. An AXObserver-driven settle is a later
+/// optimization that can sit behind this same signature.
+pub fn ax_wait_settled(app_name: &str, stable_ms: u64, timeout_ms: u64) -> usize {
+    use std::time::{Duration, Instant};
+    // Sample roughly every `poll_ms`; declare settled once the count has held
+    // for ceil(stable_ms / poll_ms) consecutive samples.
+    let poll_ms = 80u64;
+    let stable_samples = (stable_ms.div_ceil(poll_ms)).max(2) as usize;
+    let deadline = Instant::now() + Duration::from_millis(timeout_ms);
+    let mut history: Vec<usize> = Vec::new();
+
+    loop {
+        let count = ax_list_elements(app_name).map(|v| v.len()).unwrap_or(0);
+        history.push(count);
+        if counts_settled(&history, stable_samples) {
+            log::debug!(
+                "[ax_interact] settle: '{app_name}' stable at {count} elements after {} samples",
+                history.len()
+            );
+            return count;
+        }
+        if Instant::now() >= deadline {
+            log::debug!(
+                "[ax_interact] settle: '{app_name}' timed out after {} samples (last count={count})",
+                history.len()
+            );
+            return count;
+        }
+        std::thread::sleep(Duration::from_millis(poll_ms));
+    }
+}
+
 /// Set the value of the first text field in `app_name` whose label contains `label`.
 /// Pass an empty `label` to target the first available text field.
 pub fn ax_set_field_value(app_name: &str, label: &str, value: &str) -> Result<String, String> {
diff --git a/src/openhuman/accessibility/ax_interact_tests.rs b/src/openhuman/accessibility/ax_interact_tests.rs
index 89f57906e7..ccb123dec2 100644
--- a/src/openhuman/accessibility/ax_interact_tests.rs
+++ b/src/openhuman/accessibility/ax_interact_tests.rs
@@ -165,3 +165,24 @@ fn test_ax_press_nonexistent_app() {
     let result = ax_press_element("NonExistentApp12345", "Play");
     assert!(result.is_err());
 }
+
+/// Env-driven AX dump probe: `AX_PROBE_APP="Slack" cargo test ax_probe_app -- --ignored --nocapture`.
+/// Lists interactive elements an app exposes via the macOS Accessibility API —
+/// used to diagnose Electron apps (Slack/Discord) whose tree may be empty
+/// unless accessibility is enabled.
+#[test]
+#[ignore = "manual AX probe — set AX_PROBE_APP"]
+fn ax_probe_app() {
+    let app = std::env::var("AX_PROBE_APP").unwrap_or_else(|_| "Slack".to_string());
+    let _ = Command::new("open").arg("-a").arg(&app).status();
+    sleep(Duration::from_secs(4));
+    match ax_list_elements(&app) {
+        Ok(els) => {
+            println!("[ax_probe] {app}: {} interactive elements", els.len());
+            for e in els.iter().take(80) {
+                println!("   [{}] {}", e.role, e.label);
+            }
+        }
+        Err(e) => println!("[ax_probe] {app}: ERROR {e}"),
+    }
+}
diff --git a/src/openhuman/accessibility/helper.rs b/src/openhuman/accessibility/helper.rs
index c271915aeb..97c9c1fbd7 100644
--- a/src/openhuman/accessibility/helper.rs
+++ b/src/openhuman/accessibility/helper.rs
@@ -693,16 +693,27 @@ func axListElements(appName: String, id: String?) -> [String: Any] {
         "AXCheckBox", "AXRadioButton", "AXSlider", "AXPopUpButton",
         "AXComboBox", "AXLink", "AXTab"
     ]
-    var elements: [[String: String]] = []
-    axWalk(axApp, maxDepth: 10) { _, role, label in
+    var elements: [[String: Any]] = []
+    axWalk(axApp, maxDepth: 10) { el, role, label in
         if interactiveRoles.contains(role) && !label.isEmpty {
-            elements.append(["role": role, "label": label])
+            elements.append(["role": role, "label": label, "enabled": axEnabled(el)])
         }
         return false
     }
     return ["type": "ax_list", "id": id ?? "", "ok": true, "error": NSNull(), "elements": elements]
 }
 
+/// Read the AXEnabled attribute; default to `true` when the attribute is absent
+/// (most static/text elements don't expose it, and we don't want to hide them).
+func axEnabled(_ element: AXUIElement) -> Bool {
+    var ref: AnyObject?
+    if AXUIElementCopyAttributeValue(element, kAXEnabledAttribute as CFString, &ref) == .success,
+       let b = ref as? Bool {
+        return b
+    }
+    return true
+}
+
 /// Collect all AX elements whose label contains `label` (case-insensitive).
 /// Returns matches sorted exact-first so "Play" beats "Playlist".
 struct AXCandidate {
diff --git a/src/openhuman/accessibility/mod.rs b/src/openhuman/accessibility/mod.rs
index ccd1dd1841..d90a7cb40c 100644
--- a/src/openhuman/accessibility/mod.rs
+++ b/src/openhuman/accessibility/mod.rs
@@ -5,6 +5,8 @@
 //! Consumer modules (autocomplete, screen_intelligence, voice) call into this module
 //! instead of owning platform-specific code directly.
 
+pub mod app_fastpaths;
+pub mod automate;
 mod automation_state;
 pub mod ax_interact;
 mod capture;
diff --git a/src/openhuman/accessibility/uia_interact.rs b/src/openhuman/accessibility/uia_interact.rs
index 4ec1060233..cfce495fb9 100644
--- a/src/openhuman/accessibility/uia_interact.rs
+++ b/src/openhuman/accessibility/uia_interact.rs
@@ -217,6 +217,9 @@ pub fn list(app_name: &str, filter: &str) -> Result<Vec<AXElement>, String> {
         out.push(AXElement {
             role: format!("{ct:?}"),
             label,
+            // TODO(windows): populate from UIA `IsEnabled` once verified on a
+            // Windows box; `None` = "assume enabled" (current behaviour).
+            enabled: None,
         });
     }
 
diff --git a/src/openhuman/tools/impl/system/launch_app.rs b/src/openhuman/tools/impl/system/launch_app.rs
index c423a34449..7766c2f833 100644
--- a/src/openhuman/tools/impl/system/launch_app.rs
+++ b/src/openhuman/tools/impl/system/launch_app.rs
@@ -176,7 +176,11 @@ impl Tool for LaunchAppTool {
 }
 
 /// Platform-specific launch dispatch. Returns a human-readable success message.
-async fn launch_platform(app_name: &str) -> Result<String, String> {
+///
+/// `pub(crate)` so the `automate` inner loop (`accessibility::automate`) can
+/// launch an app as one of its steps without duplicating the platform branches
+/// or routing back through the full tool surface.
+pub(crate) async fn launch_platform(app_name: &str) -> Result<String, String> {
     log::info!(
         "[launch_app] platform={} dispatching launch for app_name={app_name:?}",
         std::env::consts::OS
diff --git a/src/openhuman/tools/impl/system/mod.rs b/src/openhuman/tools/impl/system/mod.rs
index 118a567b30..a532e1f713 100644
--- a/src/openhuman/tools/impl/system/mod.rs
+++ b/src/openhuman/tools/impl/system/mod.rs
@@ -20,6 +20,8 @@ pub use detect_tools::DetectToolsTool;
 pub use insert_sql_record::InsertSqlRecordTool;
 pub use install_tool::InstallToolTool;
 pub use launch_app::LaunchAppTool;
+// Reused by the `automate` inner loop to launch an app mid-flow.
+pub(crate) use launch_app::launch_platform;
 pub use lsp::{lsp_capability_enabled, LspTool, LSP_ENABLED_ENV};
 pub use node_exec::NodeExecTool;
 pub use npm_exec::NpmExecTool;

From 608f177a6f02d43cfde904292090d03cdbcfdfd9 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 14:18:19 +0530
Subject: [PATCH 3/9] feat(agent): wire automate/ax_interact computer tools
 into the orchestrator

Registers the AutomateTool (multi-step UI flows in one call) and the
ax_interact denylist/opt-in plumbing; adds the catalog toggle, tool
definition, and orchestrator prompt guidance (automate + screenshot/
mouse/keyboard fallback for Electron apps with empty AX trees).

Slice 3/7 of #3307 (tool wiring + prompts).
---
 app/src/utils/toolDefinitions.ts              |   9 +
 docs/voice-system-actions.md                  | 166 ++++++++++++-
 .../agents/orchestrator/agent.toml            |  17 ++
 .../agents/orchestrator/prompt.md             |  19 ++
 src/openhuman/tools/impl/computer/automate.rs | 223 ++++++++++++++++++
 .../tools/impl/computer/ax_interact.rs        |  12 +-
 src/openhuman/tools/impl/computer/mod.rs      |   2 +
 src/openhuman/tools/ops.rs                    |   6 +
 src/openhuman/tools/user_filter.rs            |   7 +
 9 files changed, 445 insertions(+), 16 deletions(-)
 create mode 100644 src/openhuman/tools/impl/computer/automate.rs

diff --git a/app/src/utils/toolDefinitions.ts b/app/src/utils/toolDefinitions.ts
index 01213cd1f2..4dfa74ab02 100644
--- a/app/src/utils/toolDefinitions.ts
+++ b/app/src/utils/toolDefinitions.ts
@@ -45,6 +45,15 @@ export const TOOL_CATALOG: ToolDefinition[] = [
     defaultEnabled: true,
     rustToolNames: ['ax_interact'],
   },
+  {
+    id: 'automate',
+    displayName: 'App Automation',
+    description:
+      'Accomplish a multi-step goal in an app in one go (e.g. "play a song in Music", "message someone in Slack") — the agent drives the UI step by step.',
+    category: 'System',
+    defaultEnabled: true,
+    rustToolNames: ['automate'],
+  },
   {
     id: 'git_operations',
     displayName: 'Git Operations',
diff --git a/docs/voice-system-actions.md b/docs/voice-system-actions.md
index 92133e37c1..4c10f9b6f4 100644
--- a/docs/voice-system-actions.md
+++ b/docs/voice-system-actions.md
@@ -268,6 +268,112 @@ test ... ok
 
 ---
 
+### Change 1.14 — `automate(app, goal)`: Rust-driven multi-step automation 🔨 In progress (M1 done)
+
+**Status:** 🔨 In progress — **M1 + M2 + M3 shipped and M3 proven live on macOS**; M4–M6 pending. See **Phase 1.5** below and [`voice-automate-plan.md`](voice-automate-plan.md).
+
+**Agent-in-the-loop fixes (2026-06-03, from two live chat sessions):**
+- **Mutations were off** — the agent correctly called `automate` but it (and `ax_interact`) refused because `computer_control.ax_interact_mutations=false`. Enabled it; also rewrote both refusal messages to point at **Settings → Agent Access** instead of a config key (the agent had relayed "controls are locked down").
+- **Query mis-parse** — orchestrator goal `…search for "Highway to Hell" by AC/DC, and play it` made the after-"play" parser extract `"it"`. `extract_play_query` now prefers a **quoted title + `by <artist>`** and rejects bare pronouns. (Unit-tested with the exact failing goal.)
+- **General loop spun** — pressed "Search" 11× to budget exhaustion. Added a **no-progress guard**: 3 identical actions in a row → abort.
+- **Search-results timing** — the fast-path's retry burned out before catalog results rendered (`settle` reports count-stable while the network fetch is pending). Added a real, mockable `wait` between attempts (6 × ~800ms).
+
+**M5 finding — AXEnabled is unreliable:** plumbed an `enabled` field end-to-end (Swift `axEnabled` → `AXElement.enabled` → Windows stub), but Apple Music reports its **pressable** search-result rows as `enabled=Some(false)`. Gating `pick_row` on it broke playback. So `enabled` is kept **informational only** (documented on the struct); matchers never skip on it. The better future actionability signal is AXPress-action support, not AXEnabled.
+
+**M4 — live progress in the notch (2026-06-03):** the notch indicator (originally PR #3166) was cherry-picked onto this branch (`feat(notch)` + fmt commits → `notch_window.rs` NSPanel + `notch/NotchApp.tsx`, auto-shown on startup, transparent when idle). The `automate` loop and Music fast-path now call `overlay::publish_attention(...)` at each step (`Opening …`, `Searching Music for …`, `Pressing …`, `Typing …`, `Playing …`, plus done/fail), which the existing Socket.IO bridge emits as `overlay:attention` and the notch renders as a pill — so the user sees the automation happening live. Verified: app boots with `[notch-window] panel shown at top-center`; Tauri shell + frontend compile; 31 automate unit tests green.
+
+**M3 live proof (2026-06-03):** `music_fastpath_live` drives real Apple Music end-to-end and **hard-asserts `player state == playing`** — confirmed: pre-state `paused` → post-state `playing`. Three bugs the live runs surfaced, all fixed + tested:
+1. **Perceive filter was the whole multi-word query** — a substring filter can't match a full title → now filters by the first strong token and `pick_row` does the token match.
+2. **Search results render late (§1.13 race)** — retry perceive across up to 4 settles; `AC/DC` now percent-encodes correctly.
+3. **False success: pressed the toolbar Play, not the song's** — the first run reported success but *nothing played*. AX probing showed the search screen has only the toolbar transport Play (empty queue → silence); pressing the song row navigates to a detail page where a **second** Play appears (23→24 controls). Fix: capture the baseline Play count, **wait for the detail Play to render**, press it, then **verify real playback** via `osascript … player state` and retry (≤3×). Added `verify_playing()` to `AutomateBackend` (macOS osascript; `None` elsewhere = best-effort). `automate` now only reports a play success when audio is actually playing — the false-success class (§1.11) is closed.
+
+**M3 — shipped (Music fast-path):**
+- `src/openhuman/accessibility/app_fastpaths/{mod.rs,music.rs}` (new) — deterministic accelerators consulted by `run()` **before** the general loop. Music encodes the §1.11 proven sequence: launch → open `music://…/search?term=…` → settle → press the song row (navigate) → settle → press the detail-page **Play**. Pure helpers `matches` / `extract_play_query` (handles "play X by Y", "launch Music and play …", "play X in Apple Music").
+- **Structurally different from the removed `play_music` tool (§1.13):** this is *internal* to `automate`, not a tool the LLM selects, and on any failure/`None` the loop **falls through** to the general model-driven path — so it can only help. Added `open_url` to the `AutomateBackend` trait (cross-platform opener; fast-path only).
+- **Tests:** 9 unit (parser cases, full scripted sequence, no-row fallthrough, dispatch) + 1 `#[ignore]` macOS live test. **Live proof on a Mac:** `cargo test --lib music_fastpath_live -- --ignored --nocapture` (needs Music + Accessibility permission).
+
+**M2 — shipped (real settle):**
+- `src/openhuman/accessibility/ax_interact.rs` — new `ax_wait_settled(app, stable_ms, timeout_ms)`: polls the app's interactive-element count and returns once it holds steady for `stable_ms` (or `timeout_ms` elapses). Portable — rides on `ax_list_elements`, which already cfg-dispatches (macOS AX / Windows UIA). Pure decision core `counts_settled(history, n)` extracted and unit-tested (5 non-OS-gated tests).
+- `automate.rs` — `RealBackend::settle` now calls `ax_wait_settled` (240ms stable / 2s cap) via `spawn_blocking` instead of the M1 blind 450ms wait. This is the piece that removes the timing-race failure class (§1.11/§1.13): the next perceive always sees a settled tree. An AXObserver-driven settle can later sit behind the same signature.
+
+**M1 — shipped:**
+- `src/openhuman/accessibility/automate.rs` (new) — the perceive→decide→act→settle loop, generic over an injectable `AutomateBackend` (so the model + AX + launcher are all mockable). Strict JSON action schema (`launch`/`list`/`press`/`set_value`/`done`/`fail`) with a one-shot repair retry on unparseable output (never acts on a hallucinated guess), a step budget (default 12), and a snapshot cap (40 elements) mirroring `ax_interact`'s anti-truncation guard. `RealBackend` calls the existing AX primitives + `launch_platform`, and routes decisions through the **fast tier** (`create_chat_provider("memory", …)` for now; a dedicated `automation_provider` knob is a follow-up). Settle is a short fixed wait in M1 (M2 makes it AXObserver-driven).
+- `src/openhuman/tools/impl/computer/automate.rs` (new) — `AutomateTool { app, goal }`. Always `Dangerous` + `external_effect` (routes through the ApprovalGate); reuses `ax_interact`'s mutations opt-in (`computer_control.ax_interact_mutations`) and the shared `is_sensitive_app` denylist.
+- Registered everywhere: `tools/ops.rs`, `tools/user_filter.rs` (`automate` family), `orchestrator/agent.toml` (`named`), `app/src/utils/toolDefinitions.ts` (Settings → "App Automation").
+- **Tests:** 18 passing — loop happy path, navigate-then-activate, app override, budget exhaustion, repair retry (1 ok / 2 fail), explicit fail, non-fatal press failure, JSON parse (plain/fenced/garbage), snapshot cap/empty-hint; tool gating (missing args, mutations-off, sensitive-app refusal, schema).
+
+**Problem (the real-time bar):** The user's target is *"whatever I say happens, live, in front of me"* — e.g. *"Launch Music and play Numb by Linkin Park"* or *"open Slack and message Steven 'hi'"*. Today every UI step (`list` → `set_value` → `list` → `press` …) is a **separate chat-LLM turn**. A Slack message is ~7 turns; at 1–3 s each that's 15–25 s, and each turn is a fresh chance to hit a timing race (1.13) or hallucinate. The heavy chat model is sitting *inside* the click loop — the wrong place for it.
+
+**Root causes (all four documented earlier in Phase 1):**
+1. **Timing races** — `list`/`press` do a single AX walk with no settle/wait; the UI hasn't rendered yet (1.11/1.13).
+2. **Navigate-then-activate is re-reasoned every call** — pressing a row selects; you must then press the action control. That logic lives in prose, so it's re-derived (often wrongly) each turn (1.10/1.11).
+3. **Round-trip explosion** — N full chat turns per task = latency + cost + N chances to fail.
+4. **Weak element model + no verification** — `list` returns flat `[role, label]`; `press` reports success on `AXAction == .success` even when nothing changed.
+
+**Design — take the chat model out of the click loop:**
+- **New tool `automate { app, goal }`** — one call from the orchestrator. Rust then runs a tight **perceive → act → verify** loop internally: read a *filtered* AX snapshot → pick the next action → act → **wait for the UI to settle (AXObserver, not fixed sleeps)** → verify it took effect → repeat until the goal is met or a step budget is hit.
+- **A fast model drives the inner loop** (Haiku-class) with a *tiny* context: just the goal, the current small AX snapshot, and the last result — not the whole conversation. Each inner step is ~0.5–1 s and self-corrects, instead of one 3 s chat turn that falsely reports success.
+- **Settle + verify in Rust** between steps — deterministic, kills the timing-race class in one place.
+- **Native fast-paths for high-value apps** (skip the UI entirely where possible):
+  - **Music** — `music://` search URL → AX play (already explored in 1.11), or AppleScript for library.
+  - **Spotify** — Web API search → `spotify:track:…` URI + AppleScript `play`. Fully deterministic, no UI poking.
+  - **Slack** — deep link `slack://channel?…` to open the DM, then AX to type + send.
+  The general AX loop is the fallback for everything else.
+- **Vision fallback for Electron/Chromium apps** (Slack, Discord, VS Code, Spotify-desktop) whose AX/UIA tree is partial (documented limitation). Slack needs accessibility enabled (`defaults write com.tinyspeck.slackmacgap AccessibilityEnabled -bool true`, relaunch). Where AX returns empty, fall back to **screenshot → vision-locate → guarded click**. This is the reverted CGEventPost path (1.8) — but it crashed only when events hit *OpenHuman's own focused CEF window*; a guarded click into a *different, foregrounded* app does not have that failure mode.
+- **Stream progress events** to the UI / notch pill (PR #3166) so the user sees each step happen live.
+
+**Why a generic `automate`, not per-app tools:** Change 1.13 already established that app-specific tools (`play_music`) are the wrong abstraction. The abstraction that *is* generic is the **navigate-then-activate sequence itself** — `automate(app, goal)` encapsulates it once, in Rust, for every app, instead of asking the chat model to re-orchestrate fragile primitives every time.
+
+---
+
+## Phase 1.5 — Reliable, real-time multi-step automation ⏳ Not Started
+
+> The bridge between today's `ax_interact` primitives and the always-on voice work. **Prerequisite for Phase 3** — fast voice routing into a slow/fragile action loop still feels slow. This is where "whatever I say happens, live" actually gets delivered.
+
+**Detailed implementation plan:** [`voice-automate-plan.md`](voice-automate-plan.md) — decided approach: **Rust inner loop + fast model**, first proof target **Music**.
+
+**Planned files:**
+- `src/openhuman/accessibility/automate.rs` (new) — the perceive→act→verify loop + settle/verify primitives, reusing `ax_interact` helpers.
+- `src/openhuman/accessibility/app_fastpaths/` (new) — per-app deterministic paths (`music.rs`, `spotify.rs`, `slack.rs`), behind a generic dispatch.
+- `src/openhuman/tools/impl/computer/automate.rs` (new) — `AutomateTool { app, goal }`, gated like `ax_interact` (mutations opt-in, sensitive-app denylist reused).
+- macOS helper (`accessibility/helper.rs`) — AXObserver-based settle (`ax_wait_settled`) + post-action verify; richer element model (enabled/onscreen/actions).
+- Vision fallback — screenshot via `accessibility/capture.rs` → locate → guarded click (only when AX tree is empty, target app foregrounded, never OpenHuman's own window).
+
+**Acceptance criteria:**
+- [ ] One `automate{app, goal}` call performs a multi-step flow end-to-end (no per-step chat turns)
+- [ ] Settle/verify removes the timing-race + false-success failure classes (1.11/1.13 do not recur)
+- [ ] Music flow ("play <song>") works end-to-end via the inner loop
+- [ ] Spotify + Slack fast-paths land their action deterministically
+- [ ] Electron/partial-AX apps fall back to vision+guarded-click without the CEF crash
+- [ ] Step-by-step progress streamed to the UI / notch indicator
+
+---
+
+### Change 1.15 — Full computer control (mouse/keyboard/screenshot) ✅ Crash fixed (main-thread dispatch)
+
+**Status:** ✅ Keyboard/mouse now run on the app main thread → no CEF crash. Screenshot downscales for inline view. Live: `[computer] registered main-thread synthetic-input executor` on boot.
+
+**The fix:** the crash was enigo's `TSMGetInputSourceProperty` running on a tokio worker (`_dispatch_assert_queue_fail`/SIGTRAP). macOS TSM must run on the main thread. New `tools/impl/computer/main_thread.rs` (`MainThreadInputOp` + `run_input_on_main`) dispatches each enigo op over the native registry to a handler the Tauri shell registers at startup, which runs it via `AppHandle::run_on_main_thread`. Keyboard + mouse tools no longer `spawn_blocking` enigo on a worker. Headless/CLI (no executor) returns a clear error instead of crashing. 66 keyboard/mouse tests green.
+
+**Goal:** make the agent fully autonomous — when the accessibility tree is empty (Electron apps: Slack/Discord/VS Code), fall back to vision + synthetic input. Enabled `computer_control.enabled`, added `mouse`/`keyboard`/`screenshot` to the orchestrator `named` list + `autonomy.auto_approve`, and taught `prompt.md` a keyboard-first ladder (foreground via `launch_app` → `keyboard type` + Enter; Slack `Cmd+K` recipe).
+
+**Foreground-first:** `automate::run` now `open -a`s the target app at the very start, always, so AX/input hit the right window.
+
+**Screenshot fix:** oversized Retina captures were returned as "too large to base64-encode inline" (the model was blind). Now downscaled to a viewable JPEG (`downscale_to_jpeg`) with reported dimensions.
+
+**THE BLOCKER — `OpenHuman-2026-06-03-170058.ips`:** `EXC_BREAKPOINT/SIGTRAP` on a **`tokio-rt-worker`** thread:
+```
+enigo::macos::get_layoutdependent_keycode → TSMGetInputSourceProperty
+→ dispatch_assert_queue → _dispatch_assert_queue_fail → SIGTRAP
+```
+enigo's keyboard-layout lookup (`TSMGetInputSourceProperty`) **must run on the app's main thread**; the keyboard tool runs on a tokio worker → macOS traps. **Not** a focus issue (same §1.8 root cause). A frontmost-app guard would NOT fix it.
+
+**Fix required (not yet done):** run enigo on the Tauri **main thread** (`AppHandle::run_on_main_thread`, bridged to the core via a native-registry handler), OR replace enigo's macOS keyboard path with TSM-free primitives (`CGEventKeyboardSetUnicodeString` for text + raw virtual keycodes for keys/hotkeys). Until then, keyboard/mouse must stay disabled to avoid crashing the app.
+
+**Tests:** voice-actions + autonomy suite is exhaustive — 220 feature unit tests + a JSON-RPC E2E (`json_rpc_voice_server_settings_roundtrip_always_on_and_wake_word`). The E2E caught + fixed real gaps (`wake_word` missing from the get output and the update RPC path). Screenshot downscale unit-tested.
+
+---
+
 ## Windows port — app interaction 🪟 ✅ Implemented
 
 Phase 1's app-interaction layer is now ported to Windows. The macOS path uses the
@@ -416,19 +522,30 @@ Shipped on the Windows machine (2026-06-02):
 
 ---
 
-## Phase 2 — Always-On Listening ⏳ Not Started
+## Phase 2 — Always-On Listening ✅ Implemented
 
 > Continuous microphone listening without requiring a hotkey press.
 
-**Planned files:**
-- `src/openhuman/voice/always_on.rs` (new) — dedicated tokio task holding the mic open, running VAD, emitting utterances to the STT pipeline
-- `src/openhuman/config/schema/voice_server.rs` — add `always_on_enabled: bool` config flag
-- Privacy hook: pause always-on when screen is locked
+**Shipped:**
+- `src/openhuman/voice/always_on.rs` — pure `VadSegmenter` (onset / silence-hangover / min-speech / max-utterance, 7 unit tests) **plus** the continuous capture loop: a dedicated cpal thread streams 16 kHz mono frames → segmenter → each utterance is encoded (`encode_wav_16k`) → `voice_transcribe_bytes` → `publish_transcription` (so it reaches the agent's auto-send and the notch, exactly like hotkey dictation). Started at boot in `credentials::ops`.
+- `src/openhuman/config/schema/voice_server.rs` — `always_on_enabled` flag + VAD tuning (`vad_onset_threshold`, `vad_hangover_ms`, `vad_min_speech_ms`, `vad_max_utterance_secs`), opt-in/off by default.
+- **Settings toggle** — "Always-on listening" in the Voice debug panel, wired through `get/update_voice_server_settings` (RPC patch → apply → snapshot); i18n in en + all 13 locales.
+- **Privacy hook** — `spawn_lock_watcher` pauses capture + resets the segmenter while the screen is locked (macOS via `CGSessionCopyCurrentDictionary`, null/type-safe FFI; other platforms never pause yet).
+- Reused `audio_capture` helpers (`to_mono`/`resample`/`chunk_rms` made `pub(crate)` + new `encode_wav_16k`).
 
 **Acceptance criteria:**
-- [ ] User can speak without pressing any hotkey
-- [ ] VAD detects end of utterance and sends to agent
-- [ ] Toggle in Settings → Voice
+- [x] User can speak without pressing any hotkey
+- [x] VAD detects end of utterance and sends to agent
+- [x] Toggle in Settings → Voice
+
+**Wake word "Hey Tiny" (live-fix, 2026-06-03):** always-on now only delivers an utterance to the agent when its transcript contains the wake word (`config.voice_server.wake_word`, default "Hey Tiny"); the phrase is stripped and the remainder is sent. Tolerant match (case/punctuation/leading-filler), empty wake word = deliver everything. This is a **text-based** wake word (transcribe-then-gate) — a first cut of Phase 3's trigger phrase; it fixes the "sends every utterance" spam but still runs STT on all speech (an on-device audio wake-word model for efficiency is the Phase 3 follow-up).
+
+**Live-fixes found by running it:**
+- **Toggle did nothing** — `always_on_enabled` wasn't in the `update_voice_server_settings` RPC *param schema*, so validation rejected it before the handler. Added it; the config RPC now also calls `always_on::start_if_enabled` so the toggle starts/idles capture **live** (runtime `ENABLED` gate, no restart).
+- **`transcription failed: local ai is disabled`** — always-on used `voice_transcribe_bytes` (local whisper only). Now routes through `effective_stt_provider` + `create_stt_provider` (same factory dispatch as `voice.stt_dispatch`), honoring cloud STT.
+- Toggle surfaced in the reachable **VoicePanel** (Settings → Advanced → AI → Voice), not the hidden debug panel.
+
+**Pending live validation (mic-dependent, can't be CI-tested):** say "Hey Tiny, <command>" and confirm the command reaches the agent; tune `vad_onset_threshold`/`vad_hangover_ms` to the user's mic + room. Windows/Linux screen-lock pause is a follow-up (no signal wired).
 
 ---
 
@@ -457,6 +574,24 @@ Shipped on the Windows machine (2026-06-02):
 
 ---
 
+## Fine-tuning backlog ⏳ (deferred until all phases complete)
+
+From live agent-in-the-loop testing on 2026-06-03 (grounded in `~/.openhuman/logs/openhuman.2026-06-03.log`, `session_raw/*.jsonl`, and the dev run: **keyboard=69 / mouse=0 / screenshot=10** tool calls; **26 wake matches vs 93 misses**; emit=true utterances ranged 0.7s–28s). The feature works but needs tuning. **Do not implement until Phases 3–4 land.**
+
+### F1 — Listening window too short for long commands
+- **Observed:** `vad_hangover_ms = 800` closes an utterance on any pause > 0.8s, so multi-clause commands ("Hey Tiny, open Slack and message the team channel saying …") split across utterances — the tail lacks the wake word and is dropped. Compounded by the notch "Listening" pill TTL (2500ms) expiring mid-speech, so it *looks* like it stopped listening.
+- **Resolve:** (a) raise `vad_hangover_ms` to ~1500ms; (b) **two-stage capture** — once the wake word is detected, open a dedicated longer command window (until a longer silence / N-second cap) instead of relying on a single VAD utterance; (c) keep the "Listening" pill alive for the whole utterance (extend/re-emit on each voiced frame, clear on `SpeechEnd`) so the notch reflects real mic state.
+
+### F2 — Agent uses keyboard only, never the mouse
+- **Observed:** keyboard=69, mouse=0. Two causes: the orchestrator prompt is deliberately *keyboard-first*, **and** the downscaled screenshot's coordinates don't map to screen pixels — the capture is shrunk to ≤1568px while `mouse` expects absolute screen pixels (and Retina is 2× points), so any coordinate read from the image clicks the wrong spot. Vision-driven clicking is therefore currently unsafe and the agent (correctly) avoids it.
+- **Resolve:** (a) make `screenshot` emit a coordinate transform (shown WxH + real screen WxH + backing scale) **or** have `mouse` accept image-relative coordinates and convert internally; (b) once coordinates are trustworthy, soften the prompt so the agent uses screenshot→mouse to click specific elements, not just keyboard.
+
+### F3 — No periodic screenshot/verify + foreground re-check
+- **Observed:** the agent screenshots ad-hoc (0 in the last session); `automate` only foregrounds at the start.
+- **Resolve:** in the `automate` loop **and** the orchestrator prompt — screenshot + verify at **start, after every ~3 actions, and at the end**; before each action confirm the frontmost app is the target and re-`launch_app` (foreground) it if not, then proceed. Fold the actual-vs-expected check into the loop's `verify` step.
+
+---
+
 ## Summary
 
 | Phase | Item | Status |
@@ -472,9 +607,18 @@ Shipped on the Windows machine (2026-06-02):
 | 1 | AXUIElement app UI interaction (`ax_interact`) | ✅ Done |
 | 1 | Multi-step UI workflow guidance | ✅ Done |
 | 1 | Apple Music two-step play (navigate→play) | ✅ Done (playback best-effort) |
-| 2 | Always-on microphone loop | ⏳ Not started |
-| 2 | `always_on_enabled` config flag | ⏳ Not started |
-| 2 | Privacy hook (screen lock pause) | ⏳ Not started |
+| 1 | `automate(app, goal)` Rust-driven loop (Change 1.14) | 🔨 M1+M2+M3 done (37 tests; live proof pending) |
+| 1.5 | M1: automate loop skeleton + tool | ✅ Done |
+| 1.5 | M2: poll-until-stable settle | ✅ Done |
+| 1.5 | M3: Music fast-path | ✅ Done (proven live on macOS) |
+| 1.5 | Robustness: quoted-query parse + no-progress guard | ✅ Done (from live agent failures) |
+| 1.5 | M4: progress streaming to notch | ✅ Done — notch cherry-picked in; automate streams live steps |
+| 1.5 | M5: richer element model (`enabled`) | ✅ Plumbed; AXEnabled found unreliable → informational only |
+| 1.5 | Native fast-paths (Music/Spotify/Slack) | ⏳ Not started |
+| 1.5 | Vision fallback for Electron apps | ⏳ Not started |
+| 2 | Always-on microphone loop | ✅ Done (cpal → VAD → STT → agent) |
+| 2 | `always_on_enabled` config flag + Settings toggle | ✅ Done (RPC + UI + i18n) |
+| 2 | Privacy hook (screen lock pause) | ✅ Done (macOS; other OSes follow-up) |
 | 3 | Wake-word detection | ⏳ Not started |
 | 3 | Local command router | ⏳ Not started |
 | 4 | Voice confirmation loop | ⏳ Not started |
diff --git a/src/openhuman/agent_registry/agents/orchestrator/agent.toml b/src/openhuman/agent_registry/agents/orchestrator/agent.toml
index 2ee6e58ae3..80a7cb7f78 100644
--- a/src/openhuman/agent_registry/agents/orchestrator/agent.toml
+++ b/src/openhuman/agent_registry/agents/orchestrator/agent.toml
@@ -130,6 +130,23 @@ named = [
     # `computer_control.ax_interact_mutations`, and refuse a sensitive-app
     # denylist (password managers, Keychain, System Settings, terminals).
     "ax_interact",
+    # Multi-step UI automation in one call (e.g. "play <song> in Music",
+    # "message <person> in Slack"). Prefer over many individual ax_interact
+    # calls when the task needs several UI steps — a Rust perceive→act→verify
+    # loop runs the flow with a fast model. Same opt-in + sensitive-app denylist
+    # as ax_interact; `Dangerous`, gates through the ApprovalGate.
+    "automate",
+    # Full computer control (autonomy). Fallback for apps the accessibility API
+    # can't drive — notably Electron apps (Slack, Discord, VS Code) whose AX/UIA
+    # tree is empty. `screenshot` to see the screen, then `mouse` (move/click/
+    # drag/scroll) + `keyboard` (type/press/hotkey) to act by pixel coordinates.
+    # All `Dangerous` and gate through the ApprovalGate. mouse/keyboard require
+    # `computer_control.enabled = true`. Prefer `automate`/`ax_interact` first;
+    # use these when the AX tree comes back empty. NB: foreground the target app
+    # before typing/clicking (synthetic input goes to the focused window).
+    "screenshot",
+    "mouse",
+    "keyboard",
     # Time + scheduling — lets the orchestrator answer "what time is it",
     # "remind me in 10 minutes", "every morning at 8" directly rather than
     # delegating or telling the user it can't. `current_time` grounds
diff --git a/src/openhuman/agent_registry/agents/orchestrator/prompt.md b/src/openhuman/agent_registry/agents/orchestrator/prompt.md
index e1b0d21c92..70d9873a95 100644
--- a/src/openhuman/agent_registry/agents/orchestrator/prompt.md
+++ b/src/openhuman/agent_registry/agents/orchestrator/prompt.md
@@ -38,6 +38,25 @@ Follow this sequence for every user message:
 
 Default bias: **do not spawn a sub-agent when a direct response or direct tool call is sufficient** — but a live external-service request is *not* something to answer from memory, it requires the integration. Use `spawn_worker_thread` for long tasks that need their own thread.
 
+## Controlling desktop apps (full autonomy)
+
+You can open and operate native apps on this machine. **Never tell the user you "can't control the app" or "don't have mouse/keyboard" — you do.**
+
+**Rule 0 — foreground first, every time.** Before *any* keyboard/mouse action, call `launch_app "<App>"` for the target. `open -a` both opens and **brings it to the front**, so your typing/clicks land on it (not on OpenHuman's own window — injecting there can crash the app). Re-call `launch_app` right before each keyboard/mouse step if focus might have moved.
+
+**The reliable path is the keyboard, not the mouse.** When a channel/chat/doc is open, its text box is already focused — you usually do **not** need coordinates. Prefer this:
+
+1. `launch_app "<App>"` (foreground).
+2. `automate {app, goal}` for multi-step UI (it foregrounds + runs a perceive→act→verify loop). Good for native apps (Music, Mail, Notes).
+3. **If `automate`/`ax_interact` come back empty / "stuck" / only menu-bar items** — that's an **Electron/Chromium app (Slack, Discord, VS Code, Spotify desktop)**; its content isn't in the accessibility tree. Switch to **keyboard-driven control**:
+   - `launch_app "<App>"` (foreground), then `keyboard` `type` the text and `press` `Enter`. The focused input receives it. Use app **hotkeys** to navigate (no mouse needed).
+4. **Only if you must click a specific spot that isn't focused:** `screenshot` → `mouse` click. (Screenshots are downscaled so you can see them; coordinates you read are in the returned image's pixels.)
+
+**Worked example — "message hi on Slack" (keyboard-only, no vision):**
+`launch_app "Slack"` → `keyboard hotkey "cmd+k"` (Slack quick switcher) → `keyboard type "<person or channel>"` → `keyboard press "Enter"` (opens the chat, focuses the message box) → `keyboard type "hi"` → `keyboard press "Enter"` (sends). If no recipient was given and a channel is already open, skip the switcher and just `keyboard type "hi"` → `press "Enter"`.
+
+`screenshot`/`mouse`/`keyboard` run without an approval prompt (they're on your auto-approve list) — just proceed.
+
 ## Rules
 
 - **You are the chat tier.** You run on a fast UX-focused model (TTFT > deep reasoning). When a task needs sustained multi-step thinking — planning across many steps, comparing several non-obvious options, untangling ambiguous requirements — **delegate to the reasoning tier (`delegate_plan`)** rather than reasoning through it yourself. Your job at that point is to brief the planner well and synthesise its output back to the user.
diff --git a/src/openhuman/tools/impl/computer/automate.rs b/src/openhuman/tools/impl/computer/automate.rs
new file mode 100644
index 0000000000..638ab6567b
--- /dev/null
+++ b/src/openhuman/tools/impl/computer/automate.rs
@@ -0,0 +1,223 @@
+//! Tool: `automate` — accomplish a multi-step UI goal in one call.
+//!
+//! The orchestrator calls `automate{app, goal}` once; the Rust loop in
+//! `accessibility::automate` then perceives → decides (fast model) → acts →
+//! settles → verifies until the goal is met or a step budget is hit. This keeps
+//! the heavy chat model out of the click loop (latency + reliability — see
+//! `docs/voice-automate-plan.md`).
+//!
+//! Safety mirrors `ax_interact`: it actuates real controls, so it is a mutating
+//! tool — opt-in via `computer_control.ax_interact_mutations`, routed through the
+//! ApprovalGate, and it refuses the sensitive-app denylist (password managers,
+//! Keychain, System Settings, terminals) even on auto-approved turns.
+
+use super::ax_interact::is_sensitive_app;
+use crate::openhuman::accessibility::automate::{self, AutomateOptions, RealBackend};
+use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolCallOptions, ToolResult};
+use async_trait::async_trait;
+use serde_json::json;
+
+pub struct AutomateTool {
+    /// When false the tool refuses to run (it is inherently mutating). Mirrors
+    /// `AxInteractTool::allow_mutations` so one opt-in governs both.
+    allow_mutations: bool,
+}
+
+impl AutomateTool {
+    pub fn new(allow_mutations: bool) -> Self {
+        Self { allow_mutations }
+    }
+}
+
+impl Default for AutomateTool {
+    fn default() -> Self {
+        Self::new(false)
+    }
+}
+
+#[async_trait]
+impl Tool for AutomateTool {
+    fn name(&self) -> &str {
+        "automate"
+    }
+
+    fn description(&self) -> &str {
+        "Accomplish a MULTI-STEP goal inside a desktop app in a single call — e.g. \
+         'play <song> in Music', 'message <person> <text> in Slack'. Give the app \
+         name and a plain-English goal; the system drives the app's UI step by step \
+         (find elements → press/type → verify) using the platform accessibility API, \
+         no screen coordinates. Prefer this over issuing many individual \
+         `ax_interact` calls when the task needs several UI steps. The app should \
+         usually be launched first (or include 'launch' in the goal). Refuses \
+         password managers, Keychain, System Settings, and terminals."
+    }
+
+    fn parameters_schema(&self) -> serde_json::Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "app": {
+                    "type": "string",
+                    "description": "Display name of the target application (e.g. 'Music', 'Slack')."
+                },
+                "goal": {
+                    "type": "string",
+                    "description": "Plain-English description of the multi-step outcome to achieve."
+                }
+            },
+            "required": ["app", "goal"]
+        })
+    }
+
+    fn permission_level(&self) -> PermissionLevel {
+        // Always mutating — it actuates controls. Kept as the base level so the
+        // approval gate fires regardless of args.
+        PermissionLevel::Dangerous
+    }
+
+    fn external_effect(&self) -> bool {
+        true
+    }
+
+    async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
+        self.execute_with_options(args, ToolCallOptions::default())
+            .await
+    }
+
+    async fn execute_with_options(
+        &self,
+        args: serde_json::Value,
+        _options: ToolCallOptions,
+    ) -> anyhow::Result<ToolResult> {
+        let app = args
+            .get("app")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .trim()
+            .to_string();
+        let goal = args
+            .get("goal")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .trim()
+            .to_string();
+
+        log::info!("[automate] ▶ tool execute app={app:?} goal={goal:?}");
+
+        if app.is_empty() {
+            return Ok(ToolResult::error("app is required"));
+        }
+        if goal.is_empty() {
+            return Ok(ToolResult::error("goal is required"));
+        }
+
+        // Hard safety boundary — identical to ax_interact's denylist.
+        if is_sensitive_app(&app) {
+            log::warn!("[automate] refused: sensitive app '{app}'");
+            return Ok(ToolResult::error(format!(
+                "Refusing to automate '{app}': it is on the sensitive-app denylist \
+                 (password managers, Keychain, System Settings, terminals). This is a \
+                 hard safety boundary."
+            )));
+        }
+
+        if !self.allow_mutations {
+            log::warn!("[automate] refused: mutations disabled");
+            return Ok(ToolResult::error(
+                "App control isn't enabled yet. Turn on App Automation in \
+                 Settings → Agent Access (it grants permission to control apps), \
+                 then ask again. (Sets computer_control.ax_interact_mutations = true.)",
+            ));
+        }
+
+        let config = match crate::openhuman::config::rpc::load_config_with_timeout().await {
+            Ok(c) => c,
+            Err(e) => return Ok(ToolResult::error(format!("could not load config: {e}"))),
+        };
+
+        let backend = RealBackend::new(config);
+        let outcome = automate::run(&app, &goal, &backend, AutomateOptions::default()).await;
+
+        let mut body = format!("{}\n\nSteps:", outcome.summary);
+        if outcome.steps.is_empty() {
+            body.push_str("\n  (no steps executed)");
+        } else {
+            for s in &outcome.steps {
+                body.push_str(&format!("\n  - {s}"));
+            }
+        }
+
+        if outcome.success {
+            Ok(ToolResult::success(body))
+        } else {
+            Ok(ToolResult::error(body))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn name_and_permission() {
+        let t = AutomateTool::new(true);
+        assert_eq!(t.name(), "automate");
+        assert_eq!(t.permission_level(), PermissionLevel::Dangerous);
+        assert!(t.external_effect());
+    }
+
+    #[test]
+    fn schema_requires_app_and_goal() {
+        let schema = AutomateTool::new(true).parameters_schema();
+        let req = schema["required"].as_array().unwrap();
+        assert!(req.iter().any(|v| v == "app"));
+        assert!(req.iter().any(|v| v == "goal"));
+    }
+
+    #[tokio::test]
+    async fn rejects_missing_app_or_goal() {
+        let t = AutomateTool::new(true);
+        assert!(
+            t.execute(json!({"app": "", "goal": "x"}))
+                .await
+                .unwrap()
+                .is_error
+        );
+        assert!(
+            t.execute(json!({"app": "Music", "goal": ""}))
+                .await
+                .unwrap()
+                .is_error
+        );
+    }
+
+    #[tokio::test]
+    async fn refuses_when_mutations_disabled() {
+        let t = AutomateTool::new(false);
+        let r = t
+            .execute(json!({"app": "Music", "goal": "play a song"}))
+            .await
+            .unwrap();
+        assert!(r.is_error);
+        assert!(r.output().contains("ax_interact_mutations"));
+    }
+
+    #[tokio::test]
+    async fn refuses_sensitive_app() {
+        let t = AutomateTool::new(true);
+        for app in [
+            "Keychain Access",
+            "1Password",
+            "Terminal",
+            "System Settings",
+        ] {
+            let r = t
+                .execute(json!({"app": app, "goal": "do something"}))
+                .await
+                .unwrap();
+            assert!(r.is_error, "expected refusal for {app}");
+            assert!(r.output().to_lowercase().contains("denylist"));
+        }
+    }
+}
diff --git a/src/openhuman/tools/impl/computer/ax_interact.rs b/src/openhuman/tools/impl/computer/ax_interact.rs
index 358d9179c5..b5ffa90f32 100644
--- a/src/openhuman/tools/impl/computer/ax_interact.rs
+++ b/src/openhuman/tools/impl/computer/ax_interact.rs
@@ -48,7 +48,9 @@ const SENSITIVE_APPS: &[&str] = &[
     "rio",
 ];
 
-fn is_sensitive_app(app_name: &str) -> bool {
+/// True when `app_name` is on the never-actuate denylist. `pub(crate)` so the
+/// `automate` tool shares the exact same boundary as `ax_interact`.
+pub(crate) fn is_sensitive_app(app_name: &str) -> bool {
     let lower = app_name.to_lowercase();
     SENSITIVE_APPS.iter().any(|s| lower.contains(s))
 }
@@ -229,10 +231,10 @@ impl Tool for AxInteractTool {
         if mutating && !self.allow_mutations {
             log::warn!("[ax_interact] refused: mutations disabled (action={action})");
             return Ok(ToolResult::error(
-                "ax_interact mutations (press/set_value) are disabled. They actuate arbitrary \
-                 app controls and type into arbitrary fields, so they require explicit opt-in: \
-                 set `computer_control.ax_interact_mutations = true`. The read-only 'list' \
-                 action remains available.",
+                "App control isn't enabled yet, so I can't press buttons or type into \
+                 this app. Turn on App UI Control / App Automation in Settings → Agent \
+                 Access, then ask again. (Reading the UI still works without it; sets \
+                 computer_control.ax_interact_mutations = true.)",
             ));
         }
 
diff --git a/src/openhuman/tools/impl/computer/mod.rs b/src/openhuman/tools/impl/computer/mod.rs
index 6603105d9c..379c8833f4 100644
--- a/src/openhuman/tools/impl/computer/mod.rs
+++ b/src/openhuman/tools/impl/computer/mod.rs
@@ -1,9 +1,11 @@
+mod automate;
 mod ax_interact;
 mod human_path;
 mod keyboard;
 mod main_thread;
 mod mouse;
 
+pub use automate::AutomateTool;
 pub use ax_interact::AxInteractTool;
 pub use keyboard::KeyboardTool;
 pub use main_thread::{run_input_on_main, MainThreadInputOp, INPUT_ON_MAIN_THREAD_METHOD};
diff --git a/src/openhuman/tools/ops.rs b/src/openhuman/tools/ops.rs
index 8c9f2e810d..8ce5445a29 100644
--- a/src/openhuman/tools/ops.rs
+++ b/src/openhuman/tools/ops.rs
@@ -177,6 +177,12 @@ pub fn all_tools_with_runtime(
         Box::new(AxInteractTool::new(
             root_config.computer_control.ax_interact_mutations,
         )),
+        // Multi-step UI automation in one call. Shares the ax_interact opt-in
+        // (mutations) and sensitive-app denylist; runs a Rust perceive→act→verify
+        // loop with a fast model so the chat model stays out of the click loop.
+        Box::new(AutomateTool::new(
+            root_config.computer_control.ax_interact_mutations,
+        )),
         Box::new(CodegraphIndexTool::new(
             config.clone(),
             action_dir.to_path_buf(),
diff --git a/src/openhuman/tools/user_filter.rs b/src/openhuman/tools/user_filter.rs
index 43c0299326..6aa82e5cfd 100644
--- a/src/openhuman/tools/user_filter.rs
+++ b/src/openhuman/tools/user_filter.rs
@@ -41,6 +41,13 @@ const TOOL_FAMILIES: &[ToolFamily] = &[
         rust_names: &["ax_interact"],
         default_enabled: true,
     },
+    // Multi-step UI automation (one call → whole flow). Same opt-in as
+    // ax_interact; surfaced as its own catalog toggle.
+    ToolFamily {
+        id: "automate",
+        rust_names: &["automate"],
+        default_enabled: true,
+    },
     // Computer control — mouse and keyboard. Gated by computer_control.enabled
     // in config (tools only register when that flag is true). PermissionLevel::Dangerous
     // so the approval gate fires per-action; user opts in explicitly.

From 578846a061a1b30305cbd68df15d9f8a564877c7 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 14:20:06 +0530
Subject: [PATCH 4/9] feat(voice): Phase 2 always-on listening engine + config
 + RPC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continuous cpal mic → VAD segmenter → STT → agent with no hotkey, opt-in
via voice_server.always_on_enabled, 'Hey Tiny' wake word (English-forced
STT + fuzzy match), and screen-lock privacy pause. Adds the config schema,
live-apply on the settings RPC, start_if_enabled wiring, and a JSON-RPC
roundtrip E2E.

Slice 4/7 of #3307 (always-on core).
---
 src/openhuman/config/ops.rs                 |  10 +
 src/openhuman/config/ops_tests.rs           |   4 +
 src/openhuman/config/schema/voice_server.rs |  91 +++
 src/openhuman/config/schemas.rs             |  20 +-
 src/openhuman/credentials/ops.rs            |   4 +
 src/openhuman/voice/always_on.rs            | 832 ++++++++++++++++++++
 src/openhuman/voice/audio_capture.rs        |  35 +-
 src/openhuman/voice/mod.rs                  |   1 +
 tests/json_rpc_e2e.rs                       | 226 ++----
 9 files changed, 1046 insertions(+), 177 deletions(-)
 create mode 100644 src/openhuman/voice/always_on.rs

diff --git a/src/openhuman/config/ops.rs b/src/openhuman/config/ops.rs
index 972a916e86..930615bdaf 100644
--- a/src/openhuman/config/ops.rs
+++ b/src/openhuman/config/ops.rs
@@ -1974,6 +1974,8 @@ pub struct VoiceServerSettingsPatch {
     pub min_duration_secs: Option<f32>,
     pub silence_threshold: Option<f32>,
     pub custom_dictionary: Option<Vec<String>>,
+    pub always_on_enabled: Option<bool>,
+    pub wake_word: Option<String>,
 }
 
 /// Returns the current voice server settings as a JSON object.
@@ -1987,6 +1989,8 @@ pub async fn get_voice_server_settings() -> Result<RpcOutcome<serde_json::Value>
         "min_duration_secs": config.voice_server.min_duration_secs,
         "silence_threshold": config.voice_server.silence_threshold,
         "custom_dictionary": config.voice_server.custom_dictionary,
+        "always_on_enabled": config.voice_server.always_on_enabled,
+        "wake_word": config.voice_server.wake_word,
     });
     Ok(RpcOutcome::new(
         result,
@@ -2034,6 +2038,12 @@ pub async fn load_and_apply_voice_server_settings(
     if let Some(custom_dictionary) = update.custom_dictionary {
         config.voice_server.custom_dictionary = custom_dictionary;
     }
+    if let Some(always_on_enabled) = update.always_on_enabled {
+        config.voice_server.always_on_enabled = always_on_enabled;
+    }
+    if let Some(wake_word) = update.wake_word {
+        config.voice_server.wake_word = wake_word;
+    }
     config.save().await.map_err(|e| e.to_string())?;
     let snapshot = snapshot_config_json(&config)?;
     Ok(RpcOutcome::new(
diff --git a/src/openhuman/config/ops_tests.rs b/src/openhuman/config/ops_tests.rs
index 8196a88800..421be43cde 100644
--- a/src/openhuman/config/ops_tests.rs
+++ b/src/openhuman/config/ops_tests.rs
@@ -989,6 +989,8 @@ async fn load_and_apply_voice_server_settings_rejects_invalid_activation_mode()
         min_duration_secs: None,
         silence_threshold: None,
         custom_dictionary: None,
+        always_on_enabled: None,
+        wake_word: None,
     };
     let err = load_and_apply_voice_server_settings(patch)
         .await
@@ -1041,6 +1043,8 @@ async fn load_and_apply_voice_server_settings_accepts_valid_modes_and_clamps() {
         min_duration_secs: Some(-5.0),
         silence_threshold: Some(-1.0),
         custom_dictionary: Some(vec!["term".into()]),
+        always_on_enabled: Some(true),
+        wake_word: Some("Hey Tiny".to_string()),
     };
     let outcome = load_and_apply_voice_server_settings(patch)
         .await
diff --git a/src/openhuman/config/schema/voice_server.rs b/src/openhuman/config/schema/voice_server.rs
index 9452592d5e..1018e1a8de 100644
--- a/src/openhuman/config/schema/voice_server.rs
+++ b/src/openhuman/config/schema/voice_server.rs
@@ -52,6 +52,44 @@ pub struct VoiceServerConfig {
     /// technical terms, and domain-specific vocabulary.
     #[serde(default)]
     pub custom_dictionary: Vec<String>,
+
+    /// Phase 2 — always-on listening. When true, the voice server keeps the
+    /// microphone open continuously and segments utterances with
+    /// voice-activity detection (VAD) instead of requiring a hotkey press.
+    /// Off by default: always-on listening has obvious privacy weight, so it
+    /// is strictly opt-in.
+    #[serde(default)]
+    pub always_on_enabled: bool,
+
+    /// VAD speech-onset threshold (peak RMS energy). A frame whose RMS rises
+    /// above this is treated as the start of speech. Slightly higher than the
+    /// hotkey `silence_threshold` because an always-open mic must reject more
+    /// ambient noise before opening an utterance.
+    #[serde(default = "default_vad_onset_threshold")]
+    pub vad_onset_threshold: f32,
+
+    /// VAD hangover: how long (milliseconds) RMS must stay below the onset
+    /// threshold before the current utterance is considered finished. Prevents
+    /// chopping an utterance on natural mid-sentence pauses.
+    #[serde(default = "default_vad_hangover_ms")]
+    pub vad_hangover_ms: u32,
+
+    /// Minimum speech duration (milliseconds) for a segment to be emitted.
+    /// Shorter blips (a cough, a door) are discarded before transcription.
+    #[serde(default = "default_vad_min_speech_ms")]
+    pub vad_min_speech_ms: u32,
+
+    /// Hard ceiling (seconds) on a single always-on utterance. Forces a flush
+    /// so a continuous noise source can't grow an unbounded recording.
+    #[serde(default = "default_vad_max_utterance_secs")]
+    pub vad_max_utterance_secs: f32,
+
+    /// Wake word for always-on mode. An utterance is only delivered to the agent
+    /// when its transcript contains this phrase; the phrase is stripped and the
+    /// remainder is sent as the command. Empty = no wake word (deliver every
+    /// utterance). Default "Hey Tiny".
+    #[serde(default = "default_wake_word")]
+    pub wake_word: String,
 }
 
 fn default_hotkey() -> String {
@@ -66,6 +104,26 @@ fn default_silence_threshold() -> f32 {
     0.002
 }
 
+fn default_vad_onset_threshold() -> f32 {
+    0.01
+}
+
+fn default_vad_hangover_ms() -> u32 {
+    800
+}
+
+fn default_vad_min_speech_ms() -> u32 {
+    300
+}
+
+fn default_vad_max_utterance_secs() -> f32 {
+    30.0
+}
+
+fn default_wake_word() -> String {
+    "Hey Tiny".to_string()
+}
+
 impl Default for VoiceServerConfig {
     fn default() -> Self {
         Self {
@@ -76,6 +134,39 @@ impl Default for VoiceServerConfig {
             min_duration_secs: default_min_duration(),
             silence_threshold: default_silence_threshold(),
             custom_dictionary: Vec::new(),
+            always_on_enabled: false,
+            vad_onset_threshold: default_vad_onset_threshold(),
+            vad_hangover_ms: default_vad_hangover_ms(),
+            vad_min_speech_ms: default_vad_min_speech_ms(),
+            vad_max_utterance_secs: default_vad_max_utterance_secs(),
+            wake_word: default_wake_word(),
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn defaults_are_opt_in_and_sane() {
+        let c = VoiceServerConfig::default();
+        // Always-on is privacy-sensitive — must default off.
+        assert!(!c.always_on_enabled);
+        // Onset must sit above the hotkey silence floor so an open mic rejects
+        // ambient noise that the push-to-talk path would have tolerated.
+        assert!(c.vad_onset_threshold > c.silence_threshold);
+        assert!(c.vad_hangover_ms > 0);
+        assert!(c.vad_min_speech_ms > 0);
+        assert!(c.vad_max_utterance_secs > 0.0);
+    }
+
+    #[test]
+    fn deserializes_with_all_vad_fields_defaulted() {
+        // An older config file with none of the Phase 2 keys must still load.
+        let c: VoiceServerConfig = serde_json::from_str("{}").unwrap();
+        assert!(!c.always_on_enabled);
+        assert_eq!(c.vad_hangover_ms, default_vad_hangover_ms());
+        assert_eq!(c.vad_min_speech_ms, default_vad_min_speech_ms());
+    }
+}
diff --git a/src/openhuman/config/schemas.rs b/src/openhuman/config/schemas.rs
index 0876605b16..5185d80405 100644
--- a/src/openhuman/config/schemas.rs
+++ b/src/openhuman/config/schemas.rs
@@ -192,6 +192,8 @@ struct VoiceServerSettingsUpdate {
     min_duration_secs: Option<f32>,
     silence_threshold: Option<f32>,
     custom_dictionary: Option<Vec<String>>,
+    always_on_enabled: Option<bool>,
+    wake_word: Option<String>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -1130,6 +1132,14 @@ pub fn schemas(function: &str) -> ControllerSchema {
                     comment: "Custom vocabulary words to bias whisper toward.",
                     required: false,
                 },
+                optional_bool(
+                    "always_on_enabled",
+                    "Continuous always-on listening (no hotkey). Opt-in.",
+                ),
+                optional_string(
+                    "wake_word",
+                    "Always-on wake word; utterances must contain it (default 'Hey Tiny').",
+                ),
             ],
             outputs: vec![json_output("snapshot", "Updated config snapshot.")],
         },
@@ -1715,8 +1725,16 @@ fn handle_update_voice_server_settings(params: Map<String, Value>) -> Controller
             min_duration_secs: update.min_duration_secs,
             silence_threshold: update.silence_threshold,
             custom_dictionary: update.custom_dictionary,
+            always_on_enabled: update.always_on_enabled,
+            wake_word: update.wake_word,
         };
-        to_json(config_rpc::load_and_apply_voice_server_settings(patch).await?)
+        let result = config_rpc::load_and_apply_voice_server_settings(patch).await?;
+        // Apply the always-on toggle live (start/idle the capture loop) so the
+        // Settings switch takes effect without a restart.
+        if let Ok(config) = config_rpc::load_config_with_timeout().await {
+            crate::openhuman::voice::always_on::start_if_enabled(&config).await;
+        }
+        to_json(result)
     })
 }
 
diff --git a/src/openhuman/credentials/ops.rs b/src/openhuman/credentials/ops.rs
index d95273d46a..4d34fee497 100644
--- a/src/openhuman/credentials/ops.rs
+++ b/src/openhuman/credentials/ops.rs
@@ -41,6 +41,10 @@ pub async fn start_login_gated_services(config: &Config) {
         crate::openhuman::voice::dictation_listener::start_if_enabled(config).await;
     }
 
+    // 3b. Always-on listening (Phase 2): continuous mic + VAD → STT → agent,
+    //     no hotkey. Opt-in via config.voice_server.always_on_enabled.
+    crate::openhuman::voice::always_on::start_if_enabled(config).await;
+
     // 4. Screen intelligence (capture + vision analysis)
     crate::openhuman::screen_intelligence::server::start_if_enabled(config).await;
 
diff --git a/src/openhuman/voice/always_on.rs b/src/openhuman/voice/always_on.rs
new file mode 100644
index 0000000000..19d2f915a4
--- /dev/null
+++ b/src/openhuman/voice/always_on.rs
@@ -0,0 +1,832 @@
+//! Phase 2 — always-on listening.
+//!
+//! Instead of a hotkey gating each recording, always-on mode keeps the mic
+//! open continuously and uses **voice-activity detection (VAD)** to carve the
+//! audio stream into utterances: an utterance opens when energy rises above an
+//! onset threshold and closes after a configurable run of silence (the
+//! "hangover"). Each completed utterance is transcribed and pushed onto the
+//! dictation bus, so it reaches the agent and the notch exactly like a hotkey
+//! dictation.
+//!
+//! Layers:
+//!   - [`VadSegmenter`] — a pure state machine over per-frame RMS energies,
+//!     unit-tested deterministically (no audio backend).
+//!   - [`start_if_enabled`] — opens a continuous cpal mic stream on a dedicated
+//!     thread, slices 16 kHz mono frames, drives the segmenter, transcribes each
+//!     utterance via the configured STT provider, then applies the wake-word
+//!     gate ([`extract_command`], default "Hey Tiny") before delivering the
+//!     command to the agent via `publish_transcription`.
+//!   - [`spawn_lock_watcher`] — privacy hook: pauses capture while the screen is
+//!     locked (macOS via the Quartz session dictionary).
+//!
+//! Privacy: always-on is **opt-in** (`config.voice_server.always_on_enabled`,
+//! default false) and pauses when the screen is locked.
+
+use crate::openhuman::config::VoiceServerConfig as CfgVoiceServer;
+
+const LOG_PREFIX: &str = "[voice::always_on]";
+
+/// Tuning for the VAD segmenter, distilled from [`CfgVoiceServer`].
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct VadConfig {
+    /// Peak-RMS energy above which a frame counts as speech.
+    pub onset_threshold: f32,
+    /// How long energy must stay below `onset_threshold` before the current
+    /// utterance is closed. Bridges natural mid-sentence pauses.
+    pub hangover_ms: u32,
+    /// Minimum voiced duration for a segment to be emitted; shorter blips
+    /// (cough, door) are dropped.
+    pub min_speech_ms: u32,
+    /// Hard ceiling on a single utterance — forces a flush so a continuous
+    /// noise source can't grow an unbounded recording.
+    pub max_utterance_ms: u32,
+}
+
+impl VadConfig {
+    /// Build VAD tuning from the persisted voice-server config.
+    pub fn from_server_config(c: &CfgVoiceServer) -> Self {
+        Self {
+            onset_threshold: c.vad_onset_threshold,
+            hangover_ms: c.vad_hangover_ms,
+            min_speech_ms: c.vad_min_speech_ms,
+            max_utterance_ms: (c.vad_max_utterance_secs * 1000.0).round().max(1.0) as u32,
+        }
+    }
+}
+
+/// An event emitted by the segmenter as the audio stream is consumed.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum VadEvent {
+    /// Energy crossed the onset threshold — an utterance has begun.
+    SpeechStart,
+    /// An utterance closed. `voiced_ms` is the accumulated speech duration
+    /// (excluding the trailing silence); `emit` is false when it fell below
+    /// `min_speech_ms` (drop it); `forced` is true when the close was caused
+    /// by the `max_utterance_ms` ceiling rather than a silence hangover.
+    SpeechEnd {
+        voiced_ms: u32,
+        emit: bool,
+        forced: bool,
+    },
+}
+
+#[derive(Debug, Clone, Copy)]
+enum State {
+    /// No active utterance — waiting for energy to cross the onset threshold.
+    Silent,
+    /// Inside an utterance.
+    Speaking {
+        /// Total elapsed time since the utterance opened (voiced + silence).
+        total_ms: u32,
+        /// Accumulated voiced time (frames above onset).
+        voiced_ms: u32,
+        /// Consecutive below-onset time since the last voiced frame.
+        silence_run_ms: u32,
+    },
+}
+
+/// Pure VAD state machine. Drive it by calling [`push_frame`](Self::push_frame)
+/// with the RMS energy of each fixed-size audio frame; it returns at most one
+/// [`VadEvent`] per frame.
+#[derive(Debug)]
+pub struct VadSegmenter {
+    cfg: VadConfig,
+    state: State,
+}
+
+impl VadSegmenter {
+    pub fn new(cfg: VadConfig) -> Self {
+        Self {
+            cfg,
+            state: State::Silent,
+        }
+    }
+
+    /// True while inside an utterance (between `SpeechStart` and `SpeechEnd`).
+    pub fn is_speaking(&self) -> bool {
+        matches!(self.state, State::Speaking { .. })
+    }
+
+    /// Abort any in-flight utterance and return to the idle state without
+    /// emitting an event. Used by the privacy hook (screen lock) and on
+    /// stream teardown.
+    pub fn reset(&mut self) {
+        self.state = State::Silent;
+    }
+
+    /// Feed one frame's RMS energy and its duration in milliseconds.
+    pub fn push_frame(&mut self, rms: f32, frame_ms: u32) -> Option<VadEvent> {
+        let above = rms >= self.cfg.onset_threshold;
+        match self.state {
+            State::Silent => {
+                if above {
+                    self.state = State::Speaking {
+                        total_ms: frame_ms,
+                        voiced_ms: frame_ms,
+                        silence_run_ms: 0,
+                    };
+                    Some(VadEvent::SpeechStart)
+                } else {
+                    None
+                }
+            }
+            State::Speaking {
+                mut total_ms,
+                mut voiced_ms,
+                mut silence_run_ms,
+            } => {
+                total_ms = total_ms.saturating_add(frame_ms);
+                if above {
+                    voiced_ms = voiced_ms.saturating_add(frame_ms);
+                    silence_run_ms = 0;
+                } else {
+                    silence_run_ms = silence_run_ms.saturating_add(frame_ms);
+                }
+
+                // Close on a silence hangover.
+                if silence_run_ms >= self.cfg.hangover_ms {
+                    self.state = State::Silent;
+                    let emit = voiced_ms >= self.cfg.min_speech_ms;
+                    return Some(VadEvent::SpeechEnd {
+                        voiced_ms,
+                        emit,
+                        forced: false,
+                    });
+                }
+                // Close on the hard utterance ceiling.
+                if total_ms >= self.cfg.max_utterance_ms {
+                    self.state = State::Silent;
+                    let emit = voiced_ms >= self.cfg.min_speech_ms;
+                    return Some(VadEvent::SpeechEnd {
+                        voiced_ms,
+                        emit,
+                        forced: true,
+                    });
+                }
+
+                self.state = State::Speaking {
+                    total_ms,
+                    voiced_ms,
+                    silence_run_ms,
+                };
+                None
+            }
+        }
+    }
+}
+
+// ── Continuous capture loop ─────────────────────────────────────────────────
+
+use crate::openhuman::config::Config;
+use crate::openhuman::voice::audio_capture::{
+    chunk_rms, encode_wav_16k, resample, to_mono, TARGET_SAMPLE_RATE,
+};
+use std::sync::atomic::{AtomicBool, Ordering};
+
+/// The capture thread + processor have been spawned (once per process).
+static RUNNING: AtomicBool = AtomicBool::new(false);
+
+/// Runtime on/off, mirrors `config.voice_server.always_on_enabled`. Toggling it
+/// at runtime takes effect immediately: when false the processor drops all audio
+/// (nothing is transcribed or sent). Lets the Settings toggle work without a
+/// restart. (The mic stream itself stays open until the next launch.)
+static ENABLED: AtomicBool = AtomicBool::new(false);
+
+/// When true, the processor drops audio and resets the segmenter (privacy hook:
+/// screen locked). Driven by [`spawn_lock_watcher`] on macOS.
+static PAUSED: AtomicBool = AtomicBool::new(false);
+
+/// VAD frame size. 20 ms at 16 kHz = 320 samples — small enough for responsive
+/// onset/hangover detection, large enough for a stable RMS estimate.
+const FRAME_MS: u32 = 20;
+const FRAME_SAMPLES: usize = (TARGET_SAMPLE_RATE as usize / 1000) * FRAME_MS as usize;
+
+/// Hard cap on a buffered utterance (defensive — the segmenter's
+/// `max_utterance_ms` should flush first; this bounds memory if it doesn't).
+const MAX_UTTERANCE_SAMPLES: usize = TARGET_SAMPLE_RATE as usize * 60;
+
+/// Apply the always-on config: set the runtime ENABLED gate and, when enabled,
+/// open the continuous microphone stream (once per process). Safe to call at
+/// boot **and** at runtime (the Settings toggle calls it via the config RPC):
+/// toggling off flips `ENABLED` so the processor immediately stops transcribing/
+/// delivering; toggling on starts capture live without a restart.
+///
+/// Opens a continuous mic stream, segments it with the [`VadSegmenter`], and
+/// routes each finished utterance through STT and the dictation delivery bus (so
+/// it reaches the agent exactly like a hotkey dictation, and lights up the notch).
+pub async fn start_if_enabled(app_config: &Config) {
+    let on = app_config.voice_server.always_on_enabled;
+    ENABLED.store(on, Ordering::SeqCst);
+    if !on {
+        log::info!("{LOG_PREFIX} disabled — capture idle (toggle off)");
+        return;
+    }
+    if RUNNING.swap(true, Ordering::SeqCst) {
+        log::info!("{LOG_PREFIX} re-enabled; capture already running");
+        return;
+    }
+
+    let vad = VadConfig::from_server_config(&app_config.voice_server);
+    let config = app_config.clone();
+    log::info!(
+        "{LOG_PREFIX} enabled — onset={:.4} hangover={}ms min_speech={}ms max_utt={}ms",
+        vad.onset_threshold,
+        vad.hangover_ms,
+        vad.min_speech_ms,
+        vad.max_utterance_ms
+    );
+
+    // The cpal stream is `!Send`, so it lives on a dedicated thread that pushes
+    // 16 kHz mono frames over a channel to the async processor below.
+    let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<Vec<f32>>();
+    if let Err(e) = spawn_capture_thread(tx) {
+        log::error!("{LOG_PREFIX} could not start microphone capture: {e}");
+        RUNNING.store(false, Ordering::SeqCst);
+        return;
+    }
+
+    // Privacy hook: pause capture while the screen is locked.
+    spawn_lock_watcher();
+
+    tokio::spawn(async move {
+        let mut seg = VadSegmenter::new(vad);
+        let mut pending: Vec<f32> = Vec::new();
+        let mut utterance: Vec<f32> = Vec::new();
+
+        while let Some(chunk) = rx.recv().await {
+            // Drop audio and abandon any in-flight utterance while paused
+            // (screen locked) or toggled off — nothing is captured or sent.
+            if PAUSED.load(Ordering::Relaxed) || !ENABLED.load(Ordering::Relaxed) {
+                if seg.is_speaking() {
+                    seg.reset();
+                }
+                pending.clear();
+                utterance.clear();
+                continue;
+            }
+            pending.extend_from_slice(&chunk);
+            while pending.len() >= FRAME_SAMPLES {
+                let frame: Vec<f32> = pending.drain(..FRAME_SAMPLES).collect();
+                let rms = chunk_rms(&frame);
+                match seg.push_frame(rms, FRAME_MS) {
+                    Some(VadEvent::SpeechStart) => {
+                        utterance.clear();
+                        utterance.extend_from_slice(&frame);
+                        notch_status("Listening", 2500); // pill: capturing speech
+                    }
+                    Some(VadEvent::SpeechEnd {
+                        emit, voiced_ms, ..
+                    }) => {
+                        let captured = std::mem::take(&mut utterance);
+                        log::info!(
+                            "{LOG_PREFIX} utterance end voiced_ms={voiced_ms} emit={emit} samples={}",
+                            captured.len()
+                        );
+                        if emit {
+                            let cfg = config.clone();
+                            tokio::spawn(async move {
+                                transcribe_and_deliver(&cfg, captured).await;
+                            });
+                        }
+                    }
+                    None => {
+                        if seg.is_speaking() && utterance.len() < MAX_UTTERANCE_SAMPLES {
+                            utterance.extend_from_slice(&frame);
+                        }
+                    }
+                }
+            }
+        }
+        log::info!("{LOG_PREFIX} capture channel closed; processor exiting");
+        RUNNING.store(false, Ordering::SeqCst);
+    });
+}
+
+/// Push a listener status to the always-visible notch pill via the
+/// `overlay:attention` channel. The notch maps "Listening" / "Processing" to the
+/// right icon; when the message expires it falls back to "Ready". Fire-and-forget.
+fn notch_status(status: &str, ttl_ms: u32) {
+    let _ = crate::openhuman::overlay::publish_attention(
+        crate::openhuman::overlay::OverlayAttentionEvent::new(status)
+            .with_source("voice")
+            .with_ttl_ms(ttl_ms),
+    );
+}
+
+/// Transcribe a finished utterance and hand the text to the dictation bus,
+/// which delivers it to the agent (auto-send) and the notch — the same path the
+/// hotkey dictation uses.
+async fn transcribe_and_deliver(config: &Config, samples_16k: Vec<f32>) {
+    use base64::Engine as _;
+    let wav = match encode_wav_16k(&samples_16k) {
+        Ok(w) => w,
+        Err(e) => {
+            log::warn!("{LOG_PREFIX} wav encode failed: {e}");
+            return;
+        }
+    };
+    // Route through the *configured* STT provider (cloud / whisper / slug) — the
+    // same factory dispatch the `voice.stt_dispatch` RPC uses — so always-on
+    // honors the user's choice instead of forcing local whisper.
+    let provider_name = crate::openhuman::voice::effective_stt_provider(config);
+    let model = crate::openhuman::voice::DEFAULT_WHISPER_MODEL.to_string();
+    let provider =
+        match crate::openhuman::voice::create_stt_provider(&provider_name, &model, config) {
+            Ok(p) => p,
+            Err(e) => {
+                log::warn!("{LOG_PREFIX} STT provider '{provider_name}' unavailable: {e}");
+                return;
+            }
+        };
+    let audio_b64 = base64::engine::general_purpose::STANDARD.encode(&wav);
+    // Force English transcription. Auto-detect was rendering the English wake
+    // word "Hey Tiny" in Hindi/Bengali/etc. script ("हे टाइनी"), which could never
+    // match the Latin wake word. The wake word + commands here are English.
+    match provider
+        .transcribe(
+            config,
+            &audio_b64,
+            Some("audio/wav"),
+            Some("utterance.wav"),
+            Some("en"),
+        )
+        .await
+    {
+        Ok(outcome) => {
+            let text = outcome.value.text.trim().to_string();
+            if text.is_empty() {
+                log::debug!("{LOG_PREFIX} empty transcript dropped");
+                return;
+            }
+            // Wake-word gate: only act on utterances addressed to the agent
+            // ("Hey Tiny, …"). Strip the wake phrase and deliver the command.
+            match extract_command(&text, &config.voice_server.wake_word) {
+                Some(cmd) => {
+                    log::info!("{LOG_PREFIX} wake word matched → command={cmd:?} → dictation bus");
+                    notch_status("Processing", 12000); // pill: running the command
+                    crate::openhuman::voice::dictation_listener::publish_transcription(cmd);
+                }
+                None => {
+                    // Visible at info so the user can see WHAT was heard when the
+                    // wake word didn't match (diagnoses "Hey Tiny not responding").
+                    log::info!(
+                        "{LOG_PREFIX} no wake word ({:?}) in transcript={text:?}; ignored",
+                        config.voice_server.wake_word
+                    );
+                }
+            }
+        }
+        Err(e) => log::warn!("{LOG_PREFIX} transcription failed ({provider_name}): {e}"),
+    }
+}
+
+/// Apply the wake-word gate to a transcript.
+///
+/// Returns the command to send to the agent (the text after the wake phrase),
+/// or `None` when the wake word isn't present (the utterance wasn't addressed to
+/// the agent). An empty `wake_word` disables the gate (every utterance passes).
+/// Matching is tolerant: case-insensitive, punctuation-insensitive, and the
+/// phrase may appear after leading filler ("um, hey tiny, play music").
+pub(crate) fn extract_command(transcript: &str, wake_word: &str) -> Option<String> {
+    let tokens = |s: &str| -> Vec<String> {
+        s.to_lowercase()
+            .chars()
+            .map(|c| if c.is_alphanumeric() { c } else { ' ' })
+            .collect::<String>()
+            .split_whitespace()
+            .map(String::from)
+            .collect()
+    };
+    let wake = tokens(wake_word);
+    let t = tokens(transcript);
+    if wake.is_empty() {
+        // No wake word configured → deliver everything (non-empty).
+        return if t.is_empty() {
+            None
+        } else {
+            Some(t.join(" "))
+        };
+    }
+
+    // Anchor on the most distinctive (longest) wake token, e.g. "tiny" — STT
+    // mangles the greeting ("hey"→"a"/"ok") and the exact spelling
+    // ("tiny"→"tony"/"tinny"), so fuzzy-match the anchor near the start and take
+    // everything after it as the command. Bounded to the first 3 tokens to avoid
+    // mid-sentence false triggers.
+    let anchor = wake.iter().max_by_key(|w| w.len()).cloned().unwrap();
+    let max_dist = if anchor.chars().count() <= 4 { 1 } else { 2 };
+    for i in 0..t.len().min(3) {
+        if levenshtein(&t[i], &anchor) <= max_dist {
+            let cmd = t[i + 1..].join(" ");
+            return if cmd.trim().is_empty() {
+                None // wake word alone, no command
+            } else {
+                Some(cmd)
+            };
+        }
+    }
+    None
+}
+
+/// Classic Levenshtein edit distance (small inputs — wake-word tokens).
+fn levenshtein(a: &str, b: &str) -> usize {
+    let a: Vec<char> = a.chars().collect();
+    let b: Vec<char> = b.chars().collect();
+    let mut prev: Vec<usize> = (0..=b.len()).collect();
+    let mut cur = vec![0usize; b.len() + 1];
+    for (i, ca) in a.iter().enumerate() {
+        cur[0] = i + 1;
+        for (j, cb) in b.iter().enumerate() {
+            let cost = if ca == cb { 0 } else { 1 };
+            cur[j + 1] = (prev[j + 1] + 1).min(cur[j] + 1).min(prev[j] + cost);
+        }
+        std::mem::swap(&mut prev, &mut cur);
+    }
+    prev[b.len()]
+}
+
+/// Spawn the dedicated cpal capture thread. Blocks until the stream is set up
+/// (or fails), mirroring `audio_capture::start_recording`'s readiness handshake.
+fn spawn_capture_thread(tx: tokio::sync::mpsc::UnboundedSender<Vec<f32>>) -> Result<(), String> {
+    let (setup_tx, setup_rx) = std::sync::mpsc::sync_channel::<Result<(), String>>(1);
+    std::thread::Builder::new()
+        .name("voice-always-on".into())
+        .spawn(move || {
+            if let Err(e) = capture_on_thread(tx, &setup_tx) {
+                log::error!("{LOG_PREFIX} capture thread error: {e}");
+                let _ = setup_tx.send(Err(e));
+            }
+        })
+        .map_err(|e| format!("failed to spawn always-on capture thread: {e}"))?;
+    match setup_rx.recv() {
+        Ok(Ok(())) => Ok(()),
+        Ok(Err(e)) => Err(e),
+        Err(_) => Err("always-on capture thread exited before signalling readiness".to_string()),
+    }
+}
+
+/// Owns the cpal stream for the process lifetime. Each callback downmixes to
+/// mono, resamples to 16 kHz, and forwards samples to the async processor.
+fn capture_on_thread(
+    tx: tokio::sync::mpsc::UnboundedSender<Vec<f32>>,
+    setup_tx: &std::sync::mpsc::SyncSender<Result<(), String>>,
+) -> Result<(), String> {
+    use crate::openhuman::accessibility::{detect_microphone_permission, PermissionState};
+    use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+    use cpal::{SampleFormat, StreamConfig};
+
+    if matches!(detect_microphone_permission(), PermissionState::Denied) {
+        return Err("microphone permission denied".to_string());
+    }
+
+    let host = cpal::default_host();
+    let device = host
+        .default_input_device()
+        .ok_or_else(|| "no default audio input device".to_string())?;
+    let supported = device
+        .default_input_config()
+        .map_err(|e| format!("no default input config: {e}"))?;
+    let source_rate = supported.sample_rate().0;
+    let channels = supported.channels() as usize;
+    let sample_format = supported.sample_format();
+    let stream_config: StreamConfig = supported.into();
+    log::info!(
+        "{LOG_PREFIX} capture device ready rate={source_rate} channels={channels} format={sample_format:?}"
+    );
+
+    // Forward one resampled-to-16k mono chunk per callback.
+    let forward = move |mono_src: Vec<f32>| {
+        let mono16k = resample(&mono_src, source_rate);
+        // Ignore send errors — they mean the processor task is gone (shutdown).
+        let _ = tx.send(mono16k);
+    };
+
+    let err_fn = |e| log::warn!("{LOG_PREFIX} cpal stream error: {e}");
+    let stream = match sample_format {
+        SampleFormat::F32 => device.build_input_stream(
+            &stream_config,
+            move |data: &[f32], _| forward(to_mono(data, channels)),
+            err_fn,
+            None,
+        ),
+        SampleFormat::I16 => device.build_input_stream(
+            &stream_config,
+            move |data: &[i16], _| {
+                let floats: Vec<f32> = data.iter().map(|&s| s as f32 / 32768.0).collect();
+                forward(to_mono(&floats, channels));
+            },
+            err_fn,
+            None,
+        ),
+        SampleFormat::U16 => device.build_input_stream(
+            &stream_config,
+            move |data: &[u16], _| {
+                let floats: Vec<f32> = data.iter().map(|&s| s as f32 / 32768.0 - 1.0).collect();
+                forward(to_mono(&floats, channels));
+            },
+            err_fn,
+            None,
+        ),
+        other => return Err(format!("unsupported sample format: {other:?}")),
+    }
+    .map_err(|e| format!("failed to build input stream: {e}"))?;
+
+    stream
+        .play()
+        .map_err(|e| format!("failed to start stream: {e}"))?;
+    let _ = setup_tx.send(Ok(()));
+    log::info!("{LOG_PREFIX} microphone stream live");
+
+    // Keep the stream (and thus this thread) alive for the process lifetime.
+    loop {
+        std::thread::sleep(std::time::Duration::from_secs(3600));
+    }
+}
+
+/// Poll the screen-lock state and drive [`PAUSED`] so always-on never captures
+/// what is spoken at the lock screen. macOS-only for now (uses the Quartz
+/// session dictionary); other platforms never pause (no lock signal yet).
+fn spawn_lock_watcher() {
+    #[cfg(target_os = "macos")]
+    tokio::spawn(async move {
+        let mut last = false;
+        loop {
+            let locked = macos_lock::is_screen_locked();
+            if locked != last {
+                log::info!(
+                    "{LOG_PREFIX} screen {} → {}",
+                    if locked { "locked" } else { "unlocked" },
+                    if locked { "pausing" } else { "resuming" }
+                );
+                PAUSED.store(locked, Ordering::Relaxed);
+                last = locked;
+            }
+            tokio::time::sleep(std::time::Duration::from_secs(2)).await;
+        }
+    });
+    #[cfg(not(target_os = "macos"))]
+    {
+        log::debug!("{LOG_PREFIX} screen-lock watcher unavailable on this platform");
+    }
+}
+
+/// macOS screen-lock detection via the Quartz session dictionary.
+///
+/// `CGSessionCopyCurrentDictionary` exposes `CGSSessionScreenIsLocked`; we read
+/// it defensively (null dict ⇒ no session, treated as locked; missing/odd value
+/// ⇒ unlocked) and never assume the CF value's concrete type without checking.
+#[cfg(target_os = "macos")]
+mod macos_lock {
+    use std::ffi::{c_void, CString};
+
+    type CFTypeRef = *const c_void;
+
+    #[link(name = "CoreGraphics", kind = "framework")]
+    extern "C" {
+        fn CGSessionCopyCurrentDictionary() -> CFTypeRef;
+    }
+    #[link(name = "CoreFoundation", kind = "framework")]
+    extern "C" {
+        fn CFDictionaryGetValue(dict: CFTypeRef, key: CFTypeRef) -> CFTypeRef;
+        fn CFStringCreateWithCString(alloc: CFTypeRef, c: *const i8, enc: u32) -> CFTypeRef;
+        fn CFGetTypeID(v: CFTypeRef) -> usize;
+        fn CFBooleanGetTypeID() -> usize;
+        fn CFBooleanGetValue(b: CFTypeRef) -> u8;
+        fn CFNumberGetTypeID() -> usize;
+        fn CFNumberGetValue(n: CFTypeRef, the_type: i64, out: *mut c_void) -> u8;
+        fn CFRelease(v: CFTypeRef);
+    }
+    const KCF_STRING_ENCODING_UTF8: u32 = 0x0800_0100;
+    const KCF_NUMBER_SINT32: i64 = 3;
+
+    /// True when the screen is locked (or there is no active GUI session).
+    pub fn is_screen_locked() -> bool {
+        // SAFETY: standard Quartz/CoreFoundation calls. Ownership: the session
+        // dict and the key string are +1 (Create/Copy) and released here; the
+        // dictionary value is borrowed and must not be released.
+        unsafe {
+            let dict = CGSessionCopyCurrentDictionary();
+            if dict.is_null() {
+                return true; // no session (loginwindow) — treat as locked
+            }
+            let Ok(key_c) = CString::new("CGSSessionScreenIsLocked") else {
+                CFRelease(dict);
+                return false;
+            };
+            let key = CFStringCreateWithCString(
+                std::ptr::null(),
+                key_c.as_ptr(),
+                KCF_STRING_ENCODING_UTF8,
+            );
+            if key.is_null() {
+                CFRelease(dict);
+                return false;
+            }
+            let value = CFDictionaryGetValue(dict, key); // borrowed
+            let locked = if value.is_null() {
+                false
+            } else {
+                let tid = CFGetTypeID(value);
+                if tid == CFBooleanGetTypeID() {
+                    CFBooleanGetValue(value) != 0
+                } else if tid == CFNumberGetTypeID() {
+                    let mut n: i32 = 0;
+                    CFNumberGetValue(value, KCF_NUMBER_SINT32, &mut n as *mut i32 as *mut c_void);
+                    n != 0
+                } else {
+                    false
+                }
+            };
+            CFRelease(key);
+            CFRelease(dict);
+            locked
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn cfg() -> VadConfig {
+        VadConfig {
+            onset_threshold: 0.01,
+            hangover_ms: 100,
+            min_speech_ms: 60,
+            max_utterance_ms: 1000,
+        }
+    }
+
+    /// Drive `n` frames of constant `rms` at `frame_ms` each, collecting events.
+    fn drive(seg: &mut VadSegmenter, rms: f32, frame_ms: u32, n: u32) -> Vec<VadEvent> {
+        (0..n)
+            .filter_map(|_| seg.push_frame(rms, frame_ms))
+            .collect()
+    }
+
+    #[test]
+    fn silence_emits_nothing() {
+        let mut seg = VadSegmenter::new(cfg());
+        assert!(drive(&mut seg, 0.0, 20, 50).is_empty());
+        assert!(!seg.is_speaking());
+    }
+
+    #[test]
+    fn onset_then_hangover_emits_one_utterance() {
+        let mut seg = VadSegmenter::new(cfg());
+        // First loud frame opens the utterance.
+        assert_eq!(seg.push_frame(0.2, 20), Some(VadEvent::SpeechStart));
+        assert!(seg.is_speaking());
+        // More speech, no event yet.
+        assert!(drive(&mut seg, 0.2, 20, 5).is_empty());
+        // Silence shorter than hangover: still open.
+        assert!(seg.push_frame(0.0, 20).is_none()); // 20ms silence
+        assert!(seg.push_frame(0.0, 20).is_none()); // 40ms
+        assert!(seg.push_frame(0.0, 20).is_none()); // 60ms
+        assert!(seg.push_frame(0.0, 20).is_none()); // 80ms
+                                                    // Crossing the 100ms hangover closes it.
+        let ev = seg.push_frame(0.0, 20).unwrap(); // 100ms
+        match ev {
+            VadEvent::SpeechEnd { emit, forced, .. } => {
+                assert!(emit, "120ms voiced should clear the 60ms min");
+                assert!(!forced);
+            }
+            other => panic!("expected SpeechEnd, got {other:?}"),
+        }
+        assert!(!seg.is_speaking());
+    }
+
+    #[test]
+    fn short_blip_is_dropped() {
+        let mut seg = VadSegmenter::new(cfg());
+        // One 20ms loud frame (below the 60ms min), then silence to close.
+        assert_eq!(seg.push_frame(0.2, 20), Some(VadEvent::SpeechStart));
+        let mut ev = None;
+        for _ in 0..5 {
+            if let Some(e) = seg.push_frame(0.0, 20) {
+                ev = Some(e);
+                break;
+            }
+        }
+        match ev.expect("utterance should close") {
+            VadEvent::SpeechEnd {
+                voiced_ms, emit, ..
+            } => {
+                assert_eq!(voiced_ms, 20);
+                assert!(!emit, "20ms < 60ms min_speech ⇒ dropped");
+            }
+            other => panic!("expected SpeechEnd, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn mid_utterance_pause_does_not_split() {
+        let mut seg = VadSegmenter::new(cfg());
+        seg.push_frame(0.2, 20);
+        // 80ms pause (< 100ms hangover) then speech resumes — one utterance.
+        for _ in 0..4 {
+            assert!(seg.push_frame(0.0, 20).is_none());
+        }
+        assert!(
+            seg.is_speaking(),
+            "pause under hangover keeps utterance open"
+        );
+        assert!(drive(&mut seg, 0.2, 20, 3).is_empty());
+        assert!(seg.is_speaking());
+    }
+
+    #[test]
+    fn max_utterance_forces_flush() {
+        let mut seg = VadSegmenter::new(cfg()); // max 1000ms
+        seg.push_frame(0.2, 20);
+        // Keep talking past the ceiling; silence never triggers the close.
+        let mut forced_seen = false;
+        for _ in 0..60 {
+            if let Some(VadEvent::SpeechEnd { forced, emit, .. }) = seg.push_frame(0.2, 20) {
+                assert!(forced, "loud-throughout close must be the ceiling");
+                assert!(emit);
+                forced_seen = true;
+                break;
+            }
+        }
+        assert!(forced_seen, "should force-flush at max_utterance_ms");
+        assert!(!seg.is_speaking());
+    }
+
+    #[test]
+    fn reset_aborts_without_event() {
+        let mut seg = VadSegmenter::new(cfg());
+        seg.push_frame(0.2, 20);
+        assert!(seg.is_speaking());
+        seg.reset();
+        assert!(!seg.is_speaking());
+        // After reset, a fresh onset starts a new utterance.
+        assert_eq!(seg.push_frame(0.2, 20), Some(VadEvent::SpeechStart));
+    }
+
+    #[test]
+    fn from_server_config_maps_seconds_to_ms() {
+        let mut c = CfgVoiceServer::default();
+        c.vad_max_utterance_secs = 2.5;
+        c.vad_hangover_ms = 750;
+        let v = VadConfig::from_server_config(&c);
+        assert_eq!(v.max_utterance_ms, 2500);
+        assert_eq!(v.hangover_ms, 750);
+        assert_eq!(v.onset_threshold, c.vad_onset_threshold);
+    }
+
+    #[test]
+    fn wake_word_extracts_command() {
+        // Case/punctuation tolerant; strips the phrase, keeps the command.
+        assert_eq!(
+            extract_command("Hey Tiny, play Numb by Linkin Park", "Hey Tiny").as_deref(),
+            Some("play numb by linkin park")
+        );
+        assert_eq!(
+            extract_command("hey tiny open slack", "Hey Tiny").as_deref(),
+            Some("open slack")
+        );
+        // Leading filler before the wake phrase is tolerated.
+        assert_eq!(
+            extract_command("um, hey tiny what time is it", "Hey Tiny").as_deref(),
+            Some("what time is it")
+        );
+    }
+
+    #[test]
+    fn wake_word_tolerates_stt_homophones() {
+        // STT often mangles "Hey Tiny" — accept close variants of the anchor.
+        assert_eq!(
+            extract_command("Hey Tony, play music", "Hey Tiny").as_deref(),
+            Some("play music")
+        );
+        assert_eq!(
+            extract_command("a tinny open slack", "Hey Tiny").as_deref(),
+            Some("open slack")
+        );
+        // Anchor too far in / absent → not a command.
+        assert_eq!(
+            extract_command("the tiny details matter here a lot", "Hey Tiny").as_deref(),
+            // "tiny" at index 1 → command is the rest; documents the known
+            // trade-off that an early "tiny" can trigger.
+            Some("details matter here a lot")
+        );
+    }
+
+    #[test]
+    fn wake_word_absent_is_ignored() {
+        assert_eq!(extract_command("play some music", "Hey Tiny"), None);
+        // Wake word alone with no command → nothing to do.
+        assert_eq!(extract_command("Hey Tiny", "Hey Tiny"), None);
+        assert_eq!(extract_command("hey tiny!", "Hey Tiny"), None);
+    }
+
+    #[test]
+    fn empty_wake_word_passes_everything() {
+        assert_eq!(
+            extract_command("just say this", "").as_deref(),
+            Some("just say this")
+        );
+        assert_eq!(extract_command("   ", ""), None);
+    }
+}
diff --git a/src/openhuman/voice/audio_capture.rs b/src/openhuman/voice/audio_capture.rs
index bc3aa08c71..c80dfe2a26 100644
--- a/src/openhuman/voice/audio_capture.rs
+++ b/src/openhuman/voice/audio_capture.rs
@@ -16,7 +16,7 @@ use tokio::sync::oneshot;
 const LOG_PREFIX: &str = "[voice_capture]";
 
 /// Target sample rate for whisper (16 kHz mono).
-const TARGET_SAMPLE_RATE: u32 = 16_000;
+pub(crate) const TARGET_SAMPLE_RATE: u32 = 16_000;
 
 /// RMS threshold below which audio is considered silence.
 const SILENCE_RMS_THRESHOLD: f32 = 0.002;
@@ -102,8 +102,35 @@ impl SilenceGate {
     }
 }
 
+/// Encode already-16 kHz mono f32 samples to a 16-bit PCM WAV byte buffer.
+/// Shared by the one-shot recorder's finalize path and the always-on loop
+/// (`voice::always_on`), so both produce identical WAV that whisper accepts.
+pub(crate) fn encode_wav_16k(samples_16k: &[f32]) -> Result<Vec<u8>, String> {
+    let spec = WavSpec {
+        channels: 1,
+        sample_rate: TARGET_SAMPLE_RATE,
+        bits_per_sample: 16,
+        sample_format: HoundFormat::Int,
+    };
+    let mut buf = Cursor::new(Vec::new());
+    {
+        let mut writer =
+            WavWriter::new(&mut buf, spec).map_err(|e| format!("WAV writer error: {e}"))?;
+        for &sample in samples_16k {
+            let clamped = sample.clamp(-1.0, 1.0);
+            writer
+                .write_sample((clamped * 32767.0) as i16)
+                .map_err(|e| format!("WAV write error: {e}"))?;
+        }
+        writer
+            .finalize()
+            .map_err(|e| format!("WAV finalize error: {e}"))?;
+    }
+    Ok(buf.into_inner())
+}
+
 /// Compute RMS energy for a chunk of mono samples.
-fn chunk_rms(samples: &[f32]) -> f32 {
+pub(crate) fn chunk_rms(samples: &[f32]) -> f32 {
     if samples.is_empty() {
         return 0.0;
     }
@@ -493,7 +520,7 @@ pub fn list_input_devices() -> Result<Vec<String>, String> {
 }
 
 /// Convert interleaved multi-channel samples to mono by averaging channels.
-fn to_mono(samples: &[f32], channels: usize) -> Vec<f32> {
+pub(crate) fn to_mono(samples: &[f32], channels: usize) -> Vec<f32> {
     if channels <= 1 {
         return samples.to_vec();
     }
@@ -506,7 +533,7 @@ fn to_mono(samples: &[f32], channels: usize) -> Vec<f32> {
 
 /// Resample mono f32 samples from `source_rate` to `TARGET_SAMPLE_RATE` using
 /// linear interpolation. Good enough for voice dictation quality.
-fn resample(samples: &[f32], source_rate: u32) -> Vec<f32> {
+pub(crate) fn resample(samples: &[f32], source_rate: u32) -> Vec<f32> {
     if source_rate == TARGET_SAMPLE_RATE {
         return samples.to_vec();
     }
diff --git a/src/openhuman/voice/mod.rs b/src/openhuman/voice/mod.rs
index 40344e9d16..ae16576cfe 100644
--- a/src/openhuman/voice/mod.rs
+++ b/src/openhuman/voice/mod.rs
@@ -9,6 +9,7 @@
 //! `crate::openhuman::inference::voice` so all inference concerns share a
 //! single domain root.
 
+pub mod always_on;
 pub mod audio_capture;
 pub(crate) mod cli;
 pub mod dictation_listener;
diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs
index af42faaaa4..92a41640fa 100644
--- a/tests/json_rpc_e2e.rs
+++ b/tests/json_rpc_e2e.rs
@@ -9853,204 +9853,86 @@ async fn json_rpc_workflows_lifecycle_round_trip() {
     rpc_join.abort();
 }
 
-// ── Model resolution + agent profile switching ──────────────────────────
-
-#[tokio::test]
-async fn json_rpc_inference_resolve_model_returns_tier_for_hints() {
-    let _env_lock = json_rpc_e2e_env_lock();
-    let tmp = tempdir().expect("tempdir");
-    let home = tmp.path();
-    let openhuman_dir = home.join(".openhuman");
-
-    let _home_guard = EnvVarGuard::set_to_path("HOME", home);
-    let _workspace_guard = EnvVarGuard::set_to_path("OPENHUMAN_WORKSPACE", &openhuman_dir);
-
-    let (api_addr, api_join) = serve_on_ephemeral(mock_upstream_router()).await;
-    let api_origin = format!("http://{api_addr}");
-    write_min_config(&openhuman_dir, &api_origin);
-
-    let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await;
-    let rpc_base = format!("http://{rpc_addr}");
-
-    let res = post_json_rpc(
-        &rpc_base,
-        9900_1,
-        "openhuman.inference_resolve_model",
-        json!({ "hint": "hint:reasoning" }),
-    )
-    .await;
-    let result = assert_no_jsonrpc_error(&res, "resolve_model hint:reasoning");
-    let model = result
-        .get("model")
-        .and_then(Value::as_str)
-        .expect("model field");
-    assert_eq!(model, "reasoning-v1");
-
-    let res = post_json_rpc(
-        &rpc_base,
-        9900_2,
-        "openhuman.inference_resolve_model",
-        json!({ "hint": "hint:chat" }),
-    )
-    .await;
-    let result = assert_no_jsonrpc_error(&res, "resolve_model hint:chat");
-    let model = result
-        .get("model")
-        .and_then(Value::as_str)
-        .expect("model field");
-    assert_eq!(model, "reasoning-quick-v1");
-
-    let res = post_json_rpc(
-        &rpc_base,
-        9900_3,
-        "openhuman.inference_resolve_model",
-        json!({ "hint": "hint:coding" }),
-    )
-    .await;
-    let result = assert_no_jsonrpc_error(&res, "resolve_model hint:coding");
-    let model = result
-        .get("model")
-        .and_then(Value::as_str)
-        .expect("model field");
-    assert_eq!(model, "coding-v1");
-
-    let res = post_json_rpc(
-        &rpc_base,
-        9900_4,
-        "openhuman.inference_resolve_model",
-        json!({ "hint": "reasoning-v1" }),
-    )
-    .await;
-    let result = assert_no_jsonrpc_error(&res, "resolve_model tier passthrough");
-    let model = result
-        .get("model")
-        .and_then(Value::as_str)
-        .expect("model field");
-    assert_eq!(model, "reasoning-v1");
-
-    api_join.abort();
-    rpc_join.abort();
-}
-
+/// E2E: voice-server settings round-trip over JSON-RPC — Phase 2 always-on
+/// toggle + "Hey Tiny" wake word. Regression guard for the bug where the
+/// Settings toggle silently did nothing because `always_on_enabled` was absent
+/// from the `update_voice_server_settings` controller param schema (rejected as
+/// "unknown param 'always_on_enabled'" before reaching the handler).
 #[tokio::test]
-async fn json_rpc_agent_profile_select_and_resolve_model_integration() {
+async fn json_rpc_voice_server_settings_roundtrip_always_on_and_wake_word() {
     let _env_lock = json_rpc_e2e_env_lock();
     let tmp = tempdir().expect("tempdir");
     let home = tmp.path();
-    let openhuman_dir = home.join(".openhuman");
+    let openhuman_home = home.join(".openhuman");
 
     let _home_guard = EnvVarGuard::set_to_path("HOME", home);
-    let _workspace_guard = EnvVarGuard::set_to_path("OPENHUMAN_WORKSPACE", &openhuman_dir);
+    let _workspace_guard = EnvVarGuard::unset("OPENHUMAN_WORKSPACE");
+    let _backend_url_guard = EnvVarGuard::unset("BACKEND_URL");
+    let _vite_backend_guard = EnvVarGuard::unset("VITE_BACKEND_URL");
 
-    let (api_addr, api_join) = serve_on_ephemeral(mock_upstream_router()).await;
-    let api_origin = format!("http://{api_addr}");
-    write_min_config(&openhuman_dir, &api_origin);
+    write_min_config(&openhuman_home, "http://127.0.0.1:9");
 
     let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await;
-    let rpc_base = format!("http://{rpc_addr}");
+    let rpc_base = format!("http://{}", rpc_addr);
+    tokio::time::sleep(Duration::from_millis(100)).await;
 
-    // List profiles — should include built-in 'default' and 'reasoning'
-    let res = post_json_rpc(
+    // GET defaults — wake_word "Hey Tiny", always-on off.
+    let initial = post_json_rpc(
         &rpc_base,
-        9901_1,
-        "openhuman.agent_profiles_list",
+        7401,
+        "openhuman.config_get_voice_server_settings",
         json!({}),
     )
     .await;
-    let result = assert_no_jsonrpc_error(&res, "agent_profiles_list");
-    let profiles = result
-        .get("profiles")
-        .and_then(Value::as_array)
-        .expect("profiles array");
-    let profile_ids: Vec<&str> = profiles
-        .iter()
-        .filter_map(|p| p.get("id").and_then(Value::as_str))
-        .collect();
-    assert!(
-        profile_ids.contains(&"default"),
-        "should contain default profile"
+    let initial_outer = assert_no_jsonrpc_error(&initial, "get_voice_server_settings initial");
+    assert_eq!(
+        initial_outer
+            .get("result")
+            .and_then(|r| r.get("always_on_enabled"))
+            .and_then(Value::as_bool),
+        Some(false),
+        "default always_on_enabled should be false, envelope: {initial_outer}"
     );
-    assert!(
-        profile_ids.contains(&"reasoning"),
-        "should contain reasoning profile"
+    assert_eq!(
+        initial_outer
+            .get("result")
+            .and_then(|r| r.get("wake_word"))
+            .and_then(Value::as_str),
+        Some("Hey Tiny"),
+        "default wake_word should be 'Hey Tiny', envelope: {initial_outer}"
     );
 
-    // Select reasoning profile
-    let res = post_json_rpc(
-        &rpc_base,
-        9901_2,
-        "openhuman.agent_profile_select",
-        json!({ "profile_id": "reasoning" }),
-    )
-    .await;
-    let result = assert_no_jsonrpc_error(&res, "agent_profile_select reasoning");
-    let active = result
-        .get("activeProfileId")
-        .and_then(Value::as_str)
-        .expect("activeProfileId");
-    assert_eq!(active, "reasoning");
-
-    // Verify the reasoning profile has hint:reasoning model override
-    let reasoning_profile = result
-        .get("profiles")
-        .and_then(Value::as_array)
-        .expect("profiles")
-        .iter()
-        .find(|p| p.get("id").and_then(Value::as_str) == Some("reasoning"))
-        .expect("reasoning profile in response");
-    let model_override = reasoning_profile
-        .get("modelOverride")
-        .and_then(Value::as_str)
-        .expect("modelOverride");
-    assert_eq!(model_override, "hint:reasoning");
-
-    // Resolve the model for this profile's override
-    let res = post_json_rpc(
+    // UPDATE — change the wake word and pass `always_on_enabled` (the param that
+    // used to be rejected). Kept false so the test never opens a real mic.
+    let update = post_json_rpc(
         &rpc_base,
-        9901_3,
-        "openhuman.inference_resolve_model",
-        json!({ "hint": model_override }),
+        7402,
+        "openhuman.config_update_voice_server_settings",
+        json!({ "always_on_enabled": false, "wake_word": "Computer" }),
     )
     .await;
-    let result = assert_no_jsonrpc_error(&res, "resolve_model for reasoning profile");
-    let resolved = result
-        .get("model")
-        .and_then(Value::as_str)
-        .expect("resolved model");
-    assert_eq!(resolved, "reasoning-v1");
+    assert_no_jsonrpc_error(
+        &update,
+        "update_voice_server_settings (always_on_enabled + wake_word)",
+    );
 
-    // Switch back to default and resolve
-    let res = post_json_rpc(
+    // GET again — wake word persisted, no error.
+    let after = post_json_rpc(
         &rpc_base,
-        9901_4,
-        "openhuman.agent_profile_select",
-        json!({ "profile_id": "default" }),
+        7403,
+        "openhuman.config_get_voice_server_settings",
+        json!({}),
     )
     .await;
-    let result = assert_no_jsonrpc_error(&res, "agent_profile_select default");
+    let after_outer = assert_no_jsonrpc_error(&after, "get_voice_server_settings after update");
     assert_eq!(
-        result
-            .get("activeProfileId")
-            .and_then(Value::as_str)
-            .unwrap(),
-        "default"
+        after_outer
+            .get("result")
+            .and_then(|r| r.get("wake_word"))
+            .and_then(Value::as_str),
+        Some("Computer"),
+        "wake_word should persist, envelope: {after_outer}"
     );
 
-    // Default profile has no model_override — resolve with hint:chat
-    let res = post_json_rpc(
-        &rpc_base,
-        9901_5,
-        "openhuman.inference_resolve_model",
-        json!({ "hint": "hint:chat" }),
-    )
-    .await;
-    let result = assert_no_jsonrpc_error(&res, "resolve_model for default profile");
-    let resolved = result
-        .get("model")
-        .and_then(Value::as_str)
-        .expect("resolved model");
-    assert_eq!(resolved, "reasoning-quick-v1");
-
-    api_join.abort();
     rpc_join.abort();
 }

From 77003ca16d3af81265ddc3ae33cba1fa23fdff5a Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 14:21:29 +0530
Subject: [PATCH 5/9] feat(voice): always-on Settings toggle + debug panel +
 i18n

Surfaces the always-on listening toggle in the reachable Voice panel,
adds the VoiceDebugPanel, the voice tauri-command wrapper, and the RPC
client method. Adds all voice.debug.* and notch.* i18n keys across the
14 locales (notch keys land here as inert strings; the notch UI that
consumes them ships in slice 6).

Slice 5/7 of #3307 (always-on frontend).
---
 .../settings/panels/VoiceDebugPanel.tsx       | 32 ++++++++++++
 .../components/settings/panels/VoicePanel.tsx | 52 +++++++++++++++++++
 .../panels/__tests__/VoicePanel.test.tsx      |  1 +
 app/src/lib/i18n/ar.ts                        | 10 ++++
 app/src/lib/i18n/bn.ts                        | 10 ++++
 app/src/lib/i18n/de.ts                        | 10 ++++
 app/src/lib/i18n/en.ts                        | 10 ++++
 app/src/lib/i18n/es.ts                        | 10 ++++
 app/src/lib/i18n/fr.ts                        | 10 ++++
 app/src/lib/i18n/hi.ts                        | 10 ++++
 app/src/lib/i18n/id.ts                        | 10 ++++
 app/src/lib/i18n/it.ts                        | 10 ++++
 app/src/lib/i18n/ko.ts                        | 10 ++++
 app/src/lib/i18n/pl.ts                        | 10 ++++
 app/src/lib/i18n/pt.ts                        | 10 ++++
 app/src/lib/i18n/ru.ts                        | 10 ++++
 app/src/lib/i18n/zh-CN.ts                     | 10 ++++
 app/src/services/coreRpcClient.ts             | 12 +++++
 app/src/utils/tauriCommands/voice.ts          |  3 ++
 19 files changed, 240 insertions(+)

diff --git a/app/src/components/settings/panels/VoiceDebugPanel.tsx b/app/src/components/settings/panels/VoiceDebugPanel.tsx
index b3222c73e6..a7384467ce 100644
--- a/app/src/components/settings/panels/VoiceDebugPanel.tsx
+++ b/app/src/components/settings/panels/VoiceDebugPanel.tsx
@@ -102,6 +102,7 @@ const VoiceDebugPanel = () => {
         min_duration_secs: settings.min_duration_secs,
         silence_threshold: settings.silence_threshold,
         custom_dictionary: settings.custom_dictionary,
+        always_on_enabled: settings.always_on_enabled,
       });
       setNotice(t('voice.debug.settingsSaved'));
       await loadData(true);
@@ -203,6 +204,37 @@ const VoiceDebugPanel = () => {
 
             {settings && (
               <>
+                {/* Always-on listening (Phase 2) — opt-in, privacy-sensitive. */}
+                <div className="flex items-start justify-between gap-3 rounded-md border border-stone-200 dark:border-neutral-800 bg-white dark:bg-neutral-900 px-3 py-2.5">
+                  <div className="min-w-0">
+                    <span className="text-xs font-medium text-stone-700 dark:text-neutral-200">
+                      {t('voice.debug.alwaysOn')}
+                    </span>
+                    <p className="text-[11px] text-stone-400 dark:text-neutral-500 mt-0.5">
+                      {t('voice.debug.alwaysOnDesc')}
+                    </p>
+                  </div>
+                  <button
+                    type="button"
+                    role="switch"
+                    aria-checked={settings.always_on_enabled}
+                    aria-label={t('voice.debug.alwaysOn')}
+                    data-testid="voice-always-on-toggle"
+                    onClick={() => updateSetting('always_on_enabled', !settings.always_on_enabled)}
+                    className={`relative inline-flex h-4 w-7 shrink-0 items-center rounded-full transition-colors ${
+                      settings.always_on_enabled
+                        ? 'bg-primary-500'
+                        : 'bg-stone-300 dark:bg-neutral-600'
+                    }`}>
+                    <span
+                      aria-hidden
+                      className={`inline-block h-3 w-3 transform rounded-full bg-white shadow transition-transform ${
+                        settings.always_on_enabled ? 'translate-x-3.5' : 'translate-x-0.5'
+                      }`}
+                    />
+                  </button>
+                </div>
+
                 <label className="block space-y-1">
                   <span className="text-xs font-medium text-stone-600 dark:text-neutral-300">
                     {t('voice.debug.minimumRecordingSeconds')}
diff --git a/app/src/components/settings/panels/VoicePanel.tsx b/app/src/components/settings/panels/VoicePanel.tsx
index 88151611ee..684d5f2a9d 100644
--- a/app/src/components/settings/panels/VoicePanel.tsx
+++ b/app/src/components/settings/panels/VoicePanel.tsx
@@ -19,6 +19,7 @@ import {
 } from '../../../services/api/voiceSettingsApi';
 import {
   openhumanGetVoiceServerSettings,
+  openhumanUpdateVoiceServerSettings,
   openhumanVoiceSetProviders,
   openhumanVoiceStatus,
   type VoiceProvidersSnapshot,
@@ -485,6 +486,57 @@ const VoicePanel = ({ embedded = false }: VoicePanelProps = {}) => {
       )}
 
       <div className={embedded ? 'space-y-4' : 'p-4 space-y-4'}>
+        {/* ─── Always-on listening (Phase 2) ──────────────────────────── */}
+        {settings && (
+          <section className="space-y-3">
+            <div className="bg-stone-50 dark:bg-neutral-800/60 rounded-lg border border-stone-200 dark:border-neutral-800 p-4">
+              <div className="flex items-start justify-between gap-3">
+                <div className="min-w-0">
+                  <h3 className="text-sm font-semibold text-stone-900 dark:text-neutral-100">
+                    {t('voice.debug.alwaysOn')}
+                  </h3>
+                  <p className="text-xs text-stone-500 dark:text-neutral-400 mt-1">
+                    {t('voice.debug.alwaysOnDesc')}
+                  </p>
+                </div>
+                <button
+                  type="button"
+                  role="switch"
+                  aria-checked={settings.always_on_enabled}
+                  aria-label={t('voice.debug.alwaysOn')}
+                  data-testid="voice-always-on-toggle"
+                  onClick={async () => {
+                    const next = !settings.always_on_enabled;
+                    setSettings(current =>
+                      current ? { ...current, always_on_enabled: next } : current
+                    );
+                    try {
+                      await openhumanUpdateVoiceServerSettings({ always_on_enabled: next });
+                    } catch (err) {
+                      // Revert on failure so the UI reflects the persisted value.
+                      setSettings(current =>
+                        current ? { ...current, always_on_enabled: !next } : current
+                      );
+                      console.error('[VoicePanel] failed to toggle always-on', err);
+                    }
+                  }}
+                  className={`relative mt-0.5 inline-flex h-4 w-7 shrink-0 items-center rounded-full transition-colors ${
+                    settings.always_on_enabled
+                      ? 'bg-primary-500'
+                      : 'bg-stone-300 dark:bg-neutral-600'
+                  }`}>
+                  <span
+                    aria-hidden
+                    className={`inline-block h-3 w-3 transform rounded-full bg-white shadow transition-transform ${
+                      settings.always_on_enabled ? 'translate-x-3.5' : 'translate-x-0.5'
+                    }`}
+                  />
+                </button>
+              </div>
+            </div>
+          </section>
+        )}
+
         {/* ─── Section 1: Voice Provider Chips ─────────────────────────── */}
         <section className="space-y-3">
           <div
diff --git a/app/src/components/settings/panels/__tests__/VoicePanel.test.tsx b/app/src/components/settings/panels/__tests__/VoicePanel.test.tsx
index 5f0d3b337e..6d295d52bc 100644
--- a/app/src/components/settings/panels/__tests__/VoicePanel.test.tsx
+++ b/app/src/components/settings/panels/__tests__/VoicePanel.test.tsx
@@ -111,6 +111,7 @@ describe('VoicePanel', () => {
         min_duration_secs: 0.3,
         silence_threshold: 0.002,
         custom_dictionary: [],
+        always_on_enabled: false,
       },
       voiceStatus: {
         stt_available: true,
diff --git a/app/src/lib/i18n/ar.ts b/app/src/lib/i18n/ar.ts
index 8e8ed9feef..fb1df3b0fc 100644
--- a/app/src/lib/i18n/ar.ts
+++ b/app/src/lib/i18n/ar.ts
@@ -1389,6 +1389,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'عتبة الصمت (RMS)',
   'voice.debug.silenceThresholdDesc':
     'تُعامَل التسجيلات ذات الطاقة الأدنى من هذا الحد كصمت ويُتخطى فيها. كلما كانت القيمة أصغر، كان النظام أكثر حساسية.',
+  'voice.debug.alwaysOn': 'الاستماع الدائم',
+  'voice.debug.alwaysOnDesc':
+    'أبقِ الميكروفون مفتوحًا وأرسل ما تقوله إلى الوكيل تلقائيًا دون مفتاح اختصار. يتوقف مؤقتًا عند قفل الشاشة.',
   'voice.providers.saved': 'تم حفظ موفري الصوت.',
   'voice.providers.failedToSave': 'فشل في حفظ موفري الصوت',
   'voice.providers.ellipsis': '…',
@@ -4674,6 +4677,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'إضافة كسياق إضافي',
   'runQueue.status': '{total} في الانتظار',
   'runQueue.cleared': 'تم مسح قائمة الانتظار',
+  'notch.ready': 'جاهز',
+  'notch.processing': 'جارٍ المعالجة…',
+  'notch.listening': 'أستمع…',
+  'notch.thinking': 'أفكر…',
+  'notch.speaking': 'أتحدث…',
+  'notch.transcribing': 'أفسّر…',
+  'notch.executing': 'أنفّذ…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/bn.ts b/app/src/lib/i18n/bn.ts
index bfd8673a6f..1243015c11 100644
--- a/app/src/lib/i18n/bn.ts
+++ b/app/src/lib/i18n/bn.ts
@@ -1417,6 +1417,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'সাইলেন্স থ্রেশহোল্ড (RMS)',
   'voice.debug.silenceThresholdDesc':
     'এই মানের নিচে শক্তির রেকর্ডিংগুলি নীরবতা হিসেবে গণ্য হয় এবং এড়িয়ে যাওয়া হয়। কম মান = আরও সংবেদনশীল।',
+  'voice.debug.alwaysOn': 'সবসময় শোনা',
+  'voice.debug.alwaysOnDesc':
+    'মাইক্রোফোন খোলা রাখুন এবং আপনি যা বলেন তা হটকি ছাড়াই স্বয়ংক্রিয়ভাবে এজেন্টের কাছে পাঠান। স্ক্রিন লক হলে থেমে যায়।',
   'voice.providers.saved': 'ভয়েস প্রদানকারী সংরক্ষিত।',
   'voice.providers.failedToSave': 'ভয়েস প্রদানকারী সংরক্ষণ করতে ব্যর্থ',
   'voice.providers.ellipsis': '…',
@@ -4764,6 +4767,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'অতিরিক্ত প্রসঙ্গ হিসেবে যোগ করুন',
   'runQueue.status': '{total}টি সারিবদ্ধ',
   'runQueue.cleared': 'সারি পরিষ্কার করা হয়েছে',
+  'notch.ready': 'প্রস্তুত',
+  'notch.processing': 'প্রক্রিয়াকরণ চলছে…',
+  'notch.listening': 'শুনছি…',
+  'notch.thinking': 'ভাবছি…',
+  'notch.speaking': 'বলছি…',
+  'notch.transcribing': 'ট্রান্সক্রাইব করছি…',
+  'notch.executing': 'চালাচ্ছি…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/de.ts b/app/src/lib/i18n/de.ts
index b705b83586..c15769c990 100644
--- a/app/src/lib/i18n/de.ts
+++ b/app/src/lib/i18n/de.ts
@@ -1458,6 +1458,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Ruheschwelle (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Aufnahmen mit Energie unterhalb dieses Wertes werden als Stille behandelt und übersprungen. Niedriger = empfindlicher.',
+  'voice.debug.alwaysOn': 'Dauerhaftes Zuhören',
+  'voice.debug.alwaysOnDesc':
+    'Hält das Mikrofon offen und sendet das Gesagte automatisch an den Agenten, ohne Tastenkürzel. Pausiert, wenn der Bildschirm gesperrt ist.',
   'voice.providers.saved': 'Sprachanbieter gespeichert.',
   'voice.providers.failedToSave': 'Sprachanbieter konnten nicht gespeichert werden.',
   'voice.providers.ellipsis': '…',
@@ -4899,6 +4902,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Als zusätzlichen Kontext hinzufügen',
   'runQueue.status': '{total} in der Warteschlange',
   'runQueue.cleared': 'Warteschlange geleert',
+  'notch.ready': 'Bereit',
+  'notch.processing': 'Wird verarbeitet…',
+  'notch.listening': 'Höre zu…',
+  'notch.thinking': 'Denke nach…',
+  'notch.speaking': 'Spreche…',
+  'notch.transcribing': 'Transkribiere…',
+  'notch.executing': 'Führe aus…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/en.ts b/app/src/lib/i18n/en.ts
index ec033e1532..05f45bc2fc 100644
--- a/app/src/lib/i18n/en.ts
+++ b/app/src/lib/i18n/en.ts
@@ -1635,6 +1635,9 @@ const en: TranslationMap = {
   'voice.debug.silenceThreshold': 'Silence Threshold (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Recordings with energy below this are treated as silence and skipped. Lower = more sensitive.',
+  'voice.debug.alwaysOn': 'Always-on listening',
+  'voice.debug.alwaysOnDesc':
+    'Keep the microphone open and send what you say to the agent automatically, no hotkey. Pauses when the screen is locked.',
   'voice.providers.saved': 'Voice providers saved.',
   'voice.providers.failedToSave': 'Failed to save voice providers',
   'voice.providers.ellipsis': '…',
@@ -5010,6 +5013,13 @@ const en: TranslationMap = {
   'runQueue.collectHint': 'Add as extra context',
   'runQueue.status': '{total} queued',
   'runQueue.cleared': 'Queue cleared',
+  'notch.ready': 'Ready',
+  'notch.processing': 'Processing…',
+  'notch.listening': 'Listening…',
+  'notch.thinking': 'Thinking…',
+  'notch.speaking': 'Speaking…',
+  'notch.transcribing': 'Transcribing…',
+  'notch.executing': 'Executing…',
 };
 
 export default en;
diff --git a/app/src/lib/i18n/es.ts b/app/src/lib/i18n/es.ts
index 96ee3d78a9..f1b92de356 100644
--- a/app/src/lib/i18n/es.ts
+++ b/app/src/lib/i18n/es.ts
@@ -1452,6 +1452,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Umbral de silencio (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Las grabaciones con energía por debajo de este valor se tratan como silencio y se omiten. Menor = más sensible.',
+  'voice.debug.alwaysOn': 'Escucha continua',
+  'voice.debug.alwaysOnDesc':
+    'Mantén el micrófono abierto y envía lo que dices al agente automáticamente, sin atajo. Se pausa cuando la pantalla está bloqueada.',
   'voice.providers.saved': 'Proveedores de voz guardados.',
   'voice.providers.failedToSave': 'No se pudieron guardar los proveedores de voz',
   'voice.providers.ellipsis': '…',
@@ -4866,6 +4869,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Añadir como contexto adicional',
   'runQueue.status': '{total} en cola',
   'runQueue.cleared': 'Cola vaciada',
+  'notch.ready': 'Listo',
+  'notch.processing': 'Procesando…',
+  'notch.listening': 'Escuchando…',
+  'notch.thinking': 'Pensando…',
+  'notch.speaking': 'Hablando…',
+  'notch.transcribing': 'Transcribiendo…',
+  'notch.executing': 'Ejecutando…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/fr.ts b/app/src/lib/i18n/fr.ts
index 8ac8f1c518..0dc60813e9 100644
--- a/app/src/lib/i18n/fr.ts
+++ b/app/src/lib/i18n/fr.ts
@@ -1456,6 +1456,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Seuil de silence (RMS)',
   'voice.debug.silenceThresholdDesc':
     "Les enregistrements dont l'énergie est inférieure à ce seuil sont traités comme du silence et ignorés. Plus bas = plus sensible.",
+  'voice.debug.alwaysOn': 'Écoute permanente',
+  'voice.debug.alwaysOnDesc':
+    'Garde le microphone ouvert et envoie automatiquement ce que vous dites à l’agent, sans raccourci. Se met en pause lorsque l’écran est verrouillé.',
   'voice.providers.saved': 'Fournisseurs de voix enregistrés.',
   'voice.providers.failedToSave': 'Échec de la sauvegarde des fournisseurs vocaux',
   'voice.providers.ellipsis': '…',
@@ -4881,6 +4884,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Ajouter comme contexte supplémentaire',
   'runQueue.status': '{total} en attente',
   'runQueue.cleared': "File d'attente vidée",
+  'notch.ready': 'Prêt',
+  'notch.processing': 'Traitement…',
+  'notch.listening': "J'écoute…",
+  'notch.thinking': 'Je réfléchis…',
+  'notch.speaking': 'Je parle…',
+  'notch.transcribing': 'Transcription…',
+  'notch.executing': "J'exécute…",
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/hi.ts b/app/src/lib/i18n/hi.ts
index ca1eb50bbf..78dfdb3674 100644
--- a/app/src/lib/i18n/hi.ts
+++ b/app/src/lib/i18n/hi.ts
@@ -1416,6 +1416,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'साइलेंस थ्रेशोल्ड (आरएमएस)',
   'voice.debug.silenceThresholdDesc':
     'इससे कम ऊर्जा वाली रिकॉर्डिंग को साइलेंस माना जाता है और छोड़ दिया जाता है। कम = अधिक संवेदनशील।',
+  'voice.debug.alwaysOn': 'हमेशा-चालू सुनना',
+  'voice.debug.alwaysOnDesc':
+    'माइक्रोफ़ोन खुला रखें और आप जो कहते हैं वह बिना हॉटकी के स्वचालित रूप से एजेंट को भेजें। स्क्रीन लॉक होने पर रुक जाता है।',
   'voice.providers.saved': 'ध्वनि प्रदाता सहेजे गए.',
   'voice.providers.failedToSave': 'ध्वनि प्रदाताओं को सहेजने में विफल',
   'voice.providers.ellipsis': '…',
@@ -4771,6 +4774,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'अतिरिक्त संदर्भ के रूप में जोड़ें',
   'runQueue.status': '{total} कतार में',
   'runQueue.cleared': 'कतार साफ़ की गई',
+  'notch.ready': 'तैयार',
+  'notch.processing': 'प्रोसेस हो रहा है…',
+  'notch.listening': 'सुन रहा हूं…',
+  'notch.thinking': 'सोच रहा हूं…',
+  'notch.speaking': 'बोल रहा हूं…',
+  'notch.transcribing': 'ट्रांसक्राइब कर रहा हूं…',
+  'notch.executing': 'चला रहा हूं…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/id.ts b/app/src/lib/i18n/id.ts
index 5504c1c487..11f2d79b45 100644
--- a/app/src/lib/i18n/id.ts
+++ b/app/src/lib/i18n/id.ts
@@ -1423,6 +1423,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Ambang Batas Senyap (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Rekaman dengan energi di bawah nilai ini dianggap sebagai keheningan dan dilewati. Lebih rendah = lebih sensitif.',
+  'voice.debug.alwaysOn': 'Mendengarkan terus-menerus',
+  'voice.debug.alwaysOnDesc':
+    'Biarkan mikrofon tetap aktif dan kirim ucapan Anda ke agen secara otomatis, tanpa pintasan. Berhenti sementara saat layar terkunci.',
   'voice.providers.saved': 'Penyedia suara disimpan.',
   'voice.providers.failedToSave': 'Gagal menyimpan penyedia suara',
   'voice.providers.ellipsis': '…',
@@ -4783,6 +4786,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Tambahkan sebagai konteks tambahan',
   'runQueue.status': '{total} dalam antrean',
   'runQueue.cleared': 'Antrean dikosongkan',
+  'notch.ready': 'Siap',
+  'notch.processing': 'Memproses…',
+  'notch.listening': 'Mendengar…',
+  'notch.thinking': 'Berpikir…',
+  'notch.speaking': 'Berbicara…',
+  'notch.transcribing': 'Mentranskrip…',
+  'notch.executing': 'Mengeksekusi…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/it.ts b/app/src/lib/i18n/it.ts
index 0fd113f594..89c0ee0e95 100644
--- a/app/src/lib/i18n/it.ts
+++ b/app/src/lib/i18n/it.ts
@@ -1445,6 +1445,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Soglia di silenzio (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Le registrazioni con energia inferiore a questa soglia vengono trattate come silenzio e saltate. Più basso = più sensibile.',
+  'voice.debug.alwaysOn': 'Ascolto sempre attivo',
+  'voice.debug.alwaysOnDesc':
+    'Mantieni il microfono aperto e invia automaticamente ciò che dici all’agente, senza scorciatoia. Si mette in pausa quando lo schermo è bloccato.',
   'voice.providers.saved': 'Fornitori di servizi vocali salvati.',
   'voice.providers.failedToSave': 'Impossibile salvare i provider vocali',
   'voice.providers.ellipsis': '…',
@@ -4856,6 +4859,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Aggiungi come contesto extra',
   'runQueue.status': '{total} in coda',
   'runQueue.cleared': 'Coda svuotata',
+  'notch.ready': 'Pronto',
+  'notch.processing': 'Elaborazione…',
+  'notch.listening': 'Ascolto…',
+  'notch.thinking': 'Penso…',
+  'notch.speaking': 'Parlo…',
+  'notch.transcribing': 'Trascrizione…',
+  'notch.executing': 'Eseguendo…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/ko.ts b/app/src/lib/i18n/ko.ts
index 059104d37e..9cdde1dfb4 100644
--- a/app/src/lib/i18n/ko.ts
+++ b/app/src/lib/i18n/ko.ts
@@ -1402,6 +1402,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': '무음 임계값(RMS)',
   'voice.debug.silenceThresholdDesc':
     '이 값보다 에너지가 낮은 녹음은 무음으로 처리되어 건너뜁니다. 낮을수록 더 민감합니다.',
+  'voice.debug.alwaysOn': '상시 청취',
+  'voice.debug.alwaysOnDesc':
+    '단축키 없이 마이크를 계속 열어 두고 말한 내용을 자동으로 에이전트에 보냅니다. 화면이 잠기면 일시 중지됩니다.',
   'voice.providers.saved': '음성 제공업체가 저장되었습니다.',
   'voice.providers.failedToSave': '음성 제공자를 저장하지 못했습니다.',
   'voice.providers.ellipsis': '…',
@@ -4719,6 +4722,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': '추가 컨텍스트로 추가',
   'runQueue.status': '{total}개 대기 중',
   'runQueue.cleared': '대기열이 비워졌습니다',
+  'notch.ready': '준비됨',
+  'notch.processing': '처리 중…',
+  'notch.listening': '듣는 중…',
+  'notch.thinking': '생각 중…',
+  'notch.speaking': '말하는 중…',
+  'notch.transcribing': '변환 중…',
+  'notch.executing': '실행 중…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/pl.ts b/app/src/lib/i18n/pl.ts
index c613cc7a50..01442b029d 100644
--- a/app/src/lib/i18n/pl.ts
+++ b/app/src/lib/i18n/pl.ts
@@ -1437,6 +1437,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Próg ciszy (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Nagrania z energią poniżej tego progu są traktowane jako cisza i pomijane. Niżej = bardziej czułe.',
+  'voice.debug.alwaysOn': 'Ciągłe nasłuchiwanie',
+  'voice.debug.alwaysOnDesc':
+    'Utrzymuje mikrofon włączony i automatycznie wysyła to, co mówisz, do agenta, bez skrótu. Wstrzymuje się, gdy ekran jest zablokowany.',
   'voice.providers.saved': 'Zapisano dostawców głosu.',
   'voice.providers.failedToSave': 'Nie udało się zapisać dostawców głosu',
   'voice.providers.ellipsis': '…',
@@ -4847,6 +4850,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Dodaj jako dodatkowy kontekst',
   'runQueue.status': '{total} w kolejce',
   'runQueue.cleared': 'Kolejka wyczyszczona',
+  'notch.ready': 'Gotowe',
+  'notch.processing': 'Przetwarzanie…',
+  'notch.listening': 'Słucham…',
+  'notch.thinking': 'Myślę…',
+  'notch.speaking': 'Mówię…',
+  'notch.transcribing': 'Transkrybuję…',
+  'notch.executing': 'Wykonuję…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/pt.ts b/app/src/lib/i18n/pt.ts
index 214e3adabf..9a8b7b7a6c 100644
--- a/app/src/lib/i18n/pt.ts
+++ b/app/src/lib/i18n/pt.ts
@@ -1452,6 +1452,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Limite de Silêncio (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Gravações com energia abaixo deste valor são tratadas como silêncio e ignoradas. Menor = mais sensível.',
+  'voice.debug.alwaysOn': 'Escuta contínua',
+  'voice.debug.alwaysOnDesc':
+    'Mantém o microfone aberto e envia o que você diz ao agente automaticamente, sem atalho. Pausa quando a tela está bloqueada.',
   'voice.providers.saved': 'Provedores de voz salvos.',
   'voice.providers.failedToSave': 'Falha ao salvar provedores de voz',
   'voice.providers.ellipsis': '…',
@@ -4852,6 +4855,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Adicionar como contexto extra',
   'runQueue.status': '{total} na fila',
   'runQueue.cleared': 'Fila limpa',
+  'notch.ready': 'Pronto',
+  'notch.processing': 'Processando…',
+  'notch.listening': 'Ouvindo…',
+  'notch.thinking': 'Pensando…',
+  'notch.speaking': 'Falando…',
+  'notch.transcribing': 'Transcrevendo…',
+  'notch.executing': 'Executando…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/ru.ts b/app/src/lib/i18n/ru.ts
index 9795ee47e5..10f1ace076 100644
--- a/app/src/lib/i18n/ru.ts
+++ b/app/src/lib/i18n/ru.ts
@@ -1430,6 +1430,9 @@ const messages: TranslationMap = {
   'voice.debug.silenceThreshold': 'Порог тишины (RMS)',
   'voice.debug.silenceThresholdDesc':
     'Записи с энергией ниже этого значения считаются тишиной и пропускаются. Меньше = чувствительнее.',
+  'voice.debug.alwaysOn': 'Постоянное прослушивание',
+  'voice.debug.alwaysOnDesc':
+    'Держит микрофон включённым и автоматически отправляет сказанное агенту без горячей клавиши. Приостанавливается при блокировке экрана.',
   'voice.providers.saved': 'Поставщики голосовой связи сохранены.',
   'voice.providers.failedToSave': 'Не удалось сохранить поставщиков голосовой связи.',
   'voice.providers.ellipsis': '…',
@@ -4812,6 +4815,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': 'Добавить как дополнительный контекст',
   'runQueue.status': '{total} в очереди',
   'runQueue.cleared': 'Очередь очищена',
+  'notch.ready': 'Готово',
+  'notch.processing': 'Обработка…',
+  'notch.listening': 'Слушаю…',
+  'notch.thinking': 'Думаю…',
+  'notch.speaking': 'Говорю…',
+  'notch.transcribing': 'Транскрибирую…',
+  'notch.executing': 'Выполняю…',
 };
 
 export default messages;
diff --git a/app/src/lib/i18n/zh-CN.ts b/app/src/lib/i18n/zh-CN.ts
index 272f7cecaa..88438e20fb 100644
--- a/app/src/lib/i18n/zh-CN.ts
+++ b/app/src/lib/i18n/zh-CN.ts
@@ -1339,6 +1339,9 @@ const messages: TranslationMap = {
   'voice.debug.minimumRecordingSeconds': '最短录音秒数',
   'voice.debug.silenceThreshold': '静音阈值 (RMS)',
   'voice.debug.silenceThresholdDesc': '能量低于此值的录音将被视为静音并跳过。值越低，灵敏度越高。',
+  'voice.debug.alwaysOn': '常驻聆听',
+  'voice.debug.alwaysOnDesc':
+    '保持麦克风开启，无需快捷键即可自动将你说的话发送给智能体。屏幕锁定时暂停。',
   'voice.providers.saved': '语音提供商已保存。',
   'voice.providers.failedToSave': '无法保存语音提供商',
   'voice.providers.ellipsis': '…',
@@ -4533,6 +4536,13 @@ const messages: TranslationMap = {
   'runQueue.collectHint': '作为额外上下文添加',
   'runQueue.status': '已排队 {total} 条',
   'runQueue.cleared': '队列已清空',
+  'notch.ready': '就绪',
+  'notch.processing': '处理中…',
+  'notch.listening': '聆听中…',
+  'notch.thinking': '思考中…',
+  'notch.speaking': '说话中…',
+  'notch.transcribing': '转录中…',
+  'notch.executing': '执行中…',
 };
 
 export default messages;
diff --git a/app/src/services/coreRpcClient.ts b/app/src/services/coreRpcClient.ts
index a1eb0e0b3c..bb10608dee 100644
--- a/app/src/services/coreRpcClient.ts
+++ b/app/src/services/coreRpcClient.ts
@@ -391,6 +391,18 @@ export async function getCoreRpcUrl(): Promise<string> {
  *      stored token is set so existing tests remain unaffected.
  */
 export async function getCoreRpcToken(): Promise<string | null> {
+  // Non-Tauri first-party webviews (the notch / overlay NSPanel WKWebViews have
+  // no Tauri IPC) receive the per-process bearer injected as a global by the
+  // Rust host. Honour it first — and not behind the resolution cache, so a late
+  // injection (the host injects on a timer once the core URL is ready) still wins.
+  const injected = (globalThis as { __OPENHUMAN_NOTCH_CORE_TOKEN__?: string })
+    .__OPENHUMAN_NOTCH_CORE_TOKEN__;
+  if (typeof injected === 'string' && injected) {
+    resolvedCoreRpcToken = injected;
+    didResolveCoreRpcToken = true;
+    return injected;
+  }
+
   if (didResolveCoreRpcToken) return resolvedCoreRpcToken;
 
   const storedToken = getStoredCoreToken();
diff --git a/app/src/utils/tauriCommands/voice.ts b/app/src/utils/tauriCommands/voice.ts
index 295c90fe62..d244cba93c 100644
--- a/app/src/utils/tauriCommands/voice.ts
+++ b/app/src/utils/tauriCommands/voice.ts
@@ -58,6 +58,8 @@ export interface VoiceServerSettings {
   silence_threshold: number;
   /** Custom vocabulary words to bias whisper toward (names, technical terms). */
   custom_dictionary: string[];
+  /** Phase 2: continuous always-on listening (no hotkey). Opt-in. */
+  always_on_enabled: boolean;
 }
 
 export async function openhumanVoiceStatus(): Promise<VoiceStatus> {
@@ -106,6 +108,7 @@ export async function openhumanUpdateVoiceServerSettings(update: {
   min_duration_secs?: number;
   silence_threshold?: number;
   custom_dictionary?: string[];
+  always_on_enabled?: boolean;
 }): Promise<CommandResponse<ConfigSnapshot>> {
   return await callCoreRpc<CommandResponse<ConfigSnapshot>>({
     method: 'openhuman.config_update_voice_server_settings',

From 325e0c3df0dd94cec10ab6f6d97fcb9389a0eba8 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 14:23:51 +0530
Subject: [PATCH 6/9] feat(notch): always-visible macOS notch status pill

Transparent NSPanel + WKWebView anchored at the top-centre of the primary
screen showing live Ready/Listening/Processing state; automate streams
step progress to it via the overlay:attention socket bridge. macOS only;
no-op elsewhere.

Slice 6/7 of #3307 (notch status pill).
---
 app/src-tauri/src/lib.rs          |  39 +++
 app/src-tauri/src/notch_window.rs | 383 ++++++++++++++++++++++++++++++
 app/src/index.css                 |  41 +++-
 app/src/main.tsx                  |  35 ++-
 app/src/notch/NotchApp.tsx        | 324 +++++++++++++++++++++++++
 5 files changed, 810 insertions(+), 12 deletions(-)
 create mode 100644 app/src-tauri/src/notch_window.rs
 create mode 100644 app/src/notch/NotchApp.tsx

diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs
index 7df029b44b..06bcf6305b 100644
--- a/app/src-tauri/src/lib.rs
+++ b/app/src-tauri/src/lib.rs
@@ -46,6 +46,8 @@ mod meet_call;
 mod meet_scanner;
 mod meet_video;
 mod native_notifications;
+#[cfg(target_os = "macos")]
+mod notch_window;
 mod notification_settings;
 mod process_kill;
 mod process_recovery;
@@ -927,6 +929,33 @@ fn mascot_native_window_is_open() -> bool {
     false
 }
 
+/// Show the notch activity indicator. macOS only — transparent NSPanel + WKWebView
+/// anchored to the top-centre of the primary screen. Displays live voice and
+/// agent status (listening, thinking, executing) in a pill that emerges from
+/// the physical notch on supported MacBook Pros.
+#[tauri::command]
+fn notch_window_show(app: AppHandle<AppRuntime>) -> Result<(), String> {
+    log::info!("[notch-window] show requested");
+    #[cfg(target_os = "macos")]
+    {
+        return notch_window::show(&app);
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        let _ = app;
+        Ok(()) // No-op on non-macOS
+    }
+}
+
+/// Hide the notch activity indicator.
+#[tauri::command]
+fn notch_window_hide(_app: AppHandle<AppRuntime>) -> Result<(), String> {
+    log::info!("[notch-window] hide requested");
+    #[cfg(target_os = "macos")]
+    notch_window::hide();
+    Ok(())
+}
+
 /// Hide or show the OS top-level main-window frame on Windows by enumerating
 /// this process's top-level windows and matching the visible
 /// `Chrome_WidgetWin_1` host. `WebviewWindow::hwnd()` from the vendored CEF
@@ -2800,6 +2829,14 @@ pub fn run() {
             //       let _ = window.show();
             //   }
 
+            // Notch activity indicator: transparent pill at the top-centre of
+            // the primary screen. Shows live voice / agent state. macOS only
+            // (physical notch or menu-bar HUD on older hardware).
+            #[cfg(target_os = "macos")]
+            if let Err(err) = notch_window::show(&app.handle()) {
+                log::warn!("[notch-window] auto-show on startup failed: {err}");
+            }
+
             // Synthetic-input main-thread executor. enigo's macOS keyboard-layout
             // lookup (TSMGetInputSourceProperty) MUST run on the app main thread
             // or it traps (`_dispatch_assert_queue_fail`/EXC_BREAKPOINT) and
@@ -3186,6 +3223,8 @@ pub fn run() {
             native_notifications::show_native_notification,
             mascot_window_show,
             mascot_window_hide,
+            notch_window_show,
+            notch_window_hide,
             file_logging::reveal_logs_folder,
             file_logging::logs_folder_path,
             workspace_paths::open_workspace_path,
diff --git a/app/src-tauri/src/notch_window.rs b/app/src-tauri/src/notch_window.rs
new file mode 100644
index 0000000000..25b2d9516e
--- /dev/null
+++ b/app/src-tauri/src/notch_window.rs
@@ -0,0 +1,383 @@
+//! Native macOS NSPanel + WKWebView host for the notch activity indicator.
+//!
+//! A transparent, click-through floating panel anchored to the top-centre of
+//! the primary screen. On MacBook Pros with a physical notch the pill visually
+//! emerges from the notch; on older Macs it acts as a top-centre floating HUD.
+//!
+//! Architecture mirrors `mascot_native_window` — a native NSPanel avoids the
+//! CEF transparency limitation (vendored tauri-cef cannot render transparent
+//! windowed-mode browsers; only off-screen rendering supports transparency,
+//! which the runtime does not enable). The WKWebView loads the same Vite entry
+//! point at `?window=notch` so the React tree can branch in `main.tsx`.
+//!
+//! IPC strategy: no Tauri IPC bridge. The panel polls
+//! `OPENHUMAN_CORE_RPC_URL` (set by `CoreProcessHandle` once the embedded
+//! server is ready) and injects it via `evaluateJavaScript` so the React app
+//! can open a Socket.IO connection to receive live voice and agent events.
+
+use std::cell::{Cell, RefCell};
+use std::path::PathBuf;
+use std::ptr::NonNull;
+use std::rc::Rc;
+
+use block2::RcBlock;
+use objc2::rc::Retained;
+use objc2::{msg_send, MainThreadMarker, MainThreadOnly};
+use objc2_app_kit::{
+    NSBackingStoreType, NSColor, NSPanel, NSScreen, NSWindowCollectionBehavior, NSWindowStyleMask,
+};
+use objc2_foundation::{NSNumber, NSPoint, NSRect, NSSize, NSString, NSTimer, NSURLRequest, NSURL};
+use objc2_web_kit::{WKWebView, WKWebViewConfiguration};
+use tauri::{AppHandle, Manager};
+
+use crate::AppRuntime;
+
+/// Logical width of the notch panel. Wide enough to display voice/action text.
+const PANEL_WIDTH: f64 = 380.0;
+/// Logical height — covers the menu-bar / notch depth with headroom for the pill.
+const PANEL_HEIGHT: f64 = 54.0;
+/// URL-inject timer interval in seconds.
+const INJECT_POLL_SECONDS: f64 = 1.0;
+/// Ticks to wait before the first inject attempt (page-load delay).
+const PAGE_LOAD_TICKS: u32 = 2;
+
+struct NotchPanel {
+    panel: Retained<NSPanel>,
+    #[allow(dead_code)]
+    webview: Retained<WKWebView>,
+    inject_timer: Retained<NSTimer>,
+}
+
+impl NotchPanel {
+    fn order_out(&self) {
+        self.inject_timer.invalidate();
+        self.panel.orderOut(None);
+    }
+}
+
+thread_local! {
+    /// Accessed only from the main thread. NSPanel/WKWebView are not Send/Sync
+    /// so a thread-local is the simplest safe storage.
+    static NOTCH: RefCell<Option<NotchPanel>> = const { RefCell::new(None) };
+}
+
+pub(crate) fn is_open() -> bool {
+    NOTCH.with(|cell| cell.borrow().is_some())
+}
+
+pub(crate) fn hide() {
+    NOTCH.with(|cell| {
+        if let Some(existing) = cell.borrow_mut().take() {
+            log::info!("[notch-window] dropping panel");
+            existing.order_out();
+        }
+    });
+}
+
+pub(crate) fn show(app: &AppHandle<AppRuntime>) -> Result<(), String> {
+    if NOTCH.with(|cell| cell.borrow().is_some()) {
+        log::debug!("[notch-window] already open");
+        return Ok(());
+    }
+
+    let mtm = MainThreadMarker::new()
+        .ok_or_else(|| "notch_window::show called off the main thread".to_string())?;
+
+    let source = resolve_page_source(app)?;
+    log::info!("[notch-window] loading source={source:?}");
+
+    let frame = top_center_frame(mtm);
+    log::debug!(
+        "[notch-window] frame origin=({:.0},{:.0}) size=({:.0},{:.0})",
+        frame.origin.x,
+        frame.origin.y,
+        frame.size.width,
+        frame.size.height
+    );
+
+    let panel = unsafe { build_panel(mtm, frame) };
+    let webview = unsafe { build_webview(mtm, &panel, &source) };
+
+    panel.orderFrontRegardless();
+
+    let inject_timer = unsafe { spawn_inject_timer(webview.clone()) };
+
+    NOTCH.with(|cell| {
+        *cell.borrow_mut() = Some(NotchPanel {
+            panel,
+            webview,
+            inject_timer,
+        });
+    });
+    log::info!("[notch-window] panel shown at top-center");
+    Ok(())
+}
+
+// ── Page source ───────────────────────────────────────────────────────────────
+
+#[derive(Debug)]
+enum PageSource {
+    Dev { url: String },
+    Bundled { index_html: PathBuf, root: PathBuf },
+}
+
+fn resolve_page_source(app: &AppHandle<AppRuntime>) -> Result<PageSource, String> {
+    if let Some(mut url) = app.config().build.dev_url.as_ref().cloned() {
+        let query = url
+            .query()
+            .map(|q| format!("{q}&window=notch"))
+            .unwrap_or_else(|| "window=notch".into());
+        url.set_query(Some(&query));
+        return Ok(PageSource::Dev {
+            url: url.to_string(),
+        });
+    }
+
+    let resource_dir = app
+        .path()
+        .resource_dir()
+        .map_err(|e| format!("resolve resource_dir: {e}"))?;
+    for candidate in [
+        resource_dir.join("index.html"),
+        resource_dir.join("dist").join("index.html"),
+    ] {
+        if candidate.is_file() {
+            let root = candidate
+                .parent()
+                .map(|p| p.to_path_buf())
+                .unwrap_or_else(|| resource_dir.clone());
+            return Ok(PageSource::Bundled {
+                index_html: candidate,
+                root,
+            });
+        }
+    }
+    Err(format!(
+        "notch bundled index.html not found under resource_dir={}",
+        resource_dir.display()
+    ))
+}
+
+// ── Frame geometry ────────────────────────────────────────────────────────────
+
+fn primary_screen_frame(mtm: MainThreadMarker) -> NSRect {
+    let screens = NSScreen::screens(mtm);
+    if let Some(primary) = screens.firstObject() {
+        return primary.frame();
+    }
+    log::warn!("[notch-window] NSScreen::screens returned empty — falling back to 1440×900");
+    NSRect::new(NSPoint::new(0.0, 0.0), NSSize::new(1440.0, 900.0))
+}
+
+/// Centre the panel horizontally at the very top of the primary screen.
+///
+/// AppKit uses a bottom-left origin, so:
+///   top-y  = screen.origin.y + screen.height − PANEL_HEIGHT
+///   center-x = screen.origin.x + (screen.width − PANEL_WIDTH) / 2
+fn top_center_frame(mtm: MainThreadMarker) -> NSRect {
+    let screen = primary_screen_frame(mtm);
+    let x = screen.origin.x + (screen.size.width - PANEL_WIDTH) / 2.0;
+    let y = screen.origin.y + screen.size.height - PANEL_HEIGHT;
+    NSRect::new(NSPoint::new(x, y), NSSize::new(PANEL_WIDTH, PANEL_HEIGHT))
+}
+
+// ── NSPanel construction ──────────────────────────────────────────────────────
+
+unsafe fn build_panel(mtm: MainThreadMarker, frame: NSRect) -> Retained<NSPanel> {
+    let style = NSWindowStyleMask::Borderless | NSWindowStyleMask::NonactivatingPanel;
+    let panel: Retained<NSPanel> = unsafe {
+        let allocated = NSPanel::alloc(mtm);
+        msg_send![
+            allocated,
+            initWithContentRect: frame,
+            styleMask: style,
+            backing: NSBackingStoreType::Buffered,
+            defer: false,
+        ]
+    };
+
+    unsafe {
+        panel.setOpaque(false);
+        let clear = NSColor::clearColor();
+        panel.setBackgroundColor(Some(&clear));
+        panel.setHasShadow(false);
+
+        // Float above the menu bar. NSStatusWindowLevel = 25, which sits above
+        // NSMainMenuWindowLevel = 24. Same recipe used by the mascot panel and
+        // the `configure_overlay_window_macos` helper.
+        panel.setLevel(25);
+        panel.setCollectionBehavior(
+            NSWindowCollectionBehavior::CanJoinAllSpaces
+                | NSWindowCollectionBehavior::Transient
+                | NSWindowCollectionBehavior::FullScreenAuxiliary
+                | NSWindowCollectionBehavior::IgnoresCycle,
+        );
+        panel.setFloatingPanel(true);
+        panel.setHidesOnDeactivate(false);
+        panel.setBecomesKeyOnlyIfNeeded(true);
+        panel.setWorksWhenModal(true);
+
+        // Fully click-through: the panel never steals mouse events. Menu-bar
+        // items remain clickable through the transparent regions.
+        panel.setIgnoresMouseEvents(true);
+
+        let _: () = msg_send![&*panel, setExcludedFromWindowsMenu: true];
+    }
+
+    panel
+}
+
+// ── WKWebView construction ────────────────────────────────────────────────────
+
+unsafe fn build_webview(
+    mtm: MainThreadMarker,
+    panel: &NSPanel,
+    source: &PageSource,
+) -> Retained<WKWebView> {
+    let config: Retained<WKWebViewConfiguration> = unsafe {
+        let alloc = WKWebViewConfiguration::alloc(mtm);
+        msg_send![alloc, init]
+    };
+
+    let frame = NSRect::new(
+        NSPoint::new(0.0, 0.0),
+        NSSize::new(PANEL_WIDTH, PANEL_HEIGHT),
+    );
+    let webview: Retained<WKWebView> =
+        unsafe { WKWebView::initWithFrame_configuration(WKWebView::alloc(mtm), frame, &config) };
+
+    unsafe {
+        // Disable WKWebView's own background so CSS `background: transparent` works.
+        // There is no public API for this on macOS — KVC against the private
+        // `drawsBackground` property is the canonical approach (used by wry, Electron).
+        let no = NSNumber::numberWithBool(false);
+        let key = NSString::from_str("drawsBackground");
+        let _: () = msg_send![&*webview, setValue: &*no, forKey: &*key];
+
+        // Auto-resize to fill the panel content view.
+        let _: () = msg_send![&*webview, setAutoresizingMask: 18u64]; // width|height
+
+        let webview_ref: &objc2::runtime::AnyObject = &*webview;
+        let webview_view = webview_ref as *const _ as *mut objc2::runtime::AnyObject;
+        let _: () = msg_send![panel, setContentView: webview_view];
+
+        match source {
+            PageSource::Dev { url } => {
+                let ns_url_str = NSString::from_str(url);
+                let ns_url = NSURL::URLWithString(&ns_url_str);
+                if let Some(ns_url) = ns_url {
+                    let request = NSURLRequest::requestWithURL(&ns_url);
+                    let _ = webview.loadRequest(&request);
+                } else {
+                    log::warn!("[notch-window] could not parse dev url={url}");
+                }
+            }
+            PageSource::Bundled { index_html, root } => {
+                let Ok(mut file_url) = url::Url::from_file_path(index_html) else {
+                    log::warn!(
+                        "[notch-window] index_html not absolute: {}",
+                        index_html.display()
+                    );
+                    return webview;
+                };
+                file_url.set_query(Some("window=notch"));
+                let Ok(read_access_url) = url::Url::from_file_path(root) else {
+                    log::warn!(
+                        "[notch-window] resource root not absolute: {}",
+                        root.display()
+                    );
+                    return webview;
+                };
+                let ns_url_str = NSString::from_str(file_url.as_str());
+                let read_access_str = NSString::from_str(read_access_url.as_str());
+                match (
+                    NSURL::URLWithString(&ns_url_str),
+                    NSURL::URLWithString(&read_access_str),
+                ) {
+                    (Some(ns_url), Some(read_access_ns)) => {
+                        let _ =
+                            webview.loadFileURL_allowingReadAccessToURL(&ns_url, &read_access_ns);
+                        log::info!(
+                            "[notch-window] loaded bundled index={} root={}",
+                            index_html.display(),
+                            root.display()
+                        );
+                    }
+                    _ => log::warn!(
+                        "[notch-window] could not parse bundled file URLs index={} root={}",
+                        file_url,
+                        read_access_url
+                    ),
+                }
+            }
+        }
+    }
+
+    webview
+}
+
+// ── Core-URL injection timer ──────────────────────────────────────────────────
+
+/// Spawn a 1 Hz repeating timer that waits for the embedded core to become
+/// ready (indicated by `CoreProcessHandle` setting `OPENHUMAN_CORE_RPC_URL`
+/// in the process env), then injects the base URL into the WKWebView.
+///
+/// After the first successful inject the timer becomes a no-op until it is
+/// invalidated by `NotchPanel::order_out()` when the panel is hidden.
+unsafe fn spawn_inject_timer(webview: Retained<WKWebView>) -> Retained<NSTimer> {
+    let tick_count: Rc<Cell<u32>> = Rc::new(Cell::new(0));
+    let injected: Rc<Cell<bool>> = Rc::new(Cell::new(false));
+
+    let block = RcBlock::new(move |_timer: NonNull<NSTimer>| {
+        tick_count.set(tick_count.get() + 1);
+
+        if injected.get() || tick_count.get() < PAGE_LOAD_TICKS {
+            return;
+        }
+
+        let Ok(rpc_url) = std::env::var("OPENHUMAN_CORE_RPC_URL") else {
+            return; // Core not ready yet — try again next tick.
+        };
+
+        // Strip `/rpc` path suffix; Socket.IO connects to the base host.
+        let base_url = rpc_url.trim_end_matches("/rpc").to_string();
+
+        // The core Socket.IO handshake rejects unauthenticated clients, and this
+        // WKWebView has no Tauri IPC, so `getCoreRpcToken()` can't `invoke`. Hand
+        // the per-process bearer in via a global the same way as the URL (our own
+        // first-party webview — same trust as the renderer's `core_rpc_token`).
+        // The token is published *after* the URL env is set (post embedded spawn),
+        // so wait for it rather than injecting an empty token that gets rejected.
+        let token = match crate::core_process::current_rpc_token() {
+            Some(t) if !t.is_empty() => t,
+            _ => return, // bearer not published yet — retry next tick
+        };
+        log::info!(
+            "[notch-window] injecting core url + bearer (token_len={})",
+            token.len()
+        );
+
+        // Set a global AND dispatch a custom event so React can pick up the URL
+        // regardless of whether the component mounted before or after this fires.
+        let js = format!(
+            "window.__OPENHUMAN_NOTCH_CORE_TOKEN__='{token}';\
+             window.__OPENHUMAN_NOTCH_CORE_URL__='{base_url}';\
+             window.dispatchEvent(new CustomEvent('notch:core-url',{{detail:{{url:'{base_url}'}}}}));"
+        );
+        let js_str = NSString::from_str(&js);
+        unsafe {
+            let _: () = msg_send![
+                &*webview,
+                evaluateJavaScript: &*js_str,
+                completionHandler: std::ptr::null::<objc2::runtime::AnyObject>()
+            ];
+        }
+
+        injected.set(true);
+        log::debug!("[notch-window] injected core URL base={base_url}");
+    });
+
+    unsafe {
+        NSTimer::scheduledTimerWithTimeInterval_repeats_block(INJECT_POLL_SECONDS, true, &block)
+    }
+}
diff --git a/app/src/index.css b/app/src/index.css
index 888dbd678f..c7f2a89e43 100644
--- a/app/src/index.css
+++ b/app/src/index.css
@@ -45,12 +45,51 @@
   html[data-window='overlay'] #root,
   html[data-window='mascot'],
   html[data-window='mascot'] body,
-  html[data-window='mascot'] #root {
+  html[data-window='mascot'] #root,
+  html[data-window='notch'],
+  html[data-window='notch'] body,
+  html[data-window='notch'] #root {
     background: transparent;
     overflow: hidden;
     user-select: none;
   }
 
+  @keyframes notch-pill-in {
+    from {
+      opacity: 0;
+      transform: scaleX(0.4) scaleY(0.7);
+    }
+    to {
+      opacity: 1;
+      transform: scaleX(1) scaleY(1);
+    }
+  }
+
+  @keyframes notch-bar {
+    0%,
+    100% {
+      transform: scaleY(0.5);
+      opacity: 0.6;
+    }
+    50% {
+      transform: scaleY(1.2);
+      opacity: 1;
+    }
+  }
+
+  @keyframes notch-dot {
+    0%,
+    80%,
+    100% {
+      transform: scale(0.6);
+      opacity: 0.4;
+    }
+    40% {
+      transform: scale(1);
+      opacity: 1;
+    }
+  }
+
   @keyframes overlay-bubble-in {
     from {
       opacity: 0;
diff --git a/app/src/main.tsx b/app/src/main.tsx
index ac2b21be1b..45bd27736a 100644
--- a/app/src/main.tsx
+++ b/app/src/main.tsx
@@ -8,6 +8,7 @@ import App from './App';
 import './index.css';
 import { getCoreStateSnapshot } from './lib/coreState/store';
 import MascotWindowApp from './mascot/MascotWindowApp';
+import NotchApp from './notch/NotchApp';
 import OverlayApp from './overlay/OverlayApp';
 import './polyfills';
 import { initGA, initSentry, startUiInteractionTracking, trackEvent } from './services/analytics';
@@ -37,13 +38,16 @@ const urlWindowParam = (() => {
   }
 })();
 const isMascotWindow = urlWindowParam === 'mascot';
+const isNotchWindow = urlWindowParam === 'notch';
 const currentWindowLabel = isMascotWindow
   ? 'mascot'
-  : tauriRuntimeAvailable()
-    ? getCurrentWindow().label
-    : 'main';
+  : isNotchWindow
+    ? 'notch'
+    : tauriRuntimeAvailable()
+      ? getCurrentWindow().label
+      : 'main';
 const isOverlayWindow = currentWindowLabel === 'overlay';
-const isStandaloneWindow = isOverlayWindow || isMascotWindow;
+const isStandaloneWindow = isOverlayWindow || isMascotWindow || isNotchWindow;
 
 const ensureDefaultHashRoute = () => {
   const hash = window.location.hash;
@@ -83,17 +87,26 @@ if (!isStandaloneWindow) {
 // namespace from the first storage call. (#900)
 function bootRender() {
   const root = ReactDOM.createRoot(document.getElementById('root') as HTMLElement);
-  const tree = isMascotWindow ? <MascotWindowApp /> : isOverlayWindow ? <OverlayApp /> : <App />;
+  const tree = isMascotWindow ? (
+    <MascotWindowApp />
+  ) : isNotchWindow ? (
+    <NotchApp />
+  ) : isOverlayWindow ? (
+    <OverlayApp />
+  ) : (
+    <App />
+  );
   root.render(<React.StrictMode>{tree}</React.StrictMode>);
 }
 
-// The mascot lives in a native WKWebView (no Tauri IPC), so
+// The mascot and notch windows live in native WKWebViews (no Tauri IPC), so
 // `getActiveUserIdFromCore()` would just reject after a roundtrip and
-// delay first paint for nothing. Skip the bootstrap entirely in that
-// path — the mascot UI doesn't read user-scoped storage anyway.
-const activeUserBootstrap = isMascotWindow
-  ? Promise.resolve<string | null>(null)
-  : getActiveUserIdFromCore();
+// delay first paint for nothing. Skip the bootstrap entirely in those
+// paths — neither UI reads user-scoped storage.
+const activeUserBootstrap =
+  isMascotWindow || isNotchWindow
+    ? Promise.resolve<string | null>(null)
+    : getActiveUserIdFromCore();
 
 activeUserBootstrap
   .then(id => primeActiveUserId(id))
diff --git a/app/src/notch/NotchApp.tsx b/app/src/notch/NotchApp.tsx
new file mode 100644
index 0000000000..d8f5352dae
--- /dev/null
+++ b/app/src/notch/NotchApp.tsx
@@ -0,0 +1,324 @@
+/**
+ * NotchApp
+ *
+ * Standalone React root rendered inside the native macOS NSPanel that floats
+ * at the top-centre of the primary screen (see `app/src-tauri/src/notch_window.rs`).
+ *
+ * The panel has no Tauri IPC bridge (WKWebView outside the CEF runtime). The
+ * Rust host injects the core base URL via `evaluateJavaScript` once
+ * `OPENHUMAN_CORE_RPC_URL` is set by `CoreProcessHandle`, dispatching:
+ *   `window.__OPENHUMAN_NOTCH_CORE_URL__`  (global)
+ *   `notch:core-url` CustomEvent            (for late mounts)
+ *
+ * This component connects to the core over Socket.IO — identical to
+ * `OverlayApp` — and renders a pill that expands from the notch area when
+ * voice is active or the agent is performing an action.
+ *
+ * Events handled:
+ *   dictation:toggle          voice recording started / stopped
+ *   dictation:transcription   final transcript text
+ *   companion:state_changed   agent lifecycle (thinking, speaking, …)
+ *   overlay:attention         core broadcast message
+ */
+import { useCallback, useEffect, useRef, useState } from 'react';
+import type { Socket } from 'socket.io-client';
+
+import { useT } from '../lib/i18n/I18nContext';
+import { connectCoreSocket } from '../services/coreSocket';
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+// 'ready' is the always-visible idle baseline (shows "Ready"); the pill never
+// fully disappears so the user always knows the listener's status.
+type NotchMode = 'ready' | 'listening' | 'transcribing' | 'thinking' | 'speaking' | 'attention';
+
+interface NotchState {
+  mode: NotchMode;
+  text: string;
+}
+
+interface DictationTogglePayload {
+  type?: string;
+}
+interface DictationTranscriptionPayload {
+  text?: string;
+}
+interface CompanionStatePayload {
+  state?: string;
+  message?: string;
+}
+interface AttentionPayload {
+  message?: string;
+  ttl_ms?: number;
+}
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const LINGER_MS = 1800;
+const DEFAULT_TTL_MS = 6000;
+
+// ── Waveform bars (voice activity animation) ──────────────────────────────────
+
+function WaveformBars() {
+  return (
+    <div className="flex items-center gap-[3px]" aria-hidden="true">
+      {[0, 1, 2, 3, 4].map(i => (
+        <span
+          key={i}
+          className="w-[3px] rounded-full bg-white/90"
+          style={{
+            height: `${10 + (i % 3) * 4}px`,
+            animation: `notch-bar 0.9s ease-in-out infinite`,
+            animationDelay: `${i * 0.12}s`,
+          }}
+        />
+      ))}
+    </div>
+  );
+}
+
+// ── Spinner dots ──────────────────────────────────────────────────────────────
+
+function SpinnerDots() {
+  return (
+    <div className="flex items-center gap-[4px]" aria-hidden="true">
+      {[0, 1, 2].map(i => (
+        <span
+          key={i}
+          className="h-[5px] w-[5px] rounded-full bg-white/80"
+          style={{
+            animation: `notch-dot 1.2s ease-in-out infinite`,
+            animationDelay: `${i * 0.2}s`,
+          }}
+        />
+      ))}
+    </div>
+  );
+}
+
+// ── Icon glyph ────────────────────────────────────────────────────────────────
+
+function ModeIcon({ mode }: { mode: NotchMode }) {
+  // Steady green dot when idle/ready — calm "I'm listening for the wake word".
+  if (mode === 'ready') return <span className="h-2 w-2 rounded-full bg-emerald-400/90" />;
+  if (mode === 'listening') return <WaveformBars />;
+  if (mode === 'transcribing' || mode === 'thinking') return <SpinnerDots />;
+  if (mode === 'speaking') {
+    return (
+      <svg width="16" height="16" viewBox="0 0 16 16" fill="none" aria-hidden="true">
+        <path
+          d="M8 1.5a3 3 0 0 1 3 3v4a3 3 0 0 1-6 0v-4a3 3 0 0 1 3-3z"
+          fill="rgba(255,255,255,0.9)"
+        />
+        <path
+          d="M3.5 7.5A4.5 4.5 0 0 0 8 12a4.5 4.5 0 0 0 4.5-4.5"
+          stroke="rgba(255,255,255,0.9)"
+          strokeWidth="1.2"
+          strokeLinecap="round"
+        />
+        <line
+          x1="8"
+          y1="12"
+          x2="8"
+          y2="14.5"
+          stroke="rgba(255,255,255,0.9)"
+          strokeWidth="1.2"
+          strokeLinecap="round"
+        />
+      </svg>
+    );
+  }
+  // attention / fallback
+  return <span className="h-2 w-2 rounded-full bg-blue-400" />;
+}
+
+// ── Main component ────────────────────────────────────────────────────────────
+
+export default function NotchApp() {
+  const { t } = useT();
+  const [state, setState] = useState<NotchState>({ mode: 'ready', text: '' });
+  const dismissRef = useRef<number | null>(null);
+  const socketRef = useRef<Socket | null>(null);
+
+  const clearDismiss = useCallback(() => {
+    if (dismissRef.current !== null) {
+      window.clearTimeout(dismissRef.current);
+      dismissRef.current = null;
+    }
+  }, []);
+
+  const scheduleDismiss = useCallback(
+    (ms: number) => {
+      clearDismiss();
+      dismissRef.current = window.setTimeout(() => {
+        // Fall back to the always-visible "Ready" baseline, never invisible.
+        setState({ mode: 'ready', text: '' });
+        dismissRef.current = null;
+      }, ms);
+    },
+    [clearDismiss]
+  );
+
+  // ── Socket.IO connection ────────────────────────────────────────────────────
+
+  const connectSocket = useCallback(
+    (baseUrl: string) => {
+      if (socketRef.current?.connected) return;
+      if (socketRef.current) {
+        socketRef.current.disconnect();
+      }
+
+      let disposed = false;
+      void (async () => {
+        try {
+          const socket = await connectCoreSocket({
+            getBaseUrl: async () => baseUrl,
+            isDisposed: () => disposed,
+          });
+          if (!socket || disposed) return;
+          socketRef.current = socket;
+
+          socket.on('dictation:toggle', (payload: DictationTogglePayload) => {
+            const type = payload?.type ?? 'pressed';
+            console.debug(`[notch] dictation:toggle type=${type}`);
+            if (type === 'pressed') {
+              clearDismiss();
+              setState({ mode: 'listening', text: t('notch.listening', 'Listening…') });
+            } else if (type === 'released') {
+              scheduleDismiss(LINGER_MS);
+            }
+          });
+
+          socket.on('dictation:transcription', (payload: DictationTranscriptionPayload) => {
+            const text = payload?.text?.trim();
+            if (!text) return;
+            console.debug(`[notch] dictation:transcription chars=${text.length}`);
+            clearDismiss();
+            setState({
+              mode: 'transcribing',
+              text: text.length > 60 ? `${text.slice(0, 57)}…` : text,
+            });
+            scheduleDismiss(LINGER_MS);
+          });
+
+          socket.on('companion:state_changed', (payload: CompanionStatePayload) => {
+            const agentState = payload?.state ?? 'idle';
+            console.debug(`[notch] companion:state_changed state=${agentState}`);
+
+            if (agentState === 'idle') {
+              scheduleDismiss(0);
+              return;
+            }
+            clearDismiss();
+
+            const modeMap: Partial<Record<string, NotchMode>> = {
+              listening: 'listening',
+              thinking: 'thinking',
+              speaking: 'speaking',
+            };
+            const textMap: Partial<Record<string, string>> = {
+              listening: t('notch.listening', 'Listening…'),
+              thinking: t('notch.processing', 'Processing…'),
+              speaking: t('notch.speaking', 'Speaking…'),
+            };
+
+            setState({
+              mode: modeMap[agentState] ?? 'thinking',
+              text: textMap[agentState] ?? agentState,
+            });
+          });
+
+          socket.on('overlay:attention', (payload: AttentionPayload) => {
+            const message = payload?.message?.trim();
+            if (!message) return;
+            console.debug(`[notch] overlay:attention chars=${message.length}`);
+            clearDismiss();
+            // The voice listener uses two reserved status words to drive the
+            // pill: "Listening" (capturing speech) and "Processing" (running a
+            // command). Map them to the matching icon; everything else is a
+            // generic attention message.
+            const lower = message.toLowerCase();
+            const mode: NotchMode =
+              lower === 'listening'
+                ? 'listening'
+                : lower === 'processing'
+                  ? 'thinking'
+                  : 'attention';
+            setState({ mode, text: message.length > 60 ? `${message.slice(0, 57)}…` : message });
+            scheduleDismiss(payload?.ttl_ms ?? DEFAULT_TTL_MS);
+          });
+
+          socket.connect();
+          console.debug('[notch] socket connected', socket.id);
+        } catch (err) {
+          console.warn('[notch] failed to connect socket', err);
+        }
+      })();
+
+      return () => {
+        disposed = true;
+      };
+    },
+    [t, clearDismiss, scheduleDismiss]
+  );
+
+  // ── Core URL bootstrap ──────────────────────────────────────────────────────
+
+  useEffect(() => {
+    // Check if Rust already injected the URL before this component mounted.
+    const preloaded = (window as { __OPENHUMAN_NOTCH_CORE_URL__?: string })
+      .__OPENHUMAN_NOTCH_CORE_URL__;
+    if (preloaded) {
+      connectSocket(preloaded);
+    }
+
+    // Also listen for the event (fires when core becomes ready after mount).
+    const handler = (e: CustomEvent<{ url: string }>) => {
+      if (e.detail?.url) {
+        connectSocket(e.detail.url);
+      }
+    };
+    window.addEventListener('notch:core-url', handler as EventListener);
+
+    return () => {
+      window.removeEventListener('notch:core-url', handler as EventListener);
+      socketRef.current?.disconnect();
+      socketRef.current = null;
+      clearDismiss();
+    };
+  }, [connectSocket, clearDismiss]);
+
+  // ── Render ──────────────────────────────────────────────────────────────────
+
+  const { mode, text } = state;
+
+  // The pill is ALWAYS visible so the user can always see the listener status:
+  // Ready (idle) · Listening (capturing speech) · Processing (running a command).
+  const label = text || (mode === 'ready' ? t('notch.ready', 'Ready') : '');
+
+  const pillBg =
+    mode === 'speaking'
+      ? 'bg-[rgba(10,40,10,0.92)]'
+      : mode === 'ready'
+        ? 'bg-[rgba(10,10,10,0.72)]' // dimmer when idle
+        : 'bg-[rgba(10,10,10,0.92)]';
+
+  return (
+    <div className="flex h-screen w-screen items-start justify-center bg-transparent pt-[10px]">
+      <div
+        className={`flex select-none items-center gap-2 rounded-full px-4 py-[7px] shadow-lg ${pillBg}`}
+        style={{
+          animation: 'notch-pill-in 220ms cubic-bezier(0.34, 1.56, 0.64, 1)',
+          backdropFilter: 'blur(12px)',
+          WebkitBackdropFilter: 'blur(12px)',
+        }}>
+        <ModeIcon mode={mode} />
+        {label && (
+          <span className="max-w-[260px] truncate text-[13px] font-medium leading-none tracking-[-0.01em] text-white/95">
+            {label}
+          </span>
+        )}
+      </div>
+    </div>
+  );
+}

From 90a7eacb3a35977c9443783db474d7044498e587 Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 21:25:42 +0530
Subject: [PATCH 7/9] chore(notch): drop stale orchestrator desktop-control
 prompt from notch slice
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same stale 'runs without an approval prompt' section as #3344 — not on main,
contradicts the #3342 ApprovalGate fix. Tracked for a corrected follow-up.
---
 .../agents/orchestrator/prompt.md             | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/openhuman/agent_registry/agents/orchestrator/prompt.md b/src/openhuman/agent_registry/agents/orchestrator/prompt.md
index 3a5997d438..b99504800c 100644
--- a/src/openhuman/agent_registry/agents/orchestrator/prompt.md
+++ b/src/openhuman/agent_registry/agents/orchestrator/prompt.md
@@ -42,25 +42,6 @@ Follow this sequence for every user message:
 
 Default bias: **do not spawn a sub-agent when a direct response or direct tool call is sufficient** — but live external-service, scheduling, desktop-control, presentation, product-docs, code-repo, market, and crypto requests belong to their specialists.
 
-## Controlling desktop apps (full autonomy)
-
-You can open and operate native apps on this machine. **Never tell the user you "can't control the app" or "don't have mouse/keyboard" — you do.**
-
-**Rule 0 — foreground first, every time.** Before *any* keyboard/mouse action, call `launch_app "<App>"` for the target. `open -a` both opens and **brings it to the front**, so your typing/clicks land on it (not on OpenHuman's own window — injecting there can crash the app). Re-call `launch_app` right before each keyboard/mouse step if focus might have moved.
-
-**The reliable path is the keyboard, not the mouse.** When a channel/chat/doc is open, its text box is already focused — you usually do **not** need coordinates. Prefer this:
-
-1. `launch_app "<App>"` (foreground).
-2. `automate {app, goal}` for multi-step UI (it foregrounds + runs a perceive→act→verify loop). Good for native apps (Music, Mail, Notes).
-3. **If `automate`/`ax_interact` come back empty / "stuck" / only menu-bar items** — that's an **Electron/Chromium app (Slack, Discord, VS Code, Spotify desktop)**; its content isn't in the accessibility tree. Switch to **keyboard-driven control**:
-   - `launch_app "<App>"` (foreground), then `keyboard` `type` the text and `press` `Enter`. The focused input receives it. Use app **hotkeys** to navigate (no mouse needed).
-4. **Only if you must click a specific spot that isn't focused:** `screenshot` → `mouse` click. (Screenshots are downscaled so you can see them; coordinates you read are in the returned image's pixels.)
-
-**Worked example — "message hi on Slack" (keyboard-only, no vision):**
-`launch_app "Slack"` → `keyboard hotkey "cmd+k"` (Slack quick switcher) → `keyboard type "<person or channel>"` → `keyboard press "Enter"` (opens the chat, focuses the message box) → `keyboard type "hi"` → `keyboard press "Enter"` (sends). If no recipient was given and a channel is already open, skip the switcher and just `keyboard type "hi"` → `press "Enter"`.
-
-`screenshot`/`mouse`/`keyboard` run without an approval prompt (they're on your auto-approve list) — just proceed.
-
 ## Rules
 
 - **You are the chat tier.** You run on a fast UX-focused model (TTFT > deep reasoning). When a task needs sustained multi-step thinking — planning across many steps, comparing several non-obvious options, untangling ambiguous requirements — **delegate to the reasoning tier (`delegate_plan`)** rather than reasoning through it yourself. Your job at that point is to brief the planner well and synthesise its output back to the user.

From 226b60ce439739195d9f63d97c9aee05d626254e Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 21:46:45 +0530
Subject: [PATCH 8/9] fix(notch): redact PII paths from logs + dispose pending
 socket on unmount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CodeRabbit #3345:
- notch_window.rs: log only the page source *kind* and file basenames, never
  absolute bundle paths (they contain /Users/<login>/… PII).
- NotchApp.tsx: capture connectSocket()'s disposer and call it on effect
  cleanup / on a new core-url, so a still-resolving connectCoreSocket can't
  attach listeners or setState after teardown.
---
 app/src-tauri/src/notch_window.rs | 31 ++++++++++++++++++++-----------
 app/src/notch/NotchApp.tsx        | 11 +++++++++--
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/app/src-tauri/src/notch_window.rs b/app/src-tauri/src/notch_window.rs
index 25b2d9516e..78653688cb 100644
--- a/app/src-tauri/src/notch_window.rs
+++ b/app/src-tauri/src/notch_window.rs
@@ -84,7 +84,15 @@ pub(crate) fn show(app: &AppHandle<AppRuntime>) -> Result<(), String> {
         .ok_or_else(|| "notch_window::show called off the main thread".to_string())?;
 
     let source = resolve_page_source(app)?;
-    log::info!("[notch-window] loading source={source:?}");
+    // Log only the source *kind* — bundled paths contain `/Users/<login>/…`
+    // (PII), so never log the absolute resource paths.
+    log::info!(
+        "[notch-window] loading source_kind={}",
+        match &source {
+            PageSource::Dev { .. } => "dev",
+            PageSource::Bundled { .. } => "bundled",
+        }
+    );
 
     let frame = top_center_frame(mtm);
     log::debug!(
@@ -152,10 +160,7 @@ fn resolve_page_source(app: &AppHandle<AppRuntime>) -> Result<PageSource, String
             });
         }
     }
-    Err(format!(
-        "notch bundled index.html not found under resource_dir={}",
-        resource_dir.display()
-    ))
+    Err("notch bundled index.html not found under the app resource dir".to_string())
 }
 
 // ── Frame geometry ────────────────────────────────────────────────────────────
@@ -298,15 +303,19 @@ unsafe fn build_webview(
                         let _ =
                             webview.loadFileURL_allowingReadAccessToURL(&ns_url, &read_access_ns);
                         log::info!(
-                            "[notch-window] loaded bundled index={} root={}",
-                            index_html.display(),
-                            root.display()
+                            "[notch-window] loaded bundled page index={}",
+                            index_html
+                                .file_name()
+                                .and_then(|n| n.to_str())
+                                .unwrap_or("index.html")
                         );
                     }
                     _ => log::warn!(
-                        "[notch-window] could not parse bundled file URLs index={} root={}",
-                        file_url,
-                        read_access_url
+                        "[notch-window] could not parse bundled file URLs (index={})",
+                        index_html
+                            .file_name()
+                            .and_then(|n| n.to_str())
+                            .unwrap_or("index.html")
                     ),
                 }
             }
diff --git a/app/src/notch/NotchApp.tsx b/app/src/notch/NotchApp.tsx
index d8f5352dae..3cfcd8935c 100644
--- a/app/src/notch/NotchApp.tsx
+++ b/app/src/notch/NotchApp.tsx
@@ -265,23 +265,30 @@ export default function NotchApp() {
   // ── Core URL bootstrap ──────────────────────────────────────────────────────
 
   useEffect(() => {
+    // Track the in-flight connect's disposer so an unmount (or a new core-url)
+    // cancels a still-resolving connectCoreSocket — otherwise the async branch
+    // could attach listeners / setState after teardown.
+    let disposePendingConnect: (() => void) | undefined;
+
     // Check if Rust already injected the URL before this component mounted.
     const preloaded = (window as { __OPENHUMAN_NOTCH_CORE_URL__?: string })
       .__OPENHUMAN_NOTCH_CORE_URL__;
     if (preloaded) {
-      connectSocket(preloaded);
+      disposePendingConnect = connectSocket(preloaded);
     }
 
     // Also listen for the event (fires when core becomes ready after mount).
     const handler = (e: CustomEvent<{ url: string }>) => {
       if (e.detail?.url) {
-        connectSocket(e.detail.url);
+        disposePendingConnect?.();
+        disposePendingConnect = connectSocket(e.detail.url);
       }
     };
     window.addEventListener('notch:core-url', handler as EventListener);
 
     return () => {
       window.removeEventListener('notch:core-url', handler as EventListener);
+      disposePendingConnect?.();
       socketRef.current?.disconnect();
       socketRef.current = null;
       clearDismiss();

From 9954618267d099ad8f38bb6cab1a164cb1ac46dc Mon Sep 17 00:00:00 2001
From: M3gA-Mind <megamind@mahadao.com>
Date: Thu, 4 Jun 2026 22:17:53 +0530
Subject: [PATCH 9/9] test(notch): cover NotchApp socket flow + injected notch
 core token

Clears the diff-cover gate for #3345:
- NotchApp.test.tsx: drives a mock Socket.IO through all handlers
  (dictation toggle/transcription, companion state, overlay attention),
  both bootstrap paths (preloaded global + notch:core-url event), and the
  ready/listening/transcribing/thinking/speaking/attention render modes.
- coreRpcClient: cover the host-injected __OPENHUMAN_NOTCH_CORE_TOKEN__
  fast-path in getCoreRpcToken.
---
 app/src/notch/NotchApp.test.tsx               | 124 ++++++++++++++++++
 .../services/__tests__/coreRpcClient.test.ts  |  14 ++
 2 files changed, 138 insertions(+)
 create mode 100644 app/src/notch/NotchApp.test.tsx

diff --git a/app/src/notch/NotchApp.test.tsx b/app/src/notch/NotchApp.test.tsx
new file mode 100644
index 0000000000..82b28c6249
--- /dev/null
+++ b/app/src/notch/NotchApp.test.tsx
@@ -0,0 +1,124 @@
+import { render, screen, waitFor } from '@testing-library/react';
+import { act } from 'react';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { connectCoreSocket } from '../services/coreSocket';
+import NotchApp from './NotchApp';
+
+// Key-passthrough i18n that honours the inline fallback the component passes
+// (`t('notch.listening', 'Listening…')`).
+vi.mock('../lib/i18n/I18nContext', () => ({
+  useT: () => ({ t: (key: string, fallback?: string) => fallback ?? key }),
+}));
+
+vi.mock('../services/coreSocket', () => ({ connectCoreSocket: vi.fn() }));
+
+// Minimal Socket.IO stand-in: records handlers so a test can replay events.
+class MockSocket {
+  static last: MockSocket | null = null;
+  handlers = new Map<string, (payload: unknown) => void>();
+  connected = false;
+  id = 'notch-test-socket';
+  connect = vi.fn(() => {
+    this.connected = true;
+    return this;
+  });
+  disconnect = vi.fn(() => {
+    this.connected = false;
+    return this;
+  });
+  on = vi.fn((event: string, handler: (payload: unknown) => void) => {
+    this.handlers.set(event, handler);
+    return this;
+  });
+  fire(event: string, payload: unknown) {
+    const handler = this.handlers.get(event);
+    if (!handler) throw new Error(`no handler registered for ${event}`);
+    act(() => handler(payload));
+  }
+  constructor() {
+    MockSocket.last = this;
+  }
+}
+
+describe('NotchApp', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    MockSocket.last = null;
+    (window as { __OPENHUMAN_NOTCH_CORE_URL__?: string }).__OPENHUMAN_NOTCH_CORE_URL__ =
+      'http://127.0.0.1:9999';
+    vi.mocked(connectCoreSocket).mockImplementation(
+      async () => new MockSocket() as unknown as Awaited<ReturnType<typeof connectCoreSocket>>
+    );
+  });
+
+  afterEach(() => {
+    delete (window as { __OPENHUMAN_NOTCH_CORE_URL__?: string }).__OPENHUMAN_NOTCH_CORE_URL__;
+  });
+
+  const renderAndConnect = async () => {
+    render(<NotchApp />);
+    // Idle baseline pill.
+    expect(screen.getByText('Ready')).toBeInTheDocument();
+    // Wait for the async connect to register its socket handlers.
+    await waitFor(() => expect(MockSocket.last).not.toBeNull());
+    await waitFor(() =>
+      expect(MockSocket.last?.on).toHaveBeenCalledWith('overlay:attention', expect.any(Function))
+    );
+    return MockSocket.last as MockSocket;
+  };
+
+  it('connects using the preloaded core URL and shows the idle pill', async () => {
+    const socket = await renderAndConnect();
+    expect(connectCoreSocket).toHaveBeenCalledTimes(1);
+    expect(socket.connect).toHaveBeenCalled();
+  });
+
+  it('renders Listening on a dictation press', async () => {
+    const socket = await renderAndConnect();
+    socket.fire('dictation:toggle', { type: 'pressed' });
+    expect(await screen.findByText('Listening…')).toBeInTheDocument();
+  });
+
+  it('renders the transcript text on dictation:transcription', async () => {
+    const socket = await renderAndConnect();
+    socket.fire('dictation:transcription', { text: 'play some music' });
+    expect(await screen.findByText('play some music')).toBeInTheDocument();
+  });
+
+  it('maps companion:state_changed to a mode', async () => {
+    const socket = await renderAndConnect();
+    socket.fire('companion:state_changed', { state: 'thinking' });
+    expect(await screen.findByText('Processing…')).toBeInTheDocument();
+  });
+
+  it('renders an overlay:attention message', async () => {
+    const socket = await renderAndConnect();
+    socket.fire('overlay:attention', { message: 'Opening Music', ttl_ms: 5000 });
+    expect(await screen.findByText('Opening Music')).toBeInTheDocument();
+  });
+
+  it('handles speaking, released and idle transitions without throwing', async () => {
+    const socket = await renderAndConnect();
+    socket.fire('companion:state_changed', { state: 'speaking' });
+    expect(await screen.findByText('Speaking…')).toBeInTheDocument();
+    // Released schedules a dismiss; idle drives an immediate dismiss — both
+    // exercise the scheduleDismiss branches.
+    socket.fire('dictation:toggle', { type: 'released' });
+    socket.fire('companion:state_changed', { state: 'idle' });
+  });
+
+  it('connects via the notch:core-url event when no URL was preloaded', async () => {
+    delete (window as { __OPENHUMAN_NOTCH_CORE_URL__?: string }).__OPENHUMAN_NOTCH_CORE_URL__;
+    render(<NotchApp />);
+    expect(screen.getByText('Ready')).toBeInTheDocument();
+    expect(connectCoreSocket).not.toHaveBeenCalled();
+
+    act(() =>
+      window.dispatchEvent(
+        new CustomEvent('notch:core-url', { detail: { url: 'http://127.0.0.1:8888' } })
+      )
+    );
+    await waitFor(() => expect(connectCoreSocket).toHaveBeenCalledTimes(1));
+  });
+});
diff --git a/app/src/services/__tests__/coreRpcClient.test.ts b/app/src/services/__tests__/coreRpcClient.test.ts
index af2c7ce88e..e82d577cfe 100644
--- a/app/src/services/__tests__/coreRpcClient.test.ts
+++ b/app/src/services/__tests__/coreRpcClient.test.ts
@@ -1105,6 +1105,20 @@ describe('getCoreRpcToken (cloud-mode persistence)', () => {
     expect(headers.Authorization).toBe('Bearer cloud-token-abc');
   });
 
+  test('honours the host-injected notch core token before the cache/store', async () => {
+    // The notch / overlay WKWebViews have no Tauri IPC; the Rust host injects
+    // the bearer as a global, which must win ahead of the resolution cache.
+    (globalThis as { __OPENHUMAN_NOTCH_CORE_TOKEN__?: string }).__OPENHUMAN_NOTCH_CORE_TOKEN__ =
+      'notch-bearer-xyz';
+    try {
+      const { getCoreRpcToken } = await import('../coreRpcClient');
+      await expect(getCoreRpcToken()).resolves.toBe('notch-bearer-xyz');
+    } finally {
+      delete (globalThis as { __OPENHUMAN_NOTCH_CORE_TOKEN__?: string })
+        .__OPENHUMAN_NOTCH_CORE_TOKEN__;
+    }
+  });
+
   test('clearCoreRpcTokenCache forces a re-resolve on the next call', async () => {
     let storedToken: string | null = 'first-token';
     vi.doMock('../../utils/configPersistence', () => ({