diff --git a/.gitignore b/.gitignore
index 6c99549..1956dfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,5 +5,7 @@ dist/
.DS_Store
Thumbs.db
debug/
+.clawdcursor-config.json
.clawd-config.json
qa-tests/
+.claude/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8da58c..a8428bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,7 @@ All notable changes to Clawd Cursor will be documented in this file.
### Fixed
- **Checkpoint system overhaul** — removed auto-termination (completionRatio ≥ 0.90 early exit and isComplete() mid-loop kill), strict detection: content_pasted requires Ctrl+V, content_copied requires Ctrl+C, second_app_opened detects any window switch universally
- **Pipeline context passing** — `priorContext[]` accumulator flows from pre-processing through to Computer Use (no more amnesia between layers)
-- **Credential resolution order** — .clawd-config → auth-profiles.json → openclaw.json (with template expansion) → env vars
+- **Credential resolution order** — .clawdcursor-config → auth-profiles.json → openclaw.json (with template expansion) → env vars
- **`loadPipelineConfig()` path resolution** — checks package dir first, then cwd (fixes global npm installs)
- **Smart Interaction model lookup** — uses `PROVIDERS` registry instead of hardcoded model/baseUrl maps; fixes stale `claude-haiku-3-5-20241022` fallback
- **Scroll behavior** — system prompts instruct PageDown/Space instead of tiny mouse scrolls; default scroll delta 3 → 15
@@ -130,7 +130,7 @@ All notable changes to Clawd Cursor will be documented in this file.
- **Case-preserving action router** — all regex matches against raw (unmodified) task text. Typed text and URLs no longer get lowercased.
- **Flexible click matching** — `click Blank document` works without quotes (was requiring `click "Blank document"`). Single unified regex for quoted and unquoted element names.
- **PowerShell encoding** — replaced emoji (🐾) and em dash (—) in task console title that broke on Windows PowerShell due to encoding.
-- **Stale config** — `.clawd-config.json` now correctly reflects Ollama when doctor detects it (was stuck on Anthropic).
+- **Stale config** — `.clawdcursor-config.json` now correctly reflects Ollama when doctor detects it (was stuck on Anthropic).
- **Brain provider mismatch** — decomposition no longer calls Anthropic API when only Ollama is available.
### Changed
@@ -235,7 +235,7 @@ Layer 3: Screenshot + Vision — full screenshot, Computer Use API
## [0.5.0] - 2026-02-23 — Smart Pipeline + Doctor + Batch Execution
### Added
-- **`clawd-cursor doctor`** — auto-diagnoses setup, tests models, configures optimal pipeline
+- **`clawdcursor doctor`** — auto-diagnoses setup, tests models, configures optimal pipeline
- **3-layer pipeline** — Action Router → Accessibility Reasoner → Screenshot fallback
- **Layer 2: Accessibility Reasoner** (`src/a11y-reasoner.ts`) — text-only LLM reads the UI tree, no screenshots needed. Uses cheap models (Haiku, Qwen, GPT-4o-mini).
- **Batch action execution** — Claude returns multiple actions per response (3.6 avg), skipping screenshots between batched actions. Drawing tasks execute 10+ actions in a single API call.
diff --git a/README.md b/README.md
index b7cf129..364a012 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@
Clawd Cursor
- AI Desktop Agent — Universal Smart Pipeline
- Works with any AI provider · Runs free with local models · Self-healing doctor
+ OS-level desktop automation server. Gives any AI model eyes, hands, and ears on a real computer.
+ Model-agnostic · Works with Claude, GPT, Gemini, Llama, or any tool-calling model · Free with local models
@@ -14,83 +14,132 @@
- Website · Discord · Quick Start · How It Works · API · Changelog
+ Website · Discord · Quick Start · Connect · How It Works · API · Changelog
---
-## What's New in v0.6.3
+## What's New in v0.7.0
+
+**Architecture overhaul. Universal tool server. True independence.**
+
+- **6-layer smart pipeline** — L0 (Browser) -> L1 (Action Router) -> L1.5 (Deterministic Flows) -> L2 (A11y Reasoner + CDP) -> L2.5 (Vision Hints) -> L3 (Computer Use). Most tasks never reach L3.
+- **40 universal tools** — served via REST (`GET /tools`, `POST /execute/:name`) and MCP stdio from a single definition. Any model that can call functions can control your desktop.
+- **3 transport modes** — `start` (full agent + tools), `serve` (tools only, bring your own brain), `mcp` (MCP stdio for Claude Code, Cursor, Windsurf, Zed)
+- **CDP browser integration** — Chrome DevTools Protocol for DOM interaction, text extraction, click-by-selector. Auto-connects to Edge/Chrome.
+- **Action verifier** — ground-truth checking after every action. Blocks false success reports.
+- **A11y click resolver** — bounds-based coordinate resolution, zero LLM cost
+- **Deterministic flows** — hardcoded keyboard sequences for common tasks (email compose, app switch). Zero LLM calls, instant.
+- **No-progress loop detector** — blocks same action repeated 3+ times. Forces the LLM to try something different.
+- **Premature-done blocker** — evidence-based completion checking. Won't report success unless verified.
+- **Structured task logging** — JSONL per-task logs with `verified_success` vs `unverified_success` distinction
+- **First-run onboarding** — consent flow explains what desktop control means before tools activate
+- **Standalone data directory** — all data in `~/.clawdcursor/` (migrates from legacy paths automatically)
+- **Error reporting** (opt-in) — `clawdcursor report` lets users send redacted task logs to help improve the agent
+
+### v0.6.3 vs v0.7.0
+
+| | v0.6.3 | v0.7.0 |
+|---|---|---|
+| **Architecture** | 4-layer pipeline (L0-L3) | 6-layer pipeline (L0, L1, L1.5, L2, L2.5, L3) |
+| **Transport** | REST API only | REST + MCP stdio + tools-only server |
+| **Tools** | Monolithic agent, no tool exposure | 40 discrete tools, OpenAI function-calling format |
+| **Browser** | Playwright-only, no DOM access | CDP integration — click by selector, read text, type by label |
+| **Verification** | LLM self-reports success (often wrong) | Ground-truth action verifier — reads actual content back |
+| **False positives** | Common — agent says "done" prematurely | Premature-done blocker + evidence-based completion |
+| **Loops** | Agent can repeat same failed action forever | No-progress detector blocks after 3 repeats in 8 steps |
+| **Click resolution** | Vision model guesses coordinates | A11y bounds-based resolver (zero LLM cost), vision as fallback |
+| **Common tasks** | Every task goes through LLM | Deterministic flows for email, app-switch — zero LLM calls |
+| **Task logging** | Console output only | Structured JSONL per-task, verified vs unverified success |
+| **Data directory** | `~/.openclaw/clawdcursor/` (coupled) | `~/.clawdcursor/` (standalone, auto-migrates) |
+| **Dependencies** | Tied to OpenClaw platform | Fully standalone — works with any AI, any client |
+| **Onboarding** | None — starts immediately | First-run consent flow for desktop control |
+| **MCP support** | None | Native MCP stdio for Claude Code, Cursor, Windsurf, Zed |
+| **Error reporting** | None | Opt-in redacted task log submission |
+| **Model coupling** | Anthropic-favored defaults | Truly model-agnostic — Claude, GPT, Gemini, Llama, Ollama, anything |
-**Universal Pipeline, Multi-App Workflows, Provider-Agnostic.**
+---
-- **🧠 LLM-based task pre-processor** — one cheap text LLM call decomposes any command into structured intent. No more brittle regex parsing.
-- **📋 Multi-app workflows** — copy from Wikipedia, paste in Notepad? Works. 6-checkpoint tracking ensures every step completes (select → copy → switch app → click → paste → verify).
-- **⌨️ Site-specific shortcuts** — Reddit (j/k/a/c), Twitter/X, YouTube, Gmail, GitHub, Slack + generic hints. Vision LLM uses keyboard instead of slow mouse clicks.
-- **🌐 OS-level browser detection** — reads Windows registry or macOS LaunchServices for actual default browser. No hardcoded Edge/Safari.
-- **🔄 3 smart verification retries** — on failure, builds step log digest + checkpoint status so the vision LLM fixes the exact missed step.
-- **🔌 Mixed-provider pipelines** — kimi for text + anthropic for Computer Use, with per-layer API key resolution from OpenClaw auth-profiles.
-- **🔧 Global install fix** — config discovery now checks package dir first, then cwd.
-- **🏗️ Provider-agnostic internals** — no hardcoded model names, no hardcoded app lists, universal checkpoint detection.
+## The Glove for Any AI Hand
-## What's New in v0.6.1
+Think of your AI as the **hand** and Clawd Cursor as the **glove**.
-**Keyboard Shortcuts, Pipeline Fixes, Better URL Handling.**
+The hand has the intelligence — it reasons, plans, and decides what to do. The glove gives it grip on the physical world. Clawd Cursor wraps your entire desktop — every window, every button, every text field, every pixel — and exposes it as simple tool calls that any AI model can use.
-- **⌨️ Keyboard shortcuts registry** — common actions (scroll, copy, reddit upvote) execute as direct keystrokes. Zero LLM calls, instant.
-- **🔧 Pipeline gate fix** — Action Router now always runs, even for browser-context tasks. Shortcuts work everywhere.
-- **🌐 Smarter URL extraction** — "open gmail and send email to foo@bar.com" correctly navigates to Gmail instead of bar.com.
-- **🔄 CDP→UIDriver fallback** — Smart Interaction falls back to accessibility tree when browser CDP fails.
-- **🛑 Reliable force-stop** — `clawdcursor stop` kills lingering processes.
-- **📊 Provider label inference** — startup logs show text/vision providers clearly.
+**Your AI is the brain. Clawd Cursor is the body.**
-## What's New in v0.6.0
+If it's visible on your screen, Clawd Cursor can interact with it. Native apps, web apps, legacy software, internal tools, desktop games — anything with a GUI. No app-specific integrations needed. No APIs to configure per-service. One universal interface that turns any AI into a desktop operator.
-**Universal Provider Support, OpenClaw Integration, Security Hardening.**
+This is what makes v0.7.0 different from every other automation tool: **it doesn't care which AI drives it.** Claude, GPT, Gemini, Llama running locally, a custom model you trained yourself, or a simple Python script making function calls. If it can call tools, it can control your computer.
-- **🔗 OpenClaw integration** — auto-discovers all configured providers from OpenClaw's config. No separate API key needed when running as a skill.
-- **🌐 Universal provider support** — Anthropic, OpenAI, Groq, Together AI, DeepSeek, Kimi, Ollama, or any OpenAI-compatible endpoint. Provider auto-detected from API key format.
-- **🧠 Mixed provider pipelines** — use Ollama for text (free) + cloud for vision (best quality). Doctor picks the optimal split automatically.
-- **🔒 Security hardened** — sensitive app policy (agents must ask before email/banking/messaging), safety tiers enforced, no credentials stored in skill files.
-- **🔧 Auto-detection as default** — no hardcoded models or providers. Doctor dynamically picks the best available setup.
+```
+Your AI (any model) Clawd Cursor (the glove)
+ "Click the Send button" -> find_element + mouse_click
+ "What's on screen?" -> desktop_screenshot + read_screen
+ "Type my email" -> type_text
+ "Open Chrome to gmail" -> open_app + navigate_browser
+ "Read that table" -> cdp_read_text
+```
-### v0.5.6 — Fluid Decomposition, Interactive Doctor, Smart Vision Fallback
+---
-- **🧠 Fluid task decomposition** — LLM reasons about what ANY app needs instead of matching hardcoded patterns.
-- **🩺 Interactive doctor** — scans all providers, detects GPU/VRAM, lets you pick TEXT and VISION LLMs.
-- **🖥️ Smart vision fallback** — remaining subtasks bundled and handed to vision when cheap layers fail midway.
+## Three Ways to Connect
-### v0.5.2 — Web Dashboard + Browser Foreground Focus
+Clawd Cursor is a **tool server**. It doesn't care which AI model drives it.
-- **🖥️ Web Dashboard** — real-time logs, approve/reject safety confirmations, kill switch. Dark theme, zero dependencies.
-- **🪟 Browser foreground focus** — Playwright activates Chrome at OS level. No more invisible background tabs.
-- **Multi-provider** — 7+ providers supported out of the box
-- **95% cheaper** — simple tasks run for $0 with local models
-- **Self-healing** — if a model fails, the pipeline adapts automatically
+### 1. Built-in Agent (`start`)
-### Performance
+Full autonomous agent with built-in LLM pipeline. Send a task, get a result.
-| Task | v0.4 (single provider) | v0.5+ (local, $0) | v0.5+ (cloud) |
-|------|-----------------------|---------------------|-------------------|
-| Calculator (255*38=) | 43s | **2.6s** | **20.1s** |
-| Notepad (type hello) | 73s | **2.0s** | **54.2s** |
-| File Explorer | 53s | **1.9s** | **22.1s** |
-| Gmail compose | 162s (18 LLM calls) | — | **21.7s** (1 LLM call) |
+```bash
+clawdcursor start
+curl http://localhost:3847/task -H "Content-Type: application/json" \
+ -d '{"task": "Open Notepad and write a haiku about the ocean"}'
+```
----
+### 2. Tools-Only Server (`serve`)
-## OpenClaw Integration
+Exposes 40 desktop tools via REST API. **You** bring the brain — Claude, GPT, Gemini, Llama, a script, anything.
-Clawd Cursor ships as an [OpenClaw](https://openclaw.ai) skill. Install it and any OpenClaw agent — yours or community-built — can control your desktop through natural language.
+```bash
+clawdcursor serve
-The [`SKILL.md`](SKILL.md) teaches agents **when and how** to use Clawd Cursor: REST API for full desktop control, CDP direct for fast browser reads. Agents learn to be independent — no more asking you to screenshot or copy-paste things they can do themselves.
+# Discover available tools (OpenAI function-calling format)
+curl http://localhost:3847/tools
-For orchestration best practices (how to avoid overlap and keep OpenClaw + Clawd Cursor efficient), see [docs/OPENCLAW-INTEGRATION-RECOMMENDATIONS.md](docs/OPENCLAW-INTEGRATION-RECOMMENDATIONS.md).
+# Execute any tool
+curl http://localhost:3847/execute/desktop_screenshot
+curl http://localhost:3847/execute/mouse_click -d '{"x": 500, "y": 300}'
+curl http://localhost:3847/execute/type_text -d '{"text": "Hello world"}'
+```
-```bash
-# Install as OpenClaw skill
-openclaw skills install clawd-cursor
+### 3. MCP Mode (`mcp`)
+
+Runs as an MCP tool server over stdio. Works with Claude Code, Cursor, Windsurf, Zed, or any MCP-compatible client.
+
+```jsonc
+// Claude Code: ~/.claude/settings.json
+{
+ "mcpServers": {
+ "clawdcursor": {
+ "command": "node",
+ "args": ["/path/to/clawdcursor/dist/index.js", "mcp"]
+ }
+ }
+}
```
+### Tool Categories (40 tools)
+
+| Category | Tools | Examples |
+|----------|-------|---------|
+| Perception | 9 | `desktop_screenshot`, `read_screen`, `get_active_window`, `get_focused_element`, `smart_read`, `ocr_read_screen` |
+| Mouse | 6 | `mouse_click`, `mouse_double_click`, `mouse_drag`, `mouse_scroll` |
+| Keyboard | 5 | `key_press`, `type_text`, `smart_type`, `shortcuts_list`, `shortcuts_execute` |
+| Window/App | 6 | `focus_window`, `open_app`, `get_windows`, `invoke_element` |
+| Browser CDP | 10 | `cdp_connect`, `cdp_click`, `cdp_type`, `cdp_read_text` |
+| Orchestration | 4 | `delegate_to_agent`, `smart_click`, `navigate_browser`, `wait` |
+
---
## Quick Start
@@ -98,278 +147,143 @@ openclaw skills install clawd-cursor
### Windows
```powershell
-git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor
+git clone https://github.com/AmrDab/clawdcursor.git
+cd clawdcursor
npm install
npm run setup # builds + registers 'clawdcursor' command globally
-# Just install and start — auto-configures from OpenClaw or env vars
+# Start the full agent
clawdcursor start
-# Or specify any provider
-clawdcursor start --base-url https://api.example.com/v1 --api-key KEY
+# Or start tools-only (bring your own AI)
+clawdcursor serve
-# Fine-tune setup interactively (optional)
-clawdcursor doctor
+# Or run as MCP server
+clawdcursor mcp
```
### macOS
```bash
-git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor && npm install && npm run setup
+git clone https://github.com/AmrDab/clawdcursor.git
+cd clawdcursor && npm install && npm run setup
# Grant Accessibility permissions to your terminal first!
-# System Settings → Privacy & Security → Accessibility → Add Terminal/iTerm
+# System Settings -> Privacy & Security -> Accessibility -> Add Terminal/iTerm
-# Make macOS scripts executable
chmod +x scripts/mac/*.sh scripts/mac/*.jxa
-
-# Just start — auto-detects available providers
clawdcursor start
-
-# Or specify any provider
-clawdcursor start --base-url https://api.example.com/v1 --api-key KEY
```
### Linux
```bash
-git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor && npm install && npm run setup
+git clone https://github.com/AmrDab/clawdcursor.git
+cd clawdcursor && npm install && npm run setup
-# Linux: browser control via CDP only (no native desktop automation)
-# Just start — auto-detects available providers
+# Linux: browser control via CDP only (no native desktop automation yet)
clawdcursor start
-
-# Or specify any provider
-clawdcursor start --base-url https://api.example.com/v1 --api-key KEY
```
-> 📖 See [docs/MACOS-SETUP.md](docs/MACOS-SETUP.md) for the full macOS onboarding guide.
-
-First run auto-configuration will:
-1. Scan for AI providers from OpenClaw config, environment variables, and CLI flags
-2. Quick-test discovered providers (5s timeout per provider)
-3. Build the optimal pipeline automatically
-4. Save config and start immediately
-
-The optional `doctor` command provides interactive configuration:
-1. Tests your screen capture and accessibility bridge
-2. Scans all AI providers (Anthropic, OpenAI, Groq, Together, DeepSeek, Kimi, Ollama) and detects GPU/VRAM
-3. Tests each model and shows you what works with latency
-4. Lets you pick your TEXT LLM and VISION LLM (or accept the recommended defaults)
-5. Shows setup instructions for any unconfigured cloud providers
-6. Builds your optimal pipeline and saves it
-
-Send a task:
-```bash
-clawdcursor task "Open Notepad and type hello world"
-
-# Or via API:
-curl http://localhost:3847/task -H "Content-Type: application/json" \
- -d '{"task": "Open Notepad and type hello world"}'
-```
+> See [docs/MACOS-SETUP.md](docs/MACOS-SETUP.md) for the full macOS onboarding guide.
-> **Note:** `npm run setup` runs `npm run build && npm link`, which registers `clawdcursor` as a global command. If you prefer not to link globally, run `npm run build` instead and use `npx clawdcursor` or `node dist/index.js` to run commands.
+First run will:
+1. Show a desktop control consent warning (one-time)
+2. Scan for AI providers from environment variables and CLI flags
+3. Auto-configure the optimal pipeline
+4. Start the server on `http://localhost:3847`
-### Provider Quick Setup
+### Provider Setup
**Free (no API key needed):**
```bash
-# Just need Ollama running with any model
-ollama pull # e.g. qwen2.5:7b, llama3.2, gemma2
-clawdcursor doctor
+ollama pull qwen2.5:7b # or any model
clawdcursor start
```
**Any cloud provider:**
```bash
echo "AI_API_KEY=your-key-here" > .env
-clawdcursor doctor
+clawdcursor doctor # optional — auto-detects from key format
clawdcursor start
```
-Doctor auto-detects your provider from the key format. Supported out of the box:
-
-| Provider | Key prefix | Vision | Computer Use |
-|----------|-----------|--------|-------------|
-| Anthropic | `sk-ant-` | ✅ | ✅ |
-| OpenAI | `sk-` | ✅ | ❌ |
-| Groq | `gsk_` | ✅ | ❌ |
-| Together AI | — | ✅ | ❌ |
-| DeepSeek | — | ✅ | ❌ |
-| Kimi/Moonshot | `sk-` (long) | ❌ | ❌ |
-| Any OpenAI-compatible | — | varies | ❌ |
-
-For providers without key prefix detection, specify explicitly:
-```bash
-clawdcursor doctor --provider together --api-key YOUR_KEY
-```
-
-**OpenClaw users:** No setup needed — Clawd Cursor auto-discovers all your configured providers.
-
----
-
-## Compatibility (v0.6.0 Audit)
-
-Cross-platform checks are now automated in GitHub Actions on **Windows, macOS, and Linux** for both **Node 20** and **Node 22** (build + test).
-
-| OS | Status | Notes |
-|----|--------|-------|
-| Windows 10/11 | ✅ Full support | Native desktop automation via PowerShell + UI Automation scripts. |
-| macOS 13+ | ✅ Full support | Native desktop automation via JXA/System Events scripts. |
-| Linux | ⚠️ Partial support | Browser/CDP flows work. Native desktop automation requires X11 native libs (for `@nut-tree-fork/nut-js`) and may still vary by distro/desktop environment. |
-
-**Linux prerequisites for native automation** (Debian/Ubuntu example):
-
+**Explicit provider:**
```bash
-sudo apt-get update
-sudo apt-get install -y libxtst6 libx11-xcb1 libxcomposite1 libxdamage1 libxfixes3 libxi6 libxrandr2 libxtst-dev
+clawdcursor start --provider anthropic --api-key sk-ant-...
+clawdcursor start --base-url https://api.example.com/v1 --api-key KEY
```
-If these libraries are missing, `clawdcursor doctor` can fail on startup with errors like `libXtst.so.6: cannot open shared object file`.
+| Provider | Key prefix | Vision | Computer Use |
+|----------|-----------|--------|-------------|
+| Anthropic | `sk-ant-` | Yes | Yes |
+| OpenAI | `sk-` | Yes | No |
+| Groq | `gsk_` | Yes | No |
+| Together AI | - | Yes | No |
+| DeepSeek | - | Yes | No |
+| Kimi/Moonshot | `sk-` (long) | No | No |
+| Ollama (local) | - | Auto-detected | No |
+| Any OpenAI-compatible | - | Varies | No |
---
## How It Works
-### The Smart Pipeline
+### The 6-Layer Pipeline
-Every task is pre-processed by a cheap text LLM, then flows through up to 5 layers. Each layer is cheaper and faster than the next. Most tasks never reach Layer 3.
+Every task flows through layers cheapest-first. Most tasks complete at Layer 1 or 2 — Layer 3 is the expensive fallback.
```
-┌─────────────────────────────────────────────────────┐
-│ Pre-processor: LLM Task Decomposition (1 text call) │
-│ Parses any natural language → {app, navigate, task, │
-│ contextHints}. Opens app + navigates URL before │
-│ pipeline starts. Detects multi-app workflows. │
-├─────────────────────────────────────────────────────┤
-│ Layer 0: Browser (Playwright — free, instant) │
-│ Direct browser control via CDP. page.goto(), │
-│ brings Chrome to foreground. Zero vision tokens. │
-├─────────────────────────────────────────────────────┤
-│ Layer 1: Action Router + Shortcuts (instant, free) │
-│ Regex + UI Automation. "Open X", "type Y", "click Z"│
-│ Includes keyboard shortcuts registry — common │
-│ actions like scroll, copy, undo, reddit upvote │
-│ execute as direct keystrokes. Zero LLM calls. │
-├─────────────────────────────────────────────────────┤
-│ Layer 1.5: Smart Interaction (1 LLM call) │
-│ CDPDriver (browser) or UIDriver (desktop apps). │
-│ LLM plans steps → executes via selectors/a11y. │
-├─────────────────────────────────────────────────────┤
-│ Layer 2: Accessibility Reasoner (fast, cheap/free) │
-│ Reads the accessibility tree, sends to cheap LLM │
-│ (Haiku, Qwen, GPT-4o-mini). No screenshots needed │
-├─────────────────────────────────────────────────────┤
-│ Layer 3: Computer Use / Vision (powerful, expensive) │
-│ Full screenshot → vision LLM with site-specific │
-│ shortcuts + scroll guidance + multi-app workflows. │
-│ 3 smart verification retries with step log analysis. │
-└─────────────────────────────────────────────────────┘
+User Task
+ |
+ v
+Pre-processor (1 cheap LLM call)
+ Decomposes "open gmail and send email to bob" into
+ structured intent: {app, url, action, contextHints}
+ |
+ v
+Layer 0: Browser (free, instant)
+ Direct CDP: page.goto(), DOM reads, click by selector
+ |
+ v
+Layer 1: Action Router + Shortcuts (free, instant)
+ Regex matching + keyboard shortcuts registry
+ "scroll down" -> Page Down, "copy" -> Ctrl+C
+ |
+ v
+Layer 1.5: Deterministic Flows (free, instant)
+ Hardcoded sequences for known tasks (email compose, app switch)
+ |
+ v
+Layer 2: A11y Reasoner + CDP (cheap, 1 LLM call)
+ Reads accessibility tree or CDP DOM -> sends to cheap LLM
+ LLM decides: click, type, key_press, cdp_click, done
+ Action verifier confirms each step worked
+ |
+ v
+Layer 2.5: Vision Hints (1 screenshot)
+ Screenshot -> vision LLM for spatial hints when A11y is blind
+ |
+ v
+Layer 3: Computer Use / Vision (expensive, full)
+ Screenshot -> vision LLM with site-specific shortcuts
+ 3 smart retries with step log analysis
```
-**The doctor decides which layers are available** based on your setup. No API key? Layers 0-2 with Ollama. Anthropic key? All layers with Computer Use.
-
-### Keyboard Shortcuts (Layer 1)
-
-Clawd Cursor ships with a keyboard shortcuts registry. Common actions execute as direct keystrokes — no LLM calls, no screenshots, instant.
-
-| Category | Examples |
-|----------|----------|
-| Navigation | scroll up/down, page up/down, go back/forward |
-| Editing | copy, paste, undo, redo, select all |
-| Browser | new tab, close tab, refresh, find |
-| Social | reddit upvote/downvote, next/prev post |
-| System | minimize, maximize, switch window |
-
-Custom shortcuts can be added to `src/shortcuts.ts`. The action router uses fuzzy matching — "scroll the page down" maps to the scroll-down shortcut automatically.
-
-### Provider-Specific Behavior
-
-| Provider | Layer 1 | Layer 2 (text) | Layer 3 (vision) | Computer Use |
-|----------|---------|----------------|-------------------|-------------|
-| Anthropic | ✅ | Haiku | Sonnet | ✅ Native |
-| OpenAI | ✅ | GPT-4o-mini | GPT-4o | ❌ |
-| Groq | ✅ | Llama 3.3 70B | Llama 3.2 90B Vision | ❌ |
-| Together AI | ✅ | Llama 3.1 70B | Llama 3.2 90B Vision | ❌ |
-| DeepSeek | ✅ | DeepSeek Chat | DeepSeek Chat | ❌ |
-| Kimi | ✅ | Moonshot-8k | Moonshot-8k | ❌ |
-| Ollama | ✅ | Auto-detected | Auto-detected | ❌ |
-| No key | ✅ | ❌ | ❌ | ❌ |
-
-**Mixed providers:** Doctor can configure Ollama for text (free) + a cloud provider for vision (best quality). The pipeline picks the cheapest option for each layer automatically.
-
-### Self-Healing
-
-The pipeline adapts at runtime:
-- **Model fails?** → Circuit breaker trips, falls to next layer
-- **API rate limited?** → Exponential backoff + automatic retry
-- **Doctor detects issues?** → Falls back to available alternatives (e.g., cloud model unavailable → local Ollama)
+### Action Verification
----
+Every action is verified after execution:
+- **Type actions**: Reads back the focused element's text content
+- **Click actions**: Checks if window/focus changed as expected
+- **Key presses**: Verifies the expected state change occurred
+- **CDP actions**: Re-reads DOM to confirm changes
+- **Task completion**: Ground-truth check reads actual content (Notepad text, email window state, etc.)
-## Doctor
+If verification fails, the agent retries with a different approach instead of reporting false success.
-```bash
-npm run doctor
-```
+### No-Progress Detection
-```
-🩺 Clawd Cursor Doctor - diagnosing your setup...
-
-📸 Screen capture...
- ✅ 2560x1440, 110ms
-♿ Accessibility bridge...
- ✅ 20 windows detected, 822ms
-
-🔍 Scanning providers...
- Anthropic: ✅ key found (sk-ant-a...)
- OpenAI: ❌ no key
- Groq: ❌ no key
- Together AI: ❌ no key
- DeepSeek: ❌ no key
- Kimi (Moonshot): ❌ no key
- Ollama (Local): ✅ running (qwen2.5:7b, llama3.2)
-
- 💡 Cloud providers not configured (add API keys to unlock):
- OpenAI: set OPENAI_API_KEY — https://platform.openai.com
- Groq: set GROQ_API_KEY — https://console.groq.com
- Together AI: set TOGETHER_API_KEY — https://api.together.xyz
-
- Testing models...
- Text: claude-haiku-4-5 (Anthropic) ✅ 498ms
- Vision: claude-sonnet-4 (Anthropic) ✅ 1217ms
- Text: qwen2.5:7b (Ollama) ✅ 4117ms
-
-🎮 GPU detected: NVIDIA GeForce RTX 3080 (10240 MB VRAM)
-
-🧩 Choose your pipeline models (press Enter for recommended).
- TEXT LLM (Layer 2):
- 1. claude-haiku-4-5 (Anthropic, 498ms)
- 2. qwen2.5:7b (Ollama, 4117ms) ★ recommended
- Pick 1-2 (Enter=2):
-
- VISION LLM (Layer 3):
- 1. claude-sonnet-4 (Anthropic, 1217ms) ★ recommended
- Pick 1 (Enter=1):
-
-🧠 Selected pipeline:
- Layer 1: Action Router (offline) ✅
- Layer 2: qwen2.5:7b via Ollama ✅
- Layer 3: claude-sonnet-4 via Anthropic ✅
- 🖥️ Computer Use API: enabled
-
-💾 Config saved to .clawd-config.json
-```
-
-Options:
-```
---provider Force a provider (anthropic|openai|ollama|kimi)
---api-key Override API key
---no-save Don't save config to disk
-```
+If the LLM repeats the same action 3+ times in an 8-step window, it's blocked and forced to try something different. Combined with the premature-done blocker (requires evidence of completion for write tasks), this prevents the two most common failure modes: infinite loops and premature success.
---
@@ -380,11 +294,19 @@ Options:
| Endpoint | Method | Description |
|----------|--------|-------------|
| `/` | GET | Web dashboard UI |
-| `/task` | POST | Execute a task: `{"task": "Open Chrome"}` |
+| `/tools` | GET | List all 40 tools (OpenAI function-calling format) |
+| `/execute/:name` | POST | Execute a tool by name |
+| `/task` | POST | Submit a task: `{"task": "Open Chrome"}` |
| `/status` | GET | Agent state and current task |
-| `/logs` | GET | Last 200 log entries (JSON array) |
-| `/confirm` | POST | Approve/reject pending action |
+| `/task-logs` | GET | Recent task summaries (structured JSONL) |
+| `/task-logs/current` | GET | Current task's step-by-step log |
+| `/report` | POST | Submit an error report (opt-in) |
+| `/logs` | GET | Last 200 console log entries |
+| `/screenshot` | GET | Current screen as PNG |
+| `/action` | POST | Direct action execution (LLM-space coords) |
+| `/confirm` | POST | Approve/reject pending safety action |
| `/abort` | POST | Stop the current task |
+| `/favorites` | GET/POST/DELETE | Saved command favorites |
| `/stop` | POST | Graceful server shutdown |
| `/health` | GET | Server health + version |
@@ -393,95 +315,87 @@ Options:
## Architecture
```
-┌───────────────────────────────────────────────────┐
-│ Your Desktop (Native Control) │
-│ @nut-tree-fork/nut-js · Playwright · OS-level │
-└──────────────────────┬────────────────────────────┘
- │
-┌──────────────────────┴────────────────────────────┐
-│ Clawd Cursor Agent │
-│ │
-│ ┌────────┐ ┌────────┐ ┌───────┐ ┌─────┐ ┌─────┐│
-│ │Layer 0 │ │Layer 1 │ │L 1.5 │ │ L2 │ │ L3 ││
-│ │Browser │→│Action │→│Smart │→│A11y │→│Vision││
-│ │Playwrt │ │Router+ │ │Interac│ │Tree │ │+CU ││
-│ │(free) │ │Shortct │ │(1 LLM)│ │(cheap│ │(full)││
-│ └────────┘ └────────┘ └───────┘ └─────┘ └─────┘│
-│ ↑ │
-│ ┌──────────┐ ┌────────────────┐ │
-│ │ Doctor │ │ Web Dashboard │ │
-│ │ Auto-cfg │ │ localhost:3847 │ │
-│ └──────────┘ └────────────────┘ │
-│ │
-│ Safety Layer · REST API · Circuit Breaker │
-└────────────────────────────────────────────────────┘
+ Any AI Model
+ (Claude, GPT, Gemini, Llama, scripts, etc.)
+ |
+ +-------------+-------------+
+ | | |
+ REST API MCP stdio Built-in Agent
+ (serve) (mcp) (start)
+ | | |
+ +-------------+-------------+
+ |
+ Clawd Cursor Tool Server
+ 40 tools, single definition
+ |
+ +---------+-------+-------+---------+
+ | | | | |
+ Perception Mouse Keyboard Window Browser
+ screenshot click key_press focus cdp_click
+ read_screen drag type_text open cdp_type
+ a11y_tree scroll switch cdp_read
+ |
+ Native Desktop Layer
+ nut-js + PowerShell/JXA + Playwright
+ |
+ Your Desktop
```
---
-## Safety Tiers
+## Safety
| Tier | Actions | Behavior |
|------|---------|----------|
-| 🟢 Auto | Navigation, reading, opening apps | Runs immediately |
-| 🟡 Preview | Typing, form filling | Logs before executing |
-| 🔴 Confirm | Sending messages, deleting, purchases | Pauses for approval |
+| Auto | Navigation, reading, opening apps | Runs immediately |
+| Preview | Typing, form filling | Logs before executing |
+| Confirm | Sending messages, deleting, purchases | Pauses for approval |
-## CLI Options
+First run shows a desktop control consent warning. Dangerous key combos (Alt+F4, Ctrl+Alt+Del) are blocked. Server binds to localhost only.
+
+## CLI Commands
```
-clawdcursor start Start the agent
+clawdcursor start Start the full agent (built-in LLM pipeline)
+clawdcursor serve Start tools-only server (no built-in LLM)
+clawdcursor mcp Run as MCP tool server over stdio
clawdcursor doctor Diagnose and auto-configure
clawdcursor task Send a task to running agent
-clawdcursor dashboard Open the web dashboard in your browser
-clawdcursor kill Stop the running server
+clawdcursor report Send an error report (opt-in, redacted)
+clawdcursor dashboard Open the web dashboard
+clawdcursor install Set up API key and configure pipeline
+clawdcursor uninstall Remove all config and data
clawdcursor stop Stop the running server
+clawdcursor kill Force stop
Options:
--port API port (default: 3847)
- --provider Auto-detected, or: anthropic|openai|ollama|groq|together|deepseek|kimi|...
+ --provider anthropic|openai|ollama|groq|together|deepseek|kimi|...
--model Override vision model
--api-key AI provider API key
+ --base-url Custom API endpoint
--debug Save screenshots to debug/ folder
```
## Platform Support
-| Platform | UI Automation | Browser (CDP) | Status |
-|----------|---------------|---------------|--------|
-| **Windows** | PowerShell + .NET UI Automation | ✅ Chrome/Edge | ✅ Full support |
-| **macOS** | JXA + System Events (Accessibility API) | ✅ Chrome/Edge | ✅ Full support |
-| **Linux** | — | ✅ Chrome/Edge (CDP only) | 🔶 Browser only |
-
-### Platform Notes
-
-- **Windows**: Uses `powershell.exe` + `.NET UIAutomationClient` for native app interaction. Shell chaining: `cd dir; npm start`
-- **macOS**: Uses `osascript` + JXA (JavaScript for Automation) + System Events. Requires Accessibility permissions. Shell chaining: `cd dir && npm start`. See [docs/MACOS-SETUP.md](docs/MACOS-SETUP.md).
-- **Both**: CDPDriver (browser automation) works identically — connects via WebSocket to `localhost:9222`.
-
-### Browser CDP Setup
-
-```bash
-# Windows (PowerShell)
-Start-Process chrome --ArgumentList "--remote-debugging-port=9222"
-
-# macOS (Bash)
-open -a "Google Chrome" --args --remote-debugging-port=9222
-
-# Edge on macOS
-open -a "Microsoft Edge" --args --remote-debugging-port=9222
-```
+| Platform | UI Automation | OCR | Browser (CDP) | Status |
+|----------|---------------|-----|---------------|--------|
+| **Windows** (x64/ARM64) | PowerShell + .NET UI Automation | Windows.Media.Ocr | Chrome/Edge | Full support |
+| **macOS** (Intel/Apple Silicon) | JXA + System Events | Apple Vision framework | Chrome/Edge | Full support |
+| **Linux** (x64/ARM64) | AT-SPI (planned) | Tesseract OCR | Chrome/Edge | Browser + OCR |
## Prerequisites
-- **Node.js 18+** (20+ recommended)
-- **Windows**: PowerShell (included with Windows)
-- **macOS 13+**: osascript (included), Accessibility permissions granted
-- **AI API Key** - optional. Works offline with Ollama or Action Router only.
+- **Node.js 20+** (x64 or ARM64)
+- **Windows**: PowerShell (included)
+- **macOS 10.15+**: Accessibility permissions granted, Xcode CLI tools (`xcode-select --install`)
+- **Linux**: `tesseract-ocr` and `python3` for OCR (`sudo apt install tesseract-ocr`)
+- **AI API Key** — optional. Works offline with Ollama or tools-only mode.
## Tech Stack
-TypeScript · Node.js · @nut-tree-fork/nut-js · sharp · Express · Any OpenAI-compatible API · Anthropic Computer Use · Windows UI Automation · macOS Accessibility (JXA) · Ollama
+TypeScript - Node.js - @nut-tree-fork/nut-js - Playwright - sharp - Express - MCP SDK - Zod - Any OpenAI-compatible API - Anthropic Computer Use - Windows UI Automation - macOS Accessibility (JXA) - Chrome DevTools Protocol
## License
diff --git a/SKILL.md b/SKILL.md
index 63298a3..5ee32ae 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -1,407 +1,606 @@
---
name: clawdcursor
-version: 0.6.3
+version: 0.7.0
description: >
- AI desktop agent — control any app on Windows/macOS from your OpenClaw agent.
- Send natural language tasks to the Clawd Cursor API and it handles everything:
- opening apps, clicking buttons, typing text, navigating browsers, filling forms.
- If you can click it, your agent can too.
+ OS-level desktop automation server. 40 tools for controlling any application
+ on Windows/macOS/Linux. Model-agnostic — works with any AI that can do
+ function calling (Claude, GPT, Gemini, Llama, Mistral, or plain cURL).
+ No API keys needed. No config. Just install and start.
homepage: https://clawdcursor.com
-source: https://github.com/AmrDab/clawd-cursor
+source: https://github.com/AmrDab/clawdcursor
privacy: >
- Screenshots and UI data stay on the user's machine. If using Ollama (local), zero data leaves the host.
- If using a cloud provider (Anthropic, OpenAI, Kimi), screenshots/text are sent to that provider's API
- only — never to third-party servers or skill authors. The user controls which provider is used.
- The REST API binds to 127.0.0.1 only and is not network-accessible.
+ All processing runs locally on the user's machine. The tool server binds to
+ 127.0.0.1 only — not network accessible. No telemetry, no analytics, no
+ phone-home. Screenshots stay in memory. When using the Pro autonomous agent,
+ screenshots/text are sent to the user's configured AI provider only.
metadata:
openclaw:
- # Note: bins check removed — OpenClaw's hasBinary() doesn't handle Windows
- # PATHEXT (.exe/.cmd), causing false "missing" on Windows even when node is
- # installed. Since install requires npm (which implies node), the check is
- # redundant. See: https://github.com/openclaw/openclaw/issues/TBD
requires: {}
install:
- - git clone https://github.com/AmrDab/clawd-cursor.git
- - cd clawd-cursor && npm install && npm run setup
- - cd clawd-cursor && clawdcursor doctor
- - cd clawd-cursor && clawdcursor start
- privacy:
- - Screenshots processed by user's own configured AI provider only
- - With Ollama, fully offline — no external API calls
- - With cloud providers, data goes only to user's chosen API endpoint
- - REST API binds to 127.0.0.1 only — not network accessible
-notes:
- - In OpenClaw, this skill inherits the active agent's AI provider + API key.
- - Standalone CLI mode can still use AI_API_KEY / provider-specific env vars or Ollama.
+ - npm install -g clawdcursor
+ - clawdcursor consent --accept
+ - clawdcursor serve
+ skill_dir: ~/.openclaw/workspace/skills/clawdcursor
---
-# Clawd Cursor
-
**If a human can do it on a screen, you can too.** No API? No integration? No problem.
-Every app already has a UI — Clawd Cursor gives you eyes and hands to use all of them.
-
-No task is impossible. If it's on the screen, it's within reach.
+Every app already has a UI — clawdcursor gives you eyes and hands to use all of them.
> **IMPORTANT — READ THIS BEFORE ANYTHING ELSE**
>
-> You have a desktop AI agent over GUI running on the user's machine. This means:
-> 1. You **CAN** open apps, click buttons, type text, fill forms
-> 2. You **CAN** see what's on screen (via Clawd Cursor's screenshot pipeline)
-> 3. You **CAN** interact with applications — browser, desktop apps, settings
-> 4. You **MUST** respect safety tiers — 🟢 Auto actions run freely, 🔴 Confirm actions require user approval
-> 5. You **MUST** ask before accessing sensitive apps (email, banking, private messages, password managers)
-> 6. You **SHOULD** prefer direct tools (file read, web_fetch, CLI) over desktop automation when available
+> You have access to a desktop automation server running on the user's machine.
+> This means:
>
-> Think of Clawd Cursor as your hands and eyes on the user's computer — used with their permission.
+> 1. You **CAN** open apps, click buttons, type text, fill forms, read screen content
+> 2. You **CAN** see what's on screen via accessibility tree, OCR, CDP, or screenshots
+> 3. You **CAN** interact with any application — browsers, desktop apps, system dialogs
+> 4. You **MUST** ask the user before accessing sensitive apps (see list below)
+> 5. You **MUST** never self-approve Confirm-tier safety actions
+> 6. You **SHOULD** prefer direct tools (API calls, CLI, file read) over desktop automation when available
+>
+> Think of clawdcursor as **hands, eyes, and ears** on the user's computer —
+> used only with their permission.
---
-## When to Use This (AGENTS: READ THIS)
+## Section 1: When to Use clawdcursor
-You have a desktop UI agent available. Use it when the user asks you to interact with their desktop, or when no other tool can accomplish the task.
+Route tasks in this order — cheapest and most reliable first:
-### Tool vs Skill (OpenClaw terminology)
-- **Tool** = direct capability (API call, filesystem, shell, web fetch, browser command).
-- **Skill** = packaged workflow/domain logic that may call one or more tools.
-- **This skill (Clawd Cursor)** = GUI execution skill. Use it after OpenClaw tools/skills that can complete the same work without GUI.
+1. **Native tools first** — API call, CLI command, filesystem read/write, or web fetch. Faster, cheaper, more reliable.
+2. **Browser-native next** — if the task is browser-only and you have direct browser tools (Playwright, Puppeteer), use those.
+3. **clawdcursor last** — when no API, CLI, or browser tool can reach the target. Desktop apps, system dialogs, cross-app workflows, canvas UIs, or any GUI-only interaction.
-### Use Clawd Cursor for (examples, not limits):
-Clawd Cursor can perform **any action that is visible and interactable in the GUI** (subject to safety policy).
+### Use clawdcursor for
-- **User-requested desktop tasks** — "open Gmail and send an email", "check my calendar"
-- **Read a webpage** — when web_fetch or browser tools aren't available
-- **Interact with desktop apps** — click buttons, fill forms, read results
-- **Browser tasks** — search, navigate, fill forms (when browser tool unavailable)
-- **Visual verification** — did the page load? what does the UI show?
-- **Cross-app workflows** — copy from one app, paste in another
-- **Settings changes** — when the user explicitly asks
+- Desktop app interaction (Notepad, Word, Excel, Outlook, VS Code, Spotify, etc.)
+- Browser tasks when no other browser tool is available
+- Cross-app workflows (copy from one app, paste in another)
+- System dialogs, file pickers, OS-level popups
+- Canvas UIs where DOM access fails (Google Docs, Figma, Notion)
+- Visual verification ("did the page load?", "what does the UI show?")
+- Any GUI element visible on screen that no API can reach
-### ⚠️ Sensitive App Policy
-**Always ask the user before** accessing:
-- Email clients (Gmail, Outlook)
-- Banking or financial apps
-- Private messaging (WhatsApp, Signal, Telegram)
-- Password managers
-- Admin panels or cloud consoles
+### Do NOT use clawdcursor when
-### Don't use Clawd Cursor when:
-- You can do it with a direct API call or CLI command (faster)
+- A direct API call or CLI command can do it (faster, more reliable)
- The task is purely computational (math, text generation, code writing)
-- You can already read/write the file directly
-- The browser tool or web_fetch can handle it
+- You can read/write the file directly
+- Another browser tool already handles it
-## OpenClaw + Clawd Cursor Routing Contract (Avoid Overlap)
+### Sensitive App Policy
-Clawd Cursor should be treated as **OpenClaw's GUI execution layer**, not a competing planner.
+**Always ask the user before accessing:**
-### Route tasks in this order:
-1. **OpenClaw native tools first** (filesystem, API, shell, provider-native skills)
-2. **Browser-native automation next** (Playwright/CDP direct) for browser-only reads/clicks
-3. **Clawd Cursor API task (`POST /task`)** only when desktop/UI-level interaction is required
+- Email clients (Gmail, Outlook, Thunderbird)
+- Banking or financial apps
+- Private messaging (WhatsApp, Signal, Telegram, Slack DMs)
+- Password managers (1Password, Bitwarden, LastPass)
+- Admin panels, cloud consoles, or anything with credentials
-### Practical rule
-- If OpenClaw already has a reliable skill/tool for the domain, use it.
-- Use Clawd Cursor to bridge gaps where no API/tool exists or when the user explicitly asks for GUI interaction.
+Never access these silently. Always confirm intent first.
-This keeps behavior predictable, lowers latency/cost, and avoids duplicated logic between the main OpenClaw agent and this skill.
+---
-### Universal task pattern
-For broad "get it done" requests, split into three phases:
-1. **Plan in OpenClaw**: break work into API/CLI/browser/GUI subtasks.
-2. **Execute cheap paths first**: API + CLI + browser direct.
-3. **Escalate only residual UI steps** to Clawd Cursor.
+## Section 2: Connecting
-Think: **"OpenClaw decides, Clawd Cursor acts on GUI when needed."**
+### REST mode (`clawdcursor serve`)
-### Direct Browser Access (Fast Path)
-For quick page reads without a full task, connect to Chrome via Playwright CDP:
-```js
-const pw = require('playwright');
-const browser = await pw.chromium.connectOverCDP('http://127.0.0.1:9222');
-const pages = browser.contexts()[0].pages();
-const text = await pages[0].innerText('body');
+```bash
+clawdcursor serve # starts on http://localhost:3847
```
-Use this when you just need page content — faster than sending a task.
+Endpoints:
-| Scenario | Use | Why |
-|----------|-----|-----|
-| Read page content/text | CDP Direct | Instant, free |
-| Fill a web form | API task (`POST /task`) | Clawd handles multi-step planning |
-| Check if a page loaded | CDP Direct | Just read the title/URL |
-| Click through a complex UI flow | API task (`POST /task`) | Clawd handles planning |
-| Get a list of elements on page | CDP Direct | Fast DOM query |
-| Interact with a desktop app | API task (`POST /task`) | CDP is browser-only |
+```
+GET /tools Tool schemas (OpenAI function-calling format)
+POST /execute/{name} Execute a tool by name
+GET /health Server status check
+GET /docs Full documentation
+```
----
+Example:
-## REST API Reference
+```python
+import requests
+tools = requests.get("http://localhost:3847/tools").json()
+result = requests.post("http://localhost:3847/execute/read_screen").json()
+```
-Base URL: `http://127.0.0.1:3847`
+### MCP mode (`clawdcursor mcp`)
-> **Note:** On Windows PowerShell, use `curl.exe` (with .exe) or `Invoke-RestMethod`. Bare `curl` is aliased to `Invoke-WebRequest` which behaves differently.
+```bash
+clawdcursor mcp # stdio transport
+```
-### Pre-flight Check
+Register in your MCP client settings:
+
+```json
+{
+ "mcpServers": {
+ "clawdcursor": {
+ "command": "clawdcursor",
+ "args": ["mcp"]
+ }
+ }
+}
+```
-Before your first task, verify Clawd Cursor is running:
+Works with Claude Code, Cursor, Windsurf, Zed, or any MCP-compatible client.
-```bash
-curl.exe -s http://127.0.0.1:3847/health
-```
+### Built-in agent (`clawdcursor start`)
-Expected: `{"status":"ok","version":"0.6.0"}`
+Full autonomous agent with its own LLM pipeline. Send a task, get a result:
-If connection refused — **start it yourself** (don't ask the user):
-```powershell
-# Find the skill directory and start the server
-Start-Process -FilePath "node" -ArgumentList "dist/index.js","start" -WorkingDirectory "" -WindowStyle Hidden
-Start-Sleep 3
-# Verify it's running
-curl.exe -s http://127.0.0.1:3847/health
+```bash
+clawdcursor start
+curl http://localhost:3847/task -H "Content-Type: application/json" \
+ -d '{"task": "Open Notepad and write a haiku"}'
```
-The skill directory is wherever SKILL.md lives (the parent of this file). Use that path as the working directory.
-### Sending a Task (Async — Returns Immediately)
+### Auto-start guidance
-`POST /task` accepts the task and returns immediately. The task runs in the background. **You must poll `/status` to know when it's done.**
+If the server is not running when you need it, **start it yourself — do not ask the user:**
```bash
-curl.exe -s -X POST http://127.0.0.1:3847/task -H "Content-Type: application/json" -d "{\"task\": \"YOUR_TASK_HERE\"}"
+# Any platform
+clawdcursor serve
```
-PowerShell:
```powershell
-Invoke-RestMethod -Uri http://127.0.0.1:3847/task -Method POST -ContentType "application/json" -Body '{"task": "YOUR_TASK_HERE"}'
+# Windows (background, no window)
+Start-Process -FilePath "clawdcursor" -ArgumentList "serve" -WindowStyle Hidden
```
-### Polling Pattern (Follow This)
+Wait 2 seconds, then verify:
-```
-1. POST /task → get accepted
-2. Wait 2 seconds
-3. GET /status
-4. If status is "idle" → done
-5. If status is "waiting_confirm" → ASK THE USER, then POST /confirm based on their answer
-6. If still running → wait 2 more seconds, go to step 3
-7. If 60+ seconds → POST /abort and retry with clearer instructions
+```bash
+curl http://localhost:3847/health
```
-### Checking Status
+Expected: `{"status":"ok","version":"0.7.0"}`
-```bash
-curl.exe -s http://127.0.0.1:3847/status
+If Chrome CDP is not available on port 9222, launch Chrome yourself:
+
+```powershell
+Start-Process chrome -ArgumentList "--remote-debugging-port=9222"
```
-### Confirming Safety-Gated Actions
+---
+
+## Section 3: Tool Decision Guide
+
+This is the most important section. Follow these decision trees exactly.
+
+### Perception — always start here
+
+Before doing anything, read what is on screen:
-Some actions (sending messages, deleting) require approval. **🔴 NEVER self-approve these.** Always ask the user for confirmation before POST /confirm. These exist to protect the user — do not bypass them.
-```bash
-curl.exe -s -X POST http://127.0.0.1:3847/confirm -H "Content-Type: application/json" -d "{\"approved\": true}"
```
+1. smart_read Best first call. Combines OCR + accessibility tree.
+ Returns structured text of everything visible.
-### Aborting a Task
+2. read_screen Accessibility tree only. Fast, structured, no OCR cost.
+ Use when smart_read is unavailable or you want raw a11y.
-```bash
-curl.exe -s -X POST http://127.0.0.1:3847/abort
+3. ocr_read_screen Raw OCR text extraction (Windows OCR engine).
+ Use when a11y tree is empty (canvas apps, image-based UIs).
+
+4. desktop_screenshot Full screenshot as image. LAST RESORT.
+ Only use when you need pixel-level detail (colors, layout,
+ images) that text-based tools cannot provide.
```
-### Reading Logs (Debugging)
+### Clicking — choose the right tool
-```bash
-curl.exe -s http://127.0.0.1:3847/logs
```
+1. smart_click("Save") FIRST CHOICE. Finds element by label/text using
+ OCR + a11y, then clicks it. Handles fallbacks
+ internally. Pass the visible text of the element.
-Returns last 200 log entries. Check for `error` or `warn` entries when tasks fail.
+2. cdp_click(text="Submit") Use for browser DOM elements specifically.
+ Requires cdp_connect() first. Works by visible
+ text or CSS selector.
-### Response States
+3. invoke_element(name="Save") Use when you know the exact automation ID or
+ element name from read_screen output.
-| State | Response | What to do |
-|-------|----------|------------|
-| **Accepted** | `{"accepted": true, "task": "..."}` | Start polling |
-| **Running** | `{"status": "acting", "currentTask": "...", "stepsCompleted": 2}` | Keep polling |
-| **Waiting confirm** | `{"status": "waiting_confirm", "currentStep": "..."}` | POST /confirm |
-| **Done** | `{"status": "idle"}` | Task complete |
-| **Busy** | `{"error": "Agent is busy", "state": {...}}` | Wait or POST /abort first |
+4. mouse_click(x, y) LAST RESORT. Raw coordinates. Only use when all
+ text-based methods fail. Get coordinates from
+ desktop_screenshot (1280px-wide image space).
+```
----
+### Typing — choose the right tool
-## CDP Direct Reference
+```
+1. smart_type(text, target) FIRST CHOICE. Finds the input field by label or
+ nearby text, focuses it, then types. One call
+ does find + focus + type.
-Chrome must be running with `--remote-debugging-port=9222`.
+2. cdp_type(label, text) Use for browser input fields. Finds by label
+ text or CSS selector. Requires cdp_connect().
-### Quick check:
-```bash
-curl.exe -s http://127.0.0.1:9222/json/version
+3. type_text(text) Raw clipboard paste into whatever is currently
+ focused. Use after you have manually focused the
+ right element with smart_click or focus_window.
```
-If this returns JSON, Chrome is ready.
+### Browser workflow — follow this exact sequence
-### Connecting via Playwright:
+```
+1. navigate_browser(url) Opens URL, auto-launches browser with CDP enabled
+2. wait(3) Let the page load
+3. cdp_connect() Connect to the browser's CDP
+4. cdp_page_context() Get interactive elements on the page
+
+ IMPORTANT: Check the connected URL. If CDP connected to the wrong tab:
+5. cdp_list_tabs() List all browser tabs
+6. cdp_switch_tab(target) Switch to the correct tab
+
+Then interact:
+ cdp_click(text="...") Click by visible text
+ cdp_type(label="...", text) Type into input by label
+ cdp_read_text() Extract page text
+ cdp_evaluate(script) Run JavaScript
+```
+
+### CDP fast path (quick page reads)
+
+For reading page content without a full task, skip `navigate_browser` and connect directly if Chrome is already open:
```javascript
+// Chrome must have --remote-debugging-port=9222
const { chromium } = require('playwright');
const browser = await chromium.connectOverCDP('http://127.0.0.1:9222');
-const context = browser.contexts()[0];
-const page = context.pages()[0];
-
-// Read page content
-const title = await page.title();
-const url = page.url();
+const page = browser.contexts()[0].pages()[0];
const text = await page.textContent('body');
+```
+
+| Scenario | Use | Why |
+|----------|-----|-----|
+| Read page content | CDP direct | Instant, no LLM cost |
+| Fill a form | `cdp_type` + `cdp_click` | clawd handles the interaction |
+| Check if a page loaded | `cdp_read_text()` | Fast DOM query |
+| Desktop app interaction | Individual tools | CDP is browser-only |
+| Complex multi-step task | `delegate_to_agent` | Built-in agent handles planning |
+
+### Window focus rule (CRITICAL)
+
+**Always call focus_window before key_press.**
+
+`key_press` sends keystrokes to whatever window currently has focus. If your
+agent runs in a terminal, key presses go to the terminal — not the app you
+intended. Always focus the target window first:
+
+```
+focus_window("Notepad") Focus the window
+read_screen() Confirm it is focused
+key_press("ctrl+s") Now the keystroke goes to Notepad
+```
+
+### Shortcuts — use before reaching for mouse clicks
+
+`shortcuts_list` returns keyboard shortcuts for the current app context.
+`shortcuts_execute` runs a named shortcut with fuzzy matching.
+
+For known actions (save, copy, paste, undo, new tab, close tab, find, etc.),
+use shortcuts first — they are instant and never miss:
+
+```
+shortcuts_execute("save") Instead of clicking File > Save
+shortcuts_execute("copy") Instead of right-click > Copy
+shortcuts_execute("new tab") Instead of clicking the + button
+```
-// Click by role
-await page.getByRole('button', { name: 'Submit' }).click();
+### Canvas app handling (Google Docs, Figma, Notion)
-// Fill a field
-await page.getByLabel('Email').fill('user@example.com');
+These apps use canvas rendering. The DOM has no readable text. Pattern:
-// Read specific elements
-const buttons = await page.$$eval('button', els => els.map(e => e.textContent));
+```
+1. cdp_read_text() Try first — will return empty or garbage
+2. ocr_read_screen() Fall back to OCR for actual content
+3. smart_read() Also works — OCR component will pick it up
+
+To type in canvas apps:
+1. mouse_click(x, y) Click the canvas area where you want to type
+2. type_text("your text") Clipboard paste works even on canvas
+```
+
+### Delegate complex tasks to the built-in agent
+
+For multi-step tasks (5+ actions, uncertain path, or "just get it done"):
+
+```
+delegate_to_agent("Open Gmail, find the latest email from Stripe, and forward it to billing@example.com")
+```
+
+Then poll for completion:
+
+```
+1. delegate_to_agent(task) Submit the task
+2. wait(2) Let it start
+3. GET /status Check: acting | waiting_confirm | idle
+4. If waiting_confirm → ASK the user, then POST /confirm
+5. If idle → task complete
+6. If acting after 60s → POST /abort and retry with simpler phrasing
+```
+
+**Response states:**
+
+| State | What it means | What to do |
+|-------|--------------|------------|
+| `acting` | Task in progress | Keep polling every 2s |
+| `waiting_confirm` | Safety-gated action pending | Ask the user → POST /confirm |
+| `idle` | Task complete | Read the result |
+| `error` | Task failed | Check /logs, retry or rephrase |
+
+**Never self-approve `waiting_confirm`.** Always ask the user first.
+
+### Verifying actions succeeded
+
+After every action, verify it worked. Do not assume success:
+
+```
+type_text("Hello") Type something
+read_screen() Read back — is "Hello" in the focused element?
+
+smart_click("Send") Click a button
+read_screen() Did the UI change? Is the button gone?
+
+navigate_browser(url) Go to a page
+cdp_read_text() Did the page actually load?
```
---
-## Task Writing Guidelines
+## Section 4: Task Examples
+
+| Goal | How to do it |
+|------|-------------|
+| **Open app and type** | `open_app("notepad")` → `wait(2)` → `type_text("Hello world")` |
+| **Read a webpage** | `navigate_browser(url)` → `cdp_connect()` → `cdp_read_text()` |
+| **Fill a web form** | `cdp_connect()` → `cdp_type(label, text)` × N → `cdp_click("Submit")` |
+| **Cross-app copy/paste** | `focus_window("Chrome")` → `key_press("ctrl+a")` → `key_press("ctrl+c")` → `focus_window("Notepad")` → `type_text(clipboard)` |
+| **Interact with desktop app** | `open_app("Spotify")` → `smart_click("Discover Weekly")` |
+| **Canvas editor (Google Docs)** | `navigate_browser(url)` → `cdp_connect()` → `ocr_read_screen()` → `mouse_click(500,400)` → `type_text("content")` |
+| **Send email (with confirm)** | `delegate_to_agent("Open Gmail, compose to john@example.com, subject: Meeting, body: Confirming 2pm")` → poll → user approves confirm |
+| **Check deployment status** | `navigate_browser("https://vercel.com/dashboard")` → `cdp_connect()` → `cdp_read_text()` |
+| **Take a screenshot** | `desktop_screenshot()` |
+| **Play music** | `open_app("Spotify")` → `smart_read()` → `smart_click("Play")` |
+| **System settings** | `delegate_to_agent("Open Windows Settings and turn on Dark Mode")` |
+| **Complex browser flow** | `delegate_to_agent("Open YouTube, search for Adele Hello, play the first result")` |
+
+### Task writing guidelines (for delegate_to_agent)
1. **Be specific** — include app names, URLs, exact text to type, button names
2. **One task at a time** — wait for completion before sending the next
-3. **Describe the goal, not the clicks** — say "Send an email to john@example.com about the meeting" not "click compose, click to field..."
-4. **Check status** if a task seems to hang
-5. **Don't include credentials in task text** — tasks are logged
+3. **Describe the goal, not the clicks** — "Send an email to john@" not "click compose, click to field..."
+4. **Don't include credentials in task text** — tasks are logged
+5. **If it fails once, rephrase** — break into smaller steps, be more explicit about app name / button label
-## Task Examples
+---
-| Goal | Task to send |
-|------|-------------|
-| **Simple navigation** | `Open Chrome and go to github.com` |
-| **Read screen content** | `What text is currently displayed in Notepad?` |
-| **Cross-app workflow** | `Copy the email address from the Chrome tab and paste it into the To field in Outlook` |
-| **Form filling** | `In the open Chrome tab, fill the contact form: name "John Doe", email "john@example.com"` |
-| **App interaction** | `Open Spotify and play the Discover Weekly playlist` |
-| **Settings change** | `Open Windows Settings and turn on Dark Mode` |
-| **Data extraction** | `Read the stock price shown in the Bloomberg tab in Chrome` |
-| **Complex browser** | `Open YouTube, search for "Adele Hello", and play the first video result` |
-| **Verification** | `Check if the deployment succeeded — look at the Vercel dashboard in Chrome` |
-| **Send email** | `Open Gmail, compose email to john@example.com, subject: Meeting Tomorrow, body: Confirming 2pm. Best regards.` |
-| **Take screenshot** | `Take a screenshot` |
-
-## Error Recovery
-
-| Problem | Solution |
-|---------|----------|
-| Connection refused on :3847 | Start Clawd Cursor: `cd clawd-cursor && npm start` |
-| Connection refused on :9222 | Start Chrome with CDP: `Start-Process chrome -ArgumentList "--remote-debugging-port=9222"` |
-| Agent returns "busy" | Poll `/status` — wait for idle, or POST `/abort` |
-| Task fails with no details | Check `/logs` for error entries |
-| Task completes but wrong result | Rephrase with more specifics: exact app name, button text, field labels |
-| Same task fails repeatedly | Break into smaller tasks (one action per task) |
-| Safety confirmation pending | POST `/confirm` with `{"approved": true}` or `{"approved": false}` |
-| Task hangs > 60 seconds | POST `/abort`, then retry with simpler phrasing |
+## Section 5: Tool Reference (40 tools)
+
+Speed/cost tier: ⚡ Free+instant · 🔵 Cheap · 🟡 Moderate · 🔴 Expensive (vision LLM)
+
+### Perception (5 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `smart_read` | OCR + accessibility tree combined | 🔵 | **Best first call** for reading anything on screen |
+| `read_screen` | Accessibility tree (windows, buttons, inputs, text) | ⚡ | Fast structured read when you want raw a11y |
+| `ocr_read_screen` | Raw OCR text extraction | 🔵 | Canvas apps or image-based UIs where a11y fails |
+| `desktop_screenshot` | Full screen capture (1280px wide) | ⚡ | **Last resort** — when you need pixel-level visual detail |
+| `desktop_screenshot_region` | Zoomed crop of a specific area | ⚡ | When you need detail in one part of the screen |
+| `get_screen_size` | Screen dimensions and DPI | ⚡ | When you need to calculate coordinates |
+
+### Mouse (6 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `smart_click` | Find element by label/text via OCR + a11y, click it | 🔵 | **First choice** for clicking — handles fallbacks internally |
+| `mouse_click` | Left click at (x, y) | ⚡ | Last resort — when text-based click methods fail |
+| `mouse_double_click` | Double click at (x, y) | ⚡ | Open files, select words |
+| `mouse_right_click` | Right click at (x, y) | ⚡ | Open context menus |
+| `mouse_hover` | Move cursor without clicking | ⚡ | Trigger hover menus or tooltips |
+| `mouse_scroll` | Scroll up/down at position | ⚡ | Scroll content not responding to Page Down |
+| `mouse_drag` | Drag from (x1,y1) to (x2,y2) | ⚡ | Resize windows, move objects, select text ranges |
+
+### Keyboard (5 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `smart_type` | Find input by label, focus it, type — all in one | 🔵 | **First choice** for typing into a specific field |
+| `type_text` | Type via clipboard paste | ⚡ | After you have focused the correct input |
+| `key_press` | Send key combo (ctrl+s, Return, alt+tab) | ⚡ | After focus_window — never without focusing first |
+| `shortcuts_list` | List keyboard shortcuts for current app | ⚡ | Before reaching for mouse clicks on known actions |
+| `shortcuts_execute` | Execute a named shortcut (fuzzy match) | ⚡ | Save, copy, paste, undo, new tab, etc. |
+
+### Window Management (4 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `get_windows` | List all open windows | ⚡ | Find which apps are running |
+| `get_active_window` | Current foreground window | ⚡ | Check what has focus right now |
+| `get_focused_element` | What has keyboard focus | ⚡ | Debug typing going to wrong element |
+| `focus_window` | Bring window to front | ⚡ | **ALWAYS** before key_press or type_text |
+
+### UI Elements (2 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `find_element` | Search UI elements by name/type | ⚡ | When you need the automation ID before invoke |
+| `invoke_element` | Invoke a UI element by automation ID or name | ⚡ | When you know the exact element from read_screen |
+
+### Clipboard (2 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `read_clipboard` | Read clipboard text | ⚡ | After a copy operation to get the content |
+| `write_clipboard` | Write text to clipboard | ⚡ | Before a paste operation |
+
+### Browser CDP (10 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `cdp_connect` | Connect to browser's Chrome DevTools Protocol | ⚡ | First step for any browser interaction |
+| `cdp_page_context` | List interactive elements on page | ⚡ | After connect — see what you can click/type |
+| `cdp_read_text` | Extract text from DOM | ⚡ | Read page content (fails on canvas apps) |
+| `cdp_click` | Click by CSS selector or visible text | ⚡ | Browser clicks — more reliable than mouse coordinates |
+| `cdp_type` | Type into input by label or selector | ⚡ | Browser form filling |
+| `cdp_select_option` | Select dropdown option | ⚡ | Dropdowns and select elements |
+| `cdp_evaluate` | Run JavaScript in page context | ⚡ | Custom DOM queries or page manipulation |
+| `cdp_wait_for_selector` | Wait for element to appear | ⚡ | After navigation or AJAX loads |
+| `cdp_list_tabs` | List all browser tabs | ⚡ | When CDP connected to wrong tab |
+| `cdp_switch_tab` | Switch to a different tab | ⚡ | After cdp_list_tabs identifies the right one |
+
+### Orchestration (4 tools)
+
+| Tool | What it does | Tier | When to use |
+|------|-------------|------|-------------|
+| `open_app` | Launch an application by name | ⚡ | First step for desktop app tasks |
+| `navigate_browser` | Open URL with CDP auto-enabled | ⚡ | First step for browser tasks |
+| `wait` | Pause for N seconds | ⚡ | After opening apps or navigating — let UI render |
+| `delegate_to_agent` | Send task to built-in autonomous agent | 🟡 | Complex multi-step tasks — agent handles all planning |
---
-## How It Works — 5-Layer Pipeline
+## Section 6: Common Patterns
+
+### Open an app and type
+
+```
+open_app("notepad")
+wait(2)
+smart_read() Confirm Notepad is open and focused
+type_text("Hello world")
+smart_read() Verify text was typed
+```
+
+### Browser task (navigate, read, interact)
+
+```
+navigate_browser("https://example.com")
+wait(3)
+cdp_connect()
+cdp_page_context() See interactive elements
+cdp_read_text() Read page content
+cdp_click(text="Sign In")
+```
-| Layer | What | Speed | Cost |
-|-------|------|-------|------|
-| **0: Browser Layer** | URL detection → direct navigation | Instant | Free |
-| **1: Action Router + Shortcuts** | Regex + UI Automation + keyboard shortcuts | Instant | Free |
-| **1.5: Smart Interaction** | 1 LLM plan → CDP/UIDriver executes | ~2-5s | 1 LLM call |
-| **2: Accessibility Reasoner** | UI tree → text LLM decides | ~1s | Cheap |
-| **3: Computer Use** | Screenshot → vision LLM | ~5-8s | Expensive |
+### Fill a web form
-Layer 1 includes keyboard shortcuts — common actions execute as direct keystrokes (0 LLM calls).
+```
+cdp_connect()
+cdp_page_context()
+cdp_type(label="Email", text="user@example.com")
+cdp_type(label="Password", text="...")
+cdp_click(text="Submit")
+wait(2)
+cdp_read_text() Verify submission result
+```
-80%+ of tasks handled by Layer 0-1 (free, instant). Vision model is last resort only.
+### Cross-app copy/paste
-## Safety Tiers
+```
+focus_window("Chrome")
+key_press("ctrl+a")
+key_press("ctrl+c")
+read_clipboard() Get the copied text
+focus_window("Notepad")
+type_text(clipboard_content)
+```
+
+### Canvas editor (Google Docs, Figma)
+
+```
+navigate_browser("https://docs.google.com/document/create")
+wait(3)
+cdp_connect()
+ocr_read_screen() OCR — DOM text extraction fails on canvas
+mouse_click(500, 400) Click into the document body
+type_text("Your text here") Clipboard paste works on canvas
+```
+
+### Verify an action succeeded
+
+```
+smart_click("Send")
+wait(1)
+smart_read() Check — did "Message sent" appear?
+ Did the Send button disappear?
+ Did the UI transition to the next state?
+```
+
+---
+
+## Section 7: Safety
+
+### Safety tiers
| Tier | Actions | Behavior |
|------|---------|----------|
| 🟢 Auto | Navigation, reading, opening apps | Runs immediately |
-| 🟡 Preview | Typing, form filling | Logs before executing |
-| 🔴 Confirm | Sending messages, deleting | Pauses — **ask the user** before POST `/confirm`. Never self-approve. |
+| 🟡 Preview | Typing, form filling | Logged before executing |
+| 🔴 Confirm | Sending messages, deleting, purchases | Pauses for user approval |
-## Security & Privacy
+### Rules
-### Network Isolation
-- API binds to `127.0.0.1` only — **not network accessible**. Verify: `netstat -an | findstr 3847` should show `127.0.0.1:3847`
-- Screenshots stay in memory, never saved to disk (unless `--debug`)
-- No telemetry, no analytics, no phone-home calls
+- **Never self-approve Confirm actions.** Always ask the user first.
+- `Alt+F4` and `Ctrl+Alt+Delete` are **blocked** and will not execute.
+- Server binds to **127.0.0.1 only** — not accessible from the network.
+- First run requires **explicit user consent** for desktop control.
+- All actions are logged.
+- No telemetry, no analytics, no phone-home.
-### Data Flow
-- **With Ollama (local)**: 100% offline — zero external network calls. No data leaves the machine.
-- **With cloud providers**: screenshots/text are sent to the user's chosen provider API **only**. No data goes to skill authors, ClawHub, or third parties.
-- **OpenClaw users**: credentials auto-discovered from local config files — no keys stored in skill directory.
-- The user controls data flow by choosing their provider. Ollama = fully private.
+---
-### Agent Autonomy Controls
-- **🟢 Auto** actions (navigation, reading, opening apps) run without prompting
-- **🟡 Preview** actions (typing, form filling) are logged before executing
-- **🔴 Confirm** actions (sending messages, deleting, purchases) **always pause for user approval**
-- Agents **must ask the user** before accessing sensitive apps (email, banking, messaging, passwords)
-- Agents **must never self-approve** 🔴 Confirm actions
+## Section 8: Error Recovery
+
+| Problem | What to do |
+|---------|-----------|
+| Server not running (connection refused on :3847) | Run `clawdcursor serve` and wait 2 seconds |
+| Chrome CDP not available (:9222) | `Start-Process chrome -ArgumentList "--remote-debugging-port=9222"` |
+| CDP connects to wrong tab | Call `cdp_list_tabs()` then `cdp_switch_tab(target)` |
+| `focus_window` fails | Try `mouse_click` on the window's title bar area, then `read_screen` to confirm |
+| `smart_click` fails to find element | Fall back: `read_screen` to get coordinates, then `mouse_click(x, y)` |
+| `smart_type` fails to find input | Fall back: `smart_click` on the input field, then `type_text(text)` |
+| `cdp_read_text` returns empty (canvas app) | Use `ocr_read_screen()` instead |
+| `key_press` goes to wrong window | You forgot `focus_window` — always focus first, then press keys |
+| Agent returns "busy" | Wait for it to finish, or call `abort` and retry |
+| Task completes but wrong result | Verify with `smart_read` or `read_screen`, then retry with more specific instructions |
+| Same action fails 3+ times | Try a completely different approach — different tool, different target |
---
-## Setup (User Reference)
+## Section 9: Coordinate System
-Setup is handled by the user. If Clawd Cursor isn't running, **start it yourself** using the exec tool:
-```powershell
-Start-Process -FilePath "node" -ArgumentList "dist/index.js","start" -WorkingDirectory "" -WindowStyle Hidden
-```
-Only ask the user if you cannot start it (e.g., node not installed, build missing).
+All mouse tools use **image-space coordinates** based on a 1280px-wide viewport.
+This matches the screenshots from `desktop_screenshot`. DPI scaling is handled
+automatically. You do not need to worry about logical vs physical pixels.
-```bash
-git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor
-npm install && npm run build
-npx clawd-cursor doctor # auto-detects and configures everything
-npm start # starts on port 3847
-```
+---
-**macOS:** Grant Accessibility permission to terminal: System Settings → Privacy & Security → Accessibility
+## Section 10: Platform Support
-| Provider | Setup | Cost |
-|----------|-------|------|
-| **Ollama (free)** | `ollama pull ` | $0 (fully offline) |
-| **Any cloud provider** | Set `AI_API_KEY=your-key` | Varies by provider |
-| **OpenClaw users** | Automatic — no setup needed | Uses configured provider |
+| Platform | UI Automation | OCR | Browser (CDP) | Status |
+|----------|---------------|-----|---------------|--------|
+| **Windows** (x64/ARM64) | PowerShell + .NET UI Automation | Windows.Media.Ocr | Chrome/Edge | Full support |
+| **macOS** (Intel/Apple Silicon) | JXA + System Events | Apple Vision framework | Chrome/Edge | Full support |
+| **Linux** (x64/ARM64) | AT-SPI (planned) | Tesseract OCR | Chrome/Edge | Browser + OCR |
+
+**macOS:** Grant Accessibility permission: System Settings > Privacy > Accessibility.
+Install Xcode CLI tools if not present: `xcode-select --install`
+
+**Linux:** Install Tesseract for OCR: `sudo apt install tesseract-ocr`
---
-## Performance Optimization
-
-Proven optimizations applied to reduce task execution latency and LLM API costs. Reference files in `perf/references/patches/`.
-
-### Applied Optimizations
-
-| # | Name | Impact |
-|---|------|--------|
-| 1 | Screenshot hash cache | 90% fewer LLM calls on static screens |
-| 2 | Parallel screenshot+a11y | 30-40% per-step latency cut |
-| 3 | A11y context cache (2s TTL) | Eliminates redundant PS spawns |
-| 4 | Screenshot compression | 52% smaller payload (58KB vs 120KB) |
-| 5 | Async debug writes | 94% less event loop blocking |
-| 6 | Streaming LLM responses | 1-3s faster per LLM call |
-| 7 | Trimmed system prompts | ~60% fewer prompt tokens |
-| 8 | A11y tree filtering | Interactive elements only, 3000 char cap |
-| 9 | Combined PS script | 1 spawn instead of 3 |
-| 10 | Taskbar cache (30s TTL) | Skip expensive taskbar query |
-| 11 | Delay reduction | 50-150ms vs 200-1500ms |
-
-### Benchmarks (2560x1440)
-
-| Metric | v0.3 (VNC) | v0.4 (Native) | v0.4.1+ (Optimized) |
-|--------|------------|---------------|----------------------|
-| Screenshot capture | ~850ms | ~50ms | ~57ms |
-| Screenshot size | ~200KB | ~120KB | ~58KB |
-| A11y context (uncached) | N/A | ~600ms | ~462ms |
-| A11y context (cached) | N/A | 0ms | 0ms (2s TTL) |
-| Delays (per step) | N/A | 200-1500ms | 50-600ms |
-| System prompt tokens | N/A | ~800 | ~300 |
-
-### Perf Tools
-
-- `perf/apply-optimizations.ps1` — apply all patches
-- `perf/perf-test.ts` — benchmark harness (`npx ts-node perf/perf-test.ts`)
+## Modes Summary
+
+| Mode | Command | What it does | Who is the brain? | Cost |
+|------|---------|-------------|-------------------|------|
+| `serve` | `clawdcursor serve` | 40 tools via REST API, no LLM | Your AI model | Your calls only |
+| `mcp` | `clawdcursor mcp` | 40 tools via MCP stdio, no LLM | Your AI model | Your calls only |
+| `start` | `clawdcursor start` | Full autonomous agent + 40 tools | Built-in LLM pipeline | Varies by provider |
diff --git a/docs/ACCESSIBILITY-RESEARCH.md b/docs/ACCESSIBILITY-RESEARCH.md
index 97d5536..4d9d430 100644
--- a/docs/ACCESSIBILITY-RESEARCH.md
+++ b/docs/ACCESSIBILITY-RESEARCH.md
@@ -1,7 +1,7 @@
# Windows UI Automation from Node.js — Research Report
**Date:** 2026-02-19
-**Context:** clawd-cursor desktop AI agent currently uses VNC + screenshot + vision for every action.
+**Context:** clawdcursor desktop AI agent currently uses VNC + screenshot + vision for every action.
**Goal:** Add a Windows accessibility layer to enumerate UI elements, read properties, and interact by reference (not pixel coordinates).
---
diff --git a/AI-SHORTCUTS.md b/docs/AI-SHORTCUTS.md
similarity index 100%
rename from AI-SHORTCUTS.md
rename to docs/AI-SHORTCUTS.md
diff --git a/docs/MACOS-SETUP.md b/docs/MACOS-SETUP.md
index ffe6b76..2725768 100644
--- a/docs/MACOS-SETUP.md
+++ b/docs/MACOS-SETUP.md
@@ -81,8 +81,8 @@ If it works, you'll see something like: `{"name":"Terminal","pid":12345}`
## 2. Install & Build
```bash
-git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor && npm install && npm run build
+git clone https://github.com/AmrDab/clawdcursor.git
+cd clawdcursor && npm install && npm run build
```
### Make macOS scripts executable
diff --git a/docs/agent-guide.md b/docs/agent-guide.md
new file mode 100644
index 0000000..65aec29
--- /dev/null
+++ b/docs/agent-guide.md
@@ -0,0 +1,205 @@
+# clawdcursor Agent Guide
+
+> This document teaches AI models how to use clawdcursor tools effectively.
+> Include this in your system prompt or reference it when connecting to the tool server.
+
+## What is clawdcursor?
+
+clawdcursor is an OS-level desktop automation server. It gives you (the AI model)
+eyes, hands, and ears on a real computer desktop. You can see the screen, click,
+type, read UI elements, interact with browsers, and control any application.
+
+**You are the brain. clawdcursor is the body.**
+
+## Quick Start
+
+```
+1. read_screen → See what's on screen (text, fast, structured)
+2. Decide what to do → Your reasoning
+3. Execute an action → mouse_click, key_press, type_text, cdp_click, etc.
+4. read_screen again → Verify the action worked
+5. Repeat until done
+```
+
+## Core Principles
+
+### 1. Text First, Vision Second
+Always call `read_screen` before `desktop_screenshot`. The accessibility tree is:
+- **Fast**: ~100ms vs ~500ms for screenshot
+- **Structured**: Named buttons, input fields, text values
+- **Small**: A few KB of text vs a large image
+
+Only use `desktop_screenshot` when:
+- You need to see visual layout (charts, images, colors)
+- The accessibility tree is empty or unhelpful (canvas apps, games)
+- You need to verify visual state
+
+### 2. CDP for Browsers, A11y for Native Apps
+When working with a browser (Edge, Chrome):
+- Call `navigate_browser` to open a URL with CDP enabled
+- Call `cdp_connect` to establish the connection
+- Use `cdp_click`, `cdp_type`, `cdp_read_text` for all interactions
+- CDP is faster and more reliable than mouse clicks for web pages
+
+When working with native apps (Notepad, Excel, File Explorer):
+- Use `read_screen` to see the UI tree
+- Use `mouse_click` at coordinates from the accessibility tree
+- Use `key_press` for keyboard shortcuts
+- Use `type_text` for entering text
+
+### 3. Verify Every Action
+After every action, read the screen again to confirm it worked.
+Don't assume success — verify it.
+
+```
+Bad: click "Save" → assume saved → done
+Good: click "Save" → read_screen → confirm save dialog closed → done
+```
+
+### 4. Use Keyboard Shortcuts
+Keyboard shortcuts are faster and more reliable than clicking:
+- `ctrl+s` to save
+- `ctrl+a` to select all
+- `ctrl+c` / `ctrl+v` for copy/paste
+- `ctrl+n` for new document
+- `alt+tab` to switch windows
+- `ctrl+w` to close tab
+- `Return` to confirm dialogs
+
+## Tool Categories
+
+### Perception (see the screen)
+| Tool | When to use |
+|------|-------------|
+| `read_screen` | Always start here. Returns accessibility tree. |
+| `desktop_screenshot` | When you need visual confirmation or a11y tree is empty. |
+| `desktop_screenshot_region` | Zoom into a specific area for detail. |
+| `get_screen_size` | Get screen dimensions and DPI info. |
+| `get_windows` | List all open windows. |
+| `get_active_window` | Check which window has focus. |
+| `get_focused_element` | Check which UI element has keyboard focus. |
+
+### Actions (control the computer)
+| Tool | When to use |
+|------|-------------|
+| `mouse_click` | Click a UI element at image-space coordinates. |
+| `mouse_double_click` | Open files, select words. |
+| `mouse_right_click` | Open context menus. |
+| `mouse_scroll` | Scroll pages, lists, documents. |
+| `mouse_drag` | Select text, move objects, resize. |
+| `mouse_hover` | Reveal tooltips or hover menus. |
+| `key_press` | Keyboard shortcuts and special keys. |
+| `type_text` | Enter text into focused input. |
+
+### Window Management
+| Tool | When to use |
+|------|-------------|
+| `focus_window` | Bring a window to front (by name, PID, or title). |
+| `find_element` | Search for a specific UI element by name or type. |
+| `open_app` | Launch an application. |
+
+### Browser (CDP)
+| Tool | When to use |
+|------|-------------|
+| `navigate_browser` | Open a URL (launches browser with CDP). |
+| `cdp_connect` | Connect to browser's DevTools Protocol. |
+| `cdp_page_context` | List interactive elements (buttons, inputs, links). |
+| `cdp_read_text` | Extract text from a page or element. |
+| `cdp_click` | Click by CSS selector or visible text. |
+| `cdp_type` | Type into input by selector or label. |
+| `cdp_select_option` | Select dropdown option. |
+| `cdp_evaluate` | Run arbitrary JavaScript. |
+| `cdp_wait_for_selector` | Wait for element to appear. |
+| `cdp_list_tabs` | List open browser tabs. |
+| `cdp_switch_tab` | Switch to a different tab. |
+
+### Clipboard
+| Tool | When to use |
+|------|-------------|
+| `read_clipboard` | Read clipboard contents. |
+| `write_clipboard` | Write text to clipboard. |
+
+### Orchestration
+| Tool | When to use |
+|------|-------------|
+| `delegate_to_agent` | Hand off complex task to autonomous pipeline. |
+| `wait` | Pause after animations, page loads, transitions. |
+
+## Common Patterns
+
+### Open an app and type something
+```
+1. open_app("notepad")
+2. wait(2)
+3. type_text("Hello, world!")
+```
+
+### Search the web
+```
+1. navigate_browser("https://google.com")
+2. cdp_connect()
+3. cdp_type(selector: "textarea[name='q']", text: "clawdcursor")
+4. key_press("Return")
+5. wait(2)
+6. cdp_read_text() → extract search results
+```
+
+### Copy text between apps
+```
+1. focus_window(processName: "msedge")
+2. key_press("ctrl+a") → select all
+3. key_press("ctrl+c") → copy
+4. read_clipboard() → verify content
+5. focus_window(processName: "notepad")
+6. key_press("ctrl+v") → paste
+```
+
+### Fill out a web form
+```
+1. navigate_browser("https://example.com/form")
+2. cdp_connect()
+3. cdp_page_context() → see all inputs
+4. cdp_type(label: "Name", text: "John Doe")
+5. cdp_type(label: "Email", text: "john@example.com")
+6. cdp_select_option(selector: "#country", value: "US")
+7. cdp_click(text: "Submit")
+```
+
+### Multi-app workflow (web research → document)
+```
+1. navigate_browser("https://en.wikipedia.org/wiki/Tokyo")
+2. cdp_connect()
+3. cdp_read_text(selector: "#mw-content-text") → extract info
+4. open_app("notepad")
+5. wait(2)
+6. type_text("Tokyo Research Notes\n\n" + extracted_info)
+7. key_press("ctrl+s")
+```
+
+## Coordinate System
+
+All mouse tools use **image-space coordinates** — these match the 1280px-wide
+screenshots from `desktop_screenshot`. The server automatically converts to
+the correct OS coordinates (handling DPI scaling).
+
+You do NOT need to worry about DPI, physical pixels, or logical pixels.
+Just use the coordinates you see in screenshots.
+
+## Safety
+
+- `alt+f4`, `ctrl+alt+delete` are blocked
+- The server only binds to localhost (127.0.0.1)
+- `type_text` uses clipboard paste (reliable, no dropped characters)
+- All actions are logged
+
+## Error Handling
+
+If a tool returns `isError: true`:
+1. Read the error message
+2. Try an alternative approach
+3. Don't repeat the same failing action more than twice
+
+Common errors:
+- "Not connected to CDP" → call `cdp_connect` first
+- "No window found" → check `get_windows` for the correct process name
+- "Click failed" → verify coordinates with `read_screen` or `desktop_screenshot`
diff --git a/docs/app-knowledge.md b/docs/app-knowledge.md
new file mode 100644
index 0000000..c42df75
--- /dev/null
+++ b/docs/app-knowledge.md
@@ -0,0 +1,517 @@
+# App Knowledge Base for Clawd Cursor
+
+This file is loaded into the LLM's context when interacting with desktop apps.
+It tells the AI what to expect from each app and how to operate it reliably.
+
+---
+
+## Startup Context — Verify Focus First
+
+**Before doing anything else, check FOCUSED ELEMENT processName.**
+
+The preprocessor opens and navigates the browser, but the terminal window may still hold keyboard focus when you receive control.
+
+| FOCUSED WINDOW processName | Meaning | Action |
+|---------------------------|---------|--------|
+| `msedge` or `chrome` | Browser focused ✓ | Proceed with task |
+| `olk` | Outlook focused ✓ | Proceed with task |
+| `notepad`, `mspaint`, etc. | Correct app ✓ | Proceed with task |
+| `windowsterminal`, `cmd`, `powershell` | **Wrong window** | Return needs_human with reason "wrong_window" |
+| `explorer` | **Wrong window** | Return needs_human with reason "wrong_window" |
+
+**Wrong-window response format:**
+```json
+{"action":"needs_human","reason":"wrong_window","description":"Focused window is windowsterminal. Edge has the target page loaded but does not have keyboard focus. Pipeline must re-focus msedge before I can act."}
+```
+
+The pipeline will re-focus the correct window and retry. Do NOT try to switch windows yourself.
+
+---
+
+## Startup Flow Rules
+
+**The agent handles startup for you.** By the time you (the LLM) receive control, the app is already open, focused, and maximized. You do NOT need to:
+- Press the Windows/Super key
+- Type an app name in the Start menu
+- Press Enter to launch
+- Press Win+Up to maximize
+- Press Alt+Tab to switch apps
+- Call focus-window
+
+**Your job starts AFTER the app is ready.** Read the IMPORTANT CONTEXT section — it tells you exactly what has already been done. For example:
+- `Opened "Outlook" — it is ALREADY the active, focused, maximized window` means Outlook is ready
+- `Compose window is OPEN. Cursor is in the To field` means you can start typing the email address immediately
+
+### What to do on step 0
+1. Read the IMPORTANT CONTEXT to understand what's already done
+2. Read the FOCUSED ELEMENT to know where the cursor is right now
+3. Read the UI TREE to see available elements (may be sparse for WebView2 apps)
+4. Start executing from the FIRST action that hasn't been done yet
+
+### What NEVER to do
+- Do NOT press Alt+Tab, Super/Windows key, or other window-switching keys
+- Do NOT try to reopen an app that's already open
+- Do NOT press Ctrl+N multiple times — each press toggles compose open/closed
+- Do NOT click on window titles, taskbar items, or Pane elements
+- Do NOT repeat an action that's already in the ACTIONS TAKEN SO FAR list
+
+---
+
+## General Rules
+
+1. **You are a screen reader operator.** Keyboard shortcuts and accessibility actions come FIRST. need_visual (vision) is the LAST resort — only use it if keyboard methods truly cannot reach the target.
+
+2. **WebView2/Electron apps have minimal a11y trees.** You will see mostly Pane, Group, and Text elements. Interactive controls (buttons, inputs) are inside the web content and often invisible to UIAutomation. DO NOT a11y_click elements that don't appear in the tree — use keyboard shortcuts instead.
+
+3. **When the a11y tree is sparse, use keyboard shortcuts.** Tab navigates between fields. Enter/Ctrl+Enter submits. Escape cancels. Alt/F10 opens menus. These work universally across all Windows apps.
+
+4. **Trust the keyboard.** After typing text, it IS in the field even if the a11y tree doesn't show it. After pressing Tab, focus HAS moved to the next field. Do not repeat actions because the tree didn't update.
+
+5. **Never click window titles or taskbar items.** These are not interactive UI elements. Use keyboard shortcuts instead.
+
+6. **The FOCUSED ELEMENT section tells you exactly where the cursor is.** Always check this before typing to confirm you are in the right field.
+
+7. **Check the ACTIONS TAKEN SO FAR list.** If an action says SUCCEEDED or ALREADY TYPED, it worked. Move to the NEXT step. Never repeat a succeeded action.
+
+8. **One action per response.** Return exactly one JSON action. After execution, you'll get the updated UI state to decide the next action.
+
+9. **Use the simplest action available.** Prefer key_press and type over a11y_click or need_visual. Keyboard shortcuts are faster and more reliable than clicking.
+
+10. **Before using need_visual**, ask yourself: is there a keyboard shortcut for this? Can I Tab to it? Can I Alt+key open a menu? Only if ALL keyboard approaches fail, use need_visual.
+
+---
+
+## App-Specific Interaction Patterns
+
+### When to use keyboard vs clicking
+| Scenario | Use | Why |
+|----------|-----|-----|
+| WebView2 app (Outlook, Teams) | Keyboard shortcuts | a11y tree is empty, clicking fails |
+| Native app (Notepad, Paint) | a11y_click on visible elements | Elements are in the tree with valid bounds |
+| Form navigation | Tab between fields | Universal, reliable |
+| Submit/Send | Ctrl+Enter or Enter | No need to find Send button |
+| Cancel/Close | Escape or Alt+F4 | Universal |
+
+### How to navigate fields
+The pattern for filling out any form:
+```
+1. Check FOCUSED ELEMENT — confirms which field has focus
+2. type "content" — fills the current field
+3. key_press "Tab" — moves to next field
+4. type "content" — fills that field
+5. Repeat until done
+6. key_press "ctrl+Return" or "Return" — submit
+```
+
+Do NOT try to click individual fields. Tab navigation is reliable across all apps.
+
+---
+
+## Outlook (New) -- process: `olk`
+
+### What it is
+Outlook (new) is a WebView2 wrapper. The a11y tree shows almost nothing -- just Panes and a TitleBar. You CANNOT see buttons, input fields, or email content in the tree.
+
+### Process identity
+- Outlook runs as process `olk` but its WebView2 content runs under `msedge`
+- The FOCUSED ELEMENT may show pid for either process -- both are correct
+- If Edge also has Outlook Web open, shortcuts may go to the wrong window
+- The agent handles focusing the correct process before you get control
+
+### How to operate it
+**USE KEYBOARD SHORTCUTS ONLY. Do not try to click UI elements via a11y.**
+
+### Compose Email Flow (step by step)
+```
+BEFORE STARTING: Check FOCUSED ELEMENT.
+ - If it shows ControlType.Group name="To" className="EditorClass" -> compose IS open, go to step 2
+ - If it shows anything else -> press Ctrl+N to open compose, wait 2s, then go to step 2
+ - If FOCUSED ELEMENT shows "To" with existing content -> press Ctrl+A then Delete to clear, then type
+
+1. key_press "ctrl+n" -> Opens compose window (SKIP if compose is already open)
+ Wait for FOCUSED ELEMENT to show "To" group before continuing.
+ Do NOT press Ctrl+N again — each press toggles compose open/closed.
+
+2. type "user@example.com" -> Type the recipient's email address into the To field.
+ Even if the a11y tree doesn't show the text, it IS there.
+
+3. key_press "Tab" -> Moves focus To -> Cc (or directly to Subject in some configs)
+ Check FOCUSED ELEMENT: if it shows "Cc", press Tab again to skip to Subject.
+
+4. type "subject text" -> Type the subject line.
+
+5. key_press "Tab" -> Moves focus Subject -> Body.
+
+6. type "body text" -> Type the full email body. Use \n for newlines.
+ Write a REAL message — not a placeholder.
+
+7. key_press "ctrl+Return" -> SENDS the email immediately.
+ After this, return {"action":"done","evidence":"Sent email to [address] via Ctrl+Enter keyboard shortcut in Outlook"}
+ DO NOT second-guess this. The email IS sent. The tree will not update — that is normal.
+```
+
+### Critical: When to return "done"
+After pressing `ctrl+Return` to send, **immediately return done**. Do NOT:
+- Wait for the tree to update (it won't)
+- Try to verify by checking the window title (may or may not change)
+- Press Ctrl+Return again (would open a new compose)
+
+The evidence string should be: `"Sent email to [address] with subject '[subject]' via Ctrl+Enter"`
+
+### Important Notes
+- Ctrl+N only works when the Outlook (olk) window has focus. The agent ensures this before handing off.
+- Do NOT press Ctrl+N if compose is already open — check FOCUSED ELEMENT first.
+- After sending, the a11y tree will still show Panes — this is NORMAL. Trust the keyboard sequence.
+
+### Keyboard Shortcuts
+| Action | Shortcut |
+|--------|----------|
+| New email | Ctrl+N |
+| Send | Ctrl+Enter |
+| Reply | Ctrl+R |
+| Reply All | Ctrl+Shift+R |
+| Forward | Ctrl+F |
+| Search | F3 or Ctrl+E |
+| Delete | Delete |
+| Mark read/unread | Ctrl+Q / Ctrl+U |
+| Flag message | Insert |
+
+### What the a11y tree looks like
+```
+FOCUSED WINDOW UI TREE:
+ [ControlType.Window] "Mail - amr dabbas - Outlook"
+ [ControlType.Pane] <-- many nested empty panes (WebView2 structure)
+ [ControlType.Pane]
+ [ControlType.Pane] "Mail - amr dabbas - Outlook - Web content"
+ [ControlType.Pane]
+ [ControlType.Text] "Untitled"
+ [ControlType.Button] "Minimize"
+ [ControlType.Button] "Maximize"
+ [ControlType.Button] "Close"
+ [ControlType.TitleBar] "Mail - amr dabbas - Outlook"
+```
+This is ALL you get. No mail list, no compose fields, no buttons. Keyboard only.
+
+---
+
+## Microsoft Edge -- process: `msedge`
+
+### What it is
+Chromium-based browser. Has a rich a11y tree for browser chrome but web page content varies.
+
+### Important
+- Ctrl+N opens a NEW BROWSER WINDOW (not related to Outlook)
+- Ctrl+L focuses the address bar
+- Ctrl+T opens a new tab
+- If Outlook Web is open in Edge, keyboard shortcuts may conflict. Always ensure the correct process (olk vs msedge) has focus.
+
+### Interacting with web pages — CDP FIRST
+
+When **CDP PAGE CONTEXT** is shown in your UI STATE, the page DOM is directly accessible.
+**Use cdp_click/cdp_type — they are faster and more reliable than Tab navigation for React/SPA pages.**
+
+CDP action examples:
+```
+{"action":"cdp_click","by_text":"Compose","description":"open compose window"}
+{"action":"cdp_type","by_label":"To","text":"user@example.com","description":"type recipient"}
+{"action":"cdp_type","selector":"[aria-label='Subject']","text":"Meeting","description":"type subject"}
+{"action":"checkpoint","description":"verify we navigated to results page"}
+```
+
+When CDP PAGE CONTEXT is NOT shown (CDP unavailable), use a11y tree or Tab navigation:
+- `[ControlType.Edit]` or `[ControlType.Document]` → a11y_set_value or a11y_focus then type
+- `[ControlType.Button]` with a name → a11y_click to activate
+- `[ControlType.Hyperlink]` → a11y_click to follow
+
+When filling a search form (no CDP):
+1. a11y_focus the "From" or origin field
+2. type the city/airport name
+3. Tab to the next field and type
+4. Tab to the date field, type or use arrow keys
+5. Tab to the Submit/Search button, press Enter or Space
+
+### Keyboard Shortcuts
+| Action | Shortcut |
+|--------|----------|
+| New window | Ctrl+N |
+| New tab | Ctrl+T |
+| Close tab | Ctrl+W |
+| Address bar | Ctrl+L or F6 |
+| Find in page | Ctrl+F |
+| Refresh | F5 or Ctrl+R |
+| Back | Alt+Left |
+| Forward | Alt+Right |
+| Next interactive element | Tab |
+| Previous interactive element | Shift+Tab |
+
+---
+
+## Google Docs -- process: `msedge`
+
+### What it is
+Google Docs (docs.google.com) is a web app running in Edge/Chrome. It uses a custom canvas renderer — most elements are NOT in the a11y tree. NEVER use a11y_click or a11y_focus — they will hang or crash.
+
+### Key URLs
+- `docs.google.com` — homepage (list of documents, "Blank" template at top)
+- `docs.google.com/document/create` — INSTANTLY creates a new blank document (USE THIS)
+- `docs.google.com/document/d/{id}/edit` — editing an existing document
+
+### Creating a new document
+The preprocessor navigates to `docs.google.com/document/create` which opens a blank doc directly.
+- You will see the doc title "Untitled document" and a blinking cursor in the body
+- The document body is ready for typing — just use `type` action
+- Do NOT press Ctrl+N (that opens a new browser tab, NOT a new Google Doc)
+- Do NOT try to click "Blank" or "File > New" — the create URL handles this
+
+### Writing/composing content
+When the task says "write a sentence about X" or "write about X":
+1. You are a language model — COMPOSE the text yourself
+2. Use `{"action":"type","text":"Your composed sentence here.","description":"typing composed content"}`
+3. The text should be original, relevant, and well-written
+4. After typing, verify the text appears in CDP PAGE CONTEXT
+5. THEN declare done with the actual text as evidence
+
+### With CDP (preferred)
+- `cdp_type selector ".kix-appview-editor" text="content"` — type in document body
+- OR just use `{"action":"type","text":"content"}` — keyboard input goes to focused doc
+- `cdp_click by_text="File"` — open File menu
+- `cdp_click by_text="Share"` — open share dialog
+
+### Common mistakes (NEVER do these)
+- Ctrl+N → opens browser tab, NOT Google Doc
+- a11y_click anything → hangs/crashes (canvas rendering)
+- Declaring done without typing content → BLOCKED by pipeline
+- Typing the task instruction literally (e.g. "a sentence on dogs") instead of composing actual content
+
+---
+
+## Notepad -- process: `notepad`
+
+### a11y tree
+Rich and reliable. Edit field shows full content via ValuePattern.
+
+### How to operate
+Notepad has a full a11y tree. You CAN use a11y_click, a11y_set_value, and a11y_focus on its elements. But keyboard shortcuts are still faster for common operations.
+
+### Keyboard Shortcuts
+| Action | Shortcut |
+|--------|----------|
+| New file | Ctrl+N |
+| Open | Ctrl+O |
+| Save | Ctrl+S |
+| Save As | Ctrl+Shift+S |
+| Find | Ctrl+F |
+| Replace | Ctrl+H |
+| Select All | Ctrl+A |
+
+---
+
+## Paint -- process: `mspaint`
+
+### a11y tree
+Has toolbar buttons visible. Canvas is a single Pane element.
+
+### How to draw
+1. Select tool via a11y_click on toolbar button (e.g., "Pencil", "Brush")
+2. Use mouse actions (click, drag) on the canvas coordinates
+3. Color selection via a11y_click on color palette buttons
+
+---
+
+## File Explorer -- process: `explorer`
+
+### How to operate
+- Ctrl+L focuses the address bar (type a path and press Enter to navigate)
+- Tab cycles between navigation pane, file list, and address bar
+- F2 renames the selected file
+- Delete moves selected file to Recycle Bin
+- Enter opens the selected file/folder
+
+---
+
+## General Windows Shortcuts (work in all apps)
+
+| Action | Shortcut |
+|--------|----------|
+| Copy | Ctrl+C |
+| Cut | Ctrl+X |
+| Paste | Ctrl+V |
+| Undo | Ctrl+Z |
+| Redo | Ctrl+Y |
+| Select All | Ctrl+A |
+| Save | Ctrl+S |
+| Print | Ctrl+P |
+| Close window | Alt+F4 |
+| Switch app | Alt+Tab |
+| Task Manager | Ctrl+Shift+Escape |
+| Screenshot | Win+Shift+S |
+
+---
+
+## TripAdvisor -- process: `msedge`
+
+### What it is
+TripAdvisor (tripadvisor.com) is a React SPA. Its Flights tab redirects to Google Flights — EXPECTED behavior. Hotels, restaurants, and attractions are handled on TripAdvisor itself.
+
+### Task: "Book cheapest flight from [city]"
+TripAdvisor Flights opens Google Flights. Follow the Google Flights section below.
+1. If already on tripadvisor.com — cdp_click by_text "Flights" or navigate directly to google.com/travel/flights
+2. If redirected to google.com/flights — continue with the Google Flights flow
+3. For "cheapest flight" with no destination → use Explore view (see Google Flights section)
+
+### Task: "Find hotel / restaurant / attraction"
+1. Use cdp_type by_label "Search" to enter the search query
+2. Use cdp_click to select from autocomplete
+3. Read CDP PAGE CONTEXT for results (ratings, prices, addresses)
+4. done — report the top result with relevant details
+
+### NEVER DO
+- a11y_click or a11y_focus (React SPA, will hang)
+- Return needs_human because destination is missing — use Explore on Google Flights
+
+---
+
+## Google Flights -- process: `msedge`
+
+### What it is
+Google Flights (google.com/flights or google.com/travel/explore) is a React SPA. The a11y tree will HANG if UIA calls are made on it. NEVER use a11y_click or a11y_focus — they time out (45 seconds). Use CDP or keyboard-only.
+
+### IMPORTANT
+- TripAdvisor Flights redirects to Google Flights — EXPECTED. Continue on Google Flights.
+- The page loads in ~3 seconds. Give it time before interacting.
+- NEVER call a11y_click or a11y_focus on any Google Flights element.
+- NEVER call checkpoint when CDP is unavailable — it causes a loop.
+
+### NO DESTINATION in the task?
+If the task says "find flights from X" with NO destination specified:
+- Use the **Explore view**: `cdp_click selector "[aria-label='Explore destinations']"` OR `cdp_click by_text "Explore"`
+- Explore shows cheapest flights from your origin to ALL destinations — no destination required
+- Set origin, read CDP PAGE CONTEXT for destination cards with prices and earliest dates
+- Report the earliest departure in your `done` evidence
+- Do NOT return needs_human and do NOT pick an arbitrary destination — Explore solves this
+
+### WITH CDP PAGE CONTEXT (preferred)
+
+**Origin + destination search:**
+```
+1. key_press "Escape" — dismiss any popup
+2. cdp_type by_label "Where from?" "Los Angeles" — set origin (city name)
+3. key_press "Down" then "Return" — confirm autocomplete
+4. cdp_type by_label "Where to?" "New York" — set destination
+5. key_press "Down" then "Return" — confirm autocomplete
+6. cdp_click by_text "Search" — submit
+7. Read CDP PAGE CONTEXT for flight results
+8. done — report earliest/cheapest flight with price, airline, date
+```
+
+**No-destination (Explore) search:**
+```
+1. key_press "Escape"
+2. cdp_click selector "[aria-label='Explore destinations']" (OR cdp_click by_text "Explore")
+3. cdp_type by_label "Where from?" "Los Angeles"
+4. key_press "Down" then "Return"
+5. Read CDP PAGE CONTEXT for destination cards
+6. done — report top 3 results: destination, date, price
+```
+
+**Reading results:** CDP PAGE CONTEXT has flight cards with price, date, airline.
+- "soonest flight" → report earliest departure date
+- "cheapest flight" → report lowest price
+- Report both if ambiguous
+
+### WITHOUT CDP (keyboard fallback)
+
+Only use if CDP PAGE CONTEXT is NOT shown.
+
+```
+1. key_press "Escape"
+2. key_press "Tab" (×2) — reach origin field
+3. type "Los Angeles"
+4. key_press "Down" then "Return"
+5. key_press "Tab" — move to destination
+6. type "New York" — REQUIRED — cannot leave blank on keyboard path
+7. key_press "Down" then "Return"
+8. key_press "Tab" (×4) — reach Search button
+9. key_press "Return" — submit
+10. key_press "Tab" (repeat) — read FOCUSED ELEMENT for prices/times
+```
+
+Tab order: Round trip → Origin → Destination → Depart → Return → Passengers → Class → Search
+
+### NEVER DO
+- a11y_click / a11y_focus on any element (timeouts + UIA hangs)
+- checkpoint when CDP is unavailable (loop)
+- Leave destination blank on keyboard path (form errors)
+- Pick an arbitrary destination — use Explore instead
+
+---
+
+## Unknown / Unlisted Apps
+
+When you encounter an app not listed in this knowledge base, use this universal exploration strategy:
+
+### Step 1 — Read the a11y tree
+Look for:
+- Named **Buttons** → a11y_click them
+- Named **Edit** or **Document** fields → a11y_focus then type
+- Named **MenuItem** or **Menu** → a11y_click to open
+- Named **TabItem** → a11y_click to switch tab
+
+### Step 2 — Open the menu bar
+Press **Alt** or **F10** to open the app's menu bar. Then:
+- Arrow keys navigate menu items
+- Enter opens a submenu or activates an item
+- Escape closes the menu
+
+### Step 3 — Use Tab navigation
+- **Tab** moves focus to the next interactive control
+- **Shift+Tab** moves backwards
+- **Enter** or **Space** activates the focused control
+- Check FOCUSED ELEMENT after each Tab to know where you are
+
+### Step 4 — Common universal shortcuts
+| Action | Shortcut |
+|--------|----------|
+| Open menu | Alt or F10 |
+| Find/Search | Ctrl+F |
+| Save | Ctrl+S |
+| Open | Ctrl+O |
+| New | Ctrl+N |
+| Close | Ctrl+W or Alt+F4 |
+| Undo | Ctrl+Z |
+| Select all | Ctrl+A |
+| Copy | Ctrl+C |
+| Paste | Ctrl+V |
+| Help | F1 |
+| Close dialog | Escape |
+
+### Step 5 — Only then, need_visual
+If you've tried a11y interaction, keyboard shortcuts, and Tab navigation and still cannot proceed, use need_visual with a concise target description.
+
+---
+
+## Error Recovery
+
+### "Element not found" errors
+The element doesn't exist in the a11y tree. This is NORMAL for WebView2 apps. Use keyboard shortcuts instead.
+
+### Compose window won't open
+1. The agent already tried to open it -- check IMPORTANT CONTEXT
+2. If it says compose is open, trust it and start typing
+3. If you must try Ctrl+N: press it ONCE only, wait 2 seconds, check FOCUSED ELEMENT
+
+### Actions seem to go to the wrong app
+Check the FOCUSED ELEMENT section -- the pid tells you which process has focus. If it's not the target app, report "unsure" so the agent can re-focus.
+
+### Text typed but not visible in a11y tree
+This is normal for WebView2 apps. The text IS in the field. Trust the keyboard and move to the next step.
+
+### Action failed -- what to try next
+1. If a11y_click failed -> try key_press with a keyboard shortcut instead
+2. If key_press didn't work -> try a different shortcut (e.g., Ctrl+Enter vs Alt+S for send)
+3. If type didn't work -> check FOCUSED ELEMENT to see if you're in the right field
+4. After 2 failures on the same target -> report "unsure" to let the agent try a different approach
+
+### Stuck in a loop
+If you see the same UI state 3+ times, you are stuck. Do NOT repeat the same action.
+Try: different keyboard shortcut, report "unsure", or report "done" if the task is actually complete.
diff --git a/docs/index.html b/docs/index.html
deleted file mode 100644
index 62c5c29..0000000
--- a/docs/index.html
+++ /dev/null
@@ -1,1048 +0,0 @@
-
-
-
-
-
- Clawd Cursor · Desktop Skill for OpenClaw
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
v0.5.6 - Global CLI + Multi-Provider Doctor
-
One skill replacesa dozen APIs
-
Give your agent a screen. Gmail, Slack, Jira, Figma - if you can click it, your agent can too. Native desktop control. No servers. No API keys per service.
-
-
-
-
-
-
-
-
- Why
- Stop integrating APIs. Start using the screen.
- Every app your agent needs already has a UI. One skill gives your agent eyes and hands on all of them.
-
-
-
12+
-
Separate API integrations
-
Each service needs its own API key, auth flow, rate limit handling, and breaking change maintenance.
-
-
-
1
-
Skill that handles all of them
-
Your agent sees the screen and interacts with any app the same way you do. Zero per-service setup.
-
-
-
0
-
API keys to manage
-
No OAuth flows, no token rotation, no webhook endpoints. If you are logged in, your agent is too.
-
-
-
-
-
-
- Three Layers
- Instant, smart, or bulletproof - pick the right layer
- Three pipeline layers. Each task falls to the cheapest layer that can handle it.
-
-
-
Layer 1 - Action Router
-
- Regex + pattern matching on task text
- Query native accessibility tree
- Find element by name - click directly
- Zero LLM calls needed
- Instant execution, completely free
-
-
-
-
-
Layer 2 - Accessibility Reasoner
-
- Reads the UI Automation tree (text-only)
- Sends tree to a text LLM (Ollama or cloud)
- LLM reasons about which element to act on
- No screenshots needed - cheap or free
- Handles moderate complexity tasks
-
-
-
-
-
Layer 3 - Screenshot + Vision
-
- Full task sent to vision LLM with screenshot
- Claude Computer Use or vision model fallback
- Plans and executes step by step
- Verifies each action before continuing
- Handles multi-app, unfamiliar UIs
-
-
-
-
-
-
-
-
- Performance
- Real benchmarks, real tasks
- No cherry-picking. v0.5.1 smart pipeline benchmarks.
-
-
-
Action Router
-
~2s
-
Simple tasks - open, click, type
-
$0 cost Zero LLM
-
-
-
Screenshot
-
~57ms
-
Native capture — HD 1280px, optimized for LLMs
-
Native 1280px HD
-
-
-
Success Rate
-
100%
-
On tested workflows
-
Self-healing Auto-retry
-
-
-
-
-
Task
Time
API Calls
Result
-
-
-
Open Calculator, type 255*38=
2.6s
$0
✅
-
-
-
Chrome → Google Docs → write sentence
101.7s
10
✅
-
-
-
GitHub → read repos → Notepad → save
134.1s
18
✅
-
-
-
Open Notepad (Action Router)
~2s
$0
✅
-
-
-
-
-
-
- Features
- Speed, safety, reliability
-
-
-
🔍
-
UI Automation Tree
-
Queries native accessibility APIs to find buttons, menus, inputs by name - not pixel coordinates.
-
-
-
⚡
-
Zero-LLM Fast Path
-
Common tasks execute in milliseconds with zero AI calls via UI Automation.
-
-
-
🧠
-
Native Computer Use
-
Claude gets native desktop tools - screenshot, click, type, scroll, drag. Fully autonomous.
-
-
-
📸
-
Adaptive Screenshots
-
Scaled to 1280×720 for the API. ~57ms native capture, HD quality for reliable icon identification. Adaptive delays per action type.
-
-
-
🛡
-
Safety Tiers
-
Auto, Preview, Confirm. Three tiers from safe reads to destructive actions.
-
-
-
🖥
-
Native Desktop Control
-
Direct OS-level screen capture and input via @nut-tree-fork/nut-js. No server needed. 17× faster screenshots.
-
-
-
🩺
-
Self-Healing Doctor
-
Auto-detects your AI provider, tests models, configures the optimal pipeline. Falls back gracefully if a model is unavailable.
-
-
-
-
-
Auto
-
Open apps, navigate, read screen. Executes immediately.
-
-
-
Preview
-
Type text, fill forms. Logged before executing.
-
-
-
Confirm
-
Send messages, delete files. Waits for approval.
-
-
-
-
-
-
- Get Started
- Up and running in 2 minutes
- Just point your AI assistant to clawdcursor.com - or run it manually:
-
-
- Windows
- macOS
- Linux
-
-
-
-
-
git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor
-npm install ; npm run setup
-clawdcursor doctor
-
-
-
-
-
git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor && npm install && npm run setup
-
-
-
-clawdcursor doctor
-
-
-
-
-
git clone https://github.com/AmrDab/clawd-cursor.git
-cd clawd-cursor && npm install && npm run setup
-
-
-clawdcursor doctor
-
-
- Requires: Node.js 20+. npm run setup builds and registers clawdcursor globally. Use npm run build if you prefer npx clawdcursor instead.
-
-
-
-
-
-
-
Ready to teach your agent a new skill?
-
Open source. One install. Your agent sees your screen and gets to work.
-
-
- Star on GitHub
-
-
-
-
-
-
-
-
Version History
-
Every release, from day one.
-
-
-
-
-
-
-
v0.5.1 Feb 23, 2026
-
HD screenshots (1280px), JPEG quality 65, window focus stability, Paint drawing 78% faster, stop command.
-
-
-
-
-
-
-
v0.5.0 Feb 23, 2026
-
Smart 3-layer pipeline, doctor CLI, multi-provider (Anthropic/OpenAI/Ollama/Kimi), batch execution (3.6 actions/call), self-healing diagnostics.
-
-
-
-
-
-
-
v0.4.0 Feb 22, 2026
-
Native desktop control via nut-js. Removed VNC dependency. 53% faster benchmarks. Compressed screenshots, streaming LLM, trimmed prompts.
-
-
-
-
-
-
-
v0.3.x Mar 2025
-
VNC-based desktop control, accessibility bridge, safety tiers, action router with pattern matching.
-
-
-
-
-
-
-
v0.1.0 – v0.2.0 Jan–Feb 2025
-
Initial release. VNC connection, basic mouse/keyboard, screenshot capture, Anthropic Computer Use integration.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/shortcut-demo.md b/docs/shortcut-demo.md
similarity index 100%
rename from shortcut-demo.md
rename to docs/shortcut-demo.md
diff --git a/docs/v0.7.0/index.html b/docs/v0.7.0/index.html
new file mode 100644
index 0000000..7919c0d
--- /dev/null
+++ b/docs/v0.7.0/index.html
@@ -0,0 +1,615 @@
+
+
+
+
+
+ Clawd Cursor v0.7.0 · The Glove for Any AI Hand
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
v0.7.0 — Architecture Overhaul
+
Give your AIeyes and hands
+
Connect any AI to your desktop. It sees your screen, moves the mouse, types, clicks — anything you can do, it can do. Works with Claude, GPT, Gemini, Llama, or any AI.
+
+
+
+
+
+
+
+
+ Use it your way
+ No app integrations. No API keys per service.
+ If it's on your screen, your AI can use it. Gmail, Slack, Figma, Jira, native apps, legacy software — anything with a UI.
+
+
+
🧑💻 Tell it what to do
+
Bring an API key and just describe what you want. clawdcursor figures out the steps, executes them, and verifies they worked. You stay in plain English.
+
+ clawdcursor doctor
+ clawdcursor start
+
+
+
+
🤖 Connect your AI directly
+
Already using Claude Code, Cursor, or Windsurf? Add clawdcursor and your AI gets full desktop control as a native tool — no extra setup, no extra API calls.
+
+ Claude Code
+ Cursor
+ Windsurf
+ Zed
+
+
+
+
+
+
+
+ Setup
+ Three ways to connect
+ One server. Three modes. They all give your AI the same desktop access.
+
+
+
Claude Code / Cursor / Windsurf
+
+ Run clawdcursor consent once
+ Add to your AI client's config (one JSON block)
+ Desktop tools appear natively in your AI
+ No extra API key, no extra setup
+ Your AI decides what to do — clawdcursor does it
+
+
+
+
+
Give it a task directly
+
+ Run clawdcursor doctor to set up your AI
+ Run clawdcursor start
+ Type what you want in plain English
+ clawdcursor figures out the steps and executes
+ Works with Anthropic, OpenAI, Groq, Ollama, and more
+
+
+
+
+
Build with it
+
+ Run clawdcursor start
+ 40 tools available over HTTP on localhost (smart tools included)
+ OpenAI function-calling format
+ Call individual tools or send full tasks
+ Browse all schemas at GET /tools
+
+
+
+
+
+
+
+
+ Six Layers
+ OCR first, vision last
+ Each task falls to the cheapest layer that can handle it. OCR reads the screen as text. Vision is the absolute last resort. Only active in CLI agent mode.
+
+
+
⚡
+
L1 · Action Router — $0
+
Pattern-matched tasks: open apps, type text, press shortcuts, navigate URLs. Zero LLM calls. Instant execution.
+
+
+
🧠
+
L1.5 · Smart Interaction — text model
+
CDPDriver + UIDriver combo. One cheap text LLM call plans a multi-step action sequence. No screenshots needed.
+
+
+
🗂️
+
L2 · Skill Cache — $0
+
Remembers known action patterns (ctrl+t = new tab). Skips perception entirely for previously-solved interactions.
+
+
+
👁️
+
L2.5 · OCR Reasoner — text model
+
OCR reads screen text + bounding boxes. A11y tree supplements in parallel. Text model reasons and clicks by coordinate. Zero screenshots.
+
+
+
🌐
+
L0 · Browser / Playwright
+
Handles browser-specific tasks first via Playwright automation before escalating to the full pipeline.
+
+
+
🎯
+
L3 · Computer Use — vision model
+
Full vision model with screenshot + action loop. Last resort for anything the text layers can't handle. Supports any visible UI.
+
+
+
+
+
+
+ What's New
+ OCR-first. Blind agent ready.
+ v0.7.0 makes clawdcursor usable by low-cost agents that can't afford screenshots. OCR reads the screen as text — vision is the last resort.
+
+
+
👁️
+
OCR-First Pipeline
+
Windows.Media.Ocr reads every pixel on screen as text with bounding boxes. A11y tree runs in parallel as a supplement. Zero screenshots for most tasks.
+
+
+
🖱️
+
Smart Tools (Blind Agent)
+
4 new MCP tools: smart_read, smart_click, smart_type, invoke_element. Click buttons by name, not coordinates. No vision needed.
+
+
+
⌨️
+
Shortcuts Engine
+
Built-in keyboard shortcut database. shortcuts_list discovers available shortcuts. shortcuts_execute fires them instantly.
+
+
+
✅
+
Ground-Truth Verifier
+
Reads actual screen state after every action. Blocks false "done" reports — the agent can't lie about success anymore.
+
+
+
🔌
+
Native MCP Support
+
Plug directly into Claude Code, Cursor, Windsurf, or Zed as an MCP server. No REST plumbing — tools show up natively in your AI's registry.
+
+
+
🗂️
+
Skill Cache
+
Learns action patterns over time. Once it knows "new tab = ctrl+t", it skips all perception layers and executes instantly. Zero cost.
+
+
+
+
+
+
+ Get Started
+ Up and running in 2 minutes
+ Install, consent once, connect your AI.
+
+
+ Claude Code / Cursor
+ Give it tasks
+ Build with it
+
+
+
+
Terminal + claude_desktop_config.json
+
+git clone https://github.com/AmrDab/clawdcursor.git --branch v0.7.0
+cd clawdcursor && npm install && npm run setup
+
+
+clawdcursor consent
+
+
+{
+ "mcpServers": {
+ "clawdcursor": {
+ "command": "clawdcursor",
+ "args": ["mcp"]
+ }
+ }
+}
+
+
+
+
+
+
+
+git clone https://github.com/AmrDab/clawdcursor.git --branch v0.7.0
+cd clawdcursor && npm install && npm run setup
+
+
+clawdcursor doctor
+
+
+clawdcursor start
+
+
+clawdcursor task "Open Chrome and go to github.com"
+
+
+clawdcursor task
+
+
+
+
+
+git clone https://github.com/AmrDab/clawdcursor.git --branch v0.7.0
+cd clawdcursor && npm install && npm run setup
+clawdcursor start
+
+
+GET http://localhost:3847/tools
+
+
+POST http://localhost:3847/execute/desktop_screenshot
+POST http://localhost:3847/execute/mouse_click
+ { "x": 450, "y": 300 }
+
+
+POST http://localhost:3847/task
+ { "task": "Open Gmail and read latest email" }
+
+
+clawdcursor consent --accept
+
+
+ Requires Node.js 20+. Consent is one-time and stored in ~/.clawdcursor/consent. Server binds to localhost only.
+
+
+
+
+
+
+
+
Give your AI a body.
+
Open source. Any model. Any client. Your desktop, controlled.
+
+
+ Star on GitHub
+
+
+
+
+
+
+
+
+
diff --git a/package-lock.json b/package-lock.json
index 9545f3e..3a997cf 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,14 +1,15 @@
{
- "name": "clawd-cursor",
- "version": "0.5.5",
+ "name": "clawdcursor",
+ "version": "0.7.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
- "name": "clawd-cursor",
- "version": "0.5.5",
+ "name": "clawdcursor",
+ "version": "0.7.0",
"license": "MIT",
"dependencies": {
+ "@modelcontextprotocol/sdk": "^1.27.1",
"@nut-tree-fork/nut-js": "^4.2.0",
"commander": "^12.0.0",
"dotenv": "^16.4.0",
@@ -19,7 +20,6 @@
"zod": "^3.25.76"
},
"bin": {
- "clawd-cursor": "dist/index.js",
"clawdcursor": "dist/index.js"
},
"devDependencies": {
@@ -745,6 +745,18 @@
"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
}
},
+ "node_modules/@hono/node-server": {
+ "version": "1.19.11",
+ "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.11.tgz",
+ "integrity": "sha512-dr8/3zEaB+p0D2n/IUrlPF1HZm586qgJNXK1a9fhg/PzdtkK7Ksd5l312tJX2yBuALqDYBlG20QEbayqPyxn+g==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=18.14.1"
+ },
+ "peerDependencies": {
+ "hono": "^4"
+ }
+ },
"node_modules/@humanfs/core": {
"version": "0.19.1",
"resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
@@ -1584,6 +1596,368 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/@modelcontextprotocol/sdk": {
+ "version": "1.27.1",
+ "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.27.1.tgz",
+ "integrity": "sha512-sr6GbP+4edBwFndLbM60gf07z0FQ79gaExpnsjMGePXqFcSSb7t6iscpjk9DhFhwd+mTEQrzNafGP8/iGGFYaA==",
+ "license": "MIT",
+ "dependencies": {
+ "@hono/node-server": "^1.19.9",
+ "ajv": "^8.17.1",
+ "ajv-formats": "^3.0.1",
+ "content-type": "^1.0.5",
+ "cors": "^2.8.5",
+ "cross-spawn": "^7.0.5",
+ "eventsource": "^3.0.2",
+ "eventsource-parser": "^3.0.0",
+ "express": "^5.2.1",
+ "express-rate-limit": "^8.2.1",
+ "hono": "^4.11.4",
+ "jose": "^6.1.3",
+ "json-schema-typed": "^8.0.2",
+ "pkce-challenge": "^5.0.0",
+ "raw-body": "^3.0.0",
+ "zod": "^3.25 || ^4.0",
+ "zod-to-json-schema": "^3.25.1"
+ },
+ "engines": {
+ "node": ">=18"
+ },
+ "peerDependencies": {
+ "@cfworker/json-schema": "^4.1.1",
+ "zod": "^3.25 || ^4.0"
+ },
+ "peerDependenciesMeta": {
+ "@cfworker/json-schema": {
+ "optional": true
+ },
+ "zod": {
+ "optional": false
+ }
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/accepts": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz",
+ "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==",
+ "license": "MIT",
+ "dependencies": {
+ "mime-types": "^3.0.0",
+ "negotiator": "^1.0.0"
+ },
+ "engines": {
+ "node": ">= 0.6"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/ajv": {
+ "version": "8.18.0",
+ "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz",
+ "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==",
+ "license": "MIT",
+ "dependencies": {
+ "fast-deep-equal": "^3.1.3",
+ "fast-uri": "^3.0.1",
+ "json-schema-traverse": "^1.0.0",
+ "require-from-string": "^2.0.2"
+ },
+ "funding": {
+ "type": "github",
+ "url": "https://github.com/sponsors/epoberezkin"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/body-parser": {
+ "version": "2.2.2",
+ "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz",
+ "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==",
+ "license": "MIT",
+ "dependencies": {
+ "bytes": "^3.1.2",
+ "content-type": "^1.0.5",
+ "debug": "^4.4.3",
+ "http-errors": "^2.0.0",
+ "iconv-lite": "^0.7.0",
+ "on-finished": "^2.4.1",
+ "qs": "^6.14.1",
+ "raw-body": "^3.0.1",
+ "type-is": "^2.0.1"
+ },
+ "engines": {
+ "node": ">=18"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/content-disposition": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.1.tgz",
+ "integrity": "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=18"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/cookie-signature": {
+ "version": "1.2.2",
+ "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz",
+ "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=6.6.0"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/debug": {
+ "version": "4.4.3",
+ "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+ "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+ "license": "MIT",
+ "dependencies": {
+ "ms": "^2.1.3"
+ },
+ "engines": {
+ "node": ">=6.0"
+ },
+ "peerDependenciesMeta": {
+ "supports-color": {
+ "optional": true
+ }
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/express": {
+ "version": "5.2.1",
+ "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz",
+ "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==",
+ "license": "MIT",
+ "dependencies": {
+ "accepts": "^2.0.0",
+ "body-parser": "^2.2.1",
+ "content-disposition": "^1.0.0",
+ "content-type": "^1.0.5",
+ "cookie": "^0.7.1",
+ "cookie-signature": "^1.2.1",
+ "debug": "^4.4.0",
+ "depd": "^2.0.0",
+ "encodeurl": "^2.0.0",
+ "escape-html": "^1.0.3",
+ "etag": "^1.8.1",
+ "finalhandler": "^2.1.0",
+ "fresh": "^2.0.0",
+ "http-errors": "^2.0.0",
+ "merge-descriptors": "^2.0.0",
+ "mime-types": "^3.0.0",
+ "on-finished": "^2.4.1",
+ "once": "^1.4.0",
+ "parseurl": "^1.3.3",
+ "proxy-addr": "^2.0.7",
+ "qs": "^6.14.0",
+ "range-parser": "^1.2.1",
+ "router": "^2.2.0",
+ "send": "^1.1.0",
+ "serve-static": "^2.2.0",
+ "statuses": "^2.0.1",
+ "type-is": "^2.0.1",
+ "vary": "^1.1.2"
+ },
+ "engines": {
+ "node": ">= 18"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/finalhandler": {
+ "version": "2.1.1",
+ "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz",
+ "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==",
+ "license": "MIT",
+ "dependencies": {
+ "debug": "^4.4.0",
+ "encodeurl": "^2.0.0",
+ "escape-html": "^1.0.3",
+ "on-finished": "^2.4.1",
+ "parseurl": "^1.3.3",
+ "statuses": "^2.0.1"
+ },
+ "engines": {
+ "node": ">= 18.0.0"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/fresh": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz",
+ "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 0.8"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/iconv-lite": {
+ "version": "0.7.2",
+ "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz",
+ "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==",
+ "license": "MIT",
+ "dependencies": {
+ "safer-buffer": ">= 2.1.2 < 3.0.0"
+ },
+ "engines": {
+ "node": ">=0.10.0"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/json-schema-traverse": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+ "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+ "license": "MIT"
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/media-typer": {
+ "version": "1.1.0",
+ "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-1.1.0.tgz",
+ "integrity": "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 0.8"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/merge-descriptors": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz",
+ "integrity": "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=18"
+ },
+ "funding": {
+ "url": "https://github.com/sponsors/sindresorhus"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/mime-db": {
+ "version": "1.54.0",
+ "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz",
+ "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 0.6"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/mime-types": {
+ "version": "3.0.2",
+ "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz",
+ "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==",
+ "license": "MIT",
+ "dependencies": {
+ "mime-db": "^1.54.0"
+ },
+ "engines": {
+ "node": ">=18"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/ms": {
+ "version": "2.1.3",
+ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+ "license": "MIT"
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/negotiator": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-1.0.0.tgz",
+ "integrity": "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 0.6"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/raw-body": {
+ "version": "3.0.2",
+ "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-3.0.2.tgz",
+ "integrity": "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==",
+ "license": "MIT",
+ "dependencies": {
+ "bytes": "~3.1.2",
+ "http-errors": "~2.0.1",
+ "iconv-lite": "~0.7.0",
+ "unpipe": "~1.0.0"
+ },
+ "engines": {
+ "node": ">= 0.10"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/send": {
+ "version": "1.2.1",
+ "resolved": "https://registry.npmjs.org/send/-/send-1.2.1.tgz",
+ "integrity": "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==",
+ "license": "MIT",
+ "dependencies": {
+ "debug": "^4.4.3",
+ "encodeurl": "^2.0.0",
+ "escape-html": "^1.0.3",
+ "etag": "^1.8.1",
+ "fresh": "^2.0.0",
+ "http-errors": "^2.0.1",
+ "mime-types": "^3.0.2",
+ "ms": "^2.1.3",
+ "on-finished": "^2.4.1",
+ "range-parser": "^1.2.1",
+ "statuses": "^2.0.2"
+ },
+ "engines": {
+ "node": ">= 18"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/serve-static": {
+ "version": "2.2.1",
+ "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-2.2.1.tgz",
+ "integrity": "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==",
+ "license": "MIT",
+ "dependencies": {
+ "encodeurl": "^2.0.0",
+ "escape-html": "^1.0.3",
+ "parseurl": "^1.3.3",
+ "send": "^1.2.0"
+ },
+ "engines": {
+ "node": ">= 18"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
+ "node_modules/@modelcontextprotocol/sdk/node_modules/type-is": {
+ "version": "2.0.1",
+ "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz",
+ "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==",
+ "license": "MIT",
+ "dependencies": {
+ "content-type": "^1.0.5",
+ "media-typer": "^1.1.0",
+ "mime-types": "^3.0.0"
+ },
+ "engines": {
+ "node": ">= 0.6"
+ }
+ },
"node_modules/@noble/hashes": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-1.8.0.tgz",
@@ -2831,6 +3205,45 @@
"url": "https://github.com/sponsors/epoberezkin"
}
},
+ "node_modules/ajv-formats": {
+ "version": "3.0.1",
+ "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-3.0.1.tgz",
+ "integrity": "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==",
+ "license": "MIT",
+ "dependencies": {
+ "ajv": "^8.0.0"
+ },
+ "peerDependencies": {
+ "ajv": "^8.0.0"
+ },
+ "peerDependenciesMeta": {
+ "ajv": {
+ "optional": true
+ }
+ }
+ },
+ "node_modules/ajv-formats/node_modules/ajv": {
+ "version": "8.18.0",
+ "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz",
+ "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==",
+ "license": "MIT",
+ "dependencies": {
+ "fast-deep-equal": "^3.1.3",
+ "fast-uri": "^3.0.1",
+ "json-schema-traverse": "^1.0.0",
+ "require-from-string": "^2.0.2"
+ },
+ "funding": {
+ "type": "github",
+ "url": "https://github.com/sponsors/epoberezkin"
+ }
+ },
+ "node_modules/ajv-formats/node_modules/json-schema-traverse": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+ "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+ "license": "MIT"
+ },
"node_modules/ansi-styles": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
@@ -3370,11 +3783,27 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/cors": {
+ "version": "2.8.6",
+ "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.6.tgz",
+ "integrity": "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==",
+ "license": "MIT",
+ "dependencies": {
+ "object-assign": "^4",
+ "vary": "^1"
+ },
+ "engines": {
+ "node": ">= 0.10"
+ },
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
"node_modules/cross-spawn": {
"version": "7.0.6",
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
- "dev": true,
"license": "MIT",
"dependencies": {
"path-key": "^3.1.0",
@@ -3902,6 +4331,27 @@
"node": ">=0.8.x"
}
},
+ "node_modules/eventsource": {
+ "version": "3.0.7",
+ "resolved": "https://registry.npmjs.org/eventsource/-/eventsource-3.0.7.tgz",
+ "integrity": "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==",
+ "license": "MIT",
+ "dependencies": {
+ "eventsource-parser": "^3.0.1"
+ },
+ "engines": {
+ "node": ">=18.0.0"
+ }
+ },
+ "node_modules/eventsource-parser": {
+ "version": "3.0.6",
+ "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz",
+ "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=18.0.0"
+ }
+ },
"node_modules/exif-parser": {
"version": "0.1.12",
"resolved": "https://registry.npmjs.org/exif-parser/-/exif-parser-0.1.12.tgz",
@@ -3963,11 +4413,28 @@
"url": "https://opencollective.com/express"
}
},
+ "node_modules/express-rate-limit": {
+ "version": "8.3.1",
+ "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.3.1.tgz",
+ "integrity": "sha512-D1dKN+cmyPWuvB+G2SREQDzPY1agpBIcTa9sJxOPMCNeH3gwzhqJRDWCXW3gg0y//+LQ/8j52JbMROWyrKdMdw==",
+ "license": "MIT",
+ "dependencies": {
+ "ip-address": "10.1.0"
+ },
+ "engines": {
+ "node": ">= 16"
+ },
+ "funding": {
+ "url": "https://github.com/sponsors/express-rate-limit"
+ },
+ "peerDependencies": {
+ "express": ">= 4.11"
+ }
+ },
"node_modules/fast-deep-equal": {
"version": "3.1.3",
"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
"integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
- "dev": true,
"license": "MIT"
},
"node_modules/fast-json-stable-stringify": {
@@ -3991,6 +4458,22 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/fast-uri": {
+ "version": "3.1.0",
+ "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
+ "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
+ "funding": [
+ {
+ "type": "github",
+ "url": "https://github.com/sponsors/fastify"
+ },
+ {
+ "type": "opencollective",
+ "url": "https://opencollective.com/fastify"
+ }
+ ],
+ "license": "BSD-3-Clause"
+ },
"node_modules/fdir": {
"version": "6.5.0",
"resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz",
@@ -4356,6 +4839,15 @@
"node": ">= 0.4"
}
},
+ "node_modules/hono": {
+ "version": "4.12.5",
+ "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.5.tgz",
+ "integrity": "sha512-3qq+FUBtlTHhtYxbxheZgY8NIFnkkC/MR8u5TTsr7YZ3wixryQ3cCwn3iZbg8p8B88iDBBAYSfZDS75t8MN7Vg==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=16.9.0"
+ }
+ },
"node_modules/http-errors": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz",
@@ -4466,6 +4958,15 @@
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
"license": "ISC"
},
+ "node_modules/ip-address": {
+ "version": "10.1.0",
+ "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
+ "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
+ "license": "MIT",
+ "engines": {
+ "node": ">= 12"
+ }
+ },
"node_modules/ipaddr.js": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
@@ -4525,6 +5026,12 @@
"node": ">=0.10.0"
}
},
+ "node_modules/is-promise": {
+ "version": "4.0.0",
+ "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz",
+ "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==",
+ "license": "MIT"
+ },
"node_modules/is-wsl": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz",
@@ -4565,6 +5072,15 @@
"regenerator-runtime": "^0.13.3"
}
},
+ "node_modules/jose": {
+ "version": "6.2.1",
+ "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.1.tgz",
+ "integrity": "sha512-jUaKr1yrbfaImV7R2TN/b3IcZzsw38/chqMpo2XJ7i2F8AfM/lA4G1goC3JVEwg0H7UldTmSt3P68nt31W7/mw==",
+ "license": "MIT",
+ "funding": {
+ "url": "https://github.com/sponsors/panva"
+ }
+ },
"node_modules/jpeg-js": {
"version": "0.4.4",
"resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.4.4.tgz",
@@ -4598,6 +5114,12 @@
"dev": true,
"license": "MIT"
},
+ "node_modules/json-schema-typed": {
+ "version": "8.0.2",
+ "resolved": "https://registry.npmjs.org/json-schema-typed/-/json-schema-typed-8.0.2.tgz",
+ "integrity": "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==",
+ "license": "BSD-2-Clause"
+ },
"node_modules/json-stable-stringify-without-jsonify": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
@@ -4852,6 +5374,15 @@
}
}
},
+ "node_modules/object-assign": {
+ "version": "4.1.1",
+ "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+ "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=0.10.0"
+ }
+ },
"node_modules/object-inspect": {
"version": "1.13.4",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
@@ -5031,7 +5562,6 @@
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
"integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
- "dev": true,
"license": "MIT",
"engines": {
"node": ">=8"
@@ -5117,6 +5647,15 @@
"node": ">=4.0.0"
}
},
+ "node_modules/pkce-challenge": {
+ "version": "5.0.1",
+ "resolved": "https://registry.npmjs.org/pkce-challenge/-/pkce-challenge-5.0.1.tgz",
+ "integrity": "sha512-wQ0b/W4Fr01qtpHlqSqspcj3EhBvimsdh0KlHhH8HRZnMsEa0ea2fTULOXOS9ccQr3om+GcGRk4e+isrZWV8qQ==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=16.20.0"
+ }
+ },
"node_modules/playwright": {
"version": "1.58.2",
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.2.tgz",
@@ -5352,6 +5891,15 @@
"integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==",
"license": "MIT"
},
+ "node_modules/require-from-string": {
+ "version": "2.0.2",
+ "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
+ "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=0.10.0"
+ }
+ },
"node_modules/resolve-from": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
@@ -5417,6 +5965,55 @@
"fsevents": "~2.3.2"
}
},
+ "node_modules/router": {
+ "version": "2.2.0",
+ "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz",
+ "integrity": "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==",
+ "license": "MIT",
+ "dependencies": {
+ "debug": "^4.4.0",
+ "depd": "^2.0.0",
+ "is-promise": "^4.0.0",
+ "parseurl": "^1.3.3",
+ "path-to-regexp": "^8.0.0"
+ },
+ "engines": {
+ "node": ">= 18"
+ }
+ },
+ "node_modules/router/node_modules/debug": {
+ "version": "4.4.3",
+ "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+ "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+ "license": "MIT",
+ "dependencies": {
+ "ms": "^2.1.3"
+ },
+ "engines": {
+ "node": ">=6.0"
+ },
+ "peerDependenciesMeta": {
+ "supports-color": {
+ "optional": true
+ }
+ }
+ },
+ "node_modules/router/node_modules/ms": {
+ "version": "2.1.3",
+ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+ "license": "MIT"
+ },
+ "node_modules/router/node_modules/path-to-regexp": {
+ "version": "8.3.0",
+ "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.3.0.tgz",
+ "integrity": "sha512-7jdwVIRtsP8MYpdXSwOS0YdD0Du+qOoF/AEPIt88PcCFrZCzx41oxku1jD88hZBwbNUIEfpqvuhjFaMAqMTWnA==",
+ "license": "MIT",
+ "funding": {
+ "type": "opencollective",
+ "url": "https://opencollective.com/express"
+ }
+ },
"node_modules/safe-buffer": {
"version": "5.2.1",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -5558,7 +6155,6 @@
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
"integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
- "dev": true,
"license": "MIT",
"dependencies": {
"shebang-regex": "^3.0.0"
@@ -5571,7 +6167,6 @@
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
"integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
- "dev": true,
"license": "MIT",
"engines": {
"node": ">=8"
@@ -6243,7 +6838,6 @@
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
"integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
- "dev": true,
"license": "ISC",
"dependencies": {
"isexe": "^2.0.0"
@@ -6379,6 +6973,15 @@
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
+ },
+ "node_modules/zod-to-json-schema": {
+ "version": "3.25.1",
+ "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.1.tgz",
+ "integrity": "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA==",
+ "license": "ISC",
+ "peerDependencies": {
+ "zod": "^3.25 || ^4"
+ }
}
}
}
diff --git a/package.json b/package.json
index 84f8c0a..d54e28f 100644
--- a/package.json
+++ b/package.json
@@ -1,10 +1,9 @@
{
- "name": "clawd-cursor",
- "version": "0.6.3",
- "description": "AI Desktop Agent — Native screen control via @nut-tree-fork/nut-js. Your AI sees the screen, moves the mouse, types, and completes tasks autonomously.",
+ "name": "clawdcursor",
+ "version": "0.7.0",
+ "description": "OS-level desktop automation server. Gives any AI model eyes, hands, and ears on a real computer. Model-agnostic — works with Claude, GPT, Gemini, Llama, or any tool-calling model.",
"main": "dist/index.js",
"bin": {
- "clawd-cursor": "dist/index.js",
"clawdcursor": "dist/index.js"
},
"scripts": {
@@ -19,6 +18,7 @@
"test": "vitest"
},
"dependencies": {
+ "@modelcontextprotocol/sdk": "^1.27.1",
"@nut-tree-fork/nut-js": "^4.2.0",
"commander": "^12.0.0",
"dotenv": "^16.4.0",
@@ -47,4 +47,3 @@
},
"license": "MIT"
}
-
diff --git a/perf/apply-optimizations.ps1 b/perf/apply-optimizations.ps1
index e0fd0be..5119f2c 100644
--- a/perf/apply-optimizations.ps1
+++ b/perf/apply-optimizations.ps1
@@ -1,5 +1,5 @@
# Apply Clawd Cursor performance optimizations
-# Usage: .\apply-optimizations.ps1 -ProjectRoot
+# Usage: .\apply-optimizations.ps1 -ProjectRoot
#
# Creates .orig backups before modifying any file.
# Run `npx tsc --noEmit` after to verify.
diff --git a/scripts/get-screen-context.ps1 b/scripts/get-screen-context.ps1
index 8d21207..c000060 100644
--- a/scripts/get-screen-context.ps1
+++ b/scripts/get-screen-context.ps1
@@ -9,7 +9,7 @@
#>
param(
[int]$FocusedProcessId = 0,
- [int]$MaxDepth = 2
+ [int]$MaxDepth = 4
)
try {
@@ -34,7 +34,8 @@ $interactiveTypes = @(
function ConvertTo-UINode {
param(
[System.Windows.Automation.AutomationElement]$Element,
- [int]$Depth = 0
+ [int]$Depth = 0,
+ [int]$TreeMaxDepth = 4
)
if ($null -eq $Element) { return $null }
@@ -47,23 +48,19 @@ function ConvertTo-UINode {
# Skip non-interactive unnamed elements
if (-not $isInteractive -and -not $hasName -and $Depth -gt 0) {
- # Still recurse into children — interactive elements may be nested
+ if ($Depth -ge $TreeMaxDepth) { return $null }
$childNodes = @()
- if ($Depth -lt $MaxDepth) {
- try {
- $kids = $Element.FindAll(
- [System.Windows.Automation.TreeScope]::Children,
- [System.Windows.Automation.Condition]::TrueCondition
- )
- foreach ($kid in $kids) {
- $childNode = ConvertTo-UINode -Element $kid -Depth ($Depth + 1)
- if ($null -ne $childNode) { $childNodes += $childNode }
- }
- } catch {}
- }
- # If this node has no interesting children, skip it entirely
+ try {
+ $kids = $Element.FindAll(
+ [System.Windows.Automation.TreeScope]::Children,
+ [System.Windows.Automation.Condition]::TrueCondition
+ )
+ foreach ($kid in $kids) {
+ $childNode = ConvertTo-UINode -Element $kid -Depth ($Depth + 1) -TreeMaxDepth $TreeMaxDepth
+ if ($null -ne $childNode) { $childNodes += $childNode }
+ }
+ } catch {}
if ($childNodes.Count -eq 0) { return $null }
- # Return children directly (flatten)
return $childNodes
}
@@ -87,20 +84,17 @@ function ConvertTo-UINode {
children = @()
}
- if ($Depth -lt $MaxDepth) {
+ if ($Depth -lt $TreeMaxDepth) {
try {
$kids = $Element.FindAll(
[System.Windows.Automation.TreeScope]::Children,
[System.Windows.Automation.Condition]::TrueCondition
)
foreach ($kid in $kids) {
- $childNode = ConvertTo-UINode -Element $kid -Depth ($Depth + 1)
+ $childNode = ConvertTo-UINode -Element $kid -Depth ($Depth + 1) -TreeMaxDepth $TreeMaxDepth
if ($null -ne $childNode) {
- if ($childNode -is [array]) {
- $node.children += $childNode
- } else {
- $node.children += $childNode
- }
+ if ($childNode -is [array]) { $node.children += $childNode }
+ else { $node.children += $childNode }
}
}
} catch {}
@@ -172,7 +166,7 @@ try {
$condition
)
if ($null -ne $targetWindow) {
- $uiTree = ConvertTo-UINode -Element $targetWindow -Depth 0
+ $uiTree = ConvertTo-UINode -Element $targetWindow -Depth 0 -TreeMaxDepth $MaxDepth
}
}
diff --git a/scripts/invoke-element.ps1 b/scripts/invoke-element.ps1
index 537e54e..a79d103 100644
--- a/scripts/invoke-element.ps1
+++ b/scripts/invoke-element.ps1
@@ -85,47 +85,57 @@ try {
exit 0
}
- # Build the search condition for the element
+ # Build condition (without name — fuzzy name matching done below)
$conditions = @()
-
if ($AutomationId -ne "") {
$conditions += New-Object System.Windows.Automation.PropertyCondition(
- [System.Windows.Automation.AutomationElement]::AutomationIdProperty,
- $AutomationId
- )
- }
-
- if ($Name -ne "") {
- $conditions += New-Object System.Windows.Automation.PropertyCondition(
- [System.Windows.Automation.AutomationElement]::NameProperty,
- $Name
+ [System.Windows.Automation.AutomationElement]::AutomationIdProperty, $AutomationId
)
}
-
if ($ControlType -ne "" -and $ctMap.ContainsKey($ControlType)) {
$conditions += New-Object System.Windows.Automation.PropertyCondition(
- [System.Windows.Automation.AutomationElement]::ControlTypeProperty,
- $ctMap[$ControlType]
+ [System.Windows.Automation.AutomationElement]::ControlTypeProperty, $ctMap[$ControlType]
)
}
-
- if ($conditions.Count -eq 0) {
- [Console]::Out.Write((@{ success = $false; error = "Must specify at least -AutomationId or -Name to identify the element" } | ConvertTo-Json -Compress))
+ if ($conditions.Count -eq 0 -and $Name -eq "") {
+ [Console]::Out.Write((@{ success = $false; error = "Must specify at least -AutomationId or -Name" } | ConvertTo-Json -Compress))
exit 0
}
- if ($conditions.Count -eq 1) {
- $searchCondition = $conditions[0]
- } else {
- $searchCondition = New-Object System.Windows.Automation.AndCondition(
- [System.Windows.Automation.Condition[]]$conditions
- )
+ $searchCondition = if ($conditions.Count -eq 0) { [System.Windows.Automation.Condition]::TrueCondition }
+ elseif ($conditions.Count -eq 1) { $conditions[0] }
+ else { New-Object System.Windows.Automation.AndCondition([System.Windows.Automation.Condition[]]$conditions) }
+
+ $element = $null
+
+ # Fast path: exact automationId match
+ if ($AutomationId -ne "" -and $conditions.Count -gt 0) {
+ $element = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $searchCondition)
}
- $element = $window.FindFirst(
- [System.Windows.Automation.TreeScope]::Descendants,
- $searchCondition
- )
+ # Fuzzy name match: strip keyboard shortcut suffix ("Save\tCtrl+S" → "save")
+ if ($null -eq $element -and $Name -ne "") {
+ $nameLower = $Name.ToLower()
+ $candidates = $window.FindAll([System.Windows.Automation.TreeScope]::Descendants, $searchCondition)
+ # First pass: exact stripped match
+ foreach ($el in $candidates) {
+ try {
+ $elName = ($el.Current.Name -replace '\t.*$', '').Trim().ToLower()
+ if ($elName -eq $nameLower -and $elName.Length -gt 0) { $element = $el; break }
+ } catch {}
+ }
+ # Second pass: contains match
+ if ($null -eq $element) {
+ foreach ($el in $candidates) {
+ try {
+ $elName = ($el.Current.Name -replace '\t.*$', '').Trim().ToLower()
+ if ($elName.Length -gt 0 -and ($elName.Contains($nameLower) -or $nameLower.Contains($elName))) {
+ $element = $el; break
+ }
+ } catch {}
+ }
+ }
+ }
if ($null -eq $element) {
$searchDesc = ""
diff --git a/scripts/linux/ocr-recognize.py b/scripts/linux/ocr-recognize.py
new file mode 100644
index 0000000..139001d
--- /dev/null
+++ b/scripts/linux/ocr-recognize.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Linux OCR via Tesseract (pytesseract) or tesseract CLI.
+Takes an image path, outputs JSON result to stdout.
+Matches the same JSON format as ocr-recognize.ps1 (Windows).
+
+Usage: python3 ocr-recognize.py /path/to/image.png
+
+Requires: tesseract-ocr package
+ Ubuntu/Debian: sudo apt install tesseract-ocr
+ Fedora: sudo dnf install tesseract
+ Arch: sudo pacman -S tesseract
+
+Optional: pip install pytesseract (for bounding boxes)
+"""
+
+import json
+import subprocess
+import sys
+import os
+import shutil
+
+def ocr_with_tesseract_cli(image_path):
+ """Use tesseract CLI with TSV output for bounding boxes."""
+ try:
+ result = subprocess.run(
+ ['tesseract', image_path, '-', 'tsv'],
+ capture_output=True, text=True, timeout=30
+ )
+ if result.returncode != 0:
+ return {"error": f"tesseract failed: {result.stderr.strip()}"}
+
+ elements = []
+ lines_text = []
+ current_line = -1
+
+ for line in result.stdout.strip().split('\n')[1:]: # skip header
+ parts = line.split('\t')
+ if len(parts) < 12:
+ continue
+
+ level, page, block, par, line_num, word_num = parts[:6]
+ left, top, width, height = parts[6:10]
+ conf = parts[10]
+ text = parts[11].strip() if len(parts) > 11 else ''
+
+ if not text or conf == '-1':
+ continue
+
+ line_idx = int(line_num)
+ if line_idx != current_line:
+ current_line = line_idx
+ if text:
+ lines_text.append(text)
+ else:
+ lines_text.append('')
+ else:
+ if lines_text:
+ lines_text[-1] += ' ' + text
+
+ elements.append({
+ "text": text,
+ "x": int(left),
+ "y": int(top),
+ "width": int(width),
+ "height": int(height),
+ "confidence": round(max(0, int(conf)) / 100, 2),
+ "line": line_idx
+ })
+
+ return {
+ "elements": elements,
+ "fullText": '\n'.join(lines_text)
+ }
+ except FileNotFoundError:
+ return {"error": "tesseract not found. Install: sudo apt install tesseract-ocr"}
+ except subprocess.TimeoutExpired:
+ return {"error": "tesseract timed out after 30s"}
+ except Exception as e:
+ return {"error": f"tesseract error: {str(e)}"}
+
+
+def ocr_with_pytesseract(image_path):
+ """Use pytesseract for bounding boxes (if installed)."""
+ try:
+ import pytesseract
+ from PIL import Image
+
+ img = Image.open(image_path)
+ data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+
+ elements = []
+ lines_text = []
+ current_line = -1
+
+ for i in range(len(data['text'])):
+ text = data['text'][i].strip()
+ conf = int(data['conf'][i])
+
+ if not text or conf < 0:
+ continue
+
+ line_idx = data['line_num'][i]
+ if line_idx != current_line:
+ current_line = line_idx
+ lines_text.append(text)
+ else:
+ if lines_text:
+ lines_text[-1] += ' ' + text
+
+ elements.append({
+ "text": text,
+ "x": data['left'][i],
+ "y": data['top'][i],
+ "width": data['width'][i],
+ "height": data['height'][i],
+ "confidence": round(conf / 100, 2),
+ "line": line_idx
+ })
+
+ return {
+ "elements": elements,
+ "fullText": '\n'.join(lines_text)
+ }
+ except ImportError:
+ return None # Fall back to CLI
+ except Exception as e:
+ return {"error": f"pytesseract error: {str(e)}"}
+
+
+def main():
+ if len(sys.argv) < 2:
+ print(json.dumps({"error": "Usage: ocr-recognize.py "}))
+ return
+
+ image_path = sys.argv[1]
+ if not os.path.isfile(image_path):
+ print(json.dumps({"error": f"Image not found: {image_path}"}))
+ return
+
+ # Try pytesseract first (better bounding boxes), fall back to CLI
+ result = ocr_with_pytesseract(image_path)
+ if result is None:
+ # pytesseract not installed, use CLI
+ if shutil.which('tesseract'):
+ result = ocr_with_tesseract_cli(image_path)
+ else:
+ result = {"error": "No OCR available. Install: sudo apt install tesseract-ocr"}
+
+ print(json.dumps(result))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/mac/get-focused-element.jxa b/scripts/mac/get-focused-element.jxa
new file mode 100644
index 0000000..5371f9d
--- /dev/null
+++ b/scripts/mac/get-focused-element.jxa
@@ -0,0 +1,102 @@
+#!/usr/bin/env osascript -l JavaScript
+/**
+ * get-focused-element.jxa — returns the currently focused UI element on macOS.
+ *
+ * Output: JSON with name, role, value, bounds, processId.
+ * Matches the FocusedElementInfo shape from accessibility.ts.
+ */
+
+function run() {
+ const se = Application('System Events');
+ se.includeStandardAdditions = true;
+
+ try {
+ const procs = se.applicationProcesses.whose({ frontmost: true });
+ if (procs.length === 0) {
+ return JSON.stringify(null);
+ }
+
+ const proc = procs[0];
+ const pid = proc.unixId();
+ const procName = proc.name();
+
+ // Get the focused UI element
+ let focused;
+ try {
+ focused = proc.focusedUIElement();
+ } catch (e) {
+ // Some apps don't expose focused element — return window info instead
+ try {
+ const wins = proc.windows();
+ if (wins.length > 0) {
+ const win = wins[0];
+ const pos = win.position();
+ const sz = win.size();
+ return JSON.stringify({
+ name: win.name() || procName,
+ automationId: '',
+ controlType: 'Window',
+ className: procName,
+ processId: pid,
+ isEnabled: true,
+ bounds: { x: pos[0], y: pos[1], width: sz[0], height: sz[1] },
+ value: ''
+ });
+ }
+ } catch (_) {}
+ return JSON.stringify(null);
+ }
+
+ // Extract properties safely
+ let name = '', role = '', value = '';
+ let bounds = { x: 0, y: 0, width: 0, height: 0 };
+
+ try { name = focused.name() || ''; } catch (_) {}
+ try { role = focused.role() || ''; } catch (_) {}
+ try { value = String(focused.value() || ''); } catch (_) {}
+
+ try {
+ const pos = focused.position();
+ const sz = focused.size();
+ bounds = { x: pos[0], y: pos[1], width: sz[0], height: sz[1] };
+ } catch (_) {}
+
+ // Map AX roles to Windows ControlType names for compatibility
+ const roleMap = {
+ 'AXTextField': 'Edit',
+ 'AXTextArea': 'Edit',
+ 'AXButton': 'Button',
+ 'AXCheckBox': 'CheckBox',
+ 'AXRadioButton': 'RadioButton',
+ 'AXComboBox': 'ComboBox',
+ 'AXList': 'List',
+ 'AXTable': 'Table',
+ 'AXMenu': 'Menu',
+ 'AXMenuItem': 'MenuItem',
+ 'AXStaticText': 'Text',
+ 'AXImage': 'Image',
+ 'AXLink': 'Hyperlink',
+ 'AXGroup': 'Group',
+ 'AXWindow': 'Window',
+ 'AXWebArea': 'Document',
+ 'AXScrollArea': 'ScrollBar',
+ 'AXTab': 'Tab',
+ 'AXSlider': 'Slider',
+ };
+
+ const controlType = roleMap[role] || role.replace('AX', '') || 'Unknown';
+
+ return JSON.stringify({
+ name: name,
+ automationId: '',
+ controlType: controlType,
+ className: procName,
+ processId: pid,
+ isEnabled: true,
+ bounds: bounds,
+ value: value
+ });
+ } catch (e) {
+ return JSON.stringify(null);
+ }
+}
diff --git a/scripts/mac/ocr-recognize.swift b/scripts/mac/ocr-recognize.swift
new file mode 100644
index 0000000..71c616a
--- /dev/null
+++ b/scripts/mac/ocr-recognize.swift
@@ -0,0 +1,124 @@
+#!/usr/bin/env swift
+// macOS OCR via Vision framework (VNRecognizeTextRequest)
+// Takes an image path, outputs JSON result to stdout.
+// Matches the same JSON format as ocr-recognize.ps1 (Windows).
+//
+// Usage: swift ocr-recognize.swift /path/to/image.png
+
+import Foundation
+import Vision
+import AppKit
+
+guard CommandLine.arguments.count > 1 else {
+ let err: [String: Any] = ["error": "Usage: ocr-recognize.swift "]
+ if let data = try? JSONSerialization.data(withJSONObject: err),
+ let str = String(data: data, encoding: .utf8) {
+ print(str)
+ }
+ exit(0)
+}
+
+let imagePath = CommandLine.arguments[1]
+let imageURL = URL(fileURLWithPath: imagePath)
+
+guard let image = NSImage(contentsOf: imageURL),
+ let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+ let err: [String: Any] = ["error": "Failed to load image: \(imagePath)"]
+ if let data = try? JSONSerialization.data(withJSONObject: err),
+ let str = String(data: data, encoding: .utf8) {
+ print(str)
+ }
+ exit(0)
+}
+
+let imageWidth = CGFloat(cgImage.width)
+let imageHeight = CGFloat(cgImage.height)
+
+let semaphore = DispatchSemaphore(value: 0)
+var elements: [[String: Any]] = []
+var fullText = ""
+
+let request = VNRecognizeTextRequest { request, error in
+ defer { semaphore.signal() }
+
+ if let error = error {
+ let err: [String: Any] = ["error": "OCR failed: \(error.localizedDescription)"]
+ if let data = try? JSONSerialization.data(withJSONObject: err),
+ let str = String(data: data, encoding: .utf8) {
+ print(str)
+ }
+ return
+ }
+
+ guard let observations = request.results as? [VNRecognizedTextObservation] else { return }
+
+ var lineIdx = 0
+ var lines: [String] = []
+
+ for observation in observations {
+ guard let candidate = observation.topCandidates(1).first else { continue }
+ let text = candidate.string
+ let confidence = candidate.confidence
+ let box = observation.boundingBox
+
+ // Vision coordinates: origin bottom-left, normalized 0-1
+ // Convert to screen pixels: origin top-left
+ let x = box.origin.x * imageWidth
+ let y = (1.0 - box.origin.y - box.height) * imageHeight
+ let w = box.width * imageWidth
+ let h = box.height * imageHeight
+
+ // Split into words for per-word bounding boxes (approximate)
+ let words = text.components(separatedBy: " ")
+ let wordWidth = w / CGFloat(max(words.count, 1))
+
+ for (i, word) in words.enumerated() {
+ guard !word.isEmpty else { continue }
+ let element: [String: Any] = [
+ "text": word,
+ "x": Int(round(x + wordWidth * CGFloat(i))),
+ "y": Int(round(y)),
+ "width": Int(round(wordWidth)),
+ "height": Int(round(h)),
+ "confidence": round(Double(confidence) * 100) / 100,
+ "line": lineIdx
+ ]
+ elements.append(element)
+ }
+
+ lines.append(text)
+ lineIdx += 1
+ }
+
+ fullText = lines.joined(separator: "\n")
+}
+
+// Configure for accuracy (fast is also available)
+request.recognitionLevel = .accurate
+request.usesLanguageCorrection = true
+
+let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+do {
+ try handler.perform([request])
+} catch {
+ let err: [String: Any] = ["error": "VNImageRequestHandler failed: \(error.localizedDescription)"]
+ if let data = try? JSONSerialization.data(withJSONObject: err),
+ let str = String(data: data, encoding: .utf8) {
+ print(str)
+ }
+ exit(0)
+}
+
+// Wait for async completion
+semaphore.wait()
+
+// Output JSON matching Windows format
+let result: [String: Any] = [
+ "elements": elements,
+ "fullText": fullText
+]
+
+if let data = try? JSONSerialization.data(withJSONObject: result),
+ let str = String(data: data, encoding: .utf8) {
+ print(str)
+}
diff --git a/scripts/ocr-recognize.ps1 b/scripts/ocr-recognize.ps1
new file mode 100644
index 0000000..f8a6d92
--- /dev/null
+++ b/scripts/ocr-recognize.ps1
@@ -0,0 +1,96 @@
+# OCR Recognition via Windows.Media.Ocr WinRT API
+# Takes an image path, runs OCR, outputs JSON result to stdout.
+# Called one-shot per OCR request (results cached in TypeScript layer).
+
+param([string]$ImagePath)
+
+try {
+ # Resolve to absolute path (required by WinRT StorageFile API)
+ $ImagePath = (Resolve-Path $ImagePath -ErrorAction Stop).Path
+
+ # Load WinRT interop assembly
+ Add-Type -AssemblyName System.Runtime.WindowsRuntime
+
+ # Load WinRT types
+ $null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
+ $null = [Windows.Graphics.Imaging.SoftwareBitmap, Windows.Foundation, ContentType = WindowsRuntime]
+ $null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation, ContentType = WindowsRuntime]
+ $null = [Windows.Storage.StorageFile, Windows.Foundation, ContentType = WindowsRuntime]
+ $null = [Windows.Storage.Streams.RandomAccessStream, Windows.Foundation, ContentType = WindowsRuntime]
+
+ # Find the AsTask(IAsyncOperation) extension method
+ $asTaskOp = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
+ $_.Name -eq 'AsTask' -and
+ $_.IsGenericMethod -and
+ $_.GetParameters().Count -eq 1 -and
+ $_.GetParameters()[0].ParameterType.IsGenericType -and
+ $_.GetParameters()[0].ParameterType.GetGenericTypeDefinition().Name -eq 'IAsyncOperation`1'
+ } | Select-Object -First 1
+
+ if (-not $asTaskOp) {
+ [Console]::Out.WriteLine('{"error":"Cannot find AsTask method for WinRT async"}')
+ [Console]::Out.Flush()
+ exit 0
+ }
+
+ function Invoke-Async {
+ param([object]$AsyncOp, [Type]$ResultType)
+ $genericMethod = $script:asTaskOp.MakeGenericMethod($ResultType)
+ $task = $genericMethod.Invoke($null, @($AsyncOp))
+ $task.Wait() | Out-Null
+ return $task.Result
+ }
+
+ # Create OCR engine from user profile languages
+ $ocr = [Windows.Media.Ocr.OcrEngine]::TryCreateFromUserProfileLanguages()
+ if (-not $ocr) {
+ [Console]::Out.WriteLine('{"error":"Windows OCR engine not available - no recognized languages installed"}')
+ [Console]::Out.Flush()
+ exit 0
+ }
+
+ # Open image via WinRT StorageFile → stream → BitmapDecoder → SoftwareBitmap
+ $storageFile = Invoke-Async ([Windows.Storage.StorageFile]::GetFileFromPathAsync($ImagePath)) ([Windows.Storage.StorageFile])
+ $stream = Invoke-Async ($storageFile.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
+ $decoder = Invoke-Async ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
+ $bitmap = Invoke-Async ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
+
+ # Run OCR recognition
+ $ocrResult = Invoke-Async ($ocr.RecognizeAsync($bitmap)) ([Windows.Media.Ocr.OcrResult])
+
+ # Build structured result — one entry per word with bounding box
+ $elements = @()
+ $lineIdx = 0
+ foreach ($line in $ocrResult.Lines) {
+ foreach ($word in $line.Words) {
+ $r = $word.BoundingRect
+ $elements += [PSCustomObject]@{
+ text = ($word.Text -replace '[\x00-\x08\x0B\x0C\x0E-\x1F]', '')
+ x = [math]::Round($r.X)
+ y = [math]::Round($r.Y)
+ width = [math]::Round($r.Width)
+ height = [math]::Round($r.Height)
+ confidence = 1.0 # Windows.Media.Ocr does not expose per-word confidence
+ line = $lineIdx
+ }
+ }
+ $lineIdx++
+ }
+
+ # Clean up WinRT resources
+ try { $stream.Dispose() } catch {}
+ try { $bitmap.Dispose() } catch {}
+
+ # Output single-line JSON
+ $output = [PSCustomObject]@{
+ elements = @($elements) # force array even if 0 or 1 element
+ fullText = if ($ocrResult.Text) { $ocrResult.Text } else { "" }
+ }
+ [Console]::Out.WriteLine(($output | ConvertTo-Json -Depth 3 -Compress))
+ [Console]::Out.Flush()
+
+} catch {
+ [Console]::Out.WriteLine((@{ error = $_.Exception.Message } | ConvertTo-Json -Compress))
+ [Console]::Out.Flush()
+ exit 0
+}
diff --git a/scripts/ps-bridge.ps1 b/scripts/ps-bridge.ps1
new file mode 100644
index 0000000..9d2ccca
--- /dev/null
+++ b/scripts/ps-bridge.ps1
@@ -0,0 +1,503 @@
+# Persistent PowerShell UIA Bridge
+# Reads newline-delimited JSON commands from stdin, writes results to stdout.
+# Keeps UI Automation assemblies and Win32 types loaded between calls —
+# eliminates 200-500ms PowerShell startup overhead on every a11y operation.
+
+try {
+ Add-Type -AssemblyName UIAutomationClient
+ Add-Type -AssemblyName UIAutomationTypes
+} catch {
+ [Console]::Out.WriteLine((@{ error = "Assembly load failed: $($_.Exception.Message)" } | ConvertTo-Json -Compress))
+ [Console]::Out.Flush()
+ exit 1
+}
+
+try {
+ Add-Type @"
+ using System;
+ using System.Runtime.InteropServices;
+ public static class Win32UIA {
+ [DllImport("user32.dll")]
+ public static extern IntPtr GetForegroundWindow();
+ [DllImport("user32.dll", SetLastError = true)]
+ public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId);
+ [DllImport("user32.dll")]
+ public static extern bool SetForegroundWindow(IntPtr hWnd);
+ [DllImport("user32.dll")]
+ public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow);
+ }
+"@
+} catch { } # May already be defined in a long-running session
+
+$ErrorActionPreference = 'Continue'
+
+# Control type map
+$ctMap = @{
+ "Button" = [System.Windows.Automation.ControlType]::Button
+ "CheckBox" = [System.Windows.Automation.ControlType]::CheckBox
+ "ComboBox" = [System.Windows.Automation.ControlType]::ComboBox
+ "Custom" = [System.Windows.Automation.ControlType]::Custom
+ "DataGrid" = [System.Windows.Automation.ControlType]::DataGrid
+ "DataItem" = [System.Windows.Automation.ControlType]::DataItem
+ "Document" = [System.Windows.Automation.ControlType]::Document
+ "Edit" = [System.Windows.Automation.ControlType]::Edit
+ "Group" = [System.Windows.Automation.ControlType]::Group
+ "Hyperlink" = [System.Windows.Automation.ControlType]::Hyperlink
+ "Image" = [System.Windows.Automation.ControlType]::Image
+ "List" = [System.Windows.Automation.ControlType]::List
+ "ListItem" = [System.Windows.Automation.ControlType]::ListItem
+ "Menu" = [System.Windows.Automation.ControlType]::Menu
+ "MenuBar" = [System.Windows.Automation.ControlType]::MenuBar
+ "MenuItem" = [System.Windows.Automation.ControlType]::MenuItem
+ "Pane" = [System.Windows.Automation.ControlType]::Pane
+ "RadioButton" = [System.Windows.Automation.ControlType]::RadioButton
+ "ScrollBar" = [System.Windows.Automation.ControlType]::ScrollBar
+ "Slider" = [System.Windows.Automation.ControlType]::Slider
+ "Spinner" = [System.Windows.Automation.ControlType]::Spinner
+ "SplitButton" = [System.Windows.Automation.ControlType]::SplitButton
+ "Tab" = [System.Windows.Automation.ControlType]::Tab
+ "TabItem" = [System.Windows.Automation.ControlType]::TabItem
+ "Text" = [System.Windows.Automation.ControlType]::Text
+ "ToolBar" = [System.Windows.Automation.ControlType]::ToolBar
+ "Tree" = [System.Windows.Automation.ControlType]::Tree
+ "TreeItem" = [System.Windows.Automation.ControlType]::TreeItem
+ "Window" = [System.Windows.Automation.ControlType]::Window
+}
+
+$interactiveTypes = @(
+ 'ControlType.Button', 'ControlType.Edit', 'ControlType.ComboBox',
+ 'ControlType.CheckBox', 'ControlType.RadioButton', 'ControlType.Hyperlink',
+ 'ControlType.MenuItem', 'ControlType.Menu', 'ControlType.Tab',
+ 'ControlType.TabItem', 'ControlType.ListItem', 'ControlType.TreeItem',
+ 'ControlType.Slider', 'ControlType.Document', 'ControlType.DataItem',
+ 'ControlType.Pane', 'ControlType.Custom', 'ControlType.ToolBar',
+ 'ControlType.Text', 'ControlType.Group'
+)
+
+# ── UI tree builder ───────────────────────────────────────────────────────────
+function ConvertTo-UINode {
+ param(
+ [System.Windows.Automation.AutomationElement]$Element,
+ [int]$Depth = 0,
+ [int]$MaxDepth = 8
+ )
+ if ($null -eq $Element) { return $null }
+ try { $cur = $Element.Current } catch { return $null }
+
+ $typeName = $cur.ControlType.ProgrammaticName
+ $hasName = $cur.Name -and $cur.Name.Trim().Length -gt 0
+ $isInteractive = $interactiveTypes -contains $typeName
+
+ if (-not $isInteractive -and -not $hasName -and $Depth -gt 0) {
+ # Unnamed non-interactive element — only skip if it's a LEAF (no children)
+ # or we've hit max depth. Electron/WebView2 apps nest: Window > Pane > Pane > Pane > Button
+ if ($Depth -ge $MaxDepth) { return $null }
+ $childNodes = @()
+ try {
+ $kids = $Element.FindAll([System.Windows.Automation.TreeScope]::Children, [System.Windows.Automation.Condition]::TrueCondition)
+ foreach ($kid in $kids) {
+ $cn = ConvertTo-UINode -Element $kid -Depth ($Depth + 1) -MaxDepth $MaxDepth
+ if ($null -ne $cn) { $childNodes += $cn }
+ }
+ } catch {}
+ # Skip unnamed leaves — but recurse into unnamed containers that have children
+ if ($childNodes.Count -eq 0) { return $null }
+ return $childNodes
+ }
+
+ $rect = $cur.BoundingRectangle
+ $bounds = if ([double]::IsInfinity($rect.X) -or [double]::IsInfinity($rect.Y) -or $rect.X -lt -100 -or $rect.Y -lt -100) {
+ @{ x = 0; y = 0; width = 0; height = 0 }
+ } else {
+ @{ x = [Math]::Round($rect.X); y = [Math]::Round($rect.Y); width = [Math]::Round($rect.Width); height = [Math]::Round($rect.Height) }
+ }
+
+ $node = [ordered]@{
+ name = if ($cur.Name) { $cur.Name } else { "" }
+ automationId = if ($cur.AutomationId) { $cur.AutomationId } else { "" }
+ controlType = $typeName
+ className = if ($cur.ClassName) { $cur.ClassName } else { "" }
+ isEnabled = $cur.IsEnabled
+ bounds = $bounds
+ children = @()
+ }
+
+ if ($Depth -lt $MaxDepth) {
+ try {
+ $kids = $Element.FindAll([System.Windows.Automation.TreeScope]::Children, [System.Windows.Automation.Condition]::TrueCondition)
+ foreach ($kid in $kids) {
+ $cn = ConvertTo-UINode -Element $kid -Depth ($Depth + 1) -MaxDepth $MaxDepth
+ if ($null -ne $cn) {
+ if ($cn -is [array]) { $node.children += $cn } else { $node.children += $cn }
+ }
+ }
+ } catch {}
+ }
+ return $node
+}
+
+# ── Command: get-screen-context ───────────────────────────────────────────────
+function Cmd-GetScreenContext {
+ param($cmd)
+ $focusedPid = if ($cmd.focusedProcessId) { [int]$cmd.focusedProcessId } else { 0 }
+ $maxDepth = if ($cmd.maxDepth) { [int]$cmd.maxDepth } else { 8 }
+
+ $root = [System.Windows.Automation.AutomationElement]::RootElement
+ $winCond = New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ControlTypeProperty,
+ [System.Windows.Automation.ControlType]::Window
+ )
+ $allWins = $root.FindAll([System.Windows.Automation.TreeScope]::Children, $winCond)
+
+ $windowList = @()
+ foreach ($win in $allWins) {
+ try {
+ $c = $win.Current
+ if (-not $c.Name -or $c.Name.Trim().Length -eq 0) { continue }
+ $pName = "unknown"
+ try { $pName = [System.Diagnostics.Process]::GetProcessById($c.ProcessId).ProcessName } catch {}
+ $rect = $c.BoundingRectangle
+ $bounds = if ([double]::IsInfinity($rect.X)) { @{ x=0;y=0;width=0;height=0 } }
+ else { @{ x=[Math]::Round($rect.X); y=[Math]::Round($rect.Y); width=[Math]::Round($rect.Width); height=[Math]::Round($rect.Height) } }
+ $isMin = $false
+ try {
+ $wp = $win.GetCurrentPattern([System.Windows.Automation.WindowPattern]::Pattern)
+ if ($wp.Current.WindowVisualState -eq [System.Windows.Automation.WindowVisualState]::Minimized) { $isMin = $true }
+ } catch {}
+ $windowList += [ordered]@{
+ handle = $c.NativeWindowHandle; title = $c.Name; processName = $pName
+ processId = $c.ProcessId; bounds = $bounds; isMinimized = $isMin
+ }
+ } catch {}
+ }
+
+ $uiTree = $null
+ if ($focusedPid -gt 0) {
+ $pidCond = New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ProcessIdProperty, $focusedPid
+ )
+ $targetWin = $root.FindFirst([System.Windows.Automation.TreeScope]::Children, $pidCond)
+ if ($null -ne $targetWin) {
+ $uiTree = ConvertTo-UINode -Element $targetWin -Depth 0 -MaxDepth $maxDepth
+ }
+ }
+
+ return [ordered]@{ windows = $windowList; uiTree = $uiTree }
+}
+
+# ── Command: get-foreground-window ────────────────────────────────────────────
+function Cmd-GetForegroundWindow {
+ $fgWin = [Win32UIA]::GetForegroundWindow()
+ if ($fgWin -eq [IntPtr]::Zero) { return @{ error = "No foreground window" } }
+ $wpid = 0
+ [void][Win32UIA]::GetWindowThreadProcessId($fgWin, [ref]$wpid)
+ $pName = "unknown"
+ try { $pName = [System.Diagnostics.Process]::GetProcessById($wpid).ProcessName } catch {}
+ $title = ""
+ try {
+ $el = [System.Windows.Automation.AutomationElement]::FromHandle($fgWin)
+ if ($el) { $title = $el.Current.Name }
+ } catch {}
+ return [ordered]@{ handle=[int]$fgWin; processId=$wpid; processName=$pName; title=$title; success=$true }
+}
+
+# ── Command: focus-window ─────────────────────────────────────────────────────
+function Cmd-FocusWindow {
+ param($cmd)
+ $title = if ($cmd.title) { $cmd.title } else { "" }
+ $wpid = if ($cmd.processId) { [int]$cmd.processId } else { 0 }
+ $restore = if ($cmd.restore) { $true } else { $false }
+
+ $root = [System.Windows.Automation.AutomationElement]::RootElement
+ $winCond = New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ControlTypeProperty,
+ [System.Windows.Automation.ControlType]::Window
+ )
+ $allWins = $root.FindAll([System.Windows.Automation.TreeScope]::Children, $winCond)
+
+ $target = $null
+ if ($wpid -gt 0) {
+ foreach ($w in $allWins) {
+ try { if ($w.Current.ProcessId -eq $wpid) { $target = $w; break } } catch {}
+ }
+ } elseif ($title -ne "") {
+ $tl = $title.ToLower()
+ foreach ($w in $allWins) {
+ try { if ($w.Current.Name -and $w.Current.Name.ToLower().Contains($tl)) { $target = $w; break } } catch {}
+ }
+ }
+
+ if ($null -eq $target) { return @{ success=$false; error="Window not found: title='$title' pid=$wpid" } }
+
+ if ($restore) {
+ try {
+ $wp = $target.GetCurrentPattern([System.Windows.Automation.WindowPattern]::Pattern)
+ if ($wp.Current.WindowVisualState -eq [System.Windows.Automation.WindowVisualState]::Minimized) {
+ $wp.SetWindowVisualState([System.Windows.Automation.WindowVisualState]::Normal)
+ Start-Sleep -Milliseconds 120
+ }
+ } catch {}
+ }
+
+ try { $target.SetFocus() } catch {
+ try {
+ $hwnd = [IntPtr]$target.Current.NativeWindowHandle
+ [Win32UIA]::ShowWindow($hwnd, 9) | Out-Null
+ Start-Sleep -Milliseconds 60
+ [Win32UIA]::SetForegroundWindow($hwnd) | Out-Null
+ } catch {}
+ }
+
+ $c = $target.Current
+ return [ordered]@{ success=$true; title=$c.Name; processId=$c.ProcessId; handle=$c.NativeWindowHandle }
+}
+
+# ── Command: find-element (fuzzy name match) ──────────────────────────────────
+function Cmd-FindElement {
+ param($cmd)
+ $name = if ($cmd.name) { $cmd.name } else { "" }
+ $automationId= if ($cmd.automationId){ $cmd.automationId } else { "" }
+ $controlType = if ($cmd.controlType) { $cmd.controlType } else { "" }
+ $wpid = if ($cmd.processId) { [int]$cmd.processId } else { 0 }
+ $maxResults = if ($cmd.maxResults) { [int]$cmd.maxResults } else { 20 }
+
+ $root = [System.Windows.Automation.AutomationElement]::RootElement
+ $searchRoot = $root
+ if ($wpid -gt 0) {
+ $pc = New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ProcessIdProperty, $wpid
+ )
+ $searchRoot = $root.FindFirst([System.Windows.Automation.TreeScope]::Children, $pc)
+ if ($null -eq $searchRoot) { return ,(New-Object System.Object[] 0) }
+ }
+
+ $conditions = @()
+ if ($automationId -ne "") {
+ $conditions += New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::AutomationIdProperty, $automationId
+ )
+ }
+ if ($controlType -ne "" -and $ctMap.ContainsKey($controlType)) {
+ $conditions += New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ControlTypeProperty, $ctMap[$controlType]
+ )
+ }
+
+ $searchCond = if ($conditions.Count -eq 0) { [System.Windows.Automation.Condition]::TrueCondition }
+ elseif ($conditions.Count -eq 1) { $conditions[0] }
+ else { New-Object System.Windows.Automation.AndCondition([System.Windows.Automation.Condition[]]$conditions) }
+
+ $elements = $searchRoot.FindAll([System.Windows.Automation.TreeScope]::Descendants, $searchCond)
+ $results = @()
+ $nameLower = $name.ToLower()
+
+ foreach ($el in $elements) {
+ if ($results.Count -ge $maxResults) { break }
+ try {
+ $c = $el.Current
+ if ($name -ne "") {
+ # Fuzzy: strip keyboard shortcut suffix ("Save\tCtrl+S" → "save"), then contains-match
+ $elName = ($c.Name -replace '\t.*$', '').Trim().ToLower()
+ if (-not $elName.Contains($nameLower) -and -not $nameLower.Contains($elName)) { continue }
+ if ($elName.Length -eq 0) { continue }
+ }
+ $rect = $c.BoundingRectangle
+ $bounds = if ([double]::IsInfinity($rect.X)) { @{x=0;y=0;width=0;height=0} }
+ else { @{x=[int]$rect.X;y=[int]$rect.Y;width=[int]$rect.Width;height=[int]$rect.Height} }
+ $results += [ordered]@{
+ name=$c.Name; automationId=$c.AutomationId; controlType=$c.ControlType.ProgrammaticName
+ className=$c.ClassName; processId=$c.ProcessId; isEnabled=$c.IsEnabled; bounds=$bounds
+ }
+ } catch {}
+ }
+ return ,$results
+}
+
+# ── Command: invoke-element (fuzzy name match) ────────────────────────────────
+function Cmd-InvokeElement {
+ param($cmd)
+ $name = if ($cmd.name) { $cmd.name } else { "" }
+ $automationId= if ($cmd.automationId){ $cmd.automationId } else { "" }
+ $controlType = if ($cmd.controlType) { $cmd.controlType } else { "" }
+ $wpid = [int]$cmd.processId
+ $action = $cmd.action
+ $value = if ($cmd.value) { $cmd.value } else { "" }
+
+ $root = [System.Windows.Automation.AutomationElement]::RootElement
+ $pc = New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ProcessIdProperty, $wpid
+ )
+ $window = $root.FindFirst([System.Windows.Automation.TreeScope]::Children, $pc)
+ if ($null -eq $window) { return @{ success=$false; error="No window for pid $wpid" } }
+
+ # Find element: prefer automationId (exact), then fuzzy name walk
+ $element = $null
+ if ($automationId -ne "") {
+ $aidCond = New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::AutomationIdProperty, $automationId
+ )
+ $element = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $aidCond)
+ }
+
+ if ($null -eq $element -and $name -ne "") {
+ $nameLower = $name.ToLower()
+ $ctCond = if ($controlType -ne "" -and $ctMap.ContainsKey($controlType)) {
+ New-Object System.Windows.Automation.PropertyCondition(
+ [System.Windows.Automation.AutomationElement]::ControlTypeProperty, $ctMap[$controlType]
+ )
+ } else { [System.Windows.Automation.Condition]::TrueCondition }
+
+ $candidates = $window.FindAll([System.Windows.Automation.TreeScope]::Descendants, $ctCond)
+ # First pass: exact match after stripping shortcut suffix
+ foreach ($el in $candidates) {
+ try {
+ $elName = ($el.Current.Name -replace '\t.*$', '').Trim().ToLower()
+ if ($elName -eq $nameLower -and $elName.Length -gt 0) { $element = $el; break }
+ } catch {}
+ }
+ # Second pass: contains match
+ if ($null -eq $element) {
+ foreach ($el in $candidates) {
+ try {
+ $elName = ($el.Current.Name -replace '\t.*$', '').Trim().ToLower()
+ if ($elName.Length -gt 0 -and ($elName.Contains($nameLower) -or $nameLower.Contains($elName))) {
+ $element = $el; break
+ }
+ } catch {}
+ }
+ }
+ }
+
+ if ($null -eq $element) {
+ return @{ success=$false; error="Element not found: name='$name' id='$automationId' ct='$controlType'" }
+ }
+
+ switch ($action) {
+ "click" {
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.InvokePattern]::Pattern)
+ $p.Invoke()
+ return @{ success=$true; action="click"; method="InvokePattern" }
+ } catch {
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.TogglePattern]::Pattern)
+ $p.Toggle()
+ return @{ success=$true; action="click"; method="TogglePattern" }
+ } catch {
+ $rect = $element.Current.BoundingRectangle
+ return @{ success=$false; action="click"; error="No invoke/toggle pattern";
+ clickPoint=@{x=[int]($rect.X+$rect.Width/2);y=[int]($rect.Y+$rect.Height/2)} }
+ }
+ }
+ }
+ "set-value" {
+ if ($value -eq "") { return @{ success=$false; error="value required for set-value" } }
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
+ $p.SetValue($value)
+ return @{ success=$true; action="set-value"; value=$value }
+ } catch {
+ return @{ success=$false; error="ValuePattern not supported: $($_.Exception.Message)" }
+ }
+ }
+ "get-value" {
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
+ return @{ success=$true; action="get-value"; value=$p.Current.Value }
+ } catch {
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.TextPattern]::Pattern)
+ return @{ success=$true; action="get-value"; value=$p.DocumentRange.GetText(-1); method="TextPattern" }
+ } catch {
+ return @{ success=$true; action="get-value"; value=$element.Current.Name; method="Name" }
+ }
+ }
+ }
+ "focus" {
+ try { $element.SetFocus(); return @{ success=$true; action="focus" } }
+ catch { return @{ success=$false; error="SetFocus failed: $($_.Exception.Message)" } }
+ }
+ "expand" {
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.ExpandCollapsePattern]::Pattern)
+ $p.Expand(); return @{ success=$true; action="expand" }
+ } catch { return @{ success=$false; error="ExpandCollapsePattern not supported" } }
+ }
+ "select" {
+ try {
+ $p = $element.GetCurrentPattern([System.Windows.Automation.SelectionItemPattern]::Pattern)
+ $p.Select(); return @{ success=$true; action="select" }
+ } catch { return @{ success=$false; error="SelectionItemPattern not supported" } }
+ }
+ default { return @{ success=$false; error="Unknown action: $action" } }
+ }
+}
+
+# ── Command: get-focused-element ──────────────────────────────────────────────
+function Cmd-GetFocusedElement {
+ try {
+ $focused = [System.Windows.Automation.AutomationElement]::FocusedElement
+ if ($null -eq $focused) { return @{ success=$false; error="No focused element" } }
+ $cur = $focused.Current
+ $rect = $cur.BoundingRectangle
+ $bounds = if ([double]::IsInfinity($rect.X) -or [double]::IsInfinity($rect.Y)) {
+ @{ x=0; y=0; width=0; height=0 }
+ } else {
+ @{ x=[Math]::Round($rect.X); y=[Math]::Round($rect.Y); width=[Math]::Round($rect.Width); height=[Math]::Round($rect.Height) }
+ }
+ $typeName = if ($cur.ControlType) { $cur.ControlType.ProgrammaticName } else { "" }
+ # Try to read current value if it's an editable element
+ $value = ""
+ try {
+ $vp = $focused.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
+ $value = $vp.Current.Value
+ } catch {
+ try {
+ $tp = $focused.GetCurrentPattern([System.Windows.Automation.TextPattern]::Pattern)
+ $value = $tp.DocumentRange.GetText(1000)
+ } catch {}
+ }
+ return [ordered]@{
+ success = $true
+ name = if ($cur.Name) { $cur.Name } else { "" }
+ automationId = if ($cur.AutomationId) { $cur.AutomationId } else { "" }
+ controlType = $typeName
+ className = if ($cur.ClassName) { $cur.ClassName } else { "" }
+ processId = $cur.ProcessId
+ isEnabled = $cur.IsEnabled
+ bounds = $bounds
+ value = $value
+ }
+ } catch {
+ return @{ success=$false; error=$_.Exception.Message }
+ }
+}
+
+# ── Main: signal ready, then read commands ────────────────────────────────────
+[Console]::Out.WriteLine('{"ready":true}')
+[Console]::Out.Flush()
+
+while ($true) {
+ $line = [Console]::In.ReadLine()
+ if ($null -eq $line -or $line.Trim() -eq "EXIT") { break }
+ $line = $line.Trim()
+ if ($line -eq "") { continue }
+
+ try {
+ $cmd = $line | ConvertFrom-Json
+ $result = switch ($cmd.cmd) {
+ "get-screen-context" { Cmd-GetScreenContext $cmd }
+ "get-foreground-window" { Cmd-GetForegroundWindow }
+ "focus-window" { Cmd-FocusWindow $cmd }
+ "find-element" { Cmd-FindElement $cmd }
+ "invoke-element" { Cmd-InvokeElement $cmd }
+ "get-focused-element" { Cmd-GetFocusedElement }
+ "ping" { @{ pong=$true } }
+ default { @{ error="Unknown command: $($cmd.cmd)" } }
+ }
+ [Console]::Out.WriteLine(($result | ConvertTo-Json -Depth 50 -Compress))
+ } catch {
+ [Console]::Out.WriteLine((@{ error=$_.Exception.Message } | ConvertTo-Json -Compress))
+ }
+ [Console]::Out.Flush()
+}
diff --git a/src/__tests__/action-router.test.ts b/src/__tests__/action-router.test.ts
new file mode 100644
index 0000000..1c88684
--- /dev/null
+++ b/src/__tests__/action-router.test.ts
@@ -0,0 +1,186 @@
+/**
+ * Action Router tests.
+ *
+ * We test the routing logic (pattern matching, multi-step rejection, URL detection)
+ * without actually launching apps or moving the mouse.
+ *
+ * Strategy: mock the desktop and a11y dependencies so route() short-circuits
+ * or returns without side-effects.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+// ── Mock heavy native deps before any import ──────────────────────────────────
+vi.mock('@nut-tree-fork/nut-js', () => ({
+ mouse: { config: {}, move: vi.fn(), click: vi.fn(), scrollDown: vi.fn(), scrollUp: vi.fn(), drag: vi.fn() },
+ keyboard: { config: {}, type: vi.fn(), pressKey: vi.fn(), releaseKey: vi.fn() },
+ screen: { grab: vi.fn(), grabRegion: vi.fn(), width: vi.fn().mockResolvedValue(1920), height: vi.fn().mockResolvedValue(1080) },
+ Button: { LEFT: 0, RIGHT: 1 },
+ Key: new Proxy({}, { get: (_t, p) => p }),
+ Point: class { constructor(public x: number, public y: number) {} },
+ Region: class { constructor(public left: number, public top: number, public width: number, public height: number) {} },
+}));
+
+vi.mock('sharp', () => ({
+ default: vi.fn(() => ({
+ resize: vi.fn().mockReturnThis(),
+ png: vi.fn().mockReturnThis(),
+ jpeg: vi.fn().mockReturnThis(),
+ toBuffer: vi.fn().mockResolvedValue(Buffer.from('')),
+ })),
+}));
+
+// Mock child_process so opening apps doesn't do anything
+vi.mock('child_process', () => ({
+ exec: vi.fn((_cmd: string, cb: Function) => cb(null, '', '')),
+ execFile: vi.fn((_cmd: string, _args: string[], cb: Function) => cb(null, '', '')),
+ spawn: vi.fn(() => ({ on: vi.fn(), stdout: { on: vi.fn() }, stderr: { on: vi.fn() } })),
+}));
+
+import { ActionRouter } from '../action-router';
+
+// ── Minimal fake desktop + a11y ───────────────────────────────────────────────
+function makeDesktop() {
+ return {
+ keyPress: vi.fn().mockResolvedValue(undefined),
+ typeText: vi.fn().mockResolvedValue(undefined),
+ executeMouseAction: vi.fn().mockResolvedValue(undefined),
+ executeKeyboardAction: vi.fn().mockResolvedValue(undefined),
+ captureForLLM: vi.fn().mockResolvedValue({ buffer: Buffer.from(''), scaleFactor: 1, llmWidth: 1280, llmHeight: 720 }),
+ getScreenSize: vi.fn().mockReturnValue({ width: 1920, height: 1080 }),
+ getScaleFactor: vi.fn().mockReturnValue(1.5),
+ connect: vi.fn().mockResolvedValue(undefined),
+ disconnect: vi.fn(),
+ } as any;
+}
+
+function makeA11y() {
+ return {
+ isShellAvailable: vi.fn().mockResolvedValue(true),
+ getWindows: vi.fn().mockResolvedValue([]),
+ getActiveWindow: vi.fn().mockResolvedValue({ title: 'Test', processName: 'test', pid: 1 }),
+ getFocusedElement: vi.fn().mockResolvedValue(null),
+ getAccessibilityTree: vi.fn().mockResolvedValue(''),
+ readClipboard: vi.fn().mockResolvedValue(''),
+ warmup: vi.fn().mockResolvedValue(undefined),
+ } as any;
+}
+
+describe('ActionRouter — multi-step rejection', () => {
+ let router: ActionRouter;
+ beforeEach(() => { router = new ActionRouter(makeA11y(), makeDesktop()); });
+
+ it('rejects "Open Notepad, type hello"', async () => {
+ const result = await router.route('Open Notepad, type hello');
+ expect(result.handled).toBe(false);
+ });
+
+ it('rejects "open chrome and search google"', async () => {
+ const result = await router.route('open chrome and search google');
+ expect(result.handled).toBe(false);
+ });
+
+ it('rejects "click OK, then close the window"', async () => {
+ const result = await router.route('click OK, then close the window');
+ expect(result.handled).toBe(false);
+ });
+
+ it('rejects "find the file and delete it"', async () => {
+ const result = await router.route('find the file and delete it');
+ expect(result.handled).toBe(false);
+ });
+
+ it('rejects "open notepad and then type a message"', async () => {
+ const result = await router.route('open notepad and then type a message');
+ expect(result.handled).toBe(false);
+ });
+});
+
+describe('ActionRouter — type routing', () => {
+ let router: ActionRouter;
+ let desktop: ReturnType;
+
+ beforeEach(() => {
+ desktop = makeDesktop();
+ router = new ActionRouter(makeA11y(), desktop);
+ });
+
+ it('routes "type hello world"', async () => {
+ const result = await router.route('type hello world');
+ expect(result.handled).toBe(true);
+ // handleType calls desktop.typeText(text)
+ expect(desktop.typeText).toHaveBeenCalledWith('hello world');
+ });
+
+ it('routes "type \'quoted text\'"', async () => {
+ const result = await router.route("type 'quoted text'");
+ expect(result.handled).toBe(true);
+ expect(desktop.typeText).toHaveBeenCalledWith('quoted text');
+ });
+
+ it('routes "enter some text"', async () => {
+ const result = await router.route('enter some text');
+ expect(result.handled).toBe(true);
+ });
+
+ it('does NOT route "write an essay about dogs" (write = creative, not raw type)', async () => {
+ const result = await router.route('write an essay about dogs');
+ // "write" is excluded from the type pattern — should fall through
+ expect(result.handled).toBe(false);
+ });
+});
+
+describe('ActionRouter — URL navigation', () => {
+ let router: ActionRouter;
+
+ beforeEach(() => { router = new ActionRouter(makeA11y(), makeDesktop()); });
+
+ it('routes "go to https://github.com"', async () => {
+ const result = await router.route('go to https://github.com');
+ expect(result.handled).toBe(true);
+ expect(result.description).toMatch(/github\.com/i);
+ });
+
+ it('routes "navigate to www.google.com"', async () => {
+ const result = await router.route('navigate to www.google.com');
+ expect(result.handled).toBe(true);
+ });
+
+ it('routes "visit https://docs.anthropic.com"', async () => {
+ const result = await router.route('visit https://docs.anthropic.com');
+ expect(result.handled).toBe(true);
+ });
+
+ it('does NOT route bare non-URL text as URL', async () => {
+ const result = await router.route('go to the store');
+ // "the store" has no TLD → should not match url pattern
+ // It might still hit the open-app path or fall through
+ // We just verify it doesn't crash
+ expect(typeof result.handled).toBe('boolean');
+ });
+});
+
+describe('ActionRouter — telemetry', () => {
+ it('counts LLM fallbacks for compound tasks', async () => {
+ const router = new ActionRouter(makeA11y(), makeDesktop());
+ await router.route('open chrome and search for cats');
+ await router.route('type hello, then press enter');
+ const t = router.getTelemetry();
+ expect(t.llmFallbacks).toBe(2);
+ expect(t.totalRequests).toBe(2);
+ });
+
+ it('counts nonShortcutHandled for type tasks', async () => {
+ const router = new ActionRouter(makeA11y(), makeDesktop());
+ await router.route('type hello');
+ const t = router.getTelemetry();
+ expect(t.nonShortcutHandled).toBe(1);
+ });
+
+ it('resets telemetry', async () => {
+ const router = new ActionRouter(makeA11y(), makeDesktop());
+ await router.route('type hello');
+ router.resetTelemetry();
+ expect(router.getTelemetry().totalRequests).toBe(0);
+ });
+});
diff --git a/src/__tests__/coordinate-scaling.test.ts b/src/__tests__/coordinate-scaling.test.ts
new file mode 100644
index 0000000..12a6875
--- /dev/null
+++ b/src/__tests__/coordinate-scaling.test.ts
@@ -0,0 +1,96 @@
+/**
+ * Coordinate Scaling — pure math tests, zero native dependencies.
+ *
+ * Replicates the logic in native-desktop.ts without importing it
+ * (that file imports nut-js which requires native bindings).
+ */
+
+import { describe, it, expect } from 'vitest';
+
+const LLM_TARGET_WIDTH = 1280;
+
+function computeScaleFactor(screenWidth: number): number {
+ return screenWidth > LLM_TARGET_WIDTH ? screenWidth / LLM_TARGET_WIDTH : 1;
+}
+
+function llmToReal(llmX: number, llmY: number, scale: number): { x: number; y: number } {
+ return { x: Math.round(llmX * scale), y: Math.round(llmY * scale) };
+}
+
+function llmToRealWithMonitorOffset(
+ llmX: number, llmY: number, scale: number,
+ monitorOffsetX: number, monitorOffsetY: number,
+): { x: number; y: number } {
+ return {
+ x: Math.round(llmX * scale) + monitorOffsetX,
+ y: Math.round(llmY * scale) + monitorOffsetY,
+ };
+}
+
+describe('Scale factor computation', () => {
+ it('2560×1440 → scale 2.0', () => {
+ expect(computeScaleFactor(2560)).toBe(2.0);
+ });
+
+ it('1920×1080 → scale 1.5', () => {
+ expect(computeScaleFactor(1920)).toBe(1.5);
+ });
+
+ it('1280×720 → scale 1.0 (no scaling)', () => {
+ expect(computeScaleFactor(1280)).toBe(1.0);
+ });
+
+ it('800×600 → scale 1.0 (no upscaling)', () => {
+ expect(computeScaleFactor(800)).toBe(1.0);
+ });
+
+ it('3840×2160 (4K) → scale 3.0', () => {
+ expect(computeScaleFactor(3840)).toBe(3.0);
+ });
+});
+
+describe('LLM → real coordinate mapping', () => {
+ it('(640, 360) at scale 2.0 → (1280, 720)', () => {
+ expect(llmToReal(640, 360, 2.0)).toEqual({ x: 1280, y: 720 });
+ });
+
+ it('(0, 0) at any scale → (0, 0)', () => {
+ expect(llmToReal(0, 0, 2.0)).toEqual({ x: 0, y: 0 });
+ expect(llmToReal(0, 0, 1.5)).toEqual({ x: 0, y: 0 });
+ expect(llmToReal(0, 0, 1.0)).toEqual({ x: 0, y: 0 });
+ });
+
+ it('(100, 200) at scale 1.5 → (150, 300)', () => {
+ expect(llmToReal(100, 200, 1.5)).toEqual({ x: 150, y: 300 });
+ });
+
+ it('(640, 360) at scale 1.0 → (640, 360) unchanged', () => {
+ expect(llmToReal(640, 360, 1.0)).toEqual({ x: 640, y: 360 });
+ });
+
+ it('fractional result rounds correctly', () => {
+ // 100 * 1.5 = 150.0, 33 * 1.5 = 49.5 → rounds to 50
+ expect(llmToReal(100, 33, 1.5)).toEqual({ x: 150, y: 50 });
+ });
+});
+
+describe('Multi-monitor coordinate mapping', () => {
+ it('monitor at offset (1920, 0): LLM (100, 50) scale 1.0 → real (2020, 50)', () => {
+ expect(llmToRealWithMonitorOffset(100, 50, 1.0, 1920, 0)).toEqual({ x: 2020, y: 50 });
+ });
+
+ it('monitor at offset (0, 1080): LLM (0, 0) → real (0, 1080)', () => {
+ expect(llmToRealWithMonitorOffset(0, 0, 1.0, 0, 1080)).toEqual({ x: 0, y: 1080 });
+ });
+
+ it('monitor at offset (1920, 0) scale 2.0: LLM (640, 360) → real (4200, 720)', () => {
+ // 640*2=1280 + 1920=3200? No: 640*2=1280, +1920=3200. Wait: 640*2=1280+1920=3200, 360*2=720
+ expect(llmToRealWithMonitorOffset(640, 360, 2.0, 1920, 0)).toEqual({ x: 3200, y: 720 });
+ });
+
+ it('primary monitor (offset 0,0): same as without offset', () => {
+ const withOffset = llmToRealWithMonitorOffset(200, 300, 1.5, 0, 0);
+ const without = llmToReal(200, 300, 1.5);
+ expect(withOffset).toEqual(without);
+ });
+});
diff --git a/src/__tests__/ocr-engine.test.ts b/src/__tests__/ocr-engine.test.ts
new file mode 100644
index 0000000..16f0b54
--- /dev/null
+++ b/src/__tests__/ocr-engine.test.ts
@@ -0,0 +1,376 @@
+/**
+ * OcrEngine tests.
+ *
+ * Tests the OCR bridge logic: availability detection, caching, coordinate
+ * offsetting for regions, JSON parsing from PowerShell output, and graceful
+ * degradation on errors.
+ *
+ * Strategy: mock nut-js (screen.grab), sharp, child_process, and fs so
+ * no actual screenshots or PowerShell processes are needed.
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+
+// ── Hoist mock functions so they're available inside vi.mock factories ────────
+const mockExecFile = vi.hoisted(() => vi.fn());
+const mockWriteFileSync = vi.hoisted(() => vi.fn());
+const mockUnlinkSync = vi.hoisted(() => vi.fn());
+
+// ── Mock heavy native deps before any import ──────────────────────────────────
+vi.mock('@nut-tree-fork/nut-js', () => ({
+ mouse: { config: {}, move: vi.fn(), click: vi.fn() },
+ keyboard: { config: {}, type: vi.fn() },
+ screen: {
+ grab: vi.fn().mockResolvedValue({
+ data: Buffer.alloc(4 * 100 * 100), // 100×100 RGBA
+ width: 100,
+ height: 100,
+ }),
+ },
+ Button: { LEFT: 0 },
+ Key: new Proxy({}, { get: (_t, p) => p }),
+ Point: class { constructor(public x: number, public y: number) {} },
+ Region: class { constructor(public left: number, public top: number, public width: number, public height: number) {} },
+}));
+
+vi.mock('sharp', () => ({
+ default: vi.fn(() => ({
+ resize: vi.fn().mockReturnThis(),
+ extract: vi.fn().mockReturnThis(),
+ png: vi.fn().mockReturnThis(),
+ jpeg: vi.fn().mockReturnThis(),
+ toBuffer: vi.fn().mockResolvedValue(Buffer.from('fake-png')),
+ })),
+}));
+
+// Mock fs — track writeFileSync / unlinkSync calls without touching disk
+vi.mock('fs', async () => {
+ const actual = await vi.importActual('fs');
+ return {
+ ...actual,
+ writeFileSync: (...args: unknown[]) => mockWriteFileSync(...args),
+ unlinkSync: (...args: unknown[]) => mockUnlinkSync(...args),
+ };
+});
+
+// Mock child_process.execFile — must support util.promisify returning { stdout, stderr }
+vi.mock('child_process', async () => {
+ const { promisify } = await import('util');
+
+ const execFileFn: any = (...args: unknown[]) => {
+ const cb = args[args.length - 1];
+ try {
+ const result = mockExecFile();
+ if (typeof cb === 'function') {
+ (cb as Function)(null, result?.stdout ?? '', result?.stderr ?? '');
+ }
+ } catch (err) {
+ if (typeof cb === 'function') {
+ (cb as Function)(err);
+ }
+ }
+ };
+
+ // Set custom promisify so that promisify(execFile) returns { stdout, stderr }
+ execFileFn[promisify.custom] = async (..._args: unknown[]) => {
+ const result = mockExecFile();
+ return { stdout: result?.stdout ?? '', stderr: result?.stderr ?? '' };
+ };
+
+ return {
+ execFile: execFileFn,
+ exec: vi.fn(),
+ spawn: vi.fn(),
+ };
+});
+
+// ── Import the module under test ──────────────────────────────────────────────
+import { OcrEngine } from '../ocr-engine';
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+const REAL_PLATFORM = process.platform;
+
+function setPlatform(p: string) {
+ Object.defineProperty(process, 'platform', { value: p, writable: true, configurable: true });
+}
+
+function restorePlatform() {
+ Object.defineProperty(process, 'platform', { value: REAL_PLATFORM, writable: true, configurable: true });
+}
+
+/** Sample OCR JSON output mimicking the PowerShell script's format */
+function sampleOcrJson(elements: object[] = [], fullText = 'Hello World') {
+ return JSON.stringify({ elements, fullText });
+}
+
+const SAMPLE_ELEMENTS = [
+ { text: 'Hello', x: 10, y: 20, width: 50, height: 15, confidence: 1.0, line: 0 },
+ { text: 'World', x: 70, y: 20, width: 55, height: 15, confidence: 1.0, line: 0 },
+ { text: 'Test', x: 10, y: 50, width: 40, height: 15, confidence: 1.0, line: 1 },
+];
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe('OcrEngine', () => {
+ beforeEach(() => {
+ mockExecFile.mockReset();
+ mockWriteFileSync.mockReset();
+ mockUnlinkSync.mockReset();
+ });
+
+ afterEach(() => {
+ restorePlatform();
+ });
+
+ // ── isAvailable ───────────────────────────────────────────────────────────
+
+ describe('isAvailable()', () => {
+ it('returns true on Windows', () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ expect(eng.isAvailable()).toBe(true);
+ });
+
+ it('returns false on macOS (stub)', () => {
+ setPlatform('darwin');
+ const eng = new OcrEngine();
+ expect(eng.isAvailable()).toBe(false);
+ });
+
+ it('returns false on Linux', () => {
+ setPlatform('linux');
+ const eng = new OcrEngine();
+ expect(eng.isAvailable()).toBe(false);
+ });
+
+ it('never throws on any platform', () => {
+ for (const p of ['win32', 'darwin', 'linux', 'freebsd']) {
+ setPlatform(p);
+ expect(() => new OcrEngine().isAvailable()).not.toThrow();
+ }
+ });
+
+ it('caches the result on subsequent calls', () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ expect(eng.isAvailable()).toBe(true);
+ // Changing platform shouldn't change cached result for same instance
+ setPlatform('linux');
+ expect(eng.isAvailable()).toBe(true);
+ });
+ });
+
+ // ── recognizeScreen ───────────────────────────────────────────────────────
+
+ describe('recognizeScreen()', () => {
+ it('returns EMPTY_RESULT when unavailable', async () => {
+ setPlatform('darwin');
+ const eng = new OcrEngine();
+ const result = await eng.recognizeScreen();
+ expect(result.elements).toEqual([]);
+ expect(result.fullText).toBe('');
+ });
+
+ it('parses OCR JSON output correctly on Windows', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson(SAMPLE_ELEMENTS, 'Hello World Test') });
+
+ const result = await eng.recognizeScreen();
+
+ expect(result.elements).toHaveLength(3);
+ expect(result.elements[0]).toEqual(expect.objectContaining({ text: 'Hello', x: 10, y: 20 }));
+ expect(result.elements[1]).toEqual(expect.objectContaining({ text: 'World', x: 70, y: 20 }));
+ expect(result.elements[2]).toEqual(expect.objectContaining({ text: 'Test', line: 1 }));
+ expect(result.fullText).toBe('Hello World Test');
+ expect(result.durationMs).toBeGreaterThanOrEqual(0);
+ });
+
+ it('saves screenshot to temp file and cleans up', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson([]) });
+
+ await eng.recognizeScreen();
+
+ expect(mockWriteFileSync).toHaveBeenCalledTimes(1);
+ expect(mockUnlinkSync).toHaveBeenCalledTimes(1);
+ // Temp path should contain "clawdcursor-ocr"
+ const writtenPath = mockWriteFileSync.mock.calls[0][0] as string;
+ expect(writtenPath).toContain('clawdcursor-ocr');
+ });
+
+ it('cleans up temp file even on OCR error', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: '{"error":"OCR failed"}' });
+
+ const result = await eng.recognizeScreen();
+
+ // Should degrade gracefully
+ expect(result.elements).toEqual([]);
+ expect(mockUnlinkSync).toHaveBeenCalledTimes(1);
+ });
+
+ it('handles empty elements array', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson([], '') });
+
+ const result = await eng.recognizeScreen();
+
+ expect(result.elements).toEqual([]);
+ expect(result.fullText).toBe('');
+ });
+
+ it('marks unavailable if first call fails', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockImplementation(() => { throw new Error('PowerShell not found'); });
+
+ const result = await eng.recognizeScreen();
+
+ expect(result.elements).toEqual([]);
+ expect(eng.isAvailable()).toBe(false);
+ });
+ });
+
+ // ── Cache behavior ────────────────────────────────────────────────────────
+
+ describe('caching', () => {
+ it('returns cached result within 300ms', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson(SAMPLE_ELEMENTS) });
+
+ const first = await eng.recognizeScreen();
+
+ // Change mock — but cached result should be returned
+ mockExecFile.mockReturnValue({
+ stdout: sampleOcrJson([{ text: 'Different', x: 0, y: 0, width: 10, height: 10, confidence: 1, line: 0 }]),
+ });
+ const second = await eng.recognizeScreen();
+
+ // Should be the cached result, not the new one
+ expect(second.elements).toEqual(first.elements);
+ // execFile should have been called only once (first call)
+ expect(mockExecFile).toHaveBeenCalledTimes(1);
+ });
+
+ it('invalidateCache() forces a fresh OCR call', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson(SAMPLE_ELEMENTS) });
+
+ await eng.recognizeScreen();
+ eng.invalidateCache();
+
+ mockExecFile.mockReturnValue({
+ stdout: sampleOcrJson([{ text: 'New', x: 0, y: 0, width: 10, height: 10, confidence: 1, line: 0 }]),
+ });
+ const result = await eng.recognizeScreen();
+
+ expect(result.elements[0].text).toBe('New');
+ expect(mockExecFile).toHaveBeenCalledTimes(2);
+ });
+ });
+
+ // ── recognizeRegion ───────────────────────────────────────────────────────
+
+ describe('recognizeRegion()', () => {
+ it('offsets coordinates to screen space', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ // OCR returns coordinates relative to the cropped region
+ const regionElements = [
+ { text: 'Button', x: 5, y: 10, width: 40, height: 12, confidence: 1.0, line: 0 },
+ ];
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson(regionElements) });
+
+ // Region starts at (20, 30) — within 100×100 mock screen
+ const result = await eng.recognizeRegion(20, 30, 50, 50);
+
+ // Coordinates should be offset: (5+20, 10+30)
+ expect(result.elements[0].x).toBe(25);
+ expect(result.elements[0].y).toBe(40);
+ });
+
+ it('clamps region to screen bounds', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: sampleOcrJson([]) });
+
+ // Screen is 100×100 (from mock). Request region beyond bounds.
+ const result = await eng.recognizeRegion(90, 90, 200, 200);
+
+ // Should not throw — clamped internally
+ expect(result.elements).toEqual([]);
+ });
+
+ it('returns EMPTY_RESULT when unavailable', async () => {
+ setPlatform('darwin');
+ const eng = new OcrEngine();
+ const result = await eng.recognizeRegion(0, 0, 100, 100);
+ expect(result.elements).toEqual([]);
+ });
+ });
+
+ // ── JSON parsing edge cases ───────────────────────────────────────────────
+
+ describe('JSON parsing', () => {
+ it('handles missing fields in elements gracefully', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({
+ stdout: JSON.stringify({
+ elements: [{ text: 'Hi' }], // missing x, y, width, height, confidence, line
+ fullText: 'Hi',
+ }),
+ });
+
+ const result = await eng.recognizeScreen();
+
+ expect(result.elements[0]).toEqual({
+ text: 'Hi',
+ x: 0,
+ y: 0,
+ width: 0,
+ height: 0,
+ confidence: 0,
+ line: 0,
+ });
+ });
+
+ it('handles null elements array', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({
+ stdout: JSON.stringify({ elements: null, fullText: '' }),
+ });
+
+ const result = await eng.recognizeScreen();
+ expect(result.elements).toEqual([]);
+ });
+
+ it('handles malformed JSON gracefully', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({ stdout: 'not-json{{{' });
+
+ const result = await eng.recognizeScreen();
+ expect(result.elements).toEqual([]);
+ });
+
+ it('handles error response from PowerShell', async () => {
+ setPlatform('win32');
+ const eng = new OcrEngine();
+ mockExecFile.mockReturnValue({
+ stdout: JSON.stringify({ error: 'No OCR languages installed' }),
+ });
+
+ const result = await eng.recognizeScreen();
+ expect(result.elements).toEqual([]);
+ });
+ });
+});
diff --git a/src/__tests__/safety.test.ts b/src/__tests__/safety.test.ts
new file mode 100644
index 0000000..ee743d0
--- /dev/null
+++ b/src/__tests__/safety.test.ts
@@ -0,0 +1,135 @@
+/**
+ * Safety Layer tests.
+ * Mocks native-desktop so nut-js is never loaded.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+// Mock nut-js before any module that imports it
+vi.mock('@nut-tree-fork/nut-js', () => ({
+ mouse: { config: {}, move: vi.fn(), click: vi.fn(), scrollDown: vi.fn(), scrollUp: vi.fn(), drag: vi.fn() },
+ keyboard: { config: {}, type: vi.fn(), pressKey: vi.fn(), releaseKey: vi.fn() },
+ screen: { grab: vi.fn(), grabRegion: vi.fn(), width: vi.fn(), height: vi.fn() },
+ Button: { LEFT: 0, RIGHT: 1 },
+ Key: new Proxy({}, { get: (_t, p) => p }),
+ Point: class { constructor(public x: number, public y: number) {} },
+ Region: class { constructor(public left: number, public top: number, public width: number, public height: number) {} },
+}));
+
+vi.mock('sharp', () => ({ default: vi.fn(() => ({ resize: vi.fn().mockReturnThis(), png: vi.fn().mockReturnThis(), jpeg: vi.fn().mockReturnThis(), toBuffer: vi.fn().mockResolvedValue(Buffer.from('')) })) }));
+
+import { SafetyLayer } from '../safety';
+import { SafetyTier } from '../types';
+
+function makeConfig(overrides?: { blockedPatterns?: string[]; confirmPatterns?: string[] }) {
+ return {
+ safety: {
+ blockedPatterns: overrides?.blockedPatterns ?? [
+ 'format disk', 'rm -rf', 'shutdown', 'reboot', 'mkfs', 'dd if=', 'diskpart', ':(){:|:&};:',
+ 'reg delete', 'net user', 'Remove-Item -Recurse -Force C:',
+ ],
+ confirmPatterns: overrides?.confirmPatterns ?? ['delete all', 'wipe'],
+ requireConfirm: false,
+ },
+ ai: { model: 'test', provider: 'test' },
+ server: { port: 3847, host: '127.0.0.1' },
+ capture: { format: 'png', quality: 80 },
+ debug: false,
+ } as any;
+}
+
+function typeAction() {
+ return { kind: 'type' as const, text: 'hello world' };
+}
+function clickAction() {
+ return { kind: 'click' as const, x: 100, y: 200 };
+}
+
+describe('SafetyLayer — terminal type actions', () => {
+ let safety: SafetyLayer;
+ beforeEach(() => { safety = new SafetyLayer(makeConfig()); });
+
+ it('type in powershell description → Confirm', () => {
+ expect(safety.classify(typeAction(), 'type command in powershell')).toBe(SafetyTier.Confirm);
+ });
+
+ it('type in cmd description → Confirm', () => {
+ expect(safety.classify(typeAction(), 'enter text in cmd window')).toBe(SafetyTier.Confirm);
+ });
+
+ it('type in bash description → Confirm', () => {
+ expect(safety.classify(typeAction(), 'type ls -la in bash terminal')).toBe(SafetyTier.Confirm);
+ });
+
+ it('type in Windows Terminal (wt) → Confirm', () => {
+ expect(safety.classify(typeAction(), 'type command in wt')).toBe(SafetyTier.Confirm);
+ });
+
+ it('type in Notepad (non-terminal) → Preview', () => {
+ expect(safety.classify(typeAction(), 'type text in Notepad')).toBe(SafetyTier.Preview);
+ });
+
+ it('type in Word (non-terminal) → Preview', () => {
+ expect(safety.classify(typeAction(), 'type document content in Word')).toBe(SafetyTier.Preview);
+ });
+
+ it('type in browser address bar (non-terminal) → Preview', () => {
+ expect(safety.classify(typeAction(), 'type URL in Chrome address bar')).toBe(SafetyTier.Preview);
+ });
+});
+
+describe('SafetyLayer — blocked patterns', () => {
+ let safety: SafetyLayer;
+ beforeEach(() => { safety = new SafetyLayer(makeConfig()); });
+
+ it('format disk → Confirm', () => {
+ expect(safety.classify(clickAction(), 'format disk C:')).toBe(SafetyTier.Confirm);
+ });
+
+ it('rm -rf → Confirm', () => {
+ expect(safety.classify(typeAction(), 'rm -rf /')).toBe(SafetyTier.Confirm);
+ });
+
+ it('shutdown → Confirm', () => {
+ expect(safety.classify(clickAction(), 'shutdown now')).toBe(SafetyTier.Confirm);
+ });
+
+ it('reboot → Confirm', () => {
+ expect(safety.classify(clickAction(), 'reboot the system')).toBe(SafetyTier.Confirm);
+ });
+});
+
+describe('SafetyLayer — confirm patterns', () => {
+ let safety: SafetyLayer;
+ beforeEach(() => { safety = new SafetyLayer(makeConfig()); });
+
+ it('delete all → Confirm', () => {
+ expect(safety.classify(clickAction(), 'delete all files')).toBe(SafetyTier.Confirm);
+ });
+});
+
+describe('SafetyLayer — auto tier', () => {
+ let safety: SafetyLayer;
+ beforeEach(() => { safety = new SafetyLayer(makeConfig()); });
+
+ it('normal click → Auto', () => {
+ expect(safety.classify(clickAction(), 'click OK button')).toBe(SafetyTier.Auto);
+ });
+
+ it('mouse move → Auto', () => {
+ expect(safety.classify({ kind: 'move' as any, x: 0, y: 0 }, 'move mouse to top-left')).toBe(SafetyTier.Auto);
+ });
+});
+
+describe('SafetyLayer — isBlocked()', () => {
+ let safety: SafetyLayer;
+ beforeEach(() => { safety = new SafetyLayer(makeConfig()); });
+
+ it('rm -rf is blocked', () => {
+ expect(safety.isBlocked('rm -rf /')).toBe(true);
+ });
+
+ it('ordinary task is not blocked', () => {
+ expect(safety.isBlocked('open Chrome and go to github.com')).toBe(false);
+ });
+});
diff --git a/src/__tests__/shortcuts-tools.test.ts b/src/__tests__/shortcuts-tools.test.ts
new file mode 100644
index 0000000..e01ff67
--- /dev/null
+++ b/src/__tests__/shortcuts-tools.test.ts
@@ -0,0 +1,259 @@
+/**
+ * Shortcut tools tests.
+ *
+ * Tests the MCP-exposed shortcuts_list and shortcuts_execute tools.
+ * Verifies filtering by category/context, fuzzy matching, execution,
+ * auto-context detection, and error handling.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+// ── Mock heavy native deps ────────────────────────────────────────────────────
+
+vi.mock('@nut-tree-fork/nut-js', () => ({
+ mouse: { config: {}, move: vi.fn(), click: vi.fn() },
+ keyboard: { config: {}, type: vi.fn() },
+ screen: { grab: vi.fn() },
+ Button: { LEFT: 0 },
+ Key: new Proxy({}, { get: (_t, p) => p }),
+ Point: class { constructor(public x: number, public y: number) {} },
+ Region: class { constructor(public left: number, public top: number, public width: number, public height: number) {} },
+}));
+
+vi.mock('sharp', () => ({
+ default: vi.fn(() => ({
+ resize: vi.fn().mockReturnThis(),
+ png: vi.fn().mockReturnThis(),
+ jpeg: vi.fn().mockReturnThis(),
+ toBuffer: vi.fn().mockResolvedValue(Buffer.from('fake')),
+ })),
+}));
+
+// ── Import after mocks ────────────────────────────────────────────────────────
+
+import { getShortcutTools } from '../tools/shortcuts';
+import type { ToolContext } from '../tools/types';
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+const mockKeyPress = vi.fn();
+const mockInvalidateCache = vi.fn();
+const mockGetActiveWindow = vi.fn();
+
+function createMockContext(): ToolContext {
+ return {
+ desktop: { keyPress: mockKeyPress },
+ a11y: {
+ invalidateCache: mockInvalidateCache,
+ getActiveWindow: mockGetActiveWindow,
+ },
+ cdp: {},
+ getMouseScaleFactor: () => 1,
+ getScreenshotScaleFactor: () => 1,
+ ensureInitialized: vi.fn(),
+ } as unknown as ToolContext;
+}
+
+function getListTool() {
+ return getShortcutTools().find(t => t.name === 'shortcuts_list')!;
+}
+
+function getExecuteTool() {
+ return getShortcutTools().find(t => t.name === 'shortcuts_execute')!;
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe('shortcuts_list', () => {
+ it('lists all universal shortcuts when no filters given', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({}, createMockContext());
+
+ const data = JSON.parse(result.text);
+ expect(data.count).toBeGreaterThan(10);
+ expect(data.platform).toBeDefined();
+ // Should NOT include context-specific shortcuts (e.g. reddit upvote)
+ const ids = data.shortcuts.map((s: any) => s.id);
+ expect(ids).not.toContain('reddit-upvote');
+ expect(ids).not.toContain('x-like');
+ // Should include universal ones
+ expect(ids).toContain('scroll-down');
+ expect(ids).toContain('copy');
+ expect(ids).toContain('new-tab');
+ });
+
+ it('filters by category', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({ category: 'browser' }, createMockContext());
+
+ const data = JSON.parse(result.text);
+ expect(data.count).toBeGreaterThan(0);
+ for (const s of data.shortcuts) {
+ expect(s.category).toBe('browser');
+ }
+ // Should contain browser shortcuts
+ const ids = data.shortcuts.map((s: any) => s.id);
+ expect(ids).toContain('new-tab');
+ expect(ids).toContain('close-tab');
+ expect(ids).toContain('refresh');
+ });
+
+ it('filters by context — includes universal + context-specific', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({ context: 'reddit' }, createMockContext());
+
+ const data = JSON.parse(result.text);
+ const ids = data.shortcuts.map((s: any) => s.id);
+ // Should include reddit-specific shortcuts
+ expect(ids).toContain('reddit-upvote');
+ expect(ids).toContain('reddit-next');
+ // Should also include universal shortcuts
+ expect(ids).toContain('scroll-down');
+ expect(ids).toContain('copy');
+ // Should NOT include other context-specific shortcuts (outlook, x)
+ expect(ids).not.toContain('x-like');
+ expect(ids).not.toContain('outlook-new-message');
+ });
+
+ it('filters by both category and context', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({ category: 'social', context: 'reddit' }, createMockContext());
+
+ const data = JSON.parse(result.text);
+ for (const s of data.shortcuts) {
+ expect(s.category).toBe('social');
+ }
+ const ids = data.shortcuts.map((s: any) => s.id);
+ expect(ids).toContain('reddit-upvote');
+ // x-like is social but wrong context
+ expect(ids).not.toContain('x-like');
+ });
+
+ it('returns helpful message when no shortcuts match', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({ category: 'social' }, createMockContext());
+
+ // No context provided, so context-specific social shortcuts are excluded
+ // and there are no universal social shortcuts → empty
+ expect(result.text).toContain('No shortcuts found');
+ expect(result.text).toContain('Available categories');
+ });
+
+ it('includes key combos resolved for current platform', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({ category: 'navigation' }, createMockContext());
+
+ const data = JSON.parse(result.text);
+ const scrollDown = data.shortcuts.find((s: any) => s.id === 'scroll-down');
+ expect(scrollDown).toBeDefined();
+ expect(scrollDown.key).toBe('PageDown');
+ });
+
+ it('includes usage hint in response', async () => {
+ const tool = getListTool();
+ const result = await tool.handler({}, createMockContext());
+
+ const data = JSON.parse(result.text);
+ expect(data.hint).toContain('shortcuts_execute');
+ });
+});
+
+describe('shortcuts_execute', () => {
+ beforeEach(() => {
+ mockKeyPress.mockReset();
+ mockInvalidateCache.mockReset();
+ mockGetActiveWindow.mockReset();
+ mockGetActiveWindow.mockResolvedValue({ processName: 'msedge', title: 'Reddit - Home' });
+ });
+
+ it('executes an exact match shortcut', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ const result = await tool.handler({ intent: 'scroll down' }, ctx);
+
+ expect(mockKeyPress).toHaveBeenCalledWith('PageDown');
+ expect(mockInvalidateCache).toHaveBeenCalled();
+
+ const data = JSON.parse(result.text);
+ expect(data.executed).toBe('PageDown');
+ expect(data.matchType).toBe('exact');
+ expect(data.intent).toBe('scroll down');
+ });
+
+ it('executes a fuzzy match shortcut', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ // "scrll down" is close enough for fuzzy match (Levenshtein distance 1 from "scrolldown")
+ const result = await tool.handler({ intent: 'page down' }, ctx);
+
+ expect(mockKeyPress).toHaveBeenCalledWith('PageDown');
+ const data = JSON.parse(result.text);
+ expect(data.executed).toBe('PageDown');
+ });
+
+ it('uses provided context for context-specific shortcuts', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ const result = await tool.handler({ intent: 'upvote', context: 'reddit' }, ctx);
+
+ expect(mockKeyPress).toHaveBeenCalledWith('a');
+ const data = JSON.parse(result.text);
+ expect(data.executed).toBe('a');
+ expect(data.intent).toBe('upvote');
+ });
+
+ it('auto-detects context from active window', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ mockGetActiveWindow.mockResolvedValue({ processName: 'msedge', title: 'Reddit - Popular' });
+
+ const result = await tool.handler({ intent: 'next post' }, ctx);
+
+ expect(mockKeyPress).toHaveBeenCalledWith('j');
+ const data = JSON.parse(result.text);
+ expect(data.executed).toBe('j');
+ });
+
+ it('returns error with suggestions when no match found', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ const result = await tool.handler({ intent: 'fly to the moon' }, ctx);
+
+ expect(result.isError).toBe(true);
+ expect(result.text).toContain('No shortcut matched');
+ expect(result.text).toContain('Try one of these');
+ expect(mockKeyPress).not.toHaveBeenCalled();
+ });
+
+ it('reports active window in response', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ mockGetActiveWindow.mockResolvedValue({ processName: 'notepad', title: 'Untitled' });
+
+ const result = await tool.handler({ intent: 'copy' }, ctx);
+
+ const data = JSON.parse(result.text);
+ expect(data.window).toContain('notepad');
+ expect(data.window).toContain('Untitled');
+ });
+
+ it('handles getActiveWindow failure gracefully', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ mockGetActiveWindow.mockRejectedValue(new Error('A11y unavailable'));
+
+ // Should still execute the shortcut
+ const result = await tool.handler({ intent: 'paste' }, ctx);
+
+ expect(mockKeyPress).toHaveBeenCalled();
+ expect(result.isError).toBeUndefined();
+ });
+
+ it('calls ensureInitialized before executing', async () => {
+ const tool = getExecuteTool();
+ const ctx = createMockContext();
+ await tool.handler({ intent: 'undo' }, ctx);
+
+ expect(ctx.ensureInitialized).toHaveBeenCalled();
+ });
+});
diff --git a/src/__tests__/smart-tools.test.ts b/src/__tests__/smart-tools.test.ts
new file mode 100644
index 0000000..2adcd71
--- /dev/null
+++ b/src/__tests__/smart-tools.test.ts
@@ -0,0 +1,347 @@
+/**
+ * Smart tools tests.
+ *
+ * Tests the smart_click, smart_read, smart_type, and invoke_element MCP tools.
+ * Verifies a11y → CDP → OCR fallback chain, coordinate handling, and error cases.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+// ── Mock heavy native deps ────────────────────────────────────────────────────
+
+vi.mock('@nut-tree-fork/nut-js', () => ({
+ mouse: { config: {}, move: vi.fn(), click: vi.fn(), setPosition: vi.fn() },
+ keyboard: { config: {}, type: vi.fn() },
+ screen: { grab: vi.fn() },
+ Button: { LEFT: 0 },
+ Key: new Proxy({}, { get: (_t, p) => p }),
+ Point: class { constructor(public x: number, public y: number) {} },
+ Region: class { constructor(public left: number, public top: number, public width: number, public height: number) {} },
+}));
+
+vi.mock('sharp', () => ({
+ default: vi.fn(() => ({
+ resize: vi.fn().mockReturnThis(),
+ png: vi.fn().mockReturnThis(),
+ jpeg: vi.fn().mockReturnThis(),
+ toBuffer: vi.fn().mockResolvedValue(Buffer.from('fake')),
+ })),
+}));
+
+// Mock OCR engine for OCR fallback tests
+vi.mock('../ocr-engine', () => {
+ return {
+ OcrEngine: class MockOcrEngine {
+ isAvailable() { return true; }
+ async recognizeScreen() {
+ return {
+ elements: [
+ { text: 'Submit', x: 100, y: 200, width: 80, height: 30, line: 1, confidence: 0.95 },
+ { text: 'Cancel', x: 200, y: 200, width: 80, height: 30, line: 1, confidence: 0.92 },
+ { text: 'File', x: 10, y: 10, width: 40, height: 20, line: 0, confidence: 0.98 },
+ ],
+ fullText: 'File Submit Cancel',
+ durationMs: 300,
+ };
+ }
+ invalidateCache() {}
+ },
+ };
+});
+
+// ── Import after mocks ────────────────────────────────────────────────────────
+
+import { getSmartTools } from '../tools/smart';
+import type { ToolContext } from '../tools/types';
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+const mockMouseClick = vi.fn();
+const mockKeyPress = vi.fn();
+const mockInvalidateCache = vi.fn();
+const mockGetActiveWindow = vi.fn().mockResolvedValue({
+ title: 'Test App',
+ processName: 'testapp',
+ processId: 1234,
+ bounds: { x: 0, y: 0, width: 1920, height: 1080 },
+});
+const mockInvokeElement = vi.fn();
+const mockFindElement = vi.fn();
+const mockGetFocusedElement = vi.fn();
+const mockGetScreenContext = vi.fn();
+const mockWriteClipboard = vi.fn();
+
+function createCtx(overrides?: Partial): ToolContext {
+ return {
+ desktop: {
+ mouseClick: mockMouseClick,
+ keyPress: mockKeyPress,
+ },
+ a11y: {
+ getActiveWindow: mockGetActiveWindow,
+ invokeElement: mockInvokeElement,
+ findElement: mockFindElement,
+ getFocusedElement: mockGetFocusedElement,
+ getScreenContext: mockGetScreenContext,
+ writeClipboard: mockWriteClipboard,
+ invalidateCache: mockInvalidateCache,
+ },
+ cdp: {
+ isConnected: vi.fn().mockResolvedValue(false),
+ getPage: vi.fn().mockReturnValue(null),
+ },
+ getMouseScaleFactor: () => 1.5,
+ getScreenshotScaleFactor: () => 3.0,
+ ensureInitialized: vi.fn(),
+ ...overrides,
+ };
+}
+
+beforeEach(() => {
+ vi.clearAllMocks();
+ mockGetActiveWindow.mockResolvedValue({
+ title: 'Test App',
+ processName: 'testapp',
+ processId: 1234,
+ bounds: { x: 0, y: 0, width: 1920, height: 1080 },
+ });
+});
+
+// ── Tests ──────────────────────────────────────────────────────────────────────
+
+describe('Smart Tools', () => {
+ const tools = getSmartTools();
+ const smartClick = tools.find(t => t.name === 'smart_click')!;
+ const smartRead = tools.find(t => t.name === 'smart_read')!;
+ const smartType = tools.find(t => t.name === 'smart_type')!;
+ const invokeEl = tools.find(t => t.name === 'invoke_element')!;
+
+ it('registers all 4 smart tools', () => {
+ expect(tools).toHaveLength(4);
+ expect(tools.map(t => t.name)).toEqual(['smart_read', 'smart_click', 'smart_type', 'invoke_element']);
+ });
+
+ // ── smart_click ──
+
+ describe('smart_click', () => {
+ it('clicks via UIA invoke when available', async () => {
+ mockInvokeElement.mockResolvedValue({ success: true });
+ const ctx = createCtx();
+ const result = await smartClick.handler({ target: 'Submit' }, ctx);
+ expect(result.text).toContain('UI Automation');
+ expect(result.text).toContain('invoke_element');
+ expect(mockInvalidateCache).toHaveBeenCalled();
+ });
+
+ it('uses a11y coordinate fallback when OCR has no match but a11y has bounds', async () => {
+ // Use a target that OCR won't find — only a11y can locate it
+ mockInvokeElement.mockResolvedValue({ success: false, clickPoint: { x: 500, y: 300 } });
+ const ctx = createCtx();
+ const result = await smartClick.handler({ target: 'UniqueA11yButton' }, ctx);
+ expect(result.text).toContain('a11y bounds');
+ expect(result.text).toContain('coordinate fallback');
+ // Coordinates should be passed directly — no a11yToMouse conversion
+ expect(mockMouseClick).toHaveBeenCalledWith(500, 300);
+ });
+
+ it('falls through to OCR when UIA fails entirely', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false });
+ const ctx = createCtx();
+ const result = await smartClick.handler({ target: 'Submit' }, ctx);
+ // Should find "Submit" in OCR elements and click at center
+ expect(result.text).toContain('OCR');
+ expect(result.text).toContain('Submit');
+ // OCR element: x=100, y=200, width=80, height=30 → center at (140, 215)
+ expect(mockMouseClick).toHaveBeenCalledWith(140, 215);
+ });
+
+ it('skips UIA for known empty-a11y apps', async () => {
+ mockGetActiveWindow.mockResolvedValue({
+ title: 'Terminal',
+ processName: 'windowsterminal',
+ processId: 5678,
+ bounds: { x: 0, y: 0, width: 1920, height: 1080 },
+ });
+ const ctx = createCtx();
+ const result = await smartClick.handler({ target: 'Submit' }, ctx);
+ // Should skip UIA and go to OCR
+ expect(mockInvokeElement).not.toHaveBeenCalled();
+ expect(result.text).toContain('OCR');
+ });
+
+ it('reports all attempted methods on total failure', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false });
+ const ctx = createCtx();
+ // OCR won't match "NonexistentButton"
+ const result = await smartClick.handler({ target: 'NonexistentButton' }, ctx);
+ expect(result.isError).toBe(true);
+ expect(result.text).toContain('smart_click failed');
+ expect(result.text).toContain('Attempted');
+ });
+
+ it('matches OCR elements with partial text match', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false });
+ const ctx = createCtx();
+ const result = await smartClick.handler({ target: 'Sub' }, ctx);
+ // "Sub" partially matches "Submit"
+ expect(result.text).toContain('OCR');
+ expect(result.text).toContain('Submit');
+ });
+ });
+
+ // ── smart_read ──
+
+ describe('smart_read', () => {
+ it('reads via OCR primary with a11y supplement for window scope', async () => {
+ mockGetScreenContext.mockResolvedValue('Full a11y tree here...\nWith multiple lines\nAnd buttons');
+ const ctx = createCtx();
+ const result = await smartRead.handler({ scope: 'window' }, ctx);
+ // OCR is primary — should appear first
+ expect(result.text).toContain('[via OCR');
+ // a11y tree should be appended as supplement
+ expect(result.text).toContain('=== A11Y TREE (supplement) ===');
+ expect(result.text).toContain('Full a11y tree here...');
+ });
+
+ it('reads focused element for focused scope', async () => {
+ mockGetFocusedElement.mockResolvedValue({
+ name: 'Search',
+ controlType: 'ControlType.Edit',
+ bounds: { x: 100, y: 200, width: 300, height: 30 },
+ });
+ const ctx = createCtx();
+ const result = await smartRead.handler({ scope: 'focused' }, ctx);
+ expect(result.text).toContain('[via UI Automation focused element]');
+ expect(result.text).toContain('Search');
+ });
+
+ it('falls through to OCR when a11y returns empty', async () => {
+ mockGetScreenContext.mockResolvedValue('');
+ const ctx = createCtx();
+ const result = await smartRead.handler({ scope: 'window' }, ctx);
+ expect(result.text).toContain('[via OCR');
+ });
+
+ it('skips a11y for known empty-a11y apps', async () => {
+ mockGetActiveWindow.mockResolvedValue({
+ title: 'Terminal',
+ processName: 'windowsterminal',
+ processId: 5678,
+ bounds: { x: 0, y: 0, width: 1920, height: 1080 },
+ });
+ mockGetScreenContext.mockResolvedValue('');
+ const ctx = createCtx();
+ const result = await smartRead.handler({ scope: 'window' }, ctx);
+ // Should skip to OCR without trying a11y
+ expect(mockGetScreenContext).not.toHaveBeenCalled();
+ expect(result.text).toContain('[via OCR');
+ });
+
+ it('searches for specific target element via a11y', async () => {
+ mockFindElement.mockResolvedValue([{
+ name: 'Submit',
+ controlType: 'ControlType.Button',
+ automationId: 'btn-submit',
+ bounds: { x: 100, y: 200, width: 80, height: 30 },
+ isEnabled: true,
+ }]);
+ const ctx = createCtx();
+ const result = await smartRead.handler({ target: 'Submit' }, ctx);
+ expect(result.text).toContain('[via UI Automation search]');
+ expect(result.text).toContain('Submit');
+ });
+ });
+
+ // ── smart_type ──
+
+ describe('smart_type', () => {
+ it('types into currently focused element when no target specified', async () => {
+ const ctx = createCtx();
+ const result = await smartType.handler({ text: 'Hello world' }, ctx);
+ expect(mockWriteClipboard).toHaveBeenCalledWith('Hello world');
+ expect(mockKeyPress).toHaveBeenCalledWith('ctrl+v');
+ expect(result.text).toContain('11 chars');
+ });
+
+ it('focuses target element via UIA before typing', async () => {
+ mockInvokeElement.mockResolvedValue({ success: true });
+ const ctx = createCtx();
+ const result = await smartType.handler({ text: 'test', target: 'Search box' }, ctx);
+ expect(mockInvokeElement).toHaveBeenCalledWith(
+ expect.objectContaining({ name: 'Search box', action: 'focus' })
+ );
+ expect(mockWriteClipboard).toHaveBeenCalledWith('test');
+ expect(result.text).toContain('into "Search box"');
+ });
+
+ it('clicks to focus when UIA focus fails but bounds available', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false, clickPoint: { x: 400, y: 250 } });
+ const ctx = createCtx();
+ const result = await smartType.handler({ text: 'test', target: 'Input field' }, ctx);
+ // Should click at coordinates directly (no a11yToMouse conversion)
+ expect(mockMouseClick).toHaveBeenCalledWith(400, 250);
+ expect(mockWriteClipboard).toHaveBeenCalledWith('test');
+ });
+
+ it('returns error when target element cannot be found', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false });
+ const ctx = createCtx();
+ const result = await smartType.handler({ text: 'test', target: 'Nonexistent' }, ctx);
+ expect(result.isError).toBe(true);
+ expect(result.text).toContain('Could not find element');
+ });
+ });
+
+ // ── invoke_element ──
+
+ describe('invoke_element', () => {
+ it('invokes element by name', async () => {
+ mockInvokeElement.mockResolvedValue({ success: true });
+ const ctx = createCtx();
+ const result = await invokeEl.handler({ name: 'Save', action: 'click' }, ctx);
+ expect(result.text).toContain('Invoked "Save"');
+ expect(result.text).toContain('click');
+ });
+
+ it('invokes element by automationId', async () => {
+ mockInvokeElement.mockResolvedValue({ success: true, value: 'Hello' });
+ const ctx = createCtx();
+ const result = await invokeEl.handler({ automationId: 'txtSearch', action: 'get-value' }, ctx);
+ expect(result.text).toContain('txtSearch');
+ expect(result.text).toContain('Hello');
+ });
+
+ it('requires either name or automationId', async () => {
+ const ctx = createCtx();
+ const result = await invokeEl.handler({ action: 'click' }, ctx);
+ expect(result.isError).toBe(true);
+ expect(result.text).toContain('required');
+ });
+
+ it('uses coordinate fallback for click when invoke fails', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false, clickPoint: { x: 300, y: 150 } });
+ const ctx = createCtx();
+ const result = await invokeEl.handler({ name: 'Button', action: 'click' }, ctx);
+ expect(result.text).toContain('coordinate fallback');
+ // Coordinates passed directly — no conversion
+ expect(mockMouseClick).toHaveBeenCalledWith(300, 150);
+ });
+
+ it('returns error when element not found and no coordinates', async () => {
+ mockInvokeElement.mockResolvedValue({ success: false, error: 'Element not found' });
+ const ctx = createCtx();
+ const result = await invokeEl.handler({ name: 'Ghost', action: 'click' }, ctx);
+ expect(result.isError).toBe(true);
+ expect(result.text).toContain('Element not found');
+ });
+
+ it('defaults action to click when not specified', async () => {
+ mockInvokeElement.mockResolvedValue({ success: true });
+ const ctx = createCtx();
+ await invokeEl.handler({ name: 'Button' }, ctx);
+ expect(mockInvokeElement).toHaveBeenCalledWith(
+ expect.objectContaining({ action: 'click' })
+ );
+ });
+ });
+});
diff --git a/src/__tests__/verifiers.test.ts b/src/__tests__/verifiers.test.ts
new file mode 100644
index 0000000..a2cf2e3
--- /dev/null
+++ b/src/__tests__/verifiers.test.ts
@@ -0,0 +1,167 @@
+/**
+ * Verifier tests.
+ * All a11y and LLM calls are mocked — no native desktop access.
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+
+// ── Mock native deps ──────────────────────────────────────────────────────────
+vi.mock('@nut-tree-fork/nut-js', () => ({
+ mouse: { config: {} }, keyboard: { config: {} },
+ screen: { grab: vi.fn(), grabRegion: vi.fn() },
+ Button: {}, Key: new Proxy({}, { get: (_t, p) => p }),
+ Point: class { constructor(public x: number, public y: number) {} },
+ Region: class { constructor(public left: number, public top: number, public width: number, public height: number) {} },
+}));
+
+vi.mock('sharp', () => ({
+ default: vi.fn(() => ({ resize: vi.fn().mockReturnThis(), png: vi.fn().mockReturnThis(), toBuffer: vi.fn().mockResolvedValue(Buffer.from('')) })),
+}));
+
+import { TaskVerifier, type VerifyResult } from '../verifiers';
+import type { AccessibilityBridge } from '../accessibility';
+import type { PipelineConfig } from '../providers';
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+function makeA11y(overrides?: Partial): AccessibilityBridge {
+ return {
+ getActiveWindow: vi.fn().mockResolvedValue({ title: 'Notepad', processName: 'notepad', pid: 1 }),
+ getFocusedElement: vi.fn().mockResolvedValue({ value: '', name: '', role: 'edit' }),
+ getAccessibilityTree: vi.fn().mockResolvedValue(''),
+ readClipboard: vi.fn().mockResolvedValue(''),
+ getWindows: vi.fn().mockResolvedValue([]),
+ isShellAvailable: vi.fn().mockResolvedValue(true),
+ warmup: vi.fn().mockResolvedValue(undefined),
+ ...overrides,
+ } as any;
+}
+
+function makePipelineConfig(textModelOverride?: any): PipelineConfig {
+ return {
+ provider: 'openai',
+ providerKey: 'sk-test',
+ apiKey: 'sk-test',
+ layer1: { enabled: true },
+ layer2: { enabled: true, model: 'gpt-4o-mini', baseUrl: 'https://api.openai.com/v1', apiKey: 'sk-test', provider: 'openai' },
+ layer3: { enabled: false, model: 'gpt-4o', baseUrl: '', apiKey: '', provider: 'openai' },
+ ...(textModelOverride ?? {}),
+ } as any;
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe('TaskVerifier — always returns attemptLog', () => {
+ it('attemptLog is an array even when no checks run', async () => {
+ const verifier = new TaskVerifier(makeA11y());
+ // No pipelineConfig → LLM verifier is skipped; no fast-path patterns match
+ const result = await verifier.verify('do something completely unrecognized XYZ');
+ expect(Array.isArray(result.attemptLog)).toBe(true);
+ });
+
+ it('attemptLog is populated when fast-path runs', async () => {
+ const a11y = makeA11y({
+ getActiveWindow: vi.fn().mockResolvedValue({ title: 'Notepad', processName: 'notepad', pid: 1 }),
+ });
+ const verifier = new TaskVerifier(a11y);
+ const result = await verifier.verify('open notepad');
+ expect(result.attemptLog.length).toBeGreaterThan(0);
+ expect(result.attemptLog[0]).toHaveProperty('checkName');
+ expect(result.attemptLog[0]).toHaveProperty('durationMs');
+ });
+});
+
+describe('TaskVerifier — error passthrough is FAIL not PASS', () => {
+ it('a11y error does not silently pass', async () => {
+ const a11y = makeA11y({
+ getActiveWindow: vi.fn().mockRejectedValue(new Error('UIA bridge crashed')),
+ });
+ const verifier = new TaskVerifier(a11y);
+ // "open notepad" triggers the app_open_check fast-path which calls getActiveWindow
+ const result = await verifier.verify('open notepad');
+ // Result should not be a confident PASS when the a11y bridge throws
+ // Either it fails, or it's low-confidence
+ if (result.pass) {
+ expect(result.confidence).toBeLessThan(0.7);
+ } else {
+ expect(result.pass).toBe(false);
+ }
+ });
+});
+
+describe('TaskVerifier — verifyAppOpen fast-path', () => {
+ it('passes when the right process name is in the active window', async () => {
+ const a11y = makeA11y({
+ getActiveWindow: vi.fn().mockResolvedValue({ title: 'Untitled - Notepad', processName: 'notepad', pid: 42 }),
+ });
+ const verifier = new TaskVerifier(a11y);
+ const result = await verifier.verify('open notepad');
+ // Should find "notepad" in active window processName and pass
+ const appCheck = result.attemptLog.find(a => a.checkName === 'app_open_check');
+ expect(appCheck).toBeDefined();
+ if (appCheck) {
+ expect(appCheck.pass).toBe(true);
+ }
+ });
+
+ it('fails when a different process is active', async () => {
+ const a11y = makeA11y({
+ getActiveWindow: vi.fn().mockResolvedValue({ title: 'Chrome - Google', processName: 'chrome', pid: 99 }),
+ });
+ const verifier = new TaskVerifier(a11y);
+ const result = await verifier.verify('open notepad');
+ const appCheck = result.attemptLog.find(a => a.checkName === 'app_open_check');
+ // Chrome is active but we wanted Notepad — should not be a high-confidence pass
+ if (appCheck) {
+ if (appCheck.pass) {
+ // Chrome active but Notepad expected — should not be a confident pass
+ expect(appCheck.confidence).toBeLessThanOrEqual(0.85);
+ } else {
+ expect(appCheck.pass).toBe(false);
+ }
+ }
+ });
+});
+
+describe('TaskVerifier — verifyClipboardHasContent fast-path', () => {
+ it('passes when clipboard has content', async () => {
+ const a11y = makeA11y();
+ const verifier = new TaskVerifier(a11y);
+ const readClip = vi.fn().mockResolvedValue('some copied text');
+ const result = await verifier.verify('copy the selected text', readClip);
+ const clipCheck = result.attemptLog.find(a => a.checkName === 'clipboard_check');
+ expect(clipCheck).toBeDefined();
+ if (clipCheck) {
+ expect(clipCheck.pass).toBe(true);
+ }
+ });
+
+ it('fails when clipboard is empty', async () => {
+ const a11y = makeA11y();
+ const verifier = new TaskVerifier(a11y);
+ const readClip = vi.fn().mockResolvedValue('');
+ const result = await verifier.verify('copy the selected text', readClip);
+ const clipCheck = result.attemptLog.find(a => a.checkName === 'clipboard_check');
+ if (clipCheck) {
+ expect(clipCheck.pass).toBe(false);
+ }
+ });
+});
+
+describe('TaskVerifier — no pipelineConfig falls back gracefully', () => {
+ it('unknown task without pipelineConfig returns a result without crashing', async () => {
+ const verifier = new TaskVerifier(makeA11y()); // no config
+ const result = await verifier.verify('schedule a meeting for tomorrow at 3pm');
+ expect(result).toHaveProperty('pass');
+ expect(result).toHaveProperty('attemptLog');
+ expect(typeof result.pass).toBe('boolean');
+ });
+
+ it('unknown task default is not blindly PASS with high confidence', async () => {
+ const verifier = new TaskVerifier(makeA11y());
+ const result = await verifier.verify('xyzzy completely unknown nonexistent task zzz');
+ // Without an LLM to verify, confidence should be low
+ if (result.pass) {
+ expect(result.confidence).toBeLessThan(0.6);
+ }
+ });
+});
diff --git a/src/a11y-click-resolver.ts b/src/a11y-click-resolver.ts
new file mode 100644
index 0000000..5c614a9
--- /dev/null
+++ b/src/a11y-click-resolver.ts
@@ -0,0 +1,66 @@
+/**
+ * A11y Click Resolver — resolves element names to screen coordinates
+ * using the accessibility tree's bounding rectangles.
+ *
+ * Zero LLM cost. Zero vision model calls. Just math.
+ */
+
+import { AccessibilityBridge } from './accessibility';
+
+export class A11yClickResolver {
+ constructor(private a11y: AccessibilityBridge) {}
+
+ /**
+ * Resolve an element to its center coordinates using the a11y tree.
+ * Returns null if the element isn't found or has no valid bounds.
+ */
+ async resolve(
+ name: string,
+ controlType?: string,
+ processId?: number,
+ ): Promise<{ x: number; y: number } | null> {
+ const elements = await this.a11y.findElement({
+ name,
+ ...(controlType && { controlType }),
+ ...(processId && { processId }),
+ });
+ if (!elements?.length) return null;
+
+ const b = elements[0].bounds;
+ if (!this.isValidBounds(b)) return null;
+
+ return {
+ x: b.x + Math.floor(b.width / 2),
+ y: b.y + Math.floor(b.height / 2),
+ };
+ }
+
+ /**
+ * Resolve by automationId instead of name.
+ */
+ async resolveById(
+ automationId: string,
+ processId?: number,
+ ): Promise<{ x: number; y: number } | null> {
+ const elements = await this.a11y.findElement({
+ automationId,
+ ...(processId && { processId }),
+ });
+ if (!elements?.length) return null;
+
+ const b = elements[0].bounds;
+ if (!this.isValidBounds(b)) return null;
+
+ return {
+ x: b.x + Math.floor(b.width / 2),
+ y: b.y + Math.floor(b.height / 2),
+ };
+ }
+
+ private isValidBounds(b: { x: number; y: number; width: number; height: number } | undefined): boolean {
+ if (!b || b.width <= 0 || b.height <= 0) return false;
+ // Reject off-screen or absurd coordinates (e.g. y: -29503)
+ if (b.x < -100 || b.y < -100 || b.x > 10000 || b.y > 10000) return false;
+ return true;
+ }
+}
diff --git a/src/a11y-reasoner.ts b/src/a11y-reasoner.ts
index 21742c9..9b4378b 100644
--- a/src/a11y-reasoner.ts
+++ b/src/a11y-reasoner.ts
@@ -1,181 +1,1111 @@
/**
- * Accessibility Reasoner — Layer 2 in the pipeline.
+ * Accessibility Reasoner — Layer 2.
*
- * Takes the accessibility tree (text) + a subtask description,
- * sends it to a cheap/fast text-only LLM, and gets back a
- * structured action. No screenshots, no vision model.
+ * "Blind man" navigation: reads the a11y tree, decides an action, executes it,
+ * re-reads the tree, verifies progress, repeats — until done or unsure.
*
- * This handles most desktop app interactions:
- * - Click buttons by name
- * - Fill text fields
- * - Navigate menus
- * - Select list/tree items
- * - Read element values
- *
- * Falls through to Layer 3 (screenshot) when:
- * - No matching element in the accessibility tree
- * - Task requires visual understanding (layout, colors, images)
- * - LLM returns "unsure" or invalid response
+ * Key changes vs v0.6.3:
+ * - Real verify loop (read → act → read again → confirm) instead of one-shot
+ * - Cache is invalidated before every read so the LLM never sees stale state
+ * - "done" response requires explicit evidence from the UI tree
+ * - NativeDesktop injected so the reasoner can execute actions itself
+ * - Falls through to vision only when genuinely stuck
*/
+import * as fs from 'fs';
+import * as path from 'path';
import { AccessibilityBridge } from './accessibility';
+import { A11yClickResolver } from './a11y-click-resolver';
+import { NativeDesktop } from './native-desktop';
import type { PipelineConfig } from './providers';
import type { InputAction, A11yAction } from './types';
+import { CDPDriver } from './cdp-driver';
+
+const MAX_LOOP_STEPS = 50; // text LLM stays in control — vision is its coordinate tool
+const SETTLE_MS = 300; // wait after action before re-reading tree
+const BROWSER_PROCESS_RE = /msedge|chrome|chromium/i; // shared regex — avoids duplicating in 4+ places
+const MAX_ACTION_HISTORY = 40; // cap action history to prevent unbounded growth
+
+const SYSTEM_PROMPT = `You control a Windows desktop via the accessibility tree and keyboard. You are an AUTONOMOUS REASONING AGENT. At EVERY step you must:
+1. READ the current UI state carefully
+2. THINK about what you see vs what the task requires
+3. DECIDE what action brings you closer to completion
+4. ACT — then loop back to step 1
+
+You are NOT a command executor. You REASON about the task and COMPOSE original content when needed.
+
+══════════════════════════════════════════
+DECISION TREE — follow in strict order
+══════════════════════════════════════════
+
+① WRONG APP? (check first, every step)
+ If the process is completely wrong (e.g. you're in Notepad but need Edge):
+ → {"action":"needs_human","reason":"wrong_window","description":"Focused is [wrong process], need [target]"}
+ If you're in the RIGHT app but on the WRONG page/tab/document:
+ → FIX IT YOURSELF. Navigate, click, use File > New, etc.
+ → NEVER return needs_human when you can navigate or interact to get to the right place.
+ → You are a capable agent — solve navigation problems, don't ask for help.
+
+② TASK COMPLETE?
+ Does UI state NOW confirm the task is FULLY done? ALL parts of the task must be verified.
+ → {"action":"done","evidence":"specific text or element that proves completion"}
+ Evidence must be CONCRETE PROOF visible on screen. Examples:
+ - "write a sentence on dogs" → evidence must quote THE ACTUAL SENTENCE you typed, visible in the document
+ - "search for flights" → evidence: flight results are displayed on the page
+ - "send email" → evidence: sent confirmation or compose window closed
+ CRITICAL RULES for done:
+ - NEVER declare done just because you navigated somewhere. The task actions must be COMPLETED.
+ - NEVER declare done if you haven't performed the core action (writing, clicking, typing, etc.)
+ - If the task says "write" — you must have COMPOSED and TYPED actual text. Just opening a doc is NOT done.
+ - If the task says "in a new document" — you must have CREATED a new document first.
+
+② B. CONTENT GENERATION
+ When a task asks you to "write", "compose", "draft", or "create" text content:
+ → You must INVENT and TYPE original content yourself. You are a language model — generate the text!
+ → Use {"action":"type","text":"Your composed sentence or paragraph here","description":"typing composed content"}
+ → Example: task "write a sentence on dogs" → {"action":"type","text":"Dogs are loyal companions that have been by humanity's side for thousands of years.","description":"composing and typing a sentence about dogs"}
+ → Do NOT type the task instruction literally (e.g. do NOT type "a sentence on dogs")
+ → Do NOT declare done without having typed the content
+
+③ MISSING INFO?
+ Missing a password / 2FA / payment card → {"action":"needs_human","reason":"...","description":"..."}
+ Missing a travel parameter or destination → use the app's discovery feature (e.g. Explore), do NOT ask human.
+
+④ SELECT ACTION APPROACH (first match wins):
+ CDP PAGE CONTEXT shown (browser) → MANDATORY: use cdp_click / cdp_type ONLY.
+ NEVER use a11y_click/a11y_set_value/a11y_focus on browser pages — they CRASH.
+ NEVER spam key_press (F5/Escape/Tab/Return) — use cdp_click on visible elements.
+ Click buttons/links by their text with cdp_click by_text.
+ Type in fields with cdp_type by_label or selector.
+ Need to READ page text (info retrieval) → cdp_read_text selector="body" (or a specific selector).
+ If a search is already in the URL params, the results are ALREADY SHOWING → declare done.
+ WebView2/Electron (empty Panes, ControlType.Pane only) → keyboard shortcuts ONLY, no a11y_click
+ Native app with element in UI TREE → a11y_click / a11y_set_value / a11y_focus
+ Element not in tree → Tab/Shift+Tab/Enter/Arrow or need_visual for coordinates
+ Nothing works after 3 different approaches → need_visual once, then continue
+
+══════════════════════════════════════════
+ACTIONS — return exactly ONE as JSON
+══════════════════════════════════════════
+{"action":"a11y_click","name":"Button Name","controlType":"Button","description":"why"}
+{"action":"a11y_set_value","name":"Field Name","controlType":"Edit","value":"text","description":"why"}
+{"action":"a11y_focus","name":"Element","controlType":"Edit","description":"why"}
+{"action":"key_press","key":"Tab|Return|Escape|ctrl+s|ctrl+n|ctrl+Return|alt|F10|...","description":"why"}
+{"action":"type","text":"text including \\n for newlines","description":"why"}
+{"action":"need_visual","target":"exact element name","description":"why keyboard cannot reach it"}
+{"action":"cdp_click","selector":"[aria-label='X']","description":"why"}
+{"action":"cdp_click","by_text":"Button Label","description":"why"}
+{"action":"cdp_type","by_label":"Field Label","text":"value","description":"why"}
+{"action":"cdp_type","selector":"[aria-label='X']","text":"value","description":"why"}
+{"action":"cdp_read_text","selector":"body","description":"read visible text from page or element (for info retrieval)"}
+{"action":"cdp_scroll","direction":"down","amount":600,"description":"scroll page down 600px — use to reveal more content or posts"}
+{"action":"cdp_scroll","direction":"up","amount":400,"selector":"#feed","description":"scroll a specific element"}
+{"action":"checkpoint","description":"verify current page state"}
+{"action":"switch_app","app":"notepad|excel|edge|outlook|...","description":"why you need to switch"}
+{"action":"done","evidence":"concrete proof task is complete"}
+{"action":"needs_human","reason":"payment|captcha|password|2FA|wrong_window","description":"exactly what human must do"}
-const SYSTEM_PROMPT = `You analyze accessibility trees to execute desktop tasks. Given a task and the current UI state (windows + elements), return ONE JSON action.
-
-ACTIONS:
-{"action":"a11y_click","name":"Button Name","controlType":"Button","description":"..."}
-{"action":"a11y_set_value","name":"Field Name","controlType":"Edit","value":"text","description":"..."}
-{"action":"a11y_focus","name":"Element","controlType":"Edit","description":"..."}
-{"action":"key_press","key":"Return|Tab|Escape|ctrl+s|...","description":"..."}
-{"action":"type","text":"hello","description":"..."}
-{"action":"done","description":"Task already completed based on UI state"}
-{"action":"unsure","description":"Cannot determine action from accessibility tree alone"}
-
-RULES:
-1. Match elements by name (case-insensitive, partial match OK)
-2. Prefer a11y_click over coordinate clicks — more reliable
-3. If the target element isn't in the tree, return {"action":"unsure"}
-4. For typing into fields: a11y_focus first, then type
-5. Return ONLY valid JSON, no other text
-6. PREFER keyboard shortcuts — faster and more reliable than clicking:
- - Save: key_press "ctrl+s" instead of clicking Save button
- - Close: key_press "alt+F4" instead of clicking X
- - New tab: key_press "ctrl+t" | Address bar: key_press "ctrl+l"
- - Calculator: type "1337*42=" instead of clicking individual buttons
- - Select all: "ctrl+a" | Copy: "ctrl+c" | Paste: "ctrl+v"
- - If app is already open, focus it instead of re-launching
-7. VERIFY outcomes — check the UI state after actions to confirm success`;
+══════════════════════════════════════════
+HARD CONSTRAINTS
+══════════════════════════════════════════
+• Never repeat a failed action — different approach every time
+• Never click taskbar buttons, window title bars, or unnamed Panes (throws RPC errors)
+• Never press Alt+Tab or Win key — use {"action":"switch_app","app":"notepad"} to switch apps deterministically
+• After action history shows SUCCEEDED / ALREADY DONE → move to next step, do NOT repeat
+• checkpoint: use only once to orient after a page load. If CDP is unavailable, do NOT call checkpoint again.
+• need_visual is a coordinate lookup, NOT page exploration. UI STATE already describes the page.
+• Return ONLY valid JSON — no markdown, no text outside the JSON object
+• If APP KNOWLEDGE BASE is present for this app, follow its patterns exactly`;
+
+
+
+interface ActionRecord {
+ action: string;
+ description: string;
+}
interface ReasonerResult {
handled: boolean;
- action?: InputAction;
description: string;
- /** true if the reasoner says "I can't handle this" → fall through to Layer 3 */
unsure?: boolean;
+ needsHuman?: boolean; // task requires human intervention (payment, captcha, 2FA)
+ steps?: number; // how many a11y steps were taken
+ actionHistory?: ActionRecord[]; // actions attempted, for Layer 3 context
}
export class A11yReasoner {
private a11y: AccessibilityBridge;
+ private clickResolver: A11yClickResolver;
+ private desktop: NativeDesktop;
private pipelineConfig: PipelineConfig;
- private consecutiveFailures = 0;
- private readonly MAX_FAILURES = 3;
- private disabled = false;
+ private failuresByApp: Map = new Map();
+ private readonly MAX_FAILURES = 5;
+ private disabledApps: Set = new Set();
+ private visionOnlySubtaskCount = 0;
+ private readonly VISION_RECOVERY_THRESHOLD = 3;
+ private appKnowledge: string = '';
+ private cdpDriver: CDPDriver | null = null;
+ private cdpAvailable: boolean | null = null; // null=unknown, false=unavailable, true=connected
+ private uiaDisabled = false; // set true after RPC_E_SERVERFAULT — skip all UIA reads
- constructor(a11y: AccessibilityBridge, pipelineConfig: PipelineConfig) {
+ constructor(a11y: AccessibilityBridge, desktop: NativeDesktop, pipelineConfig: PipelineConfig) {
this.a11y = a11y;
+ this.clickResolver = new A11yClickResolver(a11y);
+ this.desktop = desktop;
this.pipelineConfig = pipelineConfig;
+ this.loadAppKnowledge();
}
- /** Check if reasoner is available and not circuit-broken */
- isAvailable(): boolean {
- return this.pipelineConfig.layer2.enabled && !this.disabled;
+ private loadAppKnowledge(): void {
+ try {
+ const kbPath = path.join(__dirname, '..', 'docs', 'app-knowledge.md');
+ this.appKnowledge = fs.readFileSync(kbPath, 'utf-8');
+ } catch (err) {
+ console.debug(`[A11yReasoner] App knowledge not loaded: ${err}`);
+ this.appKnowledge = '';
+ }
}
- /**
- * Try to handle a subtask using only the accessibility tree.
- * Returns { handled: false } if it can't → caller should use Layer 3.
- */
- async reason(subtask: string): Promise {
- if (!this.isAvailable()) {
- return { handled: false, description: 'Layer 2 disabled' };
- }
+ /** Extract relevant sections from the knowledge base for the active app/task */
+ private getRelevantKnowledge(processName?: string, task?: string, currentUrl?: string): string {
+ if (!this.appKnowledge) return '';
- try {
- // Get accessibility context
- const activeWindow = await this.a11y.getActiveWindow();
- const context = await this.a11y.getScreenContext(activeWindow?.processId);
+ const sections: string[] = [];
+ const lines = this.appKnowledge.split('\n');
- if (!context || context.includes('unavailable')) {
- return { handled: false, description: 'Accessibility tree unavailable' };
- }
+ // Always include General Rules
+ let inSection = false;
+ let currentSection = '';
+ let includeSection = false;
+
+ for (const line of lines) {
+ const h2Match = line.match(/^## (.+)/);
+ if (h2Match) {
+ // Save previous section if included
+ if (includeSection && currentSection.trim()) {
+ sections.push(currentSection.trim());
+ }
+ currentSection = line + '\n';
+ const title = h2Match[1].toLowerCase();
- // Build prompt
- const userMessage = `TASK: ${subtask}\n\nCURRENT UI STATE:\n${context}`;
+ // Always include General Rules and Troubleshooting
+ includeSection = title.includes('general rules') || title.includes('troubleshooting');
- // Call cheap text model
- const start = performance.now();
- const response = await this.callTextModel(userMessage);
- const latency = Math.round(performance.now() - start);
+ // Include app-specific sections
+ if (processName) {
+ const pn = processName.toLowerCase();
+ if (pn === 'olk' && title.includes('outlook')) includeSection = true;
+ if (pn === 'msedge' && (title.includes('edge') || title.includes('outlook'))) includeSection = true;
+ if (pn === 'notepad' && title.includes('notepad')) includeSection = true;
+ if (pn === 'mspaint' && title.includes('paint')) includeSection = true;
+ }
- console.log(` 🧠 Layer 2 (${this.pipelineConfig.layer2.model}): ${latency}ms`);
+ // Include based on task keywords
+ if (task) {
+ const tl = task.toLowerCase();
+ if (/email|mail|outlook/i.test(tl) && title.includes('outlook')) includeSection = true;
+ if (/browser|edge|chrome|web/i.test(tl) && title.includes('edge')) includeSection = true;
+ if (/notepad|text.*file/i.test(tl) && title.includes('notepad')) includeSection = true;
+ if (/paint|draw/i.test(tl) && title.includes('paint')) includeSection = true;
+ if (/flight|flights|fly|airline|google flights/i.test(tl) && title.includes('google flights')) includeSection = true;
+ if (/tripadvisor/i.test(tl) && (title.includes('tripadvisor') || title.includes('google flights'))) includeSection = true;
+ if (/google docs|docs\.google|document|write.*sentence|compose/i.test(tl) && title.includes('google docs')) includeSection = true;
+ }
- // Parse response
- const result = this.parseResponse(response);
+ // URL-based section selection — most accurate for browser tabs
+ if (currentUrl) {
+ const url = currentUrl.toLowerCase();
+ if (url.includes('google.com/travel/flights') || url.includes('flights.google.com')) {
+ if (title.includes('google flights')) includeSection = true;
+ }
+ if (url.includes('tripadvisor.com')) {
+ if (title.includes('tripadvisor') || title.includes('google flights')) includeSection = true;
+ }
+ if (url.includes('docs.google.com')) {
+ if (title.includes('google docs')) includeSection = true;
+ }
+ } else if (processName === 'msedge') {
+ // Fallback when URL unknown: include both for msedge
+ if (title.includes('google flights')) includeSection = true;
+ }
- if (result.unsure) {
- console.log(` 🤷 Layer 2 unsure: ${result.description} → falling through to Layer 3`);
- return { handled: false, description: result.description, unsure: true };
+ inSection = true;
+ continue;
}
- if (result.handled) {
- this.consecutiveFailures = 0; // Reset circuit breaker
- console.log(` ✅ Layer 2 handled: ${result.description}`);
+ if (inSection) {
+ currentSection += line + '\n';
}
+ }
+ // Don't forget the last section
+ if (includeSection && currentSection.trim()) {
+ sections.push(currentSection.trim());
+ }
- return result;
- } catch (err) {
- this.consecutiveFailures++;
- console.log(` ⚠️ Layer 2 error (${this.consecutiveFailures}/${this.MAX_FAILURES}): ${err}`);
+ return sections.length > 0 ? '\n\nAPP KNOWLEDGE BASE:\n' + sections.join('\n\n') : '';
+ }
- // Circuit breaker
- if (this.consecutiveFailures >= this.MAX_FAILURES) {
- this.disabled = true;
- console.log(` 🔴 Layer 2 circuit breaker tripped — disabled for this session`);
- }
+ isAvailable(processName?: string): boolean {
+ if (!this.pipelineConfig.layer2.enabled) return false;
+ if (processName && this.disabledApps.has(processName.toLowerCase())) return false;
+ return true;
+ }
- return { handled: false, description: `Layer 2 error: ${err}` };
+ reset(processName?: string): void {
+ if (processName) {
+ const key = processName.toLowerCase();
+ this.disabledApps.delete(key);
+ this.failuresByApp.delete(key);
+ } else {
+ this.disabledApps.clear();
+ this.failuresByApp.clear();
}
+ this.visionOnlySubtaskCount = 0;
+ // Reset CDP connection state between tasks
+ this.cdpDriver = null;
+ this.cdpAvailable = null;
+ this.uiaDisabled = false;
}
- /** Reset circuit breaker (e.g., after doctor re-diagnoses) */
- reset(): void {
- this.disabled = false;
- this.consecutiveFailures = 0;
+ /** Call after a subtask is handled by vision — tracks for auto-recovery */
+ recordVisionFallback(): void {
+ this.visionOnlySubtaskCount++;
+ if (this.visionOnlySubtaskCount >= this.VISION_RECOVERY_THRESHOLD) {
+ console.log(` 🔄 Layer 2 auto-recovery: re-enabling after ${this.visionOnlySubtaskCount} vision-only subtasks`);
+ this.disabledApps.clear();
+ this.failuresByApp.clear();
+ this.visionOnlySubtaskCount = 0;
+ }
}
- private parseResponse(response: string): ReasonerResult {
- try {
- const jsonMatch = response.match(/\{[\s\S]*\}/);
- if (!jsonMatch) {
- return { handled: false, description: 'No JSON in response', unsure: true };
+ /**
+ * Try to complete a subtask using only the accessibility tree.
+ * Loops: read → act → verify → repeat.
+ * Returns { handled: false } when it cannot proceed → caller uses vision.
+ */
+ async reason(subtask: string, processName?: string, priorContext?: string[], logger?: import('./task-logger').TaskLogger, verifier?: import('./verifiers').TaskVerifier): Promise {
+ if (!this.isAvailable(processName)) {
+ return { handled: false, description: 'Layer 2 disabled' };
+ }
+
+ const actionHistory: ActionRecord[] = [];
+ // Seed action history with prior context so the LLM sees it
+ if (priorContext && priorContext.length > 0) {
+ for (const ctx of priorContext) {
+ actionHistory.push({ action: 'context', description: ctx });
}
+ }
+ let stepsTotal = 0;
- const parsed = JSON.parse(jsonMatch[0]);
+ let isLikelyBrowser = BROWSER_PROCESS_RE.test(processName || '');
- if (parsed.action === 'unsure') {
- return { handled: false, description: parsed.description || 'Unsure', unsure: true };
- }
+ for (let step = 0; step < MAX_LOOP_STEPS; step++) {
+ try {
+ // Always read fresh — cache is invalidated by caller after each action
+ // and here before each read to guarantee freshness
+ this.a11y.invalidateCache();
+ let context: string | null = null;
- if (parsed.action === 'done') {
- return { handled: true, description: parsed.description || 'Already done' };
- }
+ // Wrong-window detection: if terminal/explorer has focus but we need a browser, auto-focus the target
+ if (step === 0 && processName && actionHistory.filter(a => a.action !== 'context').length === 0) {
+ try {
+ const focusedEl = await this.a11y.getFocusedElement().catch(() => null);
+ if (focusedEl && focusedEl.processId) {
+ const windows = await this.a11y.getWindows().catch(() => []);
+ const focusedWin = windows.find(w => w.processId === focusedEl.processId);
+ const focusedProc = (focusedWin?.processName || '').toLowerCase();
+ const targetProc = processName.toLowerCase();
+ const NON_TARGET_PROCS = ['windowsterminal', 'cmd', 'powershell'];
+ const isWrongWindow = focusedProc.length > 0 &&
+ NON_TARGET_PROCS.some(p => focusedProc.includes(p)) &&
+ !focusedProc.includes(targetProc);
+ if (isWrongWindow) {
+ // Auto-fix: find and focus the target window instead of returning needs_human
+ const targetWin = windows.find(w => w.processName.toLowerCase().includes(targetProc) && !w.isMinimized);
+ if (targetWin) {
+ console.log(` 🔄 Wrong window: focused=${focusedProc}, auto-focusing ${targetProc} (pid ${targetWin.processId})`);
+ await this.a11y.focusWindow(undefined, targetWin.processId).catch(() => null);
+ await this.delay(800); // let focus settle
+ actionHistory.push({ action: 'context', description: `Auto-focused ${targetProc} window. It is now the active window.` });
+ } else {
+ console.log(` ⚠️ Wrong window: focused=${focusedProc}, target=${targetProc} not found — returning needs_human`);
+ return {
+ handled: false,
+ description: `Wrong window: focused on ${focusedProc} but target ${targetProc} not found in window list.`,
+ needsHuman: true,
+ steps: 0,
+ actionHistory,
+ };
+ }
+ }
+ }
+ } catch { /* non-critical — continue */ }
+ }
+
+ // Pre-flight: On step 0, connect CDP and find the right tab
+ if (step === 0 && processName && BROWSER_PROCESS_RE.test(processName)) {
+ try {
+ const cdp = this.cdpDriver ?? new CDPDriver(9222);
+ if (!this.cdpDriver) this.cdpDriver = cdp;
+ const connected = await cdp.isConnected().catch(() => false) || await cdp.connect().catch(() => false);
+ if (connected) {
+ this.cdpAvailable = true;
+
+ // Check if we're on the right tab — priorContext may say "Navigated to X"
+ const navigatedUrl = actionHistory
+ .filter(a => a.action === 'context')
+ .map(a => a.description.match(/[Nn]avigated to (\S+)/)?.[1])
+ .find(u => u);
+ if (navigatedUrl) {
+ const currentUrl = await cdp.getUrl().catch(() => '') || '';
+ // Use full path for matching (not just domain) to avoid google.com/mail matching google.com/travel
+ const navPath = navigatedUrl.replace(/^https?:\/\//, '');
+ // Also try a shorter domain+path match to handle URL redirects
+ // e.g., en.wikipedia.org/wiki/Mars_(planet) → en.wikipedia.org/wiki/Mars
+ const navDomain = navPath.split('/')[0]; // e.g., "en.wikipedia.org"
+ const urlMatchesExact = currentUrl.includes(navPath);
+ const urlMatchesDomain = currentUrl.includes(navDomain) && !currentUrl.includes('google.com/search');
+ if (!urlMatchesExact && !urlMatchesDomain) {
+ // CDP is on wrong tab — try to find the exact URL tab first
+ const switched = await cdp.switchToTabByUrl(navPath);
+ if (switched) {
+ // Found and switched — wait for page to be fully loaded
+ try {
+ const pg = cdp.getPage();
+ if (pg) await pg.waitForLoadState('domcontentloaded', { timeout: 5000 });
+ } catch { /* non-critical */ }
+ } else {
+ // Tab not found — try disconnecting and reconnecting CDP to pick up new tabs
+ console.log(` 🔄 CDP on wrong tab (${currentUrl.substring(0, 50)}), reconnecting CDP to find new tabs...`);
+ try {
+ await cdp.disconnect();
+ const reconnected = await cdp.connect();
+ if (reconnected) {
+ // Check again after reconnect — the new tab might now be the active page
+ const newUrl = await cdp.getUrl().catch(() => '') || '';
+ if (!newUrl.includes(navPath)) {
+ // Still wrong — try switchToTabByUrl again with fresh connection
+ const switched2 = await cdp.switchToTabByUrl(navPath);
+ if (!switched2) {
+ // Last resort: open a fresh tab and navigate there
+ // Don't navigate the current page — it may be a system widget with JS disabled
+ console.log(` 🔄 Still wrong tab after reconnect, opening new tab to https://${navPath}`);
+ try {
+ const pg = cdp.getPage();
+ if (pg) {
+ const ctx = pg.context();
+ const newTab = await ctx.newPage();
+ await newTab.goto(`https://${navPath}`, { timeout: 15000, waitUntil: 'domcontentloaded' }).catch(() => {});
+ await newTab.bringToFront().catch(() => {});
+ cdp.attachToPage(newTab);
+ }
+ } catch { /* non-critical */ }
+ }
+ }
+ }
+ } catch { /* non-critical */ }
+ await this.delay(1000); // extra settle
+ }
+ } else {
+ // URL matches — ensure page is fully loaded before reading
+ try {
+ const pg = cdp.getPage();
+ if (pg) await pg.waitForLoadState('domcontentloaded', { timeout: 5000 });
+ } catch { /* non-critical */ }
+ }
+ }
+
+ const [startUrl, startTitle] = await Promise.all([
+ cdp.getUrl().catch(() => ''),
+ cdp.getTitle().catch(() => ''),
+ ]);
+ if (startUrl) {
+ actionHistory.push({ action: 'context', description: `STARTING STATE: URL="${startUrl}" Title="${startTitle}". Do NOT call checkpoint — URL already known.` });
+ }
+ }
+ } catch { /* non-critical */ }
+ }
+
+ // For browser windows, UIA calls can HANG indefinitely on React SPAs.
+ // Try CDP first — if it works, skip UIA entirely to avoid deadlock.
+ if (isLikelyBrowser && this.cdpAvailable !== false) {
+ const cdpCtx = await this.getCdpContext();
+ if (cdpCtx) {
+ // CDP has the page — no need to touch the hanging UIA tree
+ context = `[Browser window — using CDP DOM context instead of UIA tree]${cdpCtx}`;
+ }
+ }
+
+ if (!context) {
+ // Only hard-skip UIA if explicitly disabled (confirmed RPC crash on browser)
+ if (this.uiaDisabled) {
+ context = null;
+ } else {
+ // Non-browser OR CDP unknown — read UIA tree with timeout to avoid hang
+ try {
+ const uiaPromise = (async () => {
+ const activeWindow = await this.a11y.getActiveWindow();
+ return this.a11y.getScreenContext(activeWindow?.processId);
+ })();
+ let timeoutHandle: ReturnType;
+ const timeoutPromise = new Promise((_, reject) => {
+ timeoutHandle = setTimeout(() => reject(new Error('UIA read timeout')), isLikelyBrowser ? 8000 : 30000);
+ });
+ try {
+ context = await Promise.race([uiaPromise, timeoutPromise]) as string | null;
+ } finally {
+ clearTimeout(timeoutHandle!);
+ }
+ } catch (uiaErr) {
+ // Window may have closed (e.g. compose window after Send) — check if we already completed the task
+ const sentEmail = actionHistory.some(a =>
+ (a.action === 'need_visual' && /send/i.test(a.description) && a.description.includes('SUCCEEDED')) ||
+ (a.action === 'key_press' && /ctrl\+Return|ctrl\+Enter/i.test(a.description) && a.description.includes('SUCCEEDED')));
+ if (sentEmail) {
+ console.log(` ✅ Layer 2 done (${stepsTotal} steps): window closed after Send`);
+ if (processName) this.failuresByApp.delete(processName.toLowerCase());
+ this.visionOnlySubtaskCount = 0;
+ return { handled: true, description: `Done (a11y ${stepsTotal} steps): Send clicked, compose window closed`, steps: stepsTotal };
+ }
+ if (String(uiaErr).includes('timeout')) {
+ console.log(` ⚠️ UIA read timed out — browser window likely React SPA`);
+ }
+ context = null;
+ }
+ } // end else (UIA attempt)
+ }
+
+ if (!context || context.includes('unavailable')) {
+ // If we have significant action history, provide it as context instead of giving up
+ if (actionHistory.length >= 3) {
+ context = `[A11y tree unavailable — window may have changed. Action history: ${actionHistory.map(a => a.description).join('; ')}]`;
+ } else {
+ return { handled: false, description: 'Accessibility tree unavailable' };
+ }
+ }
+
+ // Separate context entries from real action history
+ const contextEntries = actionHistory.filter(a => a.action === 'context');
+ const realHistory = actionHistory.filter(a => a.action !== 'context');
+
+ const historyNote = realHistory.length > 0
+ ? `\nACTIONS TAKEN SO FAR:\n${realHistory.map((a, i) => `${i + 1}. ${a.description}`).join('\n')}\n`
+ : '';
+
+ // Put context at the END of the message (after UI state) for maximum LLM attention
+ const contextNote = contextEntries.length > 0
+ ? `\n\nIMPORTANT CONTEXT (follow these instructions):\n${contextEntries.map(c => `- ${c.description}`).join('\n')}`
+ : '';
+
+ // Get the active process name and current URL for knowledge lookup
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const activeProc = activeWin?.processName || processName;
+ const currentUrl = (this.cdpDriver && this.cdpAvailable === true)
+ ? await this.cdpDriver.getUrl().catch(() => undefined)
+ : undefined;
+ const knowledge = this.getRelevantKnowledge(activeProc, subtask, currentUrl ?? undefined);
+
+ const userMessage = `TASK: ${subtask}${historyNote}\n\nCURRENT UI STATE:\n${context}${knowledge}${contextNote}`;
+
+ // Log context source on step 0 for diagnostics
+ if (step === 0) {
+ const ctxType = context?.includes('CDP PAGE CONTEXT') ? 'CDP' : context?.includes('ControlType') ? 'UIA' : 'OTHER';
+ console.log(` 📋 Context: ${ctxType} (${(context || '').length} chars)`);
+ }
+
+ const response = await this.callTextModel(userMessage);
+
+ const parsed = this.parseResponse(response, step + 1);
+ logger?.logStep({
+ layer: 2,
+ actionType: parsed.action,
+ result: 'success',
+ actionParams: { name: parsed.name, key: parsed.key, text: parsed.text?.substring(0, 80), selector: parsed.selector, by_text: parsed.by_text },
+ llmReasoning: parsed.description || parsed.evidence,
+ });
+ logger?.recordLlmCall();
+
+ // ── No-progress loop detector ──
+ // Build a semantic signature for this action (action + target)
+ const loopSig = `${parsed.action}|${parsed.app || parsed.name || parsed.by_text || parsed.selector || parsed.key || ''}`.toLowerCase();
+ const recentActions = actionHistory.slice(-8);
+
+ // Special handling for switch_app: count by action type alone (LLM uses varying app names)
+ let sameActionCount: number;
+ if (parsed.action === 'switch_app') {
+ sameActionCount = recentActions.filter(a => a.action === 'switch_app').length;
+ } else {
+ const recentSigs = recentActions.map(a => {
+ const parts = a.description.match(/^(\S+)\s+"([^"]*)"/);
+ return parts ? `${parts[1]}|${parts[2]}`.toLowerCase() : a.description.substring(0, 40).toLowerCase();
+ });
+ sameActionCount = recentSigs.filter(s => s === loopSig).length;
+ }
+ if (sameActionCount >= 3 && parsed.action !== 'done' && parsed.action !== 'needs_human' && parsed.action !== 'checkpoint') {
+ const msg = parsed.action === 'switch_app'
+ ? `LOOP DETECTED — you called switch_app ${sameActionCount + 1} times in the last 8 actions. The window IS focused but the UI tree shows another app because of how Windows renders focus. STOP calling switch_app. Instead, use cdp_read_text to read the browser page (CDP is connected), or use cdp_click/cdp_type to interact with it. If you already have the information you need from a previous cdp_read_text, switch to Notepad and type the answer.`
+ : `LOOP DETECTED — you repeated "${loopSig}" ${sameActionCount + 1} times in the last 8 actions with NO progress. Try a COMPLETELY DIFFERENT approach. If clicking a link doesn't work, try navigating via address bar. If a button doesn't respond, try a keyboard shortcut. Think step by step about what is actually wrong.`;
+ actionHistory.push({ action: 'blocked', description: msg });
+ console.log(` 🔄 Loop detected: "${parsed.action}" repeated ${sameActionCount + 1}x — forcing different approach`);
+ continue;
+ }
+
+ if (parsed.action === 'needs_human') {
+ const reason = parsed.reason || parsed.description || 'Human intervention required';
+
+ // If LLM says "wrong_window" but we're on the correct browser process,
+ // it's actually a wrong-page/wrong-tab issue — push back and let it navigate
+ if (/wrong_window/i.test(reason) && isLikelyBrowser && this.cdpAvailable === true) {
+ const wrongWindowRetries = actionHistory.filter(a => a.action === 'blocked' && a.description.includes('NOT wrong_window')).length;
+ if (wrongWindowRetries < 3) {
+ actionHistory.push({ action: 'blocked', description: `NOT wrong_window — you ARE in the browser. Navigate to the correct page yourself. Use key_press ctrl+l to open address bar, type the URL, press Return. Or use File > New to create a new document. DO NOT return needs_human again.` });
+ console.log(` 🚫 Blocked needs_human (wrong_window on correct browser) — pushing back to LLM`);
+ continue;
+ }
+ }
+
+ console.log(` 🙋 Needs human: ${reason}`);
+ return {
+ handled: false,
+ description: `Needs human: ${reason}\n${parsed.description || ''}`.trim(),
+ needsHuman: true,
+ steps: stepsTotal,
+ actionHistory,
+ };
+ }
+
+ if (parsed.action === 'need_visual') {
+ const visualHintsUsed = actionHistory.filter(a => a.action === 'need_visual').length;
+ if (visualHintsUsed >= 15) {
+ console.log(` 🛑 Layer 2: max visual hints (15) — stopping`);
+ return { handled: false, description: `Max visual hints used: ${parsed.description}`, unsure: true, steps: stepsTotal, actionHistory };
+ }
+
+ const targetKey = (parsed.target || '').toLowerCase().trim();
+
+ // Loop detection: same target already clicked successfully
+ const alreadyClicked = actionHistory.some(
+ a => a.action === 'need_visual' && a.description.includes('SUCCEEDED') &&
+ a.description.toLowerCase().includes(targetKey.substring(0, 12))
+ );
+ if (alreadyClicked) {
+ actionHistory.push({ action: 'need_visual', description: `ALREADY CLICKED "${parsed.target}" — proceed to NEXT step.` });
+ continue;
+ }
+
+ // Failure loop: same target failed 2+ times
+ const failCount = actionHistory.filter(
+ a => a.action === 'need_visual' && a.description.includes('not found') &&
+ a.description.toLowerCase().includes(targetKey.substring(0, 10))
+ ).length;
+ if (failCount >= 2) {
+ actionHistory.push({ action: 'need_visual', description: `"${parsed.target}" NOT FOUND after ${failCount} tries. Use key_press or type instead.` });
+ continue;
+ }
+
+ // Try a11y bounds first (0ms, 0 LLM cost)
+ const boundsCoord = await this.clickResolver.resolve(parsed.target);
+ if (boundsCoord) {
+ console.log(` 📐 Bounds click: "${parsed.target}" at (${boundsCoord.x}, ${boundsCoord.y})`);
+ await this.desktop.mouseClick(boundsCoord.x, boundsCoord.y);
+ stepsTotal++;
+ actionHistory.push({ action: 'need_visual', description: `Clicked "${parsed.target}" at (${boundsCoord.x}, ${boundsCoord.y}) via bounds — SUCCEEDED. Move to next step.` });
+ this.a11y.invalidateCache();
+ const opensUI = /new mail|compose|reply|forward|new message/i.test(parsed.target);
+ await this.delay(opensUI ? 1500 : 500);
+ continue;
+ }
+
+ // Fall back to vision model
+ console.log(` 👁️ Visual hint: "${parsed.target}"`);
+ const hint = await this.getCoordinateHint(parsed.target);
+ if (hint) {
+ console.log(` 👁️ Clicked "${parsed.target}" at (${hint.x}, ${hint.y})`);
+ await this.desktop.mouseClick(hint.x, hint.y);
+ stepsTotal++;
+ actionHistory.push({ action: 'need_visual', description: `Clicked "${parsed.target}" at (${hint.x}, ${hint.y}) — SUCCEEDED. Move to next step.` });
+ this.a11y.invalidateCache();
+ const opensUI = /new mail|compose|reply|forward|new message/i.test(parsed.target);
+ await this.delay(opensUI ? 1500 : 500);
+ continue;
+ } else {
+ actionHistory.push({ action: 'need_visual', description: `"${parsed.target}" not found in screenshot` });
+ continue;
+ }
+ }
+
+ if (parsed.action === 'unsure') {
+ // Allow up to 3 unsure responses before giving up — push a hint and continue
+ const unsureCount = actionHistory.filter(a => a.action === 'unsure').length;
+ if (unsureCount < 3) {
+ console.log(` 🤷 Layer 2 unsure (${unsureCount + 1}/3): ${parsed.description.substring(0, 80)} — pushing hint and continuing`);
+ actionHistory.push({
+ action: 'unsure',
+ description: `Unsure: ${parsed.description} → Try need_visual to see what's on screen, or use Tab to explore, or try a different keyboard approach.`,
+ });
+ continue;
+ }
+ console.log(` 🤷 Layer 2 unsure (3/3): ${parsed.description.substring(0, 80)} → falling through`);
+ return { handled: false, description: parsed.description, unsure: true, steps: stepsTotal, actionHistory };
+ }
+
+ if (parsed.action === 'done') {
+ const evidence = parsed.evidence || parsed.description || '(no evidence given)';
+
+ // Block premature done — evidence contradicts completion
+ const isContradiction = /however|but I need|let me|I should|still need|not yet|I will|next I|I haven't|need to/i.test(evidence);
+ if (isContradiction && step < MAX_LOOP_STEPS - 2) {
+ actionHistory.push({ action: 'blocked', description: `BLOCKED premature done — your evidence says "${evidence.substring(0, 100)}" which indicates the task is NOT complete. You said you still need to do something. DO IT, then declare done with proof.` });
+ console.log(` 🚫 Blocked premature done — evidence contradicts completion: ${evidence.substring(0, 80)}`);
+ continue;
+ }
+
+ // Block done if task requires writing/composing but no type action was performed
+ const taskLower = (subtask || '').toLowerCase();
+ const requiresWriting = /\b(write|compose|draft|create.*text|type.*sentence|type.*paragraph)\b/i.test(taskLower);
+ const hasTypedContent = actionHistory.some(a => a.action === 'type' && a.description && !a.description.includes('FAILED'));
+ if (requiresWriting && !hasTypedContent && step < MAX_LOOP_STEPS - 2) {
+ actionHistory.push({ action: 'blocked', description: `BLOCKED done — task requires writing/composing text but you never typed any content. Use {"action":"type","text":"your composed content here"} to type original text, THEN declare done.` });
+ console.log(` 🚫 Blocked done — writing task but no type action performed`);
+ continue;
+ }
+
+ // Ground truth check — LLM-backed semantic verification
+ let groundTruthPass = true;
+ let groundTruthDetail = 'no specific check';
+ let groundTruthMethod = 'none';
+ let attemptLog: import('./verifiers').VerifyAttempt[] = [];
+ try {
+ if (verifier) {
+ const readClip = () => this.a11y.readClipboard();
+ const vResult = await verifier.verify(subtask, readClip);
+ groundTruthPass = vResult.pass;
+ groundTruthDetail = vResult.detail;
+ groundTruthMethod = vResult.method;
+ attemptLog = vResult.attemptLog ?? [];
+
+ // Log every individual check attempt for full traceability
+ if (attemptLog.length > 0) {
+ const attemptSummary = attemptLog
+ .map(a => `[${a.checkName}] ${a.pass ? 'PASS' : 'FAIL'} conf=${a.confidence.toFixed(2)} (${a.durationMs}ms): ${a.detail.substring(0, 120)}`)
+ .join('\n ');
+ console.log(` 🔍 Verifier attempts:\n ${attemptSummary}`);
+ }
+
+ if (vResult.evidence) {
+ console.log(` 🔍 Verifier evidence: "${vResult.evidence.substring(0, 120)}"`);
+ }
+ } else {
+ // Minimal inline fallback when no verifier injected
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const pn = (activeWin?.processName || '').toLowerCase();
+ if (/paste.*notepad|notepad.*paste|copy.*notepad/i.test(subtask) || pn === 'notepad') {
+ const focused = await this.a11y.getFocusedElement().catch(() => null);
+ if (!focused?.value || focused.value.trim().length < 10) {
+ groundTruthPass = false;
+ groundTruthDetail = `notepad empty — value: "${focused?.value?.substring(0, 50) || '(none)'}"`;
+ } else {
+ groundTruthDetail = `notepad has ${focused.value.length} chars`;
+ }
+ }
+ }
+ } catch (verifyErr) {
+ // Verification errors are NOT silent passes — they block completion
+ groundTruthPass = false;
+ groundTruthDetail = `Verifier threw unexpected error: ${String(verifyErr).substring(0, 150)}`;
+ groundTruthMethod = 'error';
+ console.warn(` ⚠️ Verifier error (blocking as fail): ${groundTruthDetail}`);
+ }
+
+ if (!groundTruthPass && step < MAX_LOOP_STEPS - 2) {
+ actionHistory.push({ action: 'blocked', description: `BLOCKED done — ground truth check FAILED: ${groundTruthDetail}. The task is NOT complete. Fix it.` });
+ console.log(` 🚫 Blocked done — ground truth failed [${groundTruthMethod}]: ${groundTruthDetail}`);
+ continue;
+ }
+
+ console.log(` ✅ Layer 2 done (${stepsTotal} steps): ${evidence.substring(0, 80)}`);
+ logger?.logStep({
+ layer: 2,
+ actionType: 'done',
+ result: 'success',
+ llmReasoning: evidence.substring(0, 300),
+ uiStateSummary: attemptLog.length > 0
+ ? attemptLog.map(a => `${a.checkName}:${a.pass ? 'pass' : 'fail'}(${a.confidence.toFixed(2)})`).join(' | ')
+ : undefined,
+ verification: {
+ method: groundTruthPass ? (groundTruthMethod as any) || 'a11y_readback' : 'none',
+ verified: groundTruthPass,
+ detail: [
+ `ground_truth: ${groundTruthDetail}`,
+ `contradiction=${isContradiction}`,
+ `requiresWriting=${requiresWriting}`,
+ `hasTypedContent=${hasTypedContent}`,
+ attemptLog.length > 0 ? `checks_run=${attemptLog.map(a => a.checkName).join(',')}` : '',
+ ].filter(Boolean).join(' | '),
+ },
+ });
+ if (processName) this.failuresByApp.delete(processName.toLowerCase());
+ this.visionOnlySubtaskCount = 0;
+ return {
+ handled: true,
+ description: `Done (a11y ${stepsTotal} steps): ${evidence}`,
+ steps: stepsTotal,
+ };
+ }
+
+ // CDP direct DOM actions (Edge/Chrome only)
+ if (parsed.action === 'cdp_click' || parsed.action === 'cdp_type') {
+ try {
+ if (parsed.action === 'cdp_click') {
+ await this.executeCdpClick(parsed);
+ stepsTotal++;
+ const target = parsed.by_text || parsed.selector || parsed.target || '?';
+ console.log(` ✅ CDP click "${target}" succeeded`);
+ actionHistory.push({ action: 'cdp_click', description: `CDP click "${target}" — SUCCEEDED` });
+ } else {
+ await this.executeCdpType(parsed);
+ stepsTotal++;
+ const field = parsed.by_label || parsed.selector || '?';
+ const typedText = (parsed.text || '').substring(0, 40);
+ // Text-verify: read back the input value to confirm typing landed
+ let verifyNote = '';
+ try {
+ if (this.cdpDriver) {
+ const selector = parsed.selector || (parsed.by_label ? `[aria-label="${parsed.by_label}"]` : null);
+ if (selector) {
+ // Use parameterized readFieldValue() — avoids CSS selector injection
+ const fieldValue = await this.cdpDriver.readFieldValue(selector).catch(() => '');
+ if (fieldValue && fieldValue.length > 0) {
+ verifyNote = ` (field now shows: "${fieldValue}")`;
+ }
+ }
+ }
+ } catch { /* non-critical */ }
+ console.log(` ✅ CDP type "${typedText}" into "${field}" succeeded${verifyNote}`);
+ actionHistory.push({ action: 'cdp_type', description: `CDP type "${typedText}" into "${field}" — SUCCEEDED${verifyNote}` });
+ }
+ this.a11y.invalidateCache();
+ await this.delay(SETTLE_MS);
+ continue;
+ } catch (cdpErr) {
+ console.log(` ❌ CDP ${parsed.action} failed: ${String(cdpErr).substring(0, 150)}`);
+ actionHistory.push({ action: 'error', description: `CDP action failed: ${cdpErr} — try keyboard instead` });
+ await this.delay(SETTLE_MS);
+ continue;
+ }
+ }
+
+ // CDP scroll — scroll the page or a specific element
+ if (parsed.action === 'cdp_scroll') {
+ try {
+ await this.ensureCdp();
+ const pg = this.cdpDriver!.getPage();
+ if (pg) {
+ const direction = (parsed.direction || 'down').toLowerCase();
+ const amount = Math.min(Math.max(parsed.amount ?? 400, 50), 2000);
+ const deltaY = (direction === 'up') ? -amount : (direction === 'down') ? amount : 0;
+ const deltaX = (direction === 'left') ? -amount : (direction === 'right') ? amount : 0;
+ const selector = parsed.selector || null;
+ if (selector) {
+ await pg.evaluate(
+ ({ sel, dy, dx }: { sel: string; dy: number; dx: number }) => {
+ const el = document.querySelector(sel);
+ if (el) { el.scrollBy(dx, dy); }
+ },
+ { sel: selector, dy: deltaY, dx: deltaX },
+ );
+ } else {
+ await pg.evaluate(
+ ({ dy, dx }: { dy: number; dx: number }) => window.scrollBy(dx, dy),
+ { dy: deltaY, dx: deltaX },
+ );
+ }
+ stepsTotal++;
+ const desc = `Scrolled ${direction} ${amount}px${selector ? ` on "${selector}"` : ''}`;
+ console.log(` ✅ CDP scroll: ${desc}`);
+ actionHistory.push({ action: 'cdp_scroll', description: desc });
+ await this.delay(SETTLE_MS);
+ }
+ continue;
+ } catch (scrollErr) {
+ actionHistory.push({ action: 'error', description: `cdp_scroll failed: ${scrollErr} — use key_press ArrowDown or Page_Down instead` });
+ continue;
+ }
+ }
+
+ // CDP read text — extract visible text from page for info retrieval
+ if (parsed.action === 'cdp_read_text') {
+ try {
+ await this.ensureCdp();
+ const selector = parsed.selector || 'body';
+ // Use parameterized readText() — avoids CSS selector injection
+ const text = await this.cdpDriver!.readText(selector, 3000);
+ // Truncate in action history to prevent unbounded growth (LLM sees full text this step only)
+ const historyText = text.length > 500 ? text.substring(0, 500) + `... [${text.length} chars total]` : text;
+ actionHistory.push({ action: 'cdp_read_text', description: `PAGE TEXT (${selector}):\n${historyText}` });
+ console.log(` 📖 cdp_read_text "${selector}" — ${text.length} chars extracted`);
+ continue;
+ } catch (err) {
+ actionHistory.push({ action: 'error', description: `cdp_read_text failed: ${err} — try checkpoint or keyboard instead` });
+ continue;
+ }
+ }
+
+ // ── Deterministic app switching ──
+ if (parsed.action === 'switch_app') {
+ const targetApp = (parsed.app || parsed.name || '').toLowerCase();
+ if (!targetApp) {
+ actionHistory.push({ action: 'blocked', description: 'switch_app requires "app" parameter — e.g. {"action":"switch_app","app":"notepad"}' });
+ continue;
+ }
+ try {
+ const windows = await this.a11y.getWindows().catch(() => []);
+ const targetWin = windows.find(w =>
+ !w.isMinimized && w.processName.toLowerCase().includes(targetApp)
+ );
+ if (targetWin) {
+ // Attempt focus with retry — sometimes first attempt doesn't take
+ await this.a11y.focusWindow(undefined, targetWin.processId).catch(() => null);
+ await this.delay(600);
+ // Verify focus actually changed by checking foreground window
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const focusVerified = activeWin?.processName.toLowerCase().includes(targetApp);
+ if (!focusVerified) {
+ // Retry once — focus didn't take
+ console.log(` 🔄 switch_app retry: focus on ${activeWin?.processName || 'unknown'} not ${targetApp}, retrying...`);
+ await this.a11y.focusWindow(undefined, targetWin.processId).catch(() => null);
+ await this.delay(800);
+ }
+ actionHistory.push({ action: 'switch_app', description: `Switched to ${targetWin.processName} "${targetWin.title}" (pid ${targetWin.processId}) — SUCCEEDED` });
+ console.log(` 🔀 Switched to ${targetWin.processName} (pid ${targetWin.processId})`);
+ stepsTotal++;
+ // Update browser detection based on the target we switched to
+ isLikelyBrowser = BROWSER_PROCESS_RE.test(targetWin.processName);
+ if (!isLikelyBrowser) {
+ // Switching away from browser — mark CDP unavailable, re-enable UIA
+ this.cdpAvailable = false;
+ this.uiaDisabled = false; // UIA works fine for non-browser apps
+ } else if (this.cdpAvailable === false) {
+ // Switching back TO browser — allow CDP reconnection attempt
+ this.cdpAvailable = null;
+ }
+ } else {
+ // App not open — try to launch it
+ actionHistory.push({ action: 'switch_app', description: `${targetApp} not found in window list. Use key_press to open it (e.g. Super key, type app name, Enter).` });
+ console.log(` ⚠️ switch_app: ${targetApp} not found — prompting LLM to open it`);
+ }
+ } catch (err) {
+ actionHistory.push({ action: 'switch_app', description: `switch_app failed: ${String(err).substring(0, 80)}` });
+ }
+ continue;
+ }
+
+ if (parsed.action === 'checkpoint') {
+ const priorCheckpoints = actionHistory.filter(a => a.action === 'checkpoint');
- // Map to InputAction
- const action = this.mapAction(parsed);
- if (!action) {
- return { handled: false, description: 'Could not map action', unsure: true };
+ // Loop guard: if already called checkpoint with CDP unavailable, block further calls
+ const cdpUnavailableCheckpoints = priorCheckpoints.filter(a => a.description.includes('CDP not connected') || a.description.includes('CDP unavailable'));
+ if (cdpUnavailableCheckpoints.length >= 1) {
+ actionHistory.push({ action: 'checkpoint', description: 'CHECKPOINT BLOCKED — CDP is NOT available this session. Do NOT call checkpoint again. Use keyboard/Tab navigation to proceed with the task.' });
+ console.log(` 📍 Checkpoint suppressed (CDP unavailable, loop guard)`);
+ continue;
+ }
+
+ // Deduplicate: if same URL was already returned, block with "URL unchanged"
+ if (priorCheckpoints.length >= 1) {
+ try {
+ const currentUrl = this.cdpDriver ? await this.cdpDriver.getUrl() : null;
+ const lastCheckpoint = priorCheckpoints[priorCheckpoints.length - 1].description;
+ if (currentUrl && lastCheckpoint.includes(currentUrl)) {
+ actionHistory.push({ action: 'checkpoint', description: `CHECKPOINT BLOCKED — URL unchanged (${currentUrl}). Page already known. Proceed with task actions (cdp_click, cdp_type, key_press).` });
+ console.log(` 📍 Checkpoint suppressed (same URL: ${currentUrl})`);
+ continue;
+ }
+ } catch { /* fall through to normal checkpoint */ }
+ }
+
+ try {
+ const url = this.cdpDriver ? await this.cdpDriver.getUrl() : null;
+ const title = this.cdpDriver ? await this.cdpDriver.getTitle() : null;
+ const info = url ? `URL: ${url} | Title: "${title}"` : 'CDP not connected this session — proceed with Tab/keyboard navigation. Do NOT call checkpoint again.';
+ actionHistory.push({ action: 'checkpoint', description: `CHECKPOINT — ${info}` });
+ console.log(` 📍 Checkpoint: ${info}`);
+ } catch (err) {
+ console.debug(`[A11yReasoner] Checkpoint CDP error: ${err}`);
+ actionHistory.push({ action: 'checkpoint', description: 'CHECKPOINT — CDP unavailable this session. Proceed with Tab/keyboard navigation.' });
+ }
+ continue;
+ }
+
+ // BLOCK a11y actions on browser windows when CDP is connected — UIA hangs on React SPAs
+ const isA11yAction = ['a11y_click', 'a11y_set_value', 'a11y_focus'].includes(parsed.action);
+ if (isA11yAction && isLikelyBrowser && this.cdpAvailable === true) {
+ const a11yBlockCount = actionHistory.filter(a => a.action === 'blocked' && a.description.includes('CDP is connected')).length;
+ if (a11yBlockCount < 3) {
+ actionHistory.push({ action: 'blocked', description: `BLOCKED ${parsed.action} "${parsed.name || ''}" — CDP is connected. Use cdp_click or cdp_type instead. UIA calls HANG on this page.` });
+ console.log(` 🚫 Blocked ${parsed.action} on browser (CDP available) — redirecting to CDP`);
+ }
+ // After 3 blocks, stop wasting LLM calls — force done with failure
+ if (a11yBlockCount >= 3) {
+ return { handled: false, description: 'LLM keeps trying a11y actions on browser despite CDP being available', unsure: true, steps: stepsTotal, actionHistory };
+ }
+ continue;
+ }
+
+ // Skip a11y_click on taskbar items (they fail with RPC_E_SERVERFAULT)
+ // Also catch descriptive window-title strings that are taskbar buttons
+ const isTaskbarClick = parsed.action === 'a11y_click' && parsed.name && (
+ /running window|pinned|taskbar/i.test(parsed.name) ||
+ /Microsoft Edge|Google Chrome|msedge|Firefox/i.test(parsed.name) ||
+ // Anything that looks like a full browser window title in the taskbar
+ (parsed.controlType === 'Button' && parsed.name.length > 40)
+ );
+ if (isTaskbarClick) {
+ const taskbarSkips = actionHistory.filter(a => a.action === 'skipped' && a.description.includes('taskbar')).length;
+ if (taskbarSkips >= 2) {
+ // Already skipped twice — don't waste more LLM calls, just reuse last message
+ continue;
+ }
+ actionHistory.push({ action: 'skipped', description: `Skipped taskbar item — not clickable. Use need_visual to click UI elements instead. STOP trying a11y_click on taskbar items.` });
+ continue;
+ }
+
+ // Key-press loop detection: if LLM spams key_press on browser with CDP available, force CDP usage
+ if (parsed.action === 'key_press' && isLikelyBrowser && this.cdpAvailable === true) {
+ const recentKeyPresses = actionHistory.slice(-8).filter(a => a.action === 'key_press').length;
+ if (recentKeyPresses >= 5) {
+ actionHistory.push({ action: 'blocked', description: `STOP spamming key_press — you have done ${recentKeyPresses} key presses with no progress. CDP is connected. Use cdp_click by_text="Button Label" to click elements. Use cdp_type to fill fields. If the task URL shows search params are already set, the results ARE showing — use {"action":"done","evidence":"URL contains search params, flights displayed"}.` });
+ console.log(` 🚫 Blocked key_press spam (${recentKeyPresses} in last 8 actions) — forcing CDP usage`);
+ continue;
+ }
+ }
+
+ // Execute the action
+ const inputAction = this.mapAction(parsed);
+ if (!inputAction) {
+ console.log(` ⚠️ Layer 2: could not map action "${parsed.action}" → unsure`);
+ return { handled: false, description: `Unmappable action: ${parsed.action}`, unsure: true, steps: stepsTotal, actionHistory };
+ }
+
+ // Duplicate detection: if the exact same type action was just done, skip it
+ const lastAction = actionHistory.length > 0 ? actionHistory[actionHistory.length - 1] : null;
+ if (lastAction && parsed.action === 'type' && lastAction.action === 'type' &&
+ parsed.text && lastAction.description.includes(parsed.text.substring(0, 20))) {
+ actionHistory.push({ action: 'skipped', description: `ALREADY TYPED "${parsed.text.substring(0, 30)}" — it worked. The text is in the field. Move to the NEXT step.` });
+ continue;
+ }
+
+ console.log(` ⚡ [${parsed.action}] ${parsed.name || parsed.key || (parsed.text ? parsed.text.substring(0, 40) : '') || ''}`);
+ await this.executeAction(inputAction);
+ stepsTotal++;
+ actionHistory.push({ action: parsed.action, description: `${parsed.action} "${parsed.text || parsed.name || parsed.key || ''}" — ${parsed.description || 'done'}` });
+
+ // Settle: let the UI react before next read
+ await this.delay(SETTLE_MS);
+
+ // Cap action history to prevent unbounded growth — keep context entries + most recent actions
+ if (actionHistory.length > MAX_ACTION_HISTORY) {
+ const contextEntries = actionHistory.filter(a => a.action === 'context');
+ const nonContext = actionHistory.filter(a => a.action !== 'context');
+ const trimmed = nonContext.slice(-MAX_ACTION_HISTORY + contextEntries.length);
+ actionHistory.length = 0;
+ actionHistory.push(...contextEntries, ...trimmed);
+ }
+
+ } catch (err) {
+ const errStr = String(err);
+ // API-level errors (credits, rate limit, auth) are not UIA/app failures —
+ // don't charge them against the circuit breaker or they'll wrongly disable apps
+ const isApiError = /credit balance|rate limit|authentication|invalid.*key|overloaded/i.test(errStr);
+ if (isApiError) {
+ console.log(` ❌ Layer 2 API error (non-recoverable): ${errStr.substring(0, 120)}`);
+ return { handled: false, description: `API error: ${errStr}`, steps: stepsTotal, actionHistory };
+ }
+
+ // RPC_E_SERVERFAULT = UIA is hanging (React SPA) — switch to CDP-only immediately
+ const isRpcFault = /RPC_E_SERVERFAULT|0x80010105|SERVERFAULT/i.test(errStr);
+ if (isRpcFault && isLikelyBrowser && this.cdpAvailable === true) {
+ console.log(` 🔴 RPC_E_SERVERFAULT on browser with CDP available — disabling UIA, CDP-only mode`);
+ this.uiaDisabled = true; // prevent all future UIA reads in this session
+ actionHistory.push({ action: 'error', description: `UIA CRASHED (RPC_E_SERVERFAULT). ALL UIA/a11y actions permanently disabled. Use ONLY cdp_click, cdp_type, key_press, or type.` });
+ await this.delay(SETTLE_MS);
+ continue;
+ }
+
+ const appKey = (processName || 'global').toLowerCase();
+ const failures = (this.failuresByApp.get(appKey) || 0) + 1;
+ this.failuresByApp.set(appKey, failures);
+ console.log(` ❌ Layer 2 error (${appKey} ${failures}/${this.MAX_FAILURES}): ${errStr}`);
+
+ // For browser apps, RPC_E_SERVERFAULT means UIA is broken on SPAs — trip immediately
+ // For non-browser apps (Notepad, etc.), it's likely transient — just count as normal failure
+ if ((isRpcFault && isLikelyBrowser) || failures >= this.MAX_FAILURES) {
+ this.disabledApps.add(appKey);
+ console.log(` 🔴 Layer 2 circuit breaker tripped for "${appKey}" — subtasks for this app will use vision fallback.`);
+ return { handled: false, description: `Layer 2 error: ${errStr}`, steps: stepsTotal, actionHistory };
+ }
+ if (isRpcFault && !isLikelyBrowser) {
+ console.log(` ⚠️ RPC_E_SERVERFAULT on non-browser app "${appKey}" — retrying (${failures}/${this.MAX_FAILURES})`);
+ await this.delay(2000); // extra settle time for UIA recovery
+ }
+
+ // Don't give up on single errors — record and continue the loop
+ // The LLM will see the error in action history and try a different approach
+ actionHistory.push({ action: 'error', description: `Action failed: ${err}` });
+ await this.delay(SETTLE_MS);
+ continue;
}
+ }
- return {
- handled: true,
- action,
- description: parsed.description || parsed.action,
- };
- } catch {
- return { handled: false, description: 'Failed to parse response', unsure: true };
+ // Exhausted steps without completing — hand off
+ console.log(` ⚠️ Layer 2: max steps (${MAX_LOOP_STEPS}) reached — task may need human review`);
+ return {
+ handled: false,
+ description: `Max a11y steps reached after: ${actionHistory.map(a => a.description).join(', ')}`,
+ unsure: true,
+ steps: stepsTotal,
+ actionHistory,
+ };
+ }
+
+ private parseResponse(response: string, step?: number): any {
+ // Strip markdown code fences (haiku often wraps in ```json ... ```)
+ const stripped = response.replace(/```(?:json)?\s*/g, '').replace(/```\s*/g, '').trim();
+
+ const start = stripped.indexOf('{');
+ if (start === -1) {
+ return { action: 'unsure', description: 'No JSON in LLM response' };
}
+
+ // Balance brackets to find the end of the first JSON object
+ let depth = 0;
+ let inString = false;
+ let escape = false;
+ for (let i = start; i < stripped.length; i++) {
+ const ch = stripped[i];
+ if (escape) { escape = false; continue; }
+ if (ch === '\\' && inString) { escape = true; continue; }
+ if (ch === '"') { inString = !inString; continue; }
+ if (inString) continue;
+ if (ch === '{') depth++;
+ else if (ch === '}') {
+ depth--;
+ if (depth === 0) {
+ try {
+ return JSON.parse(stripped.slice(start, i + 1));
+ } catch (e) {
+ return { action: 'unsure', description: 'Failed to parse LLM JSON' };
+ }
+ }
+ }
+ }
+ return { action: 'unsure', description: 'Failed to parse LLM JSON' };
}
private mapAction(parsed: any): InputAction | null {
switch (parsed.action) {
case 'a11y_click':
- return {
- kind: 'a11y_click',
- name: parsed.name,
- controlType: parsed.controlType,
- } as A11yAction;
+ return { kind: 'a11y_click', name: parsed.name, controlType: parsed.controlType } as A11yAction;
case 'a11y_set_value':
return {
@@ -186,11 +1116,7 @@ export class A11yReasoner {
} as A11yAction;
case 'a11y_focus':
- return {
- kind: 'a11y_focus',
- name: parsed.name,
- controlType: parsed.controlType,
- } as A11yAction;
+ return { kind: 'a11y_focus', name: parsed.name, controlType: parsed.controlType } as A11yAction;
case 'key_press':
return { kind: 'key_press', key: parsed.key } as InputAction;
@@ -203,54 +1129,250 @@ export class A11yReasoner {
}
}
+ private async executeAction(action: InputAction): Promise {
+ if (action.kind.startsWith('a11y_')) {
+ await this.executeA11yAction(action as A11yAction);
+ } else if (action.kind === 'key_press') {
+ await this.desktop.keyPress((action as any).key);
+ } else if (action.kind === 'type') {
+ await this.desktop.typeText((action as any).text);
+ }
+ // Invalidate cache after every action — next loop reads fresh state
+ this.a11y.invalidateCache();
+ }
+
+ private async executeA11yAction(action: A11yAction): Promise {
+ const actionMap: Record = {
+ a11y_click: 'click',
+ a11y_set_value: 'set-value',
+ a11y_get_value: 'get-value',
+ a11y_focus: 'focus',
+ };
+ const a11yAction = actionMap[action.kind];
+ if (!a11yAction) throw new Error(`Unknown a11y action: ${action.kind}`);
+
+ const result = await this.a11y.invokeElement({
+ name: action.name,
+ automationId: action.automationId,
+ controlType: action.controlType,
+ action: a11yAction,
+ value: action.value,
+ });
+
+ if (!result.success && !result.clickPoint) {
+ throw new Error(result.error ?? 'A11y action failed');
+ }
+
+ if (result.clickPoint) {
+ await this.desktop.mouseClick(result.clickPoint.x, result.clickPoint.y);
+ }
+ }
+
+ /**
+ * Vision-as-Coordinate-Spotter (Layer 2.5)
+ * Takes a screenshot, asks a cheap vision model "where is [target]?",
+ * returns {x, y} coordinates or null.
+ */
+ private async getCoordinateHint(target: string): Promise<{ x: number; y: number } | null> {
+ try {
+ // Truncate overly verbose targets from the LLM
+ const shortTarget = target.length > 50 ? target.substring(0, 50).trim() : target;
+
+ const frame = await this.desktop.captureForLLM();
+ const base64 = frame.buffer.toString('base64');
+ const mediaType = frame.format === 'jpeg' ? 'image/jpeg' : 'image/png';
+
+ const prompt = `Look at this screenshot. Find the UI element: "${shortTarget}".\nReturn ONLY JSON: {"x": , "y": }\nCoordinates are in image pixels (${frame.llmWidth}x${frame.llmHeight}).\nClick the CENTER of the element.\nIf not visible: {"x": -1, "y": -1}`;
+
+ const { model, baseUrl } = this.pipelineConfig.layer3;
+ const apiKey = this.pipelineConfig.layer3.apiKey || this.pipelineConfig.apiKey;
+ const provider = this.pipelineConfig.provider;
+ let responseText: string;
+
+ if (provider.openaiCompat || baseUrl.includes('localhost') || baseUrl.includes('11434')) {
+ const res = await fetch(`${baseUrl}/chat/completions`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...provider.authHeader(apiKey) },
+ body: JSON.stringify({
+ model,
+ max_tokens: 60,
+ temperature: 0,
+ messages: [
+ { role: 'system', content: 'You find UI elements in screenshots. Return ONLY JSON coordinates, nothing else.' },
+ { role: 'user', content: [
+ { type: 'image_url', image_url: { url: `data:${mediaType};base64,${base64}` } },
+ { type: 'text', text: prompt },
+ ]},
+ ],
+ }),
+ signal: AbortSignal.timeout(15000),
+ });
+ const data = await res.json() as any;
+ if (data.error) throw new Error(data.error.message ?? JSON.stringify(data.error));
+ responseText = data.choices?.[0]?.message?.content ?? '';
+ } else {
+ // Anthropic vision API — use prefill to force JSON output
+ const res = await fetch(`${baseUrl}/messages`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...provider.authHeader(apiKey) },
+ body: JSON.stringify({
+ model,
+ max_tokens: 60,
+ messages: [
+ { role: 'user', content: [
+ { type: 'image', source: { type: 'base64', media_type: mediaType, data: base64 } },
+ { type: 'text', text: prompt },
+ ]},
+ { role: 'assistant', content: '{"x":' },
+ ],
+ }),
+ signal: AbortSignal.timeout(15000),
+ });
+ const data = await res.json() as any;
+ if (data.error) throw new Error(data.error.message ?? JSON.stringify(data.error));
+ const raw = data.content?.[0]?.text ?? '';
+ responseText = '{"x":' + raw;
+ }
+
+ const match = responseText.match(/\{\s*"x"\s*:\s*(-?\d+)\s*,\s*"y"\s*:\s*(-?\d+)\s*\}/);
+ if (!match) return null;
+
+ const x = parseInt(match[1], 10);
+ const y = parseInt(match[2], 10);
+ if (x < 0 || y < 0) return null;
+
+ // Scale from LLM image space to real screen coordinates
+ const realX = Math.round(x * frame.scaleFactor);
+ const realY = Math.round(y * frame.scaleFactor);
+
+ return { x: realX, y: realY };
+ } catch (err) {
+ console.log(` ⚠️ Layer 2.5 vision hint failed: ${err}`);
+ return null;
+ }
+ }
+
+ /** Get CDP page context for browser windows. Returns null if CDP unavailable. Caches failure state. */
+ private async getCdpContext(): Promise {
+ if (this.cdpAvailable === false) return null;
+ try {
+ if (!this.cdpDriver) {
+ this.cdpDriver = new CDPDriver(9222);
+ }
+ const connected = await this.cdpDriver.isConnected();
+ if (!connected) {
+ const ok = await this.cdpDriver.connect();
+ if (!ok) {
+ this.cdpAvailable = false;
+ this.cdpDriver = null;
+ return null;
+ }
+ }
+ this.cdpAvailable = true;
+ const ctx = await this.cdpDriver.getPageContext();
+ return '\n\n⚠️ CDP PAGE CONTEXT — you MUST use cdp_click/cdp_type/cdp_read_text actions (NOT key_press) to interact with this page:\n' + ctx;
+ } catch (err) {
+ this.cdpAvailable = false;
+ this.cdpDriver = null;
+ return null;
+ }
+ }
+
+ /** Execute a cdp_click action — click web element by selector or text */
+ private async executeCdpClick(parsed: any): Promise {
+ const cdp = await this.ensureCdp();
+ let result;
+ if (parsed.by_text) {
+ result = await cdp.clickByText(parsed.by_text);
+ } else if (parsed.selector) {
+ result = await cdp.click(parsed.selector);
+ } else if (parsed.target) {
+ result = await cdp.clickByText(parsed.target);
+ } else {
+ throw new Error('cdp_click: requires selector, by_text, or target');
+ }
+ if (!result.success) throw new Error(result.error || 'cdp_click failed');
+ }
+
+ /** Execute a cdp_type action — type into web element by selector or label */
+ private async executeCdpType(parsed: any): Promise {
+ const cdp = await this.ensureCdp();
+ if (!parsed.text) throw new Error('cdp_type: text is required');
+ let result;
+ if (parsed.by_label) {
+ result = await cdp.typeByLabel(parsed.by_label, parsed.text);
+ } else if (parsed.selector) {
+ result = await cdp.typeInField(parsed.selector, parsed.text);
+ } else {
+ throw new Error('cdp_type: requires selector or by_label');
+ }
+ if (!result.success) throw new Error(result.error || 'cdp_type failed');
+ }
+
+ /** Ensure CDPDriver is connected; throws if unavailable */
+ private async ensureCdp(): Promise {
+ if (!this.cdpDriver || !(await this.cdpDriver.isConnected())) {
+ this.cdpDriver = new CDPDriver(9222);
+ const ok = await this.cdpDriver.connect();
+ if (!ok) {
+ this.cdpAvailable = false;
+ this.cdpDriver = null;
+ throw new Error('CDPDriver: cannot connect to Edge/Chrome on port 9222');
+ }
+ this.cdpAvailable = true;
+ }
+ return this.cdpDriver;
+ }
+
private async callTextModel(userMessage: string): Promise {
const { model, baseUrl } = this.pipelineConfig.layer2;
- const apiKey = this.pipelineConfig.apiKey;
+ const apiKey = this.pipelineConfig.apiKey;
const provider = this.pipelineConfig.provider;
if (provider.openaiCompat || baseUrl.includes('localhost') || baseUrl.includes('11434')) {
- // OpenAI-compatible (Ollama, OpenAI, Kimi)
- const response = await fetch(`${baseUrl}/chat/completions`, {
+ const res = await fetch(`${baseUrl}/chat/completions`, {
method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- ...provider.authHeader(apiKey),
- },
+ headers: { 'Content-Type': 'application/json', ...provider.authHeader(apiKey) },
body: JSON.stringify({
model,
- max_tokens: 200,
+ max_tokens: 500,
temperature: 0,
+ response_format: { type: 'json_object' },
messages: [
{ role: 'system', content: SYSTEM_PROMPT },
- { role: 'user', content: userMessage },
+ { role: 'user', content: userMessage },
],
}),
- signal: AbortSignal.timeout(10000),
+ signal: AbortSignal.timeout(12000),
});
-
- const data = await response.json() as any;
- if (data.error) throw new Error(data.error.message || JSON.stringify(data.error));
- return data.choices?.[0]?.message?.content || '';
+ const data = await res.json() as any;
+ if (data.error) throw new Error(data.error.message ?? JSON.stringify(data.error));
+ return data.choices?.[0]?.message?.content ?? '';
} else {
- // Anthropic
- const response = await fetch(`${baseUrl}/messages`, {
+ // Anthropic — prefill '{' forces the model to continue with valid JSON
+ const res = await fetch(`${baseUrl}/messages`, {
method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- ...provider.authHeader(apiKey),
- },
+ headers: { 'Content-Type': 'application/json', ...provider.authHeader(apiKey) },
body: JSON.stringify({
model,
- max_tokens: 200,
+ max_tokens: 500,
system: SYSTEM_PROMPT,
- messages: [{ role: 'user', content: userMessage }],
+ messages: [
+ { role: 'user', content: userMessage },
+ { role: 'assistant', content: '{' },
+ ],
}),
- signal: AbortSignal.timeout(10000),
+ signal: AbortSignal.timeout(12000),
});
-
- const data = await response.json() as any;
- if (data.error) throw new Error(data.error.message || JSON.stringify(data.error));
- return data.content?.[0]?.text || '';
+ const data = await res.json() as any;
+ if (data.error) throw new Error(data.error.message ?? JSON.stringify(data.error));
+ // Prepend the prefilled '{' back since the API only returns the continuation
+ const text = data.content?.[0]?.text ?? '';
+ return text.startsWith('{') ? text : '{' + text;
}
}
+
+ private delay(ms: number): Promise {
+ return new Promise(r => setTimeout(r, ms));
+ }
}
diff --git a/src/accessibility.ts b/src/accessibility.ts
index ad55658..e52d42a 100644
--- a/src/accessibility.ts
+++ b/src/accessibility.ts
@@ -1,61 +1,54 @@
/**
- * Accessibility Bridge — calls platform-specific scripts to query
- * the native accessibility tree. No vision needed for most actions.
- *
- * Windows: Node.js → spawn powershell → .NET UI Automation → JSON
- * macOS: Node.js → spawn osascript → JXA (Accessibility API) → JSON
- *
- * v2: Added window management helpers (focusWindow, launchApp, getActiveWindow)
- * v2.1: Fixed hardcoded process IDs, added PowerShell check, proper foreground window detection
- * v3: Cross-platform support (Windows + macOS)
+ * Accessibility Bridge — queries the native accessibility tree.
+ *
+ * Windows: uses PSRunner (persistent powershell process via ps-bridge.ps1).
+ * One-time assembly load cost ~800ms, then each call is <50ms.
+ * macOS: spawns osascript per call (unchanged).
+ *
+ * v4: PSRunner replaces per-call powershell.exe spawning on Windows.
+ * MaxDepth raised to 4 so nested elements are visible to the LLM.
*/
import { execFile } from 'child_process';
import * as os from 'os';
import * as path from 'path';
import { promisify } from 'util';
+import { psRunner } from './ps-runner';
const execFileAsync = promisify(execFile);
-const PLATFORM = os.platform(); // 'win32' | 'darwin' | 'linux'
+const PLATFORM = os.platform();
const SCRIPTS_DIR = path.join(__dirname, '..', 'scripts');
const MAC_SCRIPTS_DIR = path.join(SCRIPTS_DIR, 'mac');
-// macOS JXA scripts enumerate System Events which can be slow on some versions.
-// 30s gives enough headroom; scripts are cached after first call so this only
-// applies to the first invocation per session.
-const SCRIPT_TIMEOUT = PLATFORM === 'darwin' ? 30000 : 10000;
-
-/** Platform script file mapping: Windows (.ps1) → macOS (.jxa) */
-const SCRIPT_MAP: Record> = {
- win32: {
- 'get-windows': 'get-windows.ps1',
- 'find-element': 'find-element.ps1',
- 'invoke-element': 'invoke-element.ps1',
- 'focus-window': 'focus-window.ps1',
- 'get-foreground-window': 'get-foreground-window.ps1',
- 'get-screen-context': 'get-screen-context.ps1',
- },
- darwin: {
- 'get-windows': 'get-windows.jxa',
- 'get-screen-context': 'get-screen-context.jxa',
- 'find-element': 'find-element.jxa',
- 'invoke-element': 'invoke-element.jxa',
- 'focus-window': 'focus-window.jxa',
- 'get-foreground-window': 'get-foreground-window.jxa',
- },
-};
-
-/** Cached shell availability */
-let shellAvailable: boolean | null = null;
+
+// macOS JXA can be slow on first call; 30s gives headroom.
+const MAC_SCRIPT_TIMEOUT = 30000;
+
+const MAX_DEPTH = 8; // raised to 8 — Electron/WebView2 apps (Outlook olk) nest deeply: Window > Pane > Pane > Pane > Button
+
+/** Cached shell availability (macOS only — Windows uses psRunner) */
+let macShellAvailable: boolean | null = null;
export interface UIElement {
name: string;
automationId: string;
controlType: string;
className: string;
+ isEnabled?: boolean;
bounds: { x: number; y: number; width: number; height: number };
children?: UIElement[];
}
+export interface FocusedElementInfo {
+ name: string;
+ automationId: string;
+ controlType: string;
+ className: string;
+ processId: number;
+ isEnabled: boolean;
+ bounds: { x: number; y: number; width: number; height: number };
+ value: string;
+}
+
export interface WindowInfo {
handle: number;
title: string;
@@ -65,13 +58,11 @@ export interface WindowInfo {
isMinimized: boolean;
}
-/** Cached window list with TTL */
interface WindowCache {
windows: WindowInfo[];
timestamp: number;
}
-/** Cached screen context with TTL */
interface ScreenContextCache {
context: string;
timestamp: number;
@@ -79,96 +70,98 @@ interface ScreenContextCache {
export class AccessibilityBridge {
private windowCache: WindowCache | null = null;
- private readonly WINDOW_CACHE_TTL = 2000; // 2s cache for window list
- private explorerProcessId: number | null = null; // Cached Explorer PID for taskbar detection
+ private readonly WINDOW_CACHE_TTL = 2000;
- /** Cached taskbar buttons — rarely change, queried once */
- private taskbarCache: { buttons: UIElement[]; timestamp: number } | null = null;
- private readonly TASKBAR_CACHE_TTL = 30000; // 30s — taskbar barely changes
-
- // ── Perf Opt #3: Screen context cache (2s TTL — UI rarely changes mid-LLM-call) ──
private screenContextCache: ScreenContextCache | null = null;
private readonly SCREEN_CONTEXT_CACHE_TTL = 2000;
+ private taskbarCache: { buttons: UIElement[]; timestamp: number } | null = null;
+ private readonly TASKBAR_CACHE_TTL = 30000;
+ private explorerProcessId: number | null = null;
+
/**
- * Check if the platform's script shell is available.
- * Windows: PowerShell, macOS: osascript
+ * Check if the platform's shell is available.
+ * Windows: always true (PSRunner starts lazily).
+ * macOS: checks osascript + Accessibility permissions.
*/
async isShellAvailable(): Promise {
- if (shellAvailable !== null) return shellAvailable;
-
+ if (PLATFORM === 'win32') return true; // PSRunner handles availability
+
+ if (macShellAvailable !== null) return macShellAvailable;
+
try {
- if (PLATFORM === 'win32') {
- await execFileAsync('powershell.exe', ['-Command', 'exit 0'], { timeout: 5000 });
- } else if (PLATFORM === 'darwin') {
- // Probe System Events directly — a bare osascript -e '""' succeeds even without
- // Accessibility permissions, giving a false positive. Touching processes.length
- // forces macOS to check the permission and fail fast with a clear error if not granted.
- await execFileAsync(
- 'osascript',
- ['-l', 'JavaScript', '-e', 'Application("System Events").processes.length; true'],
- { timeout: 5000 },
- );
- } else {
- console.error(`❌ Unsupported platform: ${PLATFORM}. Accessibility requires Windows or macOS.`);
- shellAvailable = false;
- return false;
- }
- shellAvailable = true;
- console.log(`✅ Accessibility bridge ready (${PLATFORM === 'win32' ? 'PowerShell' : 'osascript'})`);
+ await execFileAsync(
+ 'osascript',
+ ['-l', 'JavaScript', '-e', 'Application("System Events").processes.length; true'],
+ { timeout: 5000 },
+ );
+ macShellAvailable = true;
+ console.log('✅ Accessibility bridge ready (osascript)');
} catch (err: any) {
- shellAvailable = false;
- if (PLATFORM === 'darwin') {
- const isAuthError = err.stderr?.includes('not authorized') || err.message?.includes('not authorized');
- if (isAuthError) {
- console.error(
- `❌ Accessibility: not authorized to control System Events.\n` +
- ` → System Settings → Privacy & Security → Accessibility\n` +
- ` → Add your terminal app (Terminal, iTerm2, wezterm, etc.) or Node.js and try again.`
- );
- } else {
- console.error(`❌ osascript not available. Accessibility bridge will not function.`);
- }
+ macShellAvailable = false;
+ const isAuthError = err.stderr?.includes('not authorized') || err.message?.includes('not authorized');
+ if (isAuthError) {
+ console.error(
+ '❌ Accessibility: not authorized to control System Events.\n' +
+ ' → System Settings → Privacy & Security → Accessibility\n' +
+ ' → Add your terminal app and try again.',
+ );
} else {
- console.error(`❌ PowerShell not available. Accessibility bridge will not function.`);
+ console.error('❌ osascript not available. Accessibility bridge disabled.');
}
}
- return shellAvailable;
+ return macShellAvailable!;
}
- /**
- * Get the Explorer/Finder process ID (for taskbar/dock detection).
- * Caches result to avoid repeated lookups.
- */
- private async getExplorerProcessId(): Promise {
- if (this.explorerProcessId !== null) return this.explorerProcessId;
-
- const targetProcess = PLATFORM === 'darwin' ? 'finder' : 'explorer';
- try {
- const windows = await this.getWindows(true);
- const match = windows.find(w => w.processName.toLowerCase() === targetProcess);
- if (match) {
- this.explorerProcessId = match.processId;
- return match.processId;
- }
- } catch {
- // Fall through to null
+ /** Start the PSRunner bridge early so the 800ms assembly load happens in background. */
+ async warmup(): Promise {
+ if (PLATFORM === 'win32') {
+ psRunner.start().catch(() => {}); // fire-and-forget — errors surface on first actual call
}
- return null;
}
/**
- * List all visible top-level windows (cached for 2s)
+ * Invalidate caches — call after every action so the next read sees fresh UI state.
*/
+ invalidateCache(): void {
+ this.windowCache = null;
+ this.screenContextCache = null;
+ }
+
+ // ── Windows bridge helper ──────────────────────────────────────────────────
+
+ private async winCmd(command: Record): Promise {
+ return psRunner.run(command);
+ }
+
+ // ── macOS script helper ────────────────────────────────────────────────────
+
+ private runMacScript(scriptName: string, args: string[] = []): Promise {
+ return new Promise((resolve, reject) => {
+ const scriptPath = path.join(MAC_SCRIPTS_DIR, scriptName);
+ execFile('osascript', ['-l', 'JavaScript', scriptPath, ...args], {
+ timeout: MAC_SCRIPT_TIMEOUT,
+ maxBuffer: 1024 * 1024 * 5,
+ }, (error, stdout, stderr) => {
+ if (error) {
+ const detail = stderr?.trim() ? ` — ${stderr.trim()}` : '';
+ reject(new Error(error.message + detail));
+ return;
+ }
+ try {
+ const result = JSON.parse(stdout.trim());
+ if (result.error) reject(new Error(result.error));
+ else resolve(result);
+ } catch (pe) {
+ reject(pe);
+ }
+ });
+ });
+ }
+
+ // ── Public API ─────────────────────────────────────────────────────────────
+
async getWindows(forceRefresh = false): Promise {
- // Check shell availability on first call
- if (shellAvailable === null) {
- const available = await this.isShellAvailable();
- if (!available) {
- throw new Error(`Accessibility shell not available on ${PLATFORM}. Features disabled.`);
- }
- }
-
if (
!forceRefresh &&
this.windowCache &&
@@ -177,41 +170,44 @@ export class AccessibilityBridge {
return this.windowCache.windows;
}
- const windows = await this.runScript('get-windows.ps1');
- this.windowCache = { windows, timestamp: Date.now() };
+ let windows: WindowInfo[];
+ if (PLATFORM === 'win32') {
+ const result = await this.winCmd({ cmd: 'get-screen-context', maxDepth: 0 }) as any;
+ windows = result.windows ?? [];
+ // Update screen context cache timestamp so we don't double-fetch
+ this.windowCache = { windows, timestamp: Date.now() };
+ } else {
+ windows = await this.runMacScript('get-windows.jxa');
+ this.windowCache = { windows, timestamp: Date.now() };
+ }
return windows;
}
- /**
- * Invalidate the window cache (call after actions that change window state)
- */
- invalidateCache(): void {
- this.windowCache = null;
- this.screenContextCache = null;
- }
-
- /**
- * Find elements matching criteria
- */
async findElement(opts: {
name?: string;
automationId?: string;
controlType?: string;
processId?: number;
}): Promise {
- const args: string[] = [];
- if (opts.name) args.push('-Name', opts.name);
- if (opts.automationId) args.push('-AutomationId', opts.automationId);
- if (opts.controlType) args.push('-ControlType', opts.controlType);
- if (opts.processId) args.push('-ProcessId', String(opts.processId));
- return this.runScript('find-element.ps1', args);
+ if (PLATFORM === 'win32') {
+ const result = await this.winCmd({
+ cmd: 'find-element',
+ ...(opts.name && { name: opts.name }),
+ ...(opts.automationId && { automationId: opts.automationId }),
+ ...(opts.controlType && { controlType: opts.controlType }),
+ ...(opts.processId && { processId: opts.processId }),
+ }) as any;
+ return Array.isArray(result) ? result : [];
+ } else {
+ const args: string[] = [];
+ if (opts.name) args.push('-Name', opts.name);
+ if (opts.automationId) args.push('-AutomationId', opts.automationId);
+ if (opts.controlType) args.push('-ControlType', opts.controlType);
+ if (opts.processId) args.push('-ProcessId', String(opts.processId));
+ return this.runMacScript('find-element.jxa', args);
+ }
}
- /**
- * Invoke an action on an element (click, set value, etc.)
- * Auto-discovers processId by finding the element first.
- * Falls back to coordinate click if element has bounds but no processId.
- */
async invokeElement(opts: {
name?: string;
automationId?: string;
@@ -221,138 +217,224 @@ export class AccessibilityBridge {
processId?: number;
}): Promise<{ success: boolean; value?: string; error?: string; clickPoint?: { x: number; y: number } }> {
let processId = opts.processId;
- let elementBounds: { x: number; y: number; width: number; height: number } | null = null;
- // Auto-discover processId if not provided
if (!processId) {
- const searchOpts: any = {};
- if (opts.automationId) {
- searchOpts.automationId = opts.automationId;
- } else if (opts.controlType) {
- searchOpts.controlType = opts.controlType;
- }
- if (Object.keys(searchOpts).length === 0 && opts.name) {
- searchOpts.automationId = opts.name;
- }
- const elements = await this.findElement(searchOpts);
- if (!elements || elements.length === 0) {
- return { success: false, error: `Element not found: ${opts.name || opts.automationId}` };
+ const elements = await this.findElement({
+ name: opts.name,
+ automationId: opts.automationId,
+ controlType: opts.controlType,
+ });
+ if (!elements?.length) {
+ return { success: false, error: `Element not found: ${opts.name ?? opts.automationId}` };
}
- const element = elements[0];
- processId = (element as any).processId;
- elementBounds = element.bounds;
-
- // Fallback to coordinate click if we have bounds but no processId
- if (!processId && elementBounds && elementBounds.width > 0 && opts.action === 'click') {
- const centerX = elementBounds.x + Math.floor(elementBounds.width / 2);
- const centerY = elementBounds.y + Math.floor(elementBounds.height / 2);
- console.log(` ♿ No processId for "${opts.name}", falling back to coordinate click at (${centerX}, ${centerY})`);
- return {
- success: true,
- clickPoint: { x: centerX, y: centerY },
- error: `Coordinate click fallback — caller should execute mouse click at (${centerX}, ${centerY})`
- };
+ const el = elements[0];
+ processId = (el as any).processId;
+
+ if (!processId && el.bounds?.width > 0 && opts.action === 'click') {
+ const cx = el.bounds.x + Math.floor(el.bounds.width / 2);
+ const cy = el.bounds.y + Math.floor(el.bounds.height / 2);
+ return { success: true, clickPoint: { x: cx, y: cy } };
}
-
if (!processId) {
- return { success: false, error: `No processId for element: ${opts.name || opts.automationId}` };
+ return { success: false, error: `No processId for: ${opts.name ?? opts.automationId}` };
}
}
- const args: string[] = ['-Action', opts.action, '-ProcessId', String(processId)];
- if (opts.name) args.push('-Name', opts.name);
- if (opts.automationId) args.push('-AutomationId', opts.automationId);
- if (opts.controlType) args.push('-ControlType', opts.controlType);
- if (opts.value) args.push('-Value', opts.value);
- return this.runScript('invoke-element.ps1', args);
+ if (PLATFORM === 'win32') {
+ const result = await this.winCmd({
+ cmd: 'invoke-element',
+ processId,
+ action: opts.action,
+ ...(opts.name && { name: opts.name }),
+ ...(opts.automationId && { automationId: opts.automationId }),
+ ...(opts.controlType && { controlType: opts.controlType }),
+ ...(opts.value && { value: opts.value }),
+ }) as any;
+ return result;
+ } else {
+ const args: string[] = ['-Action', opts.action, '-ProcessId', String(processId)];
+ if (opts.name) args.push('-Name', opts.name);
+ if (opts.automationId) args.push('-AutomationId', opts.automationId);
+ if (opts.controlType) args.push('-ControlType', opts.controlType);
+ if (opts.value) args.push('-Value', opts.value);
+ return this.runMacScript('invoke-element.jxa', args);
+ }
}
- // ─── Window Management Helpers (deterministic, no LLM) ────────────
-
- /**
- * Focus (bring to front) a window by title substring or processId.
- * Reliable — uses UIA WindowPattern + Win32 SetForegroundWindow fallback.
- */
- async focusWindow(title?: string, processId?: number): Promise<{ success: boolean; title?: string; processId?: number; error?: string }> {
- const args: string[] = [];
- if (title) args.push('-Title', title);
- if (processId) args.push('-ProcessId', String(processId));
- args.push('-Restore'); // Always restore from minimized
-
+ async focusWindow(
+ title?: string,
+ processId?: number,
+ ): Promise<{ success: boolean; title?: string; processId?: number; error?: string }> {
try {
- const result = await this.runScript('focus-window.ps1', args);
- this.invalidateCache(); // Window state changed
+ let result: any;
+ if (PLATFORM === 'win32') {
+ result = await this.winCmd({
+ cmd: 'focus-window',
+ restore: true,
+ ...(title && { title }),
+ ...(processId && { processId }),
+ });
+ } else {
+ const args: string[] = [];
+ if (title) args.push('-Title', title);
+ if (processId) args.push('-ProcessId', String(processId));
+ args.push('-Restore');
+ result = await this.runMacScript('focus-window.jxa', args);
+ }
+ this.invalidateCache();
return result;
} catch (err) {
return { success: false, error: String(err) };
}
}
- /**
- * Get the currently active/focused window using Win32 GetForegroundWindow.
- * Returns the window info for the actual foreground window, not a heuristic guess.
- */
async getActiveWindow(): Promise {
try {
- // Use Win32 API to get actual foreground window
- const fgResult = await this.runScript('get-foreground-window.ps1');
- if (!fgResult.success) return null;
+ let fg: any;
+ if (PLATFORM === 'win32') {
+ fg = await this.winCmd({ cmd: 'get-foreground-window' });
+ } else {
+ fg = await this.runMacScript('get-foreground-window.jxa');
+ }
+ if (!fg?.success) return null;
- // Get full window list to find matching window with full info
const windows = await this.getWindows(true);
- const match = windows.find(w => w.processId === fgResult.processId);
-
+ const match = windows.find(w => w.processId === fg.processId);
if (match) return match;
-
- // Window might be new — construct minimal info from foreground result
+
return {
- handle: fgResult.handle,
- title: fgResult.title,
- processName: fgResult.processName,
- processId: fgResult.processId,
- bounds: { x: 0, y: 0, width: 0, height: 0 }, // Unknown without full query
- isMinimized: false, // Foreground window can't be minimized
+ handle: fg.handle,
+ title: fg.title,
+ processName: fg.processName,
+ processId: fg.processId,
+ bounds: { x: 0, y: 0, width: 0, height: 0 },
+ isMinimized: false,
};
} catch {
- // Fallback: return first non-minimized window (better than nothing)
try {
const windows = await this.getWindows(true);
- return windows.find(w => !w.isMinimized) || null;
+ return windows.find(w => !w.isMinimized) ?? null;
} catch {
return null;
}
}
}
- /**
- * Find a window by app name/title (fuzzy match).
- */
async findWindow(appNameOrTitle: string): Promise {
const lower = appNameOrTitle.toLowerCase();
const windows = await this.getWindows();
+ return (
+ windows.find(w => w.processName.toLowerCase() === lower) ??
+ windows.find(w => w.title.toLowerCase().includes(lower)) ??
+ windows.find(w => w.processName.toLowerCase().includes(lower)) ??
+ null
+ );
+ }
- // Exact process name match
- let match = windows.find(w => w.processName.toLowerCase() === lower);
- if (match) return match;
+ async getFocusedElement(): Promise {
+ if (PLATFORM === 'win32') {
+ try {
+ const result = await this.winCmd({ cmd: 'get-focused-element' }) as any;
+ if (!result?.success) return null;
+ return {
+ name: result.name ?? '',
+ automationId: result.automationId ?? '',
+ controlType: result.controlType ?? '',
+ className: result.className ?? '',
+ processId: result.processId ?? 0,
+ isEnabled: result.isEnabled ?? true,
+ bounds: result.bounds ?? { x: 0, y: 0, width: 0, height: 0 },
+ value: result.value ?? '',
+ };
+ } catch {
+ return null;
+ }
+ }
+ if (PLATFORM === 'darwin') {
+ try {
+ const script = path.join(MAC_SCRIPTS_DIR, 'get-focused-element.jxa');
+ const { stdout } = await execFileAsync('osascript', ['-l', 'JavaScript', script], {
+ timeout: MAC_SCRIPT_TIMEOUT,
+ });
+ const result = JSON.parse(stdout.trim());
+ if (!result) return null;
+ return {
+ name: result.name ?? '',
+ automationId: result.automationId ?? '',
+ controlType: result.controlType ?? '',
+ className: result.className ?? '',
+ processId: result.processId ?? 0,
+ isEnabled: result.isEnabled ?? true,
+ bounds: result.bounds ?? { x: 0, y: 0, width: 0, height: 0 },
+ value: result.value ?? '',
+ };
+ } catch {
+ return null;
+ }
+ }
+ // Linux: not yet implemented (AT-SPI planned)
+ return null;
+ }
- // Title contains
- match = windows.find(w => w.title.toLowerCase().includes(lower));
- if (match) return match;
+ // ── Clipboard ─────────────────────────────────────────────────────────────
- // Process name contains
- match = windows.find(w => w.processName.toLowerCase().includes(lower));
- if (match) return match;
+ /**
+ * Read text from the OS clipboard.
+ * Returns empty string on error, timeout, or non-text content.
+ */
+ async readClipboard(): Promise {
+ try {
+ if (PLATFORM === 'win32') {
+ const { stdout } = await execFileAsync('powershell.exe', [
+ '-NoProfile', '-Command', 'Get-Clipboard',
+ ], { timeout: 2000 });
+ return stdout?.trim() ?? '';
+ } else {
+ // macOS: pbpaste
+ const { stdout } = await execFileAsync('pbpaste', [], { timeout: 2000 });
+ return stdout?.trim() ?? '';
+ }
+ } catch {
+ return '';
+ }
+ }
- return null;
+ /**
+ * Write text to the OS clipboard.
+ * Silently fails on error or timeout.
+ */
+ async writeClipboard(text: string): Promise {
+ try {
+ if (PLATFORM === 'win32') {
+ // Use -EncodedCommand with Base64-encoded UTF-16LE to safely handle
+ // all characters (quotes, newlines, special chars) without escaping issues.
+ const utf16 = Buffer.from(
+ `Set-Clipboard -Value '${text.replace(/'/g, "''")}'`,
+ 'utf16le',
+ );
+ await execFileAsync('powershell.exe', [
+ '-NoProfile', '-EncodedCommand', utf16.toString('base64'),
+ ], { timeout: 2000 });
+ } else {
+ // macOS: pipe to pbcopy via shell
+ await new Promise((resolve, reject) => {
+ const proc = execFile('pbcopy', [], { timeout: 2000 }, (err) => {
+ if (err) reject(err); else resolve();
+ });
+ proc.stdin?.write(text);
+ proc.stdin?.end();
+ });
+ }
+ } catch {
+ // Silently fail — clipboard write is best-effort
+ }
}
/**
- * Get a text summary of the UI for the AI.
- * Uses combined script (1 PowerShell spawn instead of 3).
- * Includes windows list, taskbar buttons, and focused window UI tree.
+ * Get a text summary of the UI for the LLM.
+ * Always reads fresh on Windows (PSRunner is cheap); respects 2s cache otherwise.
*/
async getScreenContext(focusedProcessId?: number): Promise {
- // ── Perf Opt #3: Return cached context if fresh ──
if (
this.screenContextCache &&
Date.now() - this.screenContextCache.timestamp < this.SCREEN_CONTEXT_CACHE_TTL
@@ -360,88 +442,82 @@ export class AccessibilityBridge {
return this.screenContextCache.context;
}
- try {
- // Use combined script for single PowerShell spawn
- const args: string[] = [];
- if (focusedProcessId) args.push('-FocusedProcessId', String(focusedProcessId));
- args.push('-MaxDepth', '2');
-
- let context = '';
+ let context = '';
+ let treeError = false;
- try {
- const combined = await this.runScript('get-screen-context.ps1', args);
+ try {
+ if (PLATFORM === 'win32') {
+ const combined = await this.winCmd({
+ cmd: 'get-screen-context',
+ maxDepth: MAX_DEPTH,
+ ...(focusedProcessId && { focusedProcessId }),
+ }) as any;
- // Format windows
- if (combined.windows && Array.isArray(combined.windows)) {
- context += `WINDOWS:\n`;
+ if (combined.windows?.length) {
+ this.windowCache = { windows: combined.windows, timestamp: Date.now() };
+ context += 'WINDOWS:\n';
for (const w of combined.windows) {
context += ` ${w.isMinimized ? '🔽' : '🟢'} [${w.processName}] "${w.title}" pid:${w.processId}`;
if (!w.isMinimized) context += ` at (${w.bounds.x},${w.bounds.y}) ${w.bounds.width}x${w.bounds.height}`;
- context += `\n`;
+ context += '\n';
}
- // Update window cache from combined result
- this.windowCache = { windows: combined.windows, timestamp: Date.now() };
}
- // Format UI tree (already filtered to interactive elements by the script)
if (combined.uiTree) {
- context += `\nFOCUSED WINDOW UI TREE:\n`;
- context += this.formatTree(Array.isArray(combined.uiTree) ? combined.uiTree : [combined.uiTree], ' ');
+ context += '\nFOCUSED WINDOW UI TREE:\n';
+ context += this.formatTree(
+ Array.isArray(combined.uiTree) ? combined.uiTree : [combined.uiTree],
+ ' ',
+ );
}
- } catch {
- // Fallback to separate calls if combined script fails
+ } else {
+ // macOS — separate script calls
const windows = await this.getWindows();
- context += `WINDOWS:\n`;
+ context += 'WINDOWS:\n';
for (const w of windows) {
context += ` ${w.isMinimized ? '🔽' : '🟢'} [${w.processName}] "${w.title}" pid:${w.processId}`;
if (!w.isMinimized) context += ` at (${w.bounds.x},${w.bounds.y}) ${w.bounds.width}x${w.bounds.height}`;
- context += `\n`;
+ context += '\n';
}
-
if (focusedProcessId) {
try {
- const args = ['-FocusedProcessId', String(focusedProcessId), '-MaxDepth', '2'];
- const result = await this.runScript('get-screen-context.ps1', args);
+ const result = await this.runMacScript('get-screen-context.jxa', [
+ '-FocusedProcessId', String(focusedProcessId),
+ '-MaxDepth', String(MAX_DEPTH),
+ ]);
const tree = result?.uiTree ? [result.uiTree] : [];
context += `\nFOCUSED WINDOW UI TREE (pid:${focusedProcessId}):\n`;
- context += this.formatTree(Array.isArray(tree) ? tree : [tree], ' ');
- } catch { /* tree query failed, skip */ }
+ context += this.formatTree(tree, ' ');
+ } catch { /* skip */ }
}
}
+ } catch (err) {
+ context += `\n[A11y tree unavailable: ${err}]\n`;
+ treeError = true;
+ }
- // Include cached taskbar buttons (refreshed every 30s)
+ // Always append focused element — even when the tree query failed, focus info is critical
+ if (PLATFORM === 'win32') {
try {
- let tbButtons: UIElement[] = [];
- if (this.taskbarCache && Date.now() - this.taskbarCache.timestamp < this.TASKBAR_CACHE_TTL) {
- tbButtons = this.taskbarCache.buttons;
- } else {
- const explorerPid = await this.getExplorerProcessId();
- if (explorerPid) {
- const taskbarButtons = await this.findElement({ controlType: 'Button' });
- tbButtons = taskbarButtons.filter((b: any) =>
- b.processId === explorerPid &&
- (b.className?.includes('Taskbar') || b.className?.includes('MSTaskList'))
- );
- this.taskbarCache = { buttons: tbButtons, timestamp: Date.now() };
- }
- }
- if (tbButtons.length > 0) {
- context += `\nTASKBAR APPS:\n`;
- for (const b of tbButtons) {
- context += ` 📌 "${b.name}" at (${b.bounds.x},${b.bounds.y})\n`;
- }
+ const focused = await this.getFocusedElement();
+ if (focused) {
+ context += '\nFOCUSED ELEMENT:\n';
+ context += ` [${focused.controlType}] "${focused.name}" id:${focused.automationId} @${focused.bounds.x},${focused.bounds.y}`;
+ if (!focused.isEnabled) context += ' DISABLED';
+ if (focused.value) context += ` value="${focused.value.substring(0, 100)}"`;
+ context += ` pid:${focused.processId}\n`;
}
- } catch { /* taskbar query failed, skip */ }
+ } catch { /* non-critical */ }
+ }
- // Cache the result
- this.screenContextCache = { context, timestamp: Date.now() };
- return context;
- } catch (err) {
- return `(Accessibility unavailable: ${err})`;
+ if (!context.trim()) {
+ return '(Accessibility unavailable)';
}
+
+ this.screenContextCache = { context, timestamp: Date.now() };
+ return context;
}
- /** Interactive control types worth sending to the LLM */
private static readonly INTERACTIVE_TYPES = new Set([
'ControlType.Button', 'ControlType.Edit', 'ControlType.ComboBox',
'ControlType.CheckBox', 'ControlType.RadioButton', 'ControlType.Hyperlink',
@@ -449,97 +525,40 @@ export class AccessibilityBridge {
'ControlType.TabItem', 'ControlType.ListItem', 'ControlType.TreeItem',
'ControlType.Slider', 'ControlType.ScrollBar', 'ControlType.ToolBar',
'ControlType.Document', 'ControlType.DataItem',
+ 'ControlType.Pane', 'ControlType.Custom', 'ControlType.Group',
+ 'ControlType.Text',
]);
- /** Max chars for accessibility context sent to LLM */
- private static readonly MAX_CONTEXT_CHARS = 3000;
+ private static readonly MAX_CONTEXT_CHARS = 6000; // raised for deeper Electron/WebView2 trees
private formatTree(elements: UIElement[], indent: string): string {
let result = '';
for (const el of elements) {
- // Only include interactive elements or those with useful names
const isInteractive = AccessibilityBridge.INTERACTIVE_TYPES.has(el.controlType);
- const hasName = !!(el.name && el.name.trim());
+ const hasName = !!(el.name?.trim());
+ const hasChildren = el.children && el.children.length > 0;
+ // Show element if interactive or named; skip unnamed non-interactive LEAVES only
if (isInteractive || hasName) {
- const name = el.name ? `"${el.name}"` : '';
- const id = el.automationId ? `id:${el.automationId}` : '';
+ const name = el.name ? `"${el.name}"` : '';
+ const id = el.automationId ? `id:${el.automationId}` : '';
const bounds = `@${el.bounds.x},${el.bounds.y}`;
- result += `${indent}[${el.controlType}] ${name} ${id} ${bounds}\n`;
+ const disabled = el.isEnabled === false ? ' DISABLED' : '';
+ result += `${indent}[${el.controlType}] ${name} ${id} ${bounds}${disabled}\n`;
- // Stop adding if we're over the limit
if (result.length > AccessibilityBridge.MAX_CONTEXT_CHARS) {
result += `${indent}... (truncated)\n`;
return result;
}
}
- if (el.children) {
- result += this.formatTree(el.children, indent + ' ');
+ // Always recurse into children — unnamed containers (Pane/Group) in Electron apps
+ // often wrap the actual interactive elements several levels deep
+ if (hasChildren) {
+ result += this.formatTree(el.children!, indent + ' ');
if (result.length > AccessibilityBridge.MAX_CONTEXT_CHARS) return result;
}
}
return result;
}
-
- /**
- * Run a platform-specific accessibility script.
- * Accepts either a direct filename (e.g. 'get-windows.ps1') or
- * a logical name (e.g. 'get-windows') that gets mapped per platform.
- */
- private runScript(scriptName: string, args: string[] = []): Promise {
- return new Promise((resolve, reject) => {
- let command: string;
- let commandArgs: string[];
-
- // Resolve script name — accept both logical names and direct filenames
- const logicalName = scriptName.replace(/\.(ps1|jxa)$/, '');
- const platformScripts = SCRIPT_MAP[PLATFORM] || SCRIPT_MAP['win32'];
- const resolvedScript = platformScripts[logicalName] || scriptName;
-
- if (PLATFORM === 'darwin') {
- const scriptPath = path.join(MAC_SCRIPTS_DIR, resolvedScript);
- command = 'osascript';
- commandArgs = ['-l', 'JavaScript', scriptPath, ...args];
- } else {
- // Windows (default)
- const scriptPath = path.join(SCRIPTS_DIR, resolvedScript);
- command = 'powershell.exe';
- commandArgs = [
- '-NoProfile',
- '-NonInteractive',
- '-ExecutionPolicy', 'Bypass',
- '-File', scriptPath,
- ...args,
- ];
- }
-
- execFile(command, commandArgs, {
- timeout: SCRIPT_TIMEOUT,
- maxBuffer: 1024 * 1024 * 5, // 5MB buffer
- }, (error, stdout, stderr) => {
- if (error) {
- // Include stderr so the real reason (e.g. "not authorized to send Apple events") is visible
- const stderrDetail = typeof stderr === 'string' && stderr.trim() ? ` — ${stderr.trim()}` : '';
- const fullMessage = error.message + stderrDetail;
- console.error(`Accessibility script error (${resolvedScript}): ${fullMessage}`);
- reject(new Error(fullMessage));
- return;
- }
-
- try {
- const result = JSON.parse(stdout.trim());
- if (result.error) {
- reject(new Error(result.error));
- } else {
- resolve(result);
- }
- } catch (parseErr) {
- const stderrMsg = stderr ? stderr.trim().substring(0, 300) : '';
- console.error(`Failed to parse ${resolvedScript} output: stdout=${stdout.substring(0, 200)}${stderrMsg ? ' stderr=' + stderrMsg : ''}`);
- reject(parseErr);
- }
- });
- });
- }
}
diff --git a/src/action-router.ts b/src/action-router.ts
index 1c2e90c..dabf883 100644
--- a/src/action-router.ts
+++ b/src/action-router.ts
@@ -7,7 +7,7 @@
*/
import * as os from 'os';
-import { execFile } from 'child_process';
+import { execFile, spawn } from 'child_process';
import { promisify } from 'util';
import { AccessibilityBridge } from './accessibility';
import { NativeDesktop } from './native-desktop';
@@ -48,6 +48,9 @@ const APP_ALIASES: Record" → Ctrl/Cmd+F then type
+ // Only match explicit in-page find requests (with quotes or "in page" qualifier)
+ // NOT intent-based tasks like "find the cheapest flight" or "find a restaurant"
const mod = PLATFORM === 'darwin' ? 'Super' : 'Control'; // Cmd on macOS, Ctrl on Windows/Linux
- const findMatch = rawTask.match(/^(?:find|search in page|search within|search)\s+(.+)$/i);
+ const findMatch = rawTask.match(/^(?:search in page|search within|find in page|find on page)\s+(.+)$/i)
+ || rawTask.match(/^(?:find|search)\s+['"](.+)['"]$/i); // only quoted strings
if (findMatch) {
await this.desktop.keyPress(`${mod}+f`);
await this.delay(200);
@@ -420,6 +427,34 @@ export class ActionRouter {
return null;
}
+ // ─── Helper: Launch Edge with CDP port ─────────────────────────────
+
+ /**
+ * Try to launch Edge with --remote-debugging-port=9222 so CDPDriver can connect.
+ * Tries common installation paths. Returns true if launched successfully.
+ */
+ private async launchEdgeWithCDP(url: string): Promise {
+ const edgePaths = [
+ 'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
+ 'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
+ ];
+ for (const edgePath of edgePaths) {
+ try {
+ spawn(edgePath, [
+ '--remote-debugging-port=9222',
+ '--no-first-run',
+ '--no-default-browser-check',
+ url,
+ ], { detached: true, stdio: 'ignore' }).unref();
+ console.log(` 🔌 Launched Edge with CDP port 9222 — ${url}`);
+ return true;
+ } catch {
+ continue;
+ }
+ }
+ return false;
+ }
+
// ─── Handler: Type Text ────────────────────────────────────────────
private async handleType(text: string): Promise {
@@ -469,7 +504,16 @@ export class ActionRouter {
await this.a11y.focusWindow(undefined, browser.processId);
await this.delay(300);
} else {
- // No browser running — launch default browser via OS default handler
+ // No browser running — launch Edge with CDP debugging port for DOM access
+ const launched = await this.launchEdgeWithCDP(fullUrl);
+ if (launched) {
+ await this.delay(2500);
+ return {
+ handled: true,
+ description: `Opened ${fullUrl} in Edge (CDP port 9222 enabled for DOM interaction)`,
+ };
+ }
+ // Fall back to OS default browser
if (PLATFORM === 'darwin') {
await execFileAsync('open', [fullUrl]);
} else {
diff --git a/src/action-verifier.ts b/src/action-verifier.ts
new file mode 100644
index 0000000..e4baad1
--- /dev/null
+++ b/src/action-verifier.ts
@@ -0,0 +1,199 @@
+/**
+ * Action Verifier — wraps desktop actions with post-action verification.
+ *
+ * Read -> Act -> Verify -> Log. Every action checks that it actually worked
+ * using the accessibility bridge (getFocusedElement, get-value).
+ */
+
+import { AccessibilityBridge, FocusedElementInfo } from './accessibility';
+import { NativeDesktop } from './native-desktop';
+
+export interface VerifyResult {
+ success: boolean;
+ error?: string;
+ details?: Record;
+}
+
+export class ActionVerifier {
+ constructor(
+ private a11y: AccessibilityBridge,
+ private desktop: NativeDesktop,
+ ) {}
+
+ /**
+ * Poll until a condition is met or timeout expires.
+ * Returns true if condition was met, false on timeout.
+ */
+ async pollForCondition(
+ check: () => Promise,
+ timeoutMs = 5000,
+ intervalMs = 200,
+ ): Promise {
+ const start = Date.now();
+ while (Date.now() - start < timeoutMs) {
+ if (await check()) return true;
+ await new Promise(r => setTimeout(r, intervalMs));
+ }
+ return false;
+ }
+
+ /**
+ * Type text, then verify by reading the focused element's value.
+ */
+ async verifiedType(text: string): Promise {
+ const beforeFocus = await this.a11y.getFocusedElement();
+ await this.desktop.typeText(text);
+
+ // Give the UI a moment to update
+ await new Promise(r => setTimeout(r, 100));
+
+ const afterFocus = await this.a11y.getFocusedElement();
+ if (!afterFocus) {
+ return { success: true, details: { warning: 'Could not read focused element after typing' } };
+ }
+
+ // Check if the value contains the typed text
+ if (afterFocus.value && afterFocus.value.includes(text)) {
+ return { success: true, details: { readBack: afterFocus.value } };
+ }
+
+ // WebView2/Electron apps may not expose value through UIA — still report success
+ // but note the verification was inconclusive
+ return {
+ success: true,
+ details: {
+ warning: 'Value readback inconclusive (common in WebView2 apps)',
+ focusedElement: afterFocus.name || afterFocus.controlType,
+ readBack: afterFocus.value || '(empty)',
+ },
+ };
+ }
+
+ /**
+ * Press a key combo, then verify focus changed (or window changed for Ctrl+Enter etc).
+ */
+ async verifiedKeyPress(
+ keyCombo: string,
+ expectation?: {
+ focusShouldChange?: boolean;
+ windowShouldClose?: boolean;
+ expectedControlType?: string;
+ },
+ ): Promise {
+ const beforeFocus = await this.a11y.getFocusedElement();
+
+ // Capture window title before action — compose windows close within the same process
+ // (e.g. Outlook compose → main Outlook), so processId alone is not sufficient
+ let beforeWindowTitle = '';
+ if (expectation?.windowShouldClose) {
+ const beforeWindow = await this.a11y.getActiveWindow();
+ beforeWindowTitle = beforeWindow?.title ?? '';
+ }
+
+ await this.desktop.keyPress(keyCombo);
+
+ // Give the UI a moment to settle
+ await new Promise(r => setTimeout(r, 150));
+ this.a11y.invalidateCache();
+
+ if (expectation?.windowShouldClose) {
+ const closed = await this.pollForCondition(async () => {
+ const active = await this.a11y.getActiveWindow();
+ if (!active) return true;
+ if (beforeFocus?.processId && active.processId !== beforeFocus.processId) return true;
+ // Title changed = a different window is now active (e.g. compose closed, inbox visible)
+ if (beforeWindowTitle && active.title !== beforeWindowTitle) return true;
+ return false;
+ }, 3000, 200);
+ return {
+ success: closed,
+ error: closed ? undefined : 'Window did not close after key press',
+ details: { keyCombo, windowClosed: closed },
+ };
+ }
+
+ if (expectation?.focusShouldChange) {
+ const afterFocus = await this.a11y.getFocusedElement();
+ const focusMoved = !beforeFocus || !afterFocus ||
+ beforeFocus.name !== afterFocus.name ||
+ beforeFocus.automationId !== afterFocus.automationId ||
+ beforeFocus.controlType !== afterFocus.controlType;
+
+ if (expectation.expectedControlType && afterFocus) {
+ const typeMatch = afterFocus.controlType.includes(expectation.expectedControlType);
+ return {
+ success: typeMatch,
+ error: typeMatch ? undefined : `Expected ${expectation.expectedControlType}, got ${afterFocus.controlType}`,
+ details: {
+ keyCombo,
+ focusMoved,
+ before: beforeFocus?.name ?? '(none)',
+ after: afterFocus.name ?? '(none)',
+ afterType: afterFocus.controlType,
+ },
+ };
+ }
+
+ return {
+ success: focusMoved,
+ error: focusMoved ? undefined : 'Focus did not change after key press',
+ details: {
+ keyCombo,
+ focusMoved,
+ before: beforeFocus?.name ?? '(none)',
+ after: afterFocus?.name ?? '(none)',
+ },
+ };
+ }
+
+ // No specific expectation — just confirm the key press happened
+ return { success: true, details: { keyCombo } };
+ }
+
+ /**
+ * Click an element by resolving its bounds from the a11y tree, then verify.
+ */
+ async verifiedClick(opts: {
+ name?: string;
+ automationId?: string;
+ controlType?: string;
+ processId?: number;
+ }): Promise {
+ const elements = await this.a11y.findElement(opts);
+ if (!elements?.length) {
+ return { success: false, error: `Element not found: ${opts.name ?? opts.automationId}` };
+ }
+
+ const el = elements[0];
+ const b = el.bounds;
+ if (!b || b.width <= 0 || b.height <= 0) {
+ return { success: false, error: `Element has no valid bounds: ${opts.name ?? opts.automationId}` };
+ }
+
+ const cx = b.x + Math.floor(b.width / 2);
+ const cy = b.y + Math.floor(b.height / 2);
+
+ await this.desktop.mouseClick(cx, cy);
+ this.a11y.invalidateCache();
+
+ // Brief settle, then check if focus moved to the clicked element
+ await new Promise(r => setTimeout(r, 150));
+ const afterFocus = await this.a11y.getFocusedElement();
+
+ return {
+ success: true,
+ details: {
+ clicked: { x: cx, y: cy },
+ elementName: el.name,
+ focusedAfter: afterFocus?.name ?? '(unknown)',
+ },
+ };
+ }
+
+ /**
+ * Get the current focused element — convenience wrapper.
+ */
+ async getFocused(): Promise {
+ return this.a11y.getFocusedElement();
+ }
+}
diff --git a/src/agent.ts b/src/agent.ts
index 5977fd8..2fc485d 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -36,9 +36,16 @@ import { AccessibilityBridge } from './accessibility';
import { ActionRouter } from './action-router';
import { SafetyTier } from './types';
import { ComputerUseBrain } from './computer-use';
+import { GenericComputerUse, isGenericComputerUseSupported } from './generic-computer-use';
import { A11yReasoner } from './a11y-reasoner';
+import { OcrEngine } from './ocr-engine';
+import { OcrReasoner } from './ocr-reasoner';
+import { SkillCache } from './skill-cache';
+import { TaskLogger, CompletionStatus } from './task-logger';
+import { WorkspaceState } from './workspace-state';
+import { TaskVerifier } from './verifiers';
+import { DeterministicFlows } from './deterministic-flows';
import { BrowserLayer } from './browser-layer';
-import { SmartInteractionLayer } from './smart-interaction';
import { loadPipelineConfig } from './doctor';
import { detectProvider, type PipelineConfig } from './providers';
import type { ClawdConfig, AgentState, TaskResult, StepResult, InputAction, A11yAction } from './types';
@@ -55,9 +62,16 @@ export class Agent {
private a11y: AccessibilityBridge;
private router: ActionRouter;
private computerUse: ComputerUseBrain | null = null;
+ private genericComputerUse: GenericComputerUse | null = null;
private reasoner: A11yReasoner | null = null;
+ private ocrEngine: OcrEngine;
+ private ocrReasoner: OcrReasoner | null = null;
+ private skillCache: SkillCache;
+ private deterministicFlows: DeterministicFlows;
private browserLayer: BrowserLayer | null = null;
- private smartInteraction: SmartInteractionLayer | null = null;
+ private logger: TaskLogger;
+ private workspace: WorkspaceState;
+ private verifier: TaskVerifier;
private config: ClawdConfig;
private hasApiKey: boolean;
private state: AgentState = {
@@ -75,14 +89,32 @@ export class Agent {
this.safety = new SafetyLayer(config);
this.a11y = new AccessibilityBridge();
this.router = new ActionRouter(this.a11y, this.desktop);
+ this.deterministicFlows = new DeterministicFlows(this.a11y, this.desktop);
+ this.logger = new TaskLogger();
+ this.workspace = new WorkspaceState();
// Load pipeline config from doctor (if available)
const pipelineConfig = loadPipelineConfig();
+ this.verifier = new TaskVerifier(this.a11y, pipelineConfig ?? undefined);
if (pipelineConfig && pipelineConfig.layer2.enabled) {
- this.reasoner = new A11yReasoner(this.a11y, pipelineConfig);
+ this.reasoner = new A11yReasoner(this.a11y, this.desktop, pipelineConfig);
console.log(`🧠 Layer 2 (Accessibility Reasoner): ${pipelineConfig.layer2.model}`);
}
+ // OCR-first pipeline with skill cache
+ this.ocrEngine = new OcrEngine();
+ this.skillCache = new SkillCache();
+ this.skillCache.load();
+
+ if (this.ocrEngine.isAvailable() && pipelineConfig && pipelineConfig.layer2.enabled) {
+ this.ocrReasoner = new OcrReasoner(this.ocrEngine, this.desktop, this.a11y, pipelineConfig);
+ console.log(`👁️ Layer 2.5 (OCR Reasoner): enabled — OCR-first pipeline active`);
+ }
+ const skillStats = this.skillCache.getStats();
+ if (skillStats.total > 0) {
+ console.log(`📚 Layer 2 (Skill Cache): ${skillStats.total} cached skills`);
+ }
+
// hasApiKey gates LLM decomposition — true if cloud key OR local LLM (Ollama) is available
const hasCloudKey = !!(config.ai.apiKey && config.ai.apiKey.length > 0);
const hasVisionKey = !!(config.ai.visionApiKey && config.ai.visionApiKey.length > 0);
@@ -112,7 +144,7 @@ export class Agent {
if (!this.hasApiKey) {
console.log(`⚡ Running in offline mode (no API key or local LLM). Local parser + action router only.`);
- console.log(` To unlock AI fallback, configure your OpenClaw agent provider (or set AI_API_KEY in standalone mode) and run: clawdcursor doctor`);
+ console.log(` To unlock AI fallback, set AI_API_KEY (or run: clawdcursor doctor)`);
}
}
@@ -173,36 +205,48 @@ export class Agent {
async connect(): Promise {
await this.desktop.connect();
+ // Minimize the terminal/console window running this agent so it never
+ // appears in screenshots and the vision LLM can't accidentally close it.
+ if (!IS_MAC) {
+ try {
+ await execFileAsync('powershell.exe', ['-Command',
+ `Add-Type -TypeDefinition @"
+using System;
+using System.Runtime.InteropServices;
+public class WinAPI {
+ [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow);
+ [DllImport("kernel32.dll")] public static extern IntPtr GetConsoleWindow();
+}
+"@
+[WinAPI]::ShowWindow([WinAPI]::GetConsoleWindow(), 2)` // SW_MINIMIZE = 2
+ ]);
+ } catch { /* non-fatal — just cosmetic */ }
+ }
+
// Initialize Browser Layer (Layer 0) — Playwright for browser tasks
const pipelineConfig = loadPipelineConfig();
- const textModel = this.config.ai.model || pipelineConfig?.layer2?.model || 'unavailable';
- const visionModel = this.config.ai.visionModel || pipelineConfig?.layer3?.model || 'unavailable';
+ // Pipeline config (from .clawdcursor-config.json) takes priority for actual model selection
+ const textModel = pipelineConfig?.layer2?.model || this.config.ai.model || 'unavailable';
+ const visionModel = pipelineConfig?.layer3?.model || this.config.ai.visionModel || 'unavailable';
const textProvider = this.inferProviderLabel(
this.config.ai.textApiKey || this.config.ai.apiKey,
- this.config.ai.textBaseUrl || this.config.ai.baseUrl || pipelineConfig?.layer2?.baseUrl,
- this.config.ai.provider,
+ pipelineConfig?.layer2?.baseUrl || this.config.ai.textBaseUrl || this.config.ai.baseUrl,
+ pipelineConfig?.providerKey || this.config.ai.provider,
);
const visionProvider = this.inferProviderLabel(
this.config.ai.visionApiKey || this.config.ai.apiKey,
- this.config.ai.visionBaseUrl || this.config.ai.baseUrl || pipelineConfig?.layer3?.baseUrl,
- this.config.ai.provider,
+ pipelineConfig?.layer3?.baseUrl || this.config.ai.visionBaseUrl || this.config.ai.baseUrl,
+ pipelineConfig?.providerKey || this.config.ai.provider,
);
console.log(`🤖 Active models: text=${textModel} (${textProvider}) | vision=${visionModel} (${visionProvider})`);
this.browserLayer = new BrowserLayer(this.config, pipelineConfig || {} as PipelineConfig);
- console.log(`🌐 Layer 0 (Browser): Playwright — CDP or managed Chromium`);
+ // Browser layer initialized
- // Initialize Smart Interaction Layer (Layer 1.5) — CDPDriver + UIDriver
- this.smartInteraction = new SmartInteractionLayer(
- this.a11y,
- this.config,
- pipelineConfig || null,
- );
- if (this.smartInteraction.isAvailable()) {
- console.log(`🧩 Layer 1.5 (Smart Interaction): CDPDriver + UIDriver — 1 LLM call planning`);
- }
+ // Warm up the PSRunner bridge so assembly loading happens in background
+ this.a11y.warmup().catch(() => {});
// Initialize Computer Use for Anthropic or mixed-provider pipeline overrides
const computerUseOverrides = pipelineConfig?.layer3?.computerUse
@@ -216,13 +260,23 @@ export class Agent {
if (ComputerUseBrain.isSupported(this.config, computerUseOverrides)) {
this.computerUse = new ComputerUseBrain(this.config, this.desktop, this.a11y, this.safety, computerUseOverrides);
+ this.computerUse.setVerifier(this.verifier);
console.log(`🖥️ Computer Use API enabled (Anthropic native tool + accessibility)`);
+ } else if (isGenericComputerUseSupported(this.config, pipelineConfig)) {
+ // Non-Anthropic provider with a vision model — use the universal OpenAI-compat loop
+ this.genericComputerUse = new GenericComputerUse(this.config, this.desktop, this.a11y, this.safety, pipelineConfig);
+ this.genericComputerUse.setVerifier(this.verifier);
+ const visionModel = pipelineConfig?.layer3?.model || this.config.ai.visionModel || 'unknown';
+ console.log(`🌐 Generic Computer Use enabled (${visionModel})`);
}
const size = this.desktop.getScreenSize();
this.brain.setScreenSize(size.width, size.height);
}
+ /** Maximum wall-clock time for a single task (10 minutes) */
+ private static readonly TASK_TIMEOUT_MS = 10 * 60 * 1000;
+
async executeTask(task: string): Promise {
// Atomic concurrency guard — prevent TOCTOU race on simultaneous /task requests
if (this.state.status !== 'idle') {
@@ -236,7 +290,41 @@ export class Agent {
this.aborted = false;
const startTime = Date.now();
+ // Wrap the entire task pipeline with a global wall-clock timeout.
+ // Individual layers have their own iteration limits, but a deadlocked
+ // LLM call or runaway Computer Use loop could still exceed 10 min.
+ let timeoutHandle: ReturnType | null = null;
+ const timeoutPromise = new Promise((resolve) => {
+ timeoutHandle = setTimeout(() => {
+ this.aborted = true;
+ console.warn(`\n⏱ Task timed out after ${Agent.TASK_TIMEOUT_MS / 60000} minutes`);
+ resolve({
+ success: false,
+ steps: [{ action: 'error', description: `Task timed out after ${Agent.TASK_TIMEOUT_MS / 60000} minutes`, success: false, timestamp: Date.now() }],
+ duration: Date.now() - startTime,
+ });
+ }, Agent.TASK_TIMEOUT_MS);
+ });
+
+ try {
+ return await Promise.race([this._executeTaskInternal(task, startTime), timeoutPromise]);
+ } finally {
+ // Always clear the 10-minute timer so it doesn't keep the process alive
+ // and hold a closure reference to this Agent instance after the task ends.
+ if (timeoutHandle !== null) clearTimeout(timeoutHandle);
+ }
+ }
+
+ private async _executeTaskInternal(task: string, startTime: number): Promise {
+
console.log(`\n🐾 Starting task: ${task}`);
+ this.logger.startTask(task);
+ this.workspace.reset();
+ // Reset Layer 2 state between tasks — clears circuit breaker, disabledApps, CDP cache
+ if (this.reasoner) this.reasoner.reset();
+
+ // Create isolated virtual desktop for this task
+ await this.createIsolatedDesktop();
// Setup debug directory (only when --debug flag is set)
const debugDir = this.config.debug ? path.join(process.cwd(), 'debug') : null;
@@ -268,26 +356,26 @@ export class Agent {
if (preprocessed) {
// Open app/browser if LLM identified one
if (preprocessed.app) {
- console.log(`\n🔀 Pre-processing: opening "${preprocessed.app}" first`);
+ console.log(` Opening "${preprocessed.app}"...`);
try {
const openResult = await this.router.route(`open ${preprocessed.app}`);
if (openResult.handled) {
- console.log(` ✅ "${preprocessed.app}" opened via Action Router`);
- priorContext.push(`Opened "${preprocessed.app}" — it is now the active, focused window`);
- await new Promise(r => setTimeout(r, 2000));
-
- // Maximize the window
+ // app opened
+ priorContext.push(`Opened "${preprocessed.app}" — it is ALREADY the active, focused, maximized window. Do NOT reopen it. Do NOT press Windows key. Start interacting with it IMMEDIATELY.`);
+ // Wait for app to render its UI tree
+ const heavyApps = /outlook|word|excel|teams|powerpoint/i;
+ const settleMs = heavyApps.test(preprocessed.app!) ? 2000 : 500;
+ await new Promise(r => setTimeout(r, settleMs));
+
+ // Bring the app window to focus — the text LLM handles all further interaction
try {
- await this.router.route('maximize window');
- await new Promise(r => setTimeout(r, 500));
- try {
- await execFileAsync('powershell.exe', ['-Command',
- 'Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait("{ESC}")'
- ]);
- } catch { /* non-critical */ }
- await new Promise(r => setTimeout(r, 300));
- priorContext.push('Window maximized to full screen');
- } catch { /* not critical */ }
+ const appWin = await this.a11y.findWindow(preprocessed.app!);
+ if (appWin) {
+ await this.a11y.focusWindow(undefined, appWin.processId);
+ await new Promise(r => setTimeout(r, 300));
+ console.log(` ✅ ${preprocessed.app} focused (pid ${appWin.processId})`);
+ }
+ } catch { /* non-critical — app may self-focus */ }
}
} catch (err) {
console.log(` ⚠️ Pre-open failed: ${err} — proceeding with full task`);
@@ -295,6 +383,7 @@ export class Agent {
}
// Navigate to URL if identified — do it now via keyboard shortcut
+ // The preprocessor LLM already outputs smart URLs (e.g. docs.google.com/document/create)
if (preprocessed.navigate) {
// If no app specified but navigation requested, open default browser first
if (!preprocessed.app) {
@@ -304,21 +393,14 @@ export class Agent {
const openResult = await this.router.route(`open ${defaultBrowser}`);
if (openResult.handled) {
console.log(` ✅ "${defaultBrowser}" opened via Action Router`);
- priorContext.push(`Opened "${defaultBrowser}" — it is now the active, focused window`);
- await new Promise(r => setTimeout(r, 2000));
-
- // Maximize the window
+ priorContext.push(`Opened "${defaultBrowser}" — it is now the active, focused window, maximized to full screen`);
+ // Dismiss Snap Assist if it appeared (Win11 quirk with Super+Up)
try {
- await this.router.route('maximize window');
- await new Promise(r => setTimeout(r, 500));
- try {
- await execFileAsync('powershell.exe', ['-Command',
- 'Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait("{ESC}")'
- ]);
- } catch { /* non-critical */ }
- await new Promise(r => setTimeout(r, 300));
- priorContext.push('Window maximized to full screen');
- } catch { /* not critical */ }
+ await execFileAsync('powershell.exe', ['-Command',
+ 'Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait("{ESC}")'
+ ]);
+ } catch { /* non-critical */ }
+ await new Promise(r => setTimeout(r, 300));
}
} catch (err) {
console.log(` ⚠️ Default browser open failed: ${err} — proceeding with navigation attempt`);
@@ -327,14 +409,28 @@ export class Agent {
console.log(` 🌐 Navigating to ${preprocessed.navigate}...`);
try {
- await this.desktop.keyPress('Control+l');
- await new Promise(r => setTimeout(r, 300));
+ // Ensure browser window has focus before typing URL
+ const windows = await this.a11y.getWindows().catch(() => []);
+ const browserWin = windows.find(w => /msedge|chrome/i.test(w.processName) && !w.isMinimized);
+ if (browserWin) {
+ await this.a11y.focusWindow(undefined, browserWin.processId).catch(() => null);
+ await new Promise(r => setTimeout(r, 400));
+ }
+ // Open a NEW tab to avoid conflicts with existing tab content/CDP state
+ await this.desktop.keyPress('Control+t');
+ await new Promise(r => setTimeout(r, 500));
+ // Address bar is already focused in a new tab — type URL directly
await this.desktop.typeText(preprocessed.navigate);
await new Promise(r => setTimeout(r, 200));
await this.desktop.keyPress('Return');
- await new Promise(r => setTimeout(r, 2000)); // wait for page load
- priorContext.push(`Navigated to ${preprocessed.navigate} — page is loading`);
- console.log(` ✅ Navigated to ${preprocessed.navigate}`);
+ await new Promise(r => setTimeout(r, 3500)); // wait for page load + possible redirects
+ // Re-focus browser after navigation (terminal may have stolen focus)
+ if (browserWin) {
+ await this.a11y.focusWindow(undefined, browserWin.processId).catch(() => null);
+ await new Promise(r => setTimeout(r, 400));
+ }
+ priorContext.push(`Navigated to ${preprocessed.navigate} — page is loading in new tab. Browser is focused.`);
+ console.log(` ✅ Navigated to ${preprocessed.navigate} (new tab)`);
} catch (err) {
console.log(` ⚠️ Navigation failed: ${err} — Computer Use will handle it`);
priorContext.push(`Navigate to: ${preprocessed.navigate} (attempted but may need retry)`);
@@ -381,6 +477,7 @@ export class Agent {
};
console.log(`\n⏱️ Task took ${(result.duration / 1000).toFixed(1)}s with ${result.steps.length} steps (0 LLM calls — Playwright)`);
this.state = { status: 'idle', stepsCompleted: result.steps.length, stepsTotal: result.steps.length };
+ await this.closeIsolatedDesktop();
return result;
}
// Browser layer couldn't handle it — fall through
@@ -398,7 +495,7 @@ export class Agent {
console.log(`\n⚡ Action Router: attempting "${task}"`);
const routeResult = await this.router.route(task);
const telemetry = this.router.getTelemetry();
- console.log(` 📊 Telemetry: ${JSON.stringify(telemetry)}`);
+ // Telemetry logged silently
if (routeResult.handled) {
const step: StepResult = {
action: 'action-router',
@@ -413,41 +510,17 @@ export class Agent {
};
console.log(`\n⏱️ Task took ${(result.duration / 1000).toFixed(1)}s — Action Router (0 LLM calls, $0)`);
this.state = { status: 'idle', stepsCompleted: 1, stepsTotal: 1 };
+ await this.closeIsolatedDesktop();
return result;
}
console.log(` ⚡ Action Router: not matched — falling through`);
}
- // ── Layer 1.5: Smart Interaction (CDPDriver + UIDriver) ──
- // Uses 1 cheap LLM call to read context + plan, then executes all steps free.
- // For browser tasks: CDPDriver via CDP port 9222
- // For native tasks: UIDriver via Windows UI Automation
- if (this.smartInteraction?.isAvailable()) {
- this.state.status = 'acting';
- console.log(`\n🧩 Smart Interaction Layer: attempting "${task}"`);
- const smartResult = await this.smartInteraction.tryHandle(task, isBrowserTask);
- if (smartResult.handled && smartResult.success) {
- const result: TaskResult = {
- success: true,
- steps: smartResult.steps,
- duration: Date.now() - startTime,
- };
- console.log(`\n⏱️ Task took ${(result.duration / 1000).toFixed(1)}s with ${result.steps.length} steps (${smartResult.llmCalls} LLM call — Smart Interaction)`);
- this.state = { status: 'idle', stepsCompleted: result.steps.length, stepsTotal: result.steps.length };
- return result;
- }
- // Smart Interaction couldn't handle it — fall through to Computer Use
- if (!smartResult.handled) {
- console.log(` 🧩 Smart Interaction: falling through to Computer Use — ${smartResult.description || 'not handled'}`);
- }
- }
-
- // ── Layer 2: Computer Use / Decompose+Route (expensive fallback) ──
- if (this.computerUse) {
- return this.executeWithComputerUse(task, debugDir, startTime, priorContext);
- } else {
- return this.executeWithDecomposeAndRoute(task, debugDir, startTime);
- }
+ // ── Layer 2+: Decompose → A11y Reasoner → vision fallback per subtask ──
+ // Always decompose first so the a11y reasoner gets single-step subtasks.
+ // Computer Use is used as a per-subtask fallback inside executeWithDecomposeAndRoute,
+ // not as a first-class handler for the whole task.
+ return this.executeWithDecomposeAndRoute(task, debugDir, startTime, priorContext);
}
/**
@@ -534,9 +607,24 @@ export class Agent {
// Need a text model to pre-process
if (!this.hasApiKey && !this.reasoner) return null;
- // Skip pre-processing for very simple tasks (single action)
- const simplePatterns = /^(scroll|click|type|press|copy|paste|undo|redo|save|close|minimize|maximize)\b/i;
- if (simplePatterns.test(task)) return null;
+ // Skip pre-processing only for genuinely simple, non-compound tasks.
+ // A compound task ("open X and send email", "open X then type Y") MUST go through
+ // pre-processing so it gets decomposed properly.
+ const hasCompound = /(?:,|\b(?:and|then)\b)/i.test(task.trim());
+ if (!hasCompound) {
+ const routerHandled = [
+ /^(?:open|launch|start|run)\s+\S/i,
+ /^(?:type|enter|write|input)\s+/i,
+ /^(?:go to|navigate to|visit|browse to)\s+/i,
+ /^(?:press|hit)\s+/i,
+ /^(?:click|tap)\s+/i,
+ /^(?:focus|switch to|bring up|activate)\s+/i,
+ /^(?:close|minimize|maximize)\s+/i,
+ /^(?:find|search in page)\s+/i,
+ /^(?:scroll|copy|paste|undo|redo|save|refresh|back|forward)\b/i,
+ ];
+ if (routerHandled.some(p => p.test(task.trim()))) return null;
+ }
const systemPrompt = `You are a task pre-processor for an AI desktop agent. Parse the user's command into structured JSON.
@@ -553,6 +641,30 @@ RULES:
- CRITICAL: If the command involves multiple apps (e.g. "copy from X then paste in Y"), the task field MUST include the full chain of remaining actions including switching to other apps
- If the whole task is just "open X", task should be empty string
+SMART URL RULE — VERY IMPORTANT:
+When the task involves creating, searching, or navigating directly to content on a website, use the DIRECT ACTION URL that skips the homepage. The agent navigates to this URL immediately, so it must land on the right page.
+
+Creation URLs:
+- "write in a new google doc" → navigate: "docs.google.com/document/create" (NOT docs.google.com)
+- "create a new spreadsheet" → navigate: "docs.google.com/spreadsheets/create"
+- "create a new presentation" → navigate: "docs.google.com/presentation/create"
+- "create a github repo" → navigate: "github.com/new"
+- "create a new notion page" → navigate: "notion.so/new"
+- "compose an email in gmail" → navigate: "mail.google.com/mail/u/0/#inbox?compose=new"
+- "create a new codepen" → navigate: "codepen.io/pen/"
+- "post on twitter" → navigate: "twitter.com/compose/tweet"
+
+Search URLs (use query parameters to skip manual search):
+- "google search for cats" → navigate: "google.com/search?q=cats"
+- "search google for speed of light" → navigate: "google.com/search?q=speed+of+light"
+- "search youtube for music" → navigate: "youtube.com/results?search_query=music"
+- "search amazon for laptops" → navigate: "amazon.com/s?k=laptops"
+- "search wikipedia for Python" → navigate: "en.wikipedia.org/wiki/Python"
+- "search github for react" → navigate: "github.com/search?q=react"
+For search queries, URL-encode spaces as + and special chars as %XX.
+
+Apply this pattern to ANY website you know has a direct create/search/action URL. If unsure, use the base URL.
+
VALIDATION RULE: The task field combined with app+navigate must account for EVERY action in the original command. If you drop any part, the agent will fail.
NEVER RULES:
@@ -580,7 +692,7 @@ Examples:
- "open reddit on edge and scroll down through posts and interact with one" → {"app": "Microsoft Edge", "navigate": "reddit.com", "task": "scroll down through posts and interact with one", "contextHints": ["reddit"]}
- "open wikipedia on edge, copy a sentence, then paste it in google docs" → {"app": "Microsoft Edge", "navigate": "wikipedia.org", "task": "scroll through an article, copy an interesting sentence, then open Google Docs and paste it there", "contextHints": ["wikipedia", "google docs"]}
- "open wikipedia, copy a sentence, then open notepad and paste it" → {"app": null, "navigate": "wikipedia.org", "task": "copy a sentence from wikipedia, then open notepad and paste the sentence", "contextHints": ["wikipedia", "notepad"]}
-- "search for cats on google, copy the first result link, then open email and paste it" → {"app": null, "navigate": "google.com", "task": "search for cats, copy the first result link, then open email application and paste the link", "contextHints": ["google", "email"]}
+- "search for cats on google, copy the first result link, then open email and paste it" → {"app": null, "navigate": "google.com/search?q=cats", "task": "copy the first result link, then open email application and paste the link", "contextHints": ["google", "email"]}
- "open amazon and find a book, then save the title to a text file" → {"app": null, "navigate": "amazon.com", "task": "find a book, copy or note the title, then open text editor and save the title to a file", "contextHints": ["amazon", "text file"]}
- "compare prices between amazon and ebay for laptops" → {"app": null, "navigate": "amazon.com", "task": "search for laptops and note prices, then open ebay in new tab and compare laptop prices", "contextHints": ["amazon", "ebay"]}
- "drag an image from browser to desktop" → {"app": null, "navigate": null, "task": "drag an image from browser window to desktop", "contextHints": ["browser", "desktop"]}`;
@@ -591,13 +703,7 @@ Examples:
let response: string;
- if (this.smartInteraction?.isAvailable()) {
- // Use SmartInteraction's callTextModel (it handles all providers)
- response = await (this.smartInteraction as any).callTextModel(
- `Parse this command: "${task}"`,
- systemPrompt,
- );
- } else if (this.reasoner) {
+ if (this.reasoner) {
// Use reasoner's provider config via fetch
const pipelineConfig = loadPipelineConfig();
if (!pipelineConfig) return null;
@@ -628,6 +734,8 @@ Examples:
const elapsed = Date.now() - startTime;
console.log(` ⚡ Pre-processed in ${elapsed}ms`);
+ this.logger.logStep({ layer: 'preprocess', actionType: 'llm_preprocess', result: 'success', durationMs: elapsed, llmReasoning: response.substring(0, 200) });
+ this.logger.recordLlmCall();
// Parse JSON from response
const jsonMatch = response.match(/\{[\s\S]*\}/);
@@ -668,7 +776,7 @@ Examples:
this.state.status = 'acting';
try {
- const cuResult = await this.computerUse!.executeSubtask(task, debugDir, 0, priorContext);
+ const cuResult = await this.computerUse!.executeSubtask(task, debugDir, 0, priorContext, this.logger);
const result: TaskResult = {
success: cuResult.success,
@@ -686,58 +794,68 @@ Examples:
duration: Date.now() - startTime,
};
} finally {
+ await this.closeIsolatedDesktop();
this.state.status = 'idle';
this.state.currentTask = undefined;
}
}
/**
- * PATH B: Decompose + Route + LLM Fallback
- * For non-Anthropic providers or offline mode.
+ * PATH B: Decompose → A11y Reasoner → Computer Use fallback per subtask.
+ * Always used now — Computer Use runs per-subtask, not on the whole task.
*/
private async executeWithDecomposeAndRoute(
task: string,
debugDir: string | null,
startTime: number,
+ priorContext?: string[],
): Promise {
const steps: StepResult[] = [];
let llmCallCount = 0;
- console.log(` Using decompose → route → LLM fallback pipeline\n`);
+ // decompose → a11y → vision pipeline
try {
// ─── Decompose ───────────────────────────────────────────────
- console.log(`📋 Decomposing task...`);
+ // decomposing task
const decompositionStart = Date.now();
let subtasks: string[];
- if (this.hasApiKey) {
+ // If pre-processing already ran (priorContext exists), the task has been refined
+ // by the LLM. Skip the local parser — it misinterprets creative/contextual tasks
+ // as literal commands (e.g., "write a sentence on dogs" → "type a sentence on dogs").
+ // The task goes straight to Layer 2 (A11y Reasoner) which can see the screen and reason.
+ if (priorContext && priorContext.length > 0) {
+ subtasks = [task];
+ console.log(` ⚡ Pre-processed task — straight to Layer 2 (${Date.now() - decompositionStart}ms)`);
+ } else {
+ // No pre-processing context — try local parser first (instant, no API call)
+ const localResult = this.parser.decomposeTask(task);
+ if (localResult) {
+ subtasks = localResult;
+ console.log(` ⚡ Local parser handled in ${Date.now() - decompositionStart}ms (offline)`);
+ } else if (this.hasApiKey) {
console.log(` 🧠 Using LLM to decompose task...`);
subtasks = await this.brain.decomposeTask(task);
llmCallCount = 1;
console.log(` Decomposed via LLM in ${Date.now() - decompositionStart}ms`);
} else {
- const localResult = this.parser.decomposeTask(task);
- if (localResult) {
- subtasks = localResult;
- console.log(` ⚡ Local parser handled in ${Date.now() - decompositionStart}ms (offline)`);
- } else {
- console.log(` ❌ Task too complex for offline mode.`);
- return {
- success: false,
- steps: [{ action: 'error', description: 'Task too complex for offline mode. Configure OpenClaw agent provider (or set AI_API_KEY in standalone mode) to unlock AI fallback.', success: false, timestamp: Date.now() }],
- duration: Date.now() - startTime,
- };
- }
+ console.log(` ❌ Task too complex for offline mode.`);
+ return {
+ success: false,
+ steps: [{ action: 'error', description: 'Task too complex for offline mode. Set AI_API_KEY or run clawdcursor doctor to unlock AI fallback.', success: false, timestamp: Date.now() }],
+ duration: Date.now() - startTime,
+ };
}
+ } // close the priorContext else block
console.log(` ${subtasks.length} subtask(s):`);
subtasks.forEach((st, i) => console.log(` ${i + 1}. "${st}"`));
this.state.stepsTotal = subtasks.length;
// ─── Execute each subtask ────────────────────────────────────
- console.log(`\n⚡ Executing subtasks...`);
+ // executing subtasks
for (let i = 0; i < subtasks.length; i++) {
if (this.aborted) {
@@ -750,9 +868,15 @@ Examples:
this.state.currentStep = subtask;
this.state.stepsCompleted = i;
- // Try router first
+ // Try router first — but ONLY for mechanical subtasks.
+ // If the task came from LLM pre-processing (priorContext exists), it likely needs
+ // reasoning (e.g., "write a sentence on dogs" needs the LLM to compose content,
+ // see the screen, click Blank in Google Docs, etc.). Skip the router for those.
+ const skipRouter = !!(priorContext && priorContext.length > 0);
this.state.status = 'acting';
- const routeResult = await this.router.route(subtask);
+ const routeResult = skipRouter
+ ? { handled: false, description: 'Skipped — pre-processed task needs LLM reasoning' }
+ : await this.router.route(subtask);
if (routeResult.handled) {
console.log(` ✅ Router: ${routeResult.description}`);
@@ -774,72 +898,213 @@ Examples:
continue;
}
- console.log(` ⚠️ Router can't handle: ${routeResult.description}`);
-
- // Layer 2: Accessibility Reasoner (text-only LLM, no screenshot)
- if (this.reasoner?.isAvailable()) {
- const reasonResult = await this.reasoner.reason(subtask);
- if (reasonResult.handled) {
- if (reasonResult.action) {
- try {
- await this.executeAction(reasonResult.action as InputAction & { description?: string });
- steps.push({ action: reasonResult.action.kind, description: reasonResult.description, success: true, timestamp: Date.now() });
- await this.delay(100);
- continue;
- } catch (err) {
- console.log(` ⚠️ Layer 2 action failed: ${err} → falling through to Layer 3`);
- // Layer 2 failed — hand remaining subtasks (including this one) to Computer Use
- if (this.computerUse) {
- const remainingTask = subtasks.slice(i).join(', then ');
- console.log(` 🖥️ Handing off to Computer Use: "${remainingTask}"`);
- const fallbackResult = await this.executeLLMFallback(remainingTask, steps, debugDir, i);
- llmCallCount += fallbackResult.llmCalls;
- i = subtasks.length; // skip remaining — Computer Use handled them
- break;
- }
+ // If this is a browser task, ensure Edge has focus before Layer 2 reads the active window.
+ // The preprocessor navigates but may leave the terminal with focus.
+ const isBrowserTask = priorContext?.some(c => /navigated to|opened.*edge|opened.*chrome/i.test(c));
+ let browserProcessName: string | undefined;
+ if (isBrowserTask) {
+ try {
+ const windows = await this.a11y?.getWindows().catch(() => []) ?? [];
+ const edgeWin = windows.find(w => /msedge|chrome/i.test(w.processName) && !w.isMinimized);
+ if (edgeWin) {
+ browserProcessName = edgeWin.processName; // remember target process
+ // Try focus up to 3 times with increasing delay
+ for (let attempt = 0; attempt < 3; attempt++) {
+ await this.a11y?.focusWindow(undefined, edgeWin.processId).catch(() => null);
+ await this.delay(500 + attempt * 300);
+ const checkWin = await this.a11y?.getActiveWindow().catch(() => null);
+ if (checkWin && /msedge|chrome/i.test(checkWin.processName)) break;
}
- } else {
- // Task done per reasoner
- steps.push({ action: 'done', description: reasonResult.description, success: true, timestamp: Date.now() });
- continue;
}
+ } catch { /* non-critical */ }
+ }
+
+ // ── Layer 2: Skill Cache — replay learned paths ─────────────
+ let activeWin = await this.a11y?.getActiveWindow().catch(() => null);
+ if (!activeWin) {
+ await this.delay(400);
+ activeWin = await this.a11y?.getActiveWindow().catch(() => null);
+ }
+ const activeProcessForSkill = browserProcessName || activeWin?.processName || '';
+
+ const cachedSkill = this.skillCache.findSkill(subtask, activeProcessForSkill);
+ if (cachedSkill) {
+ const skillResult = await this.skillCache.executeSkill(cachedSkill, this.desktop, this.a11y);
+ if (skillResult === 'success') {
+ steps.push({ action: 'done', description: `Skill cache: "${cachedSkill.taskPattern}" replayed`, success: true, timestamp: Date.now() });
+ continue;
}
- // If unsure or failed, fall through to Layer 3
+ // miss → fall through to OCR or A11y
+ console.log(` 🔄 Skill cache miss — falling through`);
}
- // Layer 3: LLM vision fallback — hand off ALL remaining subtasks, not just current one
- if (this.hasApiKey) {
- await this.delay(150);
+ // ── Layer 2.5: OCR Reasoner — primary universal read layer ──
+ if (this.ocrReasoner) {
+ console.log(`\n👁️ Layer 2.5 (OCR Reasoner): "${subtask}"`);
+ const ocrStart = Date.now();
+ const ocrResult = await this.ocrReasoner.run(subtask, priorContext);
+ const ocrDuration = Date.now() - ocrStart;
+
+ if (ocrResult.handled && ocrResult.success) {
+ steps.push({
+ action: 'done',
+ description: ocrResult.description,
+ success: true,
+ timestamp: Date.now(),
+ });
+ // Record for skill promotion
+ const ocrSteps = ocrResult.actionLog
+ .filter(a => a.action !== 'done' && a.action !== 'parse_error' && a.action !== 'error')
+ .map(a => ({ type: a.action as any, description: a.description }));
+ this.skillCache.recordSuccess(subtask, activeProcessForSkill, ocrSteps);
+ console.log(` ✅ OCR Reasoner done (${ocrResult.steps} steps, ${(ocrDuration / 1000).toFixed(1)}s)`);
+ continue;
+ }
+
+ if (ocrResult.fallbackReason === 'cannot_read') {
+ console.log(` 🤷 OCR cannot read UI — skipping A11y, falling to Layer 3 (vision LLM)`);
+ } else if (!ocrResult.success) {
+ console.log(` 🤷 OCR Reasoner did not complete (${ocrResult.steps} steps, ${(ocrDuration / 1000).toFixed(1)}s) — skipping A11y, falling to vision`);
+ }
+ // OCR already includes a11y tree in its snapshot — if OCR+A11y combined
+ // couldn't handle it, A11y alone won't either. Skip straight to vision.
+ }
+
+ // When OCR Reasoner is NOT available, fall back to A11y Reasoner (v0.7.0 path)
+ // Re-read active window (may have changed during skill/OCR steps)
+ activeWin = await this.a11y?.getActiveWindow().catch(() => null);
+ const activeProcessName = browserProcessName || activeWin?.processName;
+ let a11yActionHistory: { action: string; description: string }[] | undefined;
+
+ if (!this.ocrReasoner && this.reasoner?.isAvailable(activeProcessName)) {
+ // A11y Reasoner only runs when OCR is unavailable (v0.7.0 compat path)
+ console.log(`\n🧠 Layer 2 (A11y Reasoner — OCR unavailable): "${subtask}"`);
+ const reasonStart = Date.now();
+ const reasonResult = await this.reasoner.reason(subtask, activeProcessName, priorContext, this.logger, this.verifier);
+ const reasonDuration = Date.now() - reasonStart;
+ if (reasonResult.handled) {
+ steps.push({
+ action: 'done',
+ description: reasonResult.description,
+ success: true,
+ timestamp: Date.now(),
+ });
+ console.log(` ✅ Layer 2 done (${reasonResult.steps ?? 0} steps, ${(reasonDuration / 1000).toFixed(1)}s)`);
+ continue;
+ }
+ // Check if needs human intervention (payment, captcha, 2FA, etc.)
+ if (reasonResult.needsHuman) {
+ console.log(`\n🙋 NEEDS HUMAN INTERVENTION: ${reasonResult.description}`);
+ steps.push({
+ action: 'needs-human',
+ description: reasonResult.description,
+ success: false,
+ timestamp: Date.now(),
+ });
+ break; // Stop processing — do NOT fall through to Layer 3
+ }
+
+ a11yActionHistory = reasonResult.actionHistory;
+ const stepCount = reasonResult.steps ?? 0;
+ const duration = (reasonDuration / 1000).toFixed(1);
+ console.log(` 🤷 Layer 2 → Layer 3 (${stepCount} steps, ${duration}s): ${reasonResult.description.substring(0, 100)}`);
+ this.reasoner.recordVisionFallback();
+ } else if (!this.ocrReasoner && this.reasoner) {
+ console.log(` ⚠️ Layer 2 circuit breaker (${activeProcessName ?? 'unknown'}) — falling to Layer 3`);
+ this.reasoner.recordVisionFallback();
+ }
+
+ // Layer 3: Vision fallback — Computer Use takes over when text LLM cannot proceed
+ const enrichedContext = [...(priorContext ?? [])];
+ if (a11yActionHistory && a11yActionHistory.length > 0) {
+ enrichedContext.push(
+ `A11y Reasoner already tried these actions (do NOT repeat them):\n` +
+ a11yActionHistory.map((a, idx) => ` ${idx + 1}. ${a.action} — ${a.description}`).join('\n')
+ );
+ }
+
+ if (this.computerUse || this.genericComputerUse || this.hasApiKey) {
const remainingTask = subtasks.slice(i).join(', then ');
- console.log(` 🧠 LLM vision fallback for remaining: "${remainingTask}"`);
- const fallbackResult = await this.executeLLMFallback(remainingTask, steps, debugDir, i);
- llmCallCount += fallbackResult.llmCalls;
- if (!fallbackResult.success) {
- console.log(` ❌ LLM fallback failed for: "${subtask}"`);
+ if (this.computerUse) {
+ // Anthropic native Computer Use
+ console.log(` 🖥️ Layer 3 (Anthropic): "${remainingTask}"`);
+ try {
+ const cuResult = await this.computerUse.executeSubtask(remainingTask, debugDir, i, enrichedContext, this.logger);
+ steps.push(...cuResult.steps);
+ llmCallCount += cuResult.llmCalls;
+ } catch (err) {
+ steps.push({ action: 'error', description: `Computer Use failed: ${err}`, success: false, timestamp: Date.now() });
+ }
+ } else if (this.genericComputerUse) {
+ // Generic OpenAI-compat vision loop (GPT-4o, Gemini, Groq, Llama-vision, etc.)
+ console.log(` 🌐 Layer 3 (Generic): "${remainingTask}"`);
+ try {
+ const cuResult = await this.genericComputerUse.executeSubtask(remainingTask, debugDir, i, enrichedContext, this.logger);
+ steps.push(...cuResult.steps);
+ llmCallCount += cuResult.llmCalls;
+ } catch (err) {
+ steps.push({ action: 'error', description: `Generic Computer Use failed: ${err}`, success: false, timestamp: Date.now() });
+ }
+ } else {
+ // Legacy fallback — vision LLM without structured tool schema
+ await this.delay(150);
+ console.log(` 🧠 Layer 3 (legacy fallback): "${remainingTask}"`);
+ const fallbackResult = await this.executeLLMFallback(remainingTask, steps, debugDir, i);
+ llmCallCount += fallbackResult.llmCalls;
+ if (!fallbackResult.success) {
+ console.log(` ❌ Legacy fallback failed: "${subtask}"`);
+ }
}
- break; // Computer Use handled the rest
+ break;
} else {
- steps.push({ action: 'skipped', description: `Skipped "${subtask}" — no API key`, success: false, timestamp: Date.now() });
+ steps.push({ action: 'skipped', description: `Skipped "${subtask}" — no API key or vision model configured`, success: false, timestamp: Date.now() });
}
}
+ // Update workspace state after all subtasks
+ try {
+ const windows = await this.a11y.getWindows().catch(() => []);
+ this.workspace.updateWindows(windows);
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ if (activeWin?.processId) this.workspace.setActiveWindow(activeWin.processId);
+ const clip = await this.a11y.readClipboard().catch(() => '');
+ if (clip) this.workspace.updateClipboard(clip, 'post-task');
+ } catch { /* non-critical */ }
+
+ // Only report success when an explicit 'done' step was recorded by a layer
+ const hasDoneStep = steps.some(s => s.action === 'done' && s.success);
+ // Distinguish verified vs unverified success
+ const hasVerifiedDone = steps.some(s => s.action === 'done' && s.success && s.description?.includes('verified'));
+ const hasNeedsHuman = steps.some(s => s.action === 'needs-human' || s.description?.includes('needs_human'));
+
+ let finalStatus: CompletionStatus;
+ if (hasNeedsHuman) finalStatus = 'needs_human';
+ else if (hasVerifiedDone) finalStatus = 'verified_success';
+ else if (hasDoneStep) finalStatus = 'unverified_success';
+ else finalStatus = 'failed';
+
const result: TaskResult = {
- success: steps.length > 0 && steps.some(s => s.success),
+ success: hasDoneStep,
steps,
duration: Date.now() - startTime,
};
- console.log(`\n⏱️ Task took ${(result.duration / 1000).toFixed(1)}s with ${steps.length} steps (${llmCallCount} LLM call(s))`);
+ const statusIcon = finalStatus === 'verified_success' ? '✅' : finalStatus === 'unverified_success' ? '⚠️' : '❌';
+ console.log(`\n${statusIcon} Task ${finalStatus.toUpperCase()} | ${(result.duration / 1000).toFixed(1)}s | ${steps.length} steps | ${llmCallCount} LLM calls`);
+ console.log(` Workspace: ${this.workspace.getSummary()}`);
+ this.logger.endTask(finalStatus, { refinedTask: task });
return result;
} catch (err) {
console.error(`\n❌ Decompose+Route crashed:`, err);
+ this.logger.endTask('failed');
return {
success: false,
steps: [...steps, { action: 'error', description: `Pipeline crashed: ${err}`, success: false, timestamp: Date.now() }],
duration: Date.now() - startTime,
};
} finally {
+ await this.closeIsolatedDesktop();
this.state.status = 'idle';
this.state.currentTask = undefined;
this.brain.resetConversation();
@@ -864,7 +1129,6 @@ Examples:
if (this.aborted) break;
// ── Perf Opt #2: Parallelize screenshot + a11y fetch ──
- console.log(` 📸 LLM step ${j + 1}: Capturing screen + a11y context...`);
if (j > 0) await this.delay(500); // pause between LLM retries to let UI settle
const [screenshot, a11yContext] = await Promise.all([
@@ -879,7 +1143,6 @@ Examples:
path.join(debugDir, `subtask-${subtaskIndex}-step-${j}.${ext}`),
screenshot.buffer,
).catch(() => {});
- console.log(` 💾 Debug screenshot saved (${(screenshot.buffer.length / 1024).toFixed(0)}KB, ${screenshot.llmWidth}x${screenshot.llmHeight})`);
}
// Ask AI what to do
@@ -899,19 +1162,19 @@ Examples:
const isParseError = decision.error.startsWith('Parse error:') || decision.error.startsWith('Failed to parse');
if (isParseError) {
// Parse errors are retryable — LLM returned prose or bad JSON, take a fresh screenshot and try again
- console.log(` ⚠️ LLM returned bad JSON, retrying... (${decision.error.substring(0, 80)})`);
+ // retrying after parse error
steps.push({ action: 'retry', description: `Retryable: ${decision.error.substring(0, 100)}`, success: false, timestamp: Date.now() });
this.brain.resetConversation(); // clear bad history so next attempt starts fresh
continue;
}
- console.log(` ❌ LLM error: ${decision.error}`);
+ console.log(` ❌ ${decision.error}`);
steps.push({ action: 'error', description: decision.error, success: false, timestamp: Date.now() });
return { success: false, llmCalls };
}
// Wait?
if (decision.waitMs) {
- console.log(` ⏳ Waiting ${decision.waitMs}ms: ${decision.description}`);
+ // waiting
await this.delay(decision.waitMs);
stepDescriptions.push(decision.description);
continue;
@@ -919,13 +1182,13 @@ Examples:
// Handle SEQUENCE
if (decision.sequence) {
- console.log(` 📋 Sequence: ${decision.sequence.description} (${decision.sequence.steps.length} steps)`);
+ // executing sequence
for (const seqStep of decision.sequence.steps) {
if (this.aborted) break;
const tier = this.safety.classify(seqStep, seqStep.description);
- console.log(` ${tierEmoji(tier)} ${seqStep.description}`);
+ // seq step
if (tier === SafetyTier.Confirm) {
this.state.status = 'waiting_confirm';
@@ -956,17 +1219,17 @@ Examples:
recentActions.push(actionKey);
const lastN = recentActions.slice(-MAX_SIMILAR_ACTION);
if (lastN.length >= MAX_SIMILAR_ACTION && lastN.every(a => a === lastN[0])) {
- console.log(` 🔄 Same action repeated ${MAX_SIMILAR_ACTION} times — giving up on this subtask`);
+ console.log(` ❌ Stuck: repeated "${actionKey}"`);
steps.push({ action: 'stuck', description: `Stuck: repeated "${actionKey}"`, success: false, timestamp: Date.now() });
return { success: false, llmCalls };
}
// Safety check
const tier = this.safety.classify(decision.action, decision.description);
- console.log(` ${tierEmoji(tier)} Action: ${decision.description}`);
+ // action classified
if (this.safety.isBlocked(decision.description)) {
- console.log(` 🚫 BLOCKED: ${decision.description}`);
+ console.log(` ❌ BLOCKED: ${decision.description}`);
steps.push({ action: 'blocked', description: `BLOCKED: ${decision.description}`, success: false, timestamp: Date.now() });
return { success: false, llmCalls };
}
@@ -1015,6 +1278,8 @@ Examples:
abort(): void {
this.aborted = true;
+ this.logger.endTask('aborted');
+ this.state = { status: 'idle', stepsCompleted: 0, stepsTotal: 0 };
}
getState(): AgentState {
@@ -1025,9 +1290,16 @@ Examples:
return this.safety;
}
+ getDesktop(): NativeDesktop {
+ return this.desktop;
+ }
+
+ getA11y(): AccessibilityBridge {
+ return this.a11y;
+ }
+
disconnect(): void {
this.desktop.disconnect();
- this.smartInteraction?.disconnect().catch(() => {});
}
private async executeA11yAction(action: A11yAction): Promise {
@@ -1040,8 +1312,6 @@ Examples:
const a11yAction = actionMap[action.kind];
if (!a11yAction) throw new Error(`Unknown a11y action: ${action.kind}`);
- console.log(` ♿ A11y ${a11yAction}: ${action.name || action.automationId} [${action.controlType || 'any'}]`);
-
const result = await this.a11y.invokeElement({
name: action.name,
automationId: action.automationId,
@@ -1050,9 +1320,78 @@ Examples:
value: action.value,
});
- if (!result.success) {
+ this.a11y.invalidateCache();
+
+ if (!result.success && !result.clickPoint) {
throw new Error(result.error || 'A11y action failed');
}
+
+ // Coordinate fallback: bridge couldn't invoke but gave us bounds
+ if (result.clickPoint) {
+ await this.desktop.mouseClick(result.clickPoint.x, result.clickPoint.y);
+ this.a11y.invalidateCache();
+ }
+ }
+
+ /**
+ * Minimize ALL windows on the current desktop (called before desktop switch).
+ * Uses Shell.Application COM object for a clean slate.
+ */
+ private async minimizeAllWindows(): Promise {
+ if (IS_MAC) return;
+ try {
+ await execFileAsync('powershell.exe', ['-Command',
+ `$shell = New-Object -ComObject Shell.Application; $shell.MinimizeAll()`
+ ]);
+ await new Promise(r => setTimeout(r, 400));
+ } catch { /* non-fatal */ }
+ }
+
+ /**
+ * Minimize all windows EXCEPT those matching processName (called after app opens
+ * on the isolated desktop to hide anything that leaked through).
+ */
+ private async minimizeAllExcept(processName: string): Promise {
+ if (IS_MAC) return;
+ try {
+ await execFileAsync('powershell.exe', ['-Command',
+ `Add-Type @"
+using System;
+using System.Runtime.InteropServices;
+public class Win32 {
+ [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
+ [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr h);
+ [DllImport("user32.dll")] public static extern bool EnumWindows(EnumWindowsProc lp, IntPtr p);
+ [DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr h, out uint pid);
+ public delegate bool EnumWindowsProc(IntPtr h, IntPtr p);
+}
+"@
+$target = "${processName}".ToLower()
+$procs = Get-Process | Where-Object { $_.MainWindowHandle -ne 0 -and $_.Name.ToLower() -notlike "*$target*" -and $_.Name.ToLower() -notlike "*clawdcursor*" -and $_.Name.ToLower() -notlike "*powershell*" }
+foreach ($p in $procs) { [Win32]::ShowWindow($p.MainWindowHandle, 2) | Out-Null }`
+ ]);
+ await new Promise(r => setTimeout(r, 400));
+ } catch { /* non-fatal */ }
+ }
+
+ /**
+ * Create an isolated Windows virtual desktop so the agent works in a clean
+ * environment away from the user's open windows.
+ * 1. Minimize all windows first (so they don't follow to the new desktop)
+ * 2. Win+Ctrl+D creates a new desktop and switches to it
+ */
+ private async createIsolatedDesktop(): Promise {
+ // Disabled: isolated virtual desktops hide the app that pre-processing just opened,
+ // causing vision/screenshots to see an empty desktop and waste time re-opening apps.
+ // The agent now works on the user's current desktop directly.
+ return;
+ }
+
+ /**
+ * Close the isolated virtual desktop — no-op since we no longer create one.
+ */
+ private async closeIsolatedDesktop(): Promise {
+ return;
}
private delay(ms: number): Promise {
diff --git a/src/ai-brain.ts b/src/ai-brain.ts
index 67d8d97..b82dba9 100644
--- a/src/ai-brain.ts
+++ b/src/ai-brain.ts
@@ -11,6 +11,7 @@
import * as crypto from 'crypto';
import type { ClawdConfig, InputAction, ActionSequence, ScreenFrame } from './types';
+import { extractJsonObject, extractJsonArray } from './safe-json';
const SYSTEM_PROMPT = `You are Clawd Cursor, an AI desktop agent on {OS_NAME}.
Screen: {REAL_WIDTH}x{REAL_HEIGHT}. Screenshot: {LLM_WIDTH}x{LLM_HEIGHT} (scale {SCALE}x).
@@ -113,12 +114,9 @@ export class AIBrain {
async decomposeTask(task: string): Promise {
try {
const response = await this.callLLMText(DECOMPOSE_SYSTEM_PROMPT, `Task: "${task}"`);
- const match = response.match(/\[[\s\S]*\]/);
- if (match) {
- const parsed = JSON.parse(match[0]);
- if (Array.isArray(parsed) && parsed.length > 0 && parsed.every((s: any) => typeof s === 'string')) {
- return parsed;
- }
+ const parsed = extractJsonArray(response);
+ if (parsed && parsed.length > 0 && parsed.every((s: any) => typeof s === 'string')) {
+ return parsed as string[];
}
// If parsing failed, return the whole task as a single subtask
console.warn(`⚠️ Failed to parse decomposition, using task as-is`);
@@ -247,13 +245,11 @@ export class AIBrain {
waitMs?: number;
} {
try {
- const jsonMatch = response.match(/\{[\s\S]*\}/);
- if (!jsonMatch) {
+ const parsed = extractJsonObject(response) as any;
+ if (!parsed) {
return { action: null, sequence: null, description: 'Failed to parse AI response', done: false, error: response };
}
- const parsed = JSON.parse(jsonMatch[0]);
-
if (parsed.kind === 'done') {
return { action: null, sequence: null, description: parsed.description || 'Task complete', done: true };
}
@@ -458,9 +454,8 @@ export class AIBrain {
// Early return: if we have a complete JSON object, stop waiting
if (result.includes('}') && !result.includes('"steps"')) {
try {
- const match = result.match(/\{[\s\S]*\}/);
- if (match) {
- JSON.parse(match[0]); // validates it's complete JSON
+ const earlyParsed = extractJsonObject(result);
+ if (earlyParsed) {
reader.cancel();
return result;
}
diff --git a/src/browser-layer.ts b/src/browser-layer.ts
index 3ebb6d5..e69568d 100644
--- a/src/browser-layer.ts
+++ b/src/browser-layer.ts
@@ -502,7 +502,9 @@ export class BrowserLayer {
if (this.browser && !(this.browser as any)._initializer?.wsEndpoint?.includes('127.0.0.1')) {
await this.browser.close();
}
- } catch {}
+ } catch (err) {
+ console.debug(`[BrowserLayer] cleanup error (non-critical): ${err}`);
+ }
this.browser = null;
this.context = null;
this.page = null;
diff --git a/src/cdp-driver.ts b/src/cdp-driver.ts
index 1e8bcae..fed6457 100644
--- a/src/cdp-driver.ts
+++ b/src/cdp-driver.ts
@@ -129,16 +129,34 @@ export class CDPDriver {
try {
this.browser = await chromium.connectOverCDP(
`http://127.0.0.1:${this.cdpPort}`,
- { timeout: 5000 },
+ { timeout: 15000 },
);
this.ownsBrowser = true;
- // Get the most recent tab
+ // Get the most relevant tab — search ALL browser contexts (not just the first)
+ // Priority: user-navigated pages > pinned/system widgets > browser internal pages
+ const BLOCKED_URL_PATTERNS = [
+ 'edge://', 'chrome://', 'about:',
+ // Known OEM/vendor widget pages that embed browsers as system components
+ 'vantage.csw.lenovo.com', 'lenovo.com/widget',
+ 'msn.com/spartan', 'bing.com/widget',
+ 'ntp.msn.com',
+ ];
+ const isUserPage = (url: string) =>
+ url.startsWith('http') &&
+ !BLOCKED_URL_PATTERNS.some(blocked => url.includes(blocked));
+
const contexts = this.browser.contexts();
- if (contexts.length > 0) {
- const pages = contexts[0].pages();
- this.activePage = pages.length > 0 ? pages[pages.length - 1] : null;
- }
+ const allPages: Page[] = contexts.flatMap(ctx => ctx.pages());
+ const userPages = allPages.filter(p => isUserPage(p.url()));
+ const fallbackPage = allPages.find(p =>
+ !p.url().startsWith('edge://') && !p.url().startsWith('chrome://') && !p.url().startsWith('about:')
+ ) ?? allPages[0] ?? null;
+
+ // Among user pages, prefer the last one (most recently opened/navigated)
+ this.activePage = userPages.length > 0
+ ? userPages[userPages.length - 1]
+ : fallbackPage;
if (!this.activePage) {
console.warn(' ⚠️ CDPDriver: connected but no pages found');
@@ -166,6 +184,34 @@ export class CDPDriver {
console.log(` 🔌 CDPDriver: attached to existing page`);
}
+ /**
+ * Switch to a tab whose URL contains the given substring.
+ * Useful after navigation when the agent needs to find a specific page.
+ */
+ async switchToTabByUrl(urlSubstring: string): Promise {
+ if (!this.browser) return false;
+ try {
+ const contexts = this.browser.contexts();
+ for (const ctx of contexts) {
+ for (const page of ctx.pages()) {
+ const pageUrl = page.url().toLowerCase();
+ if (pageUrl.includes(urlSubstring.toLowerCase())) {
+ this.activePage = page;
+ this.cursorInjected = false; // Reset — new page doesn't have our overlay
+ await page.bringToFront().catch(() => {});
+ const title = await page.title().catch(() => '(unknown)');
+ console.log(` 🔌 CDPDriver: switched to tab "${title}" at ${page.url()}`);
+ return true;
+ }
+ }
+ }
+ return false;
+ } catch (err) {
+ console.debug(`[CDPDriver] switchToTabByUrl failed: ${err}`);
+ return false;
+ }
+ }
+
/** Check if we're connected and the page is still alive */
async isConnected(): Promise {
if (!this.connected || !this.activePage) return false;
@@ -193,11 +239,13 @@ export class CDPDriver {
const title = (await page.title()).toLowerCase();
if (url.includes(lower) || title.includes(lower)) {
this.activePage = page;
+ this.cursorInjected = false; // Reset — new page doesn't have our overlay
await page.bringToFront();
console.log(` 🔌 CDPDriver: switched to tab "${await page.title()}"`);
return true;
}
- } catch {
+ } catch (err) {
+ console.debug(`[CDPDriver] Tab switch attempt failed: ${err}`);
continue;
}
}
@@ -217,7 +265,7 @@ export class CDPDriver {
async querySelectorAll(selector: string, maxResults = 20): Promise {
const pg = this.requirePage();
- const results = await pg.evaluate(
+ return pg.evaluate(
(args: { sel: string; max: number }) => {
const elements = document.querySelectorAll(args.sel);
const infos: any[] = [];
@@ -226,11 +274,38 @@ export class CDPDriver {
const el = elements[i] as HTMLElement;
const rect = el.getBoundingClientRect();
+ // Generate a unique CSS selector inside evaluate where we have DOM access
+ let selector = '';
+ if (el.id) {
+ selector = `#${el.id}`;
+ } else {
+ const testId = el.getAttribute('data-testid');
+ if (testId) {
+ selector = `[data-testid="${testId}"]`;
+ } else {
+ // Calculate correct nth-of-type among siblings
+ const parent = el.parentElement;
+ if (parent) {
+ let typeIndex = 0;
+ for (const sibling of parent.children) {
+ if (sibling.tagName === el.tagName) {
+ typeIndex++;
+ if (sibling === el) break;
+ }
+ }
+ selector = `${el.tagName.toLowerCase()}:nth-of-type(${typeIndex})`;
+ } else {
+ selector = el.tagName.toLowerCase();
+ }
+ }
+ }
+
infos.push({
+ selector,
tagName: el.tagName.toLowerCase(),
text: (el.textContent || '').trim().substring(0, 100),
id: el.id || '',
- className: el.className || '',
+ className: typeof el.className === 'string' ? el.className : '',
role: el.getAttribute('role') || '',
ariaLabel: el.getAttribute('aria-label') || '',
type: (el as HTMLInputElement).type || '',
@@ -251,12 +326,6 @@ export class CDPDriver {
},
{ sel: selector, max: maxResults },
);
-
- // Add unique CSS selectors
- return results.map((info: any, i: number) => ({
- ...info,
- selector: info.id ? `#${info.id}` : `${selector}:nth-of-type(${i + 1})`,
- }));
}
/**
@@ -302,7 +371,7 @@ export class CDPDriver {
tagName: htmlEl.tagName.toLowerCase(),
text: (htmlEl.textContent || '').trim().substring(0, 100),
id: htmlEl.id || '',
- className: htmlEl.className || '',
+ className: typeof htmlEl.className === 'string' ? htmlEl.className : '',
role: htmlEl.getAttribute('role') || '',
ariaLabel: htmlEl.getAttribute('aria-label') || '',
type: (htmlEl as HTMLInputElement).type || '',
@@ -355,7 +424,7 @@ export class CDPDriver {
tagName: input.tagName.toLowerCase(),
text: '',
id: input.id || '',
- className: input.className || '',
+ className: typeof input.className === 'string' ? input.className : '',
role: input.getAttribute('role') || '',
ariaLabel: input.getAttribute('aria-label') || '',
type: (input as HTMLInputElement).type || '',
@@ -377,7 +446,7 @@ export class CDPDriver {
tagName: input.tagName.toLowerCase(),
text: '',
id: htmlInput.id || '',
- className: htmlInput.className || '',
+ className: typeof htmlInput.className === 'string' ? htmlInput.className : '',
role: input.getAttribute('role') || '',
ariaLabel: input.getAttribute('aria-label') || '',
type: (input as HTMLInputElement).type || '',
@@ -407,7 +476,7 @@ export class CDPDriver {
tagName: input.tagName.toLowerCase(),
text: '',
id: htmlInput.id || '',
- className: htmlInput.className || '',
+ className: typeof htmlInput.className === 'string' ? htmlInput.className : '',
role: input.getAttribute('role') || '',
ariaLabel: input.getAttribute('aria-label') || '',
type: (input as HTMLInputElement).type || '',
@@ -435,18 +504,23 @@ export class CDPDriver {
/**
* Click an element by CSS selector.
- *
- * Uses Playwright's smart click which:
- * - Scrolls element into view
- * - Waits for it to be stable (not moving)
- * - Waits for it to be actionable (visible, enabled)
- * - Clicks the center of the element
+ * Uses JS dispatchEvent (reliable on background CDP tabs with 0x0 viewport).
+ * Falls back to Playwright click if JS dispatch doesn't work.
*/
async click(selector: string): Promise {
const pg = this.requirePage();
try {
+ // Primary: JS-based click (works on background tabs)
+ const clicked = await pg.evaluate((sel: string) => {
+ const el = document.querySelector(sel) as HTMLElement;
+ if (!el) return false;
+ el.click();
+ return true;
+ }, selector);
+ if (clicked) return { success: true, method: 'js.click' };
+ // Fallback: Playwright click with force
await this.moveCursorToSelector(selector);
- await pg.click(selector, { timeout: 5000 });
+ await pg.click(selector, { timeout: 5000, force: true });
return { success: true, method: 'playwright.click' };
} catch (err) {
return {
@@ -468,8 +542,36 @@ export class CDPDriver {
const pg = this.requirePage();
try {
- // Playwright has built-in text selectors
- // Try role-based first (buttons, links), then fall back to text
+ // Primary: JS-based click by text content (works on background tabs)
+ const jsClicked = await pg.evaluate((searchText: string) => {
+ const lower = searchText.toLowerCase();
+ // Deduplicate: elements matching multiple selectors should only appear once
+ const seen = new Set();
+ const candidates: HTMLElement[] = [];
+ for (const sel of ['button, [role="button"]', 'a[href]', '[role="link"], [role="menuitem"], [role="tab"]']) {
+ for (const el of document.querySelectorAll(sel)) {
+ if (!seen.has(el)) {
+ seen.add(el);
+ candidates.push(el as HTMLElement);
+ }
+ }
+ }
+ for (const htmlEl of candidates) {
+ const elText = (htmlEl.textContent || '').trim().toLowerCase();
+ const ariaLabel = (htmlEl.getAttribute('aria-label') || '').toLowerCase();
+ if (elText.includes(lower) || ariaLabel.includes(lower)) {
+ // Visibility check — skip hidden elements
+ const style = getComputedStyle(htmlEl);
+ if (style.display === 'none' || style.visibility === 'hidden') continue;
+ htmlEl.click();
+ return true;
+ }
+ }
+ return false;
+ }, text);
+ if (jsClicked) return { success: true, method: 'js.clickByText' };
+
+ // Fallback: Playwright locators with force
const locators = [
pg.getByRole('button', { name: text }),
pg.getByRole('link', { name: text }),
@@ -481,7 +583,7 @@ export class CDPDriver {
try {
const count = await locator.count();
if (count > 0) {
- await locator.first().click({ timeout: 3000 });
+ await locator.first().click({ timeout: 3000, force: true });
return { success: true, method: 'playwright.getByText' };
}
} catch {
@@ -510,12 +612,12 @@ export class CDPDriver {
try {
await this.moveCursorToSelector(selector);
// Try fill() first — works for inputs, textareas, and [contenteditable]
- await pg.fill(selector, text, { timeout: 5000 });
+ await pg.fill(selector, text, { timeout: 5000, force: true });
return { success: true, method: 'playwright.fill' };
} catch {
// Fall back to click + clear + type for stubborn elements
try {
- await pg.click(selector, { timeout: 3000 });
+ await pg.click(selector, { timeout: 3000, force: true });
await pg.keyboard.press('Control+a');
await pg.keyboard.type(text, { delay: 20 });
return { success: true, method: 'playwright.type' };
@@ -551,7 +653,7 @@ export class CDPDriver {
return { success: false, error: `No field found with label "${label}"` };
}
- await locator.first().fill(text, { timeout: 5000 });
+ await locator.first().fill(text, { timeout: 5000, force: true });
return { success: true, method: 'playwright.getByLabel' };
} catch (err) {
return {
@@ -703,18 +805,22 @@ export class CDPDriver {
const elements = document.querySelectorAll(sel);
const results: any[] = [];
+ // When viewport is 0x0 (background tab via CDP), skip bounds filtering
+ const hasViewport = window.innerWidth > 0 && window.innerHeight > 0;
+
for (const el of elements) {
const htmlEl = el as HTMLElement;
const rect = htmlEl.getBoundingClientRect();
- // Skip invisible elements
+ // Always skip zero-size elements (display:none, collapsed, etc.)
if (rect.width <= 0 || rect.height <= 0) continue;
- if (getComputedStyle(htmlEl).visibility === 'hidden') continue;
- if (getComputedStyle(htmlEl).display === 'none') continue;
-
- // Skip elements outside viewport
- if (rect.bottom < 0 || rect.top > window.innerHeight) continue;
- if (rect.right < 0 || rect.left > window.innerWidth) continue;
+ // Only check viewport bounds when viewport is valid (not background tab)
+ if (hasViewport) {
+ if (rect.bottom < 0 || rect.top > window.innerHeight) continue;
+ if (rect.right < 0 || rect.left > window.innerWidth) continue;
+ }
+ const style = getComputedStyle(htmlEl);
+ if (style.visibility === 'hidden' || style.display === 'none') continue;
results.push({
selector: htmlEl.id ? `#${htmlEl.id}` : '',
@@ -755,9 +861,20 @@ export class CDPDriver {
const elements = await this.getInteractiveElements();
let context = `PAGE: "${title}" at ${url}\n\n`;
- context += `INTERACTIVE ELEMENTS (${elements.length}):\n`;
- for (const el of elements) {
+ // Prioritize inputs/buttons/selects over links to keep context manageable
+ const inputs = elements.filter(e => ['input', 'textarea', 'select'].includes(e.tagName));
+ const buttons = elements.filter(e => e.tagName === 'button' || e.role === 'button');
+ const links = elements.filter(e => e.tagName === 'a' && e.role !== 'button');
+ const other = elements.filter(e => !inputs.includes(e) && !buttons.includes(e) && !links.includes(e));
+
+ // Always show all inputs, buttons, and other elements; limit links to 40
+ const shown = [...inputs, ...buttons, ...other, ...links.slice(0, 40)];
+ const hiddenLinks = links.length > 40 ? links.length - 40 : 0;
+
+ context += `INTERACTIVE ELEMENTS (${shown.length}${hiddenLinks > 0 ? `, +${hiddenLinks} more links` : ''}):\n`;
+
+ for (const el of shown) {
const label = el.ariaLabel || el.text || el.placeholder || el.name || el.id || '(unnamed)';
const typeInfo = el.type ? ` type="${el.type}"` : '';
const roleInfo = el.role ? ` role="${el.role}"` : '';
@@ -776,6 +893,40 @@ export class CDPDriver {
// SCRIPT EVALUATION
// ════════════════════════════════════════════════════════════════════
+ /**
+ * Read text content from a DOM element (safe, parameterized — no CSS injection risk).
+ * @param selector CSS selector for the target element (default: 'body')
+ * @param maxLength Maximum characters to return (default: 3000)
+ */
+ async readText(selector = 'body', maxLength = 3000): Promise {
+ const pg = this.requirePage();
+ return pg.evaluate(
+ (args: { sel: string; max: number }) => {
+ const el = document.querySelector(args.sel);
+ if (!el) return `[element not found: ${args.sel}]`;
+ const raw = (el as HTMLElement).innerText || el.textContent || '';
+ return raw.replace(/\n{3,}/g, '\n\n').trim().substring(0, args.max);
+ },
+ { sel: selector, max: maxLength },
+ );
+ }
+
+ /**
+ * Read the current value of a form field (safe, parameterized — no CSS injection risk).
+ * @param selector CSS selector for the input/textarea element
+ */
+ async readFieldValue(selector: string): Promise {
+ const pg = this.requirePage();
+ return pg.evaluate(
+ (sel: string) => {
+ const el = document.querySelector(sel);
+ if (!el) return '';
+ return ((el as HTMLInputElement).value ?? el.textContent ?? '').substring(0, 60);
+ },
+ selector,
+ );
+ }
+
/**
* Evaluate arbitrary JavaScript in the page context.
* Use this for custom interactions not covered by the standard methods.
@@ -843,6 +994,7 @@ export class CDPDriver {
this.browser = null;
this.activePage = null;
this.connected = false;
+ this.cursorInjected = false;
}
/** Get the underlying Playwright Page (for advanced usage) */
@@ -854,7 +1006,6 @@ export class CDPDriver {
// PRIVATE
// ════════════════════════════════════════════════════════════════════
- /**
/** Ensure a virtual cursor overlay exists in the page */
private async ensureCursorOverlay(): Promise {
const pg = this.requirePage();
@@ -891,8 +1042,8 @@ export class CDPDriver {
document.body.appendChild(label);
});
this.cursorInjected = true;
- } catch {
- // Ignore overlay failures
+ } catch (err) {
+ console.debug(`[CDPDriver] Cursor overlay injection failed: ${err}`);
}
}
@@ -912,8 +1063,8 @@ export class CDPDriver {
label.style.top = `${y}px`;
}
}, { x, y });
- } catch {
- // ignore
+ } catch (err) {
+ console.debug(`[CDPDriver] moveVirtualCursor failed: ${err}`);
}
}
@@ -924,8 +1075,8 @@ export class CDPDriver {
if (box) {
await this.moveVirtualCursor(Math.round(box.x + box.width / 2), Math.round(box.y + box.height / 2));
}
- } catch {
- // ignore
+ } catch (err) {
+ console.debug(`[CDPDriver] moveCursorToSelector failed: ${err}`);
}
}
diff --git a/src/computer-use.ts b/src/computer-use.ts
index 9ec948b..4fd39de 100644
--- a/src/computer-use.ts
+++ b/src/computer-use.ts
@@ -101,6 +101,9 @@ Win11: taskbar BOTTOM centered, system tray bottom-right, high-DPI.
ACCESSIBILITY: Each tool_result has WINDOWS list, FOCUSED WINDOW UI TREE (elements+coords), TASKBAR APPS.
Use accessibility data to find exact element positions and verify state.
+CRITICAL — CONTEXT AWARENESS:
+When you receive a task with CONTEXT (prior steps listed), ALWAYS take a screenshot FIRST to assess the current state before acting. Do not assume state from the context alone — verify visually.
+
CRITICAL — SPEED RULES:
1. BATCH ACTIONS. Return multiple computer tool calls in ONE response whenever possible. This is the #1 speed optimization.
2. CHECKPOINT STRATEGY: Take a screenshot after critical state changes. Then batch all predictable actions without screenshots.
@@ -118,6 +121,17 @@ PATTERNS:
- Save file: key "ctrl+s", wait 1s, type absolute path, key "Return" — all in one response
- Recovery: popup → Escape, wrong page → ctrl+l + correct URL, app frozen → alt+F4 + reopen
- Draw in Paint/canvas: Select brush tool first (click it in toolbar). Use drag operations for lines. A stick figure needs: circle/square for head (~60px), vertical line for body (~150px), diagonal lines for arms and legs (~80px each). Use LARGE coordinates — small drags produce dots. Minimum drag distance: 50 pixels.
+- After send/submit (Ctrl+Enter, clicking Send button): WAIT 3 seconds before taking a screenshot. The UI needs time to process. Do NOT immediately retry — wait first, then verify.
+- After closing a dialog (Escape, clicking X): WAIT 1 second before the next action.
+- NEVER assume an action failed just because the UI looks the same immediately after. Always wait before judging.
+
+KEYBOARD-OVER-MOUSE (critical on high-DPI displays):
+- ALWAYS prefer keyboard shortcuts over mouse clicks when both work
+- Email composition: Ctrl+N → Tab → type → Tab → type → Tab → type → Ctrl+Enter
+- Switching to an app: Alt+Tab (cycle) NOT clicking taskbar corners
+- Closing dialogs: Escape NOT clicking X button
+- Form fields: Tab to navigate, type directly — do NOT try to click field labels
+- Only use mouse clicks when there is NO keyboard alternative
SCROLLING: NEVER use mouse scroll with small amounts. For scrolling web pages use keyboard: PageDown (full page), Space (half page), or arrow keys. Mouse scroll is unreliable on modern infinite-scroll sites.
SITE SHORTCUTS (use these instead of clicking — much faster and more reliable):
@@ -138,6 +152,8 @@ For tasks involving multiple apps (copy from X, paste in Y):
5. The task is NOT done until the pasted content is VISIBLE in the target app
6. Common multi-app pattern: select text → Ctrl+C → open new app (Super + type + Return) → wait 2s → click in text area → Ctrl+V → verify
+CRITICAL — NEVER CLOSE TERMINAL/POWERSHELL WINDOWS: The agent runs inside a PowerShell or terminal window. If you close it, the agent dies and the task fails permanently. NEVER click the X on any window titled "PowerShell", "Windows PowerShell", "Command Prompt", "cmd", "Terminal", "clawdcursor", or any terminal/console window. If a terminal window is in the way, click on the TARGET app in the taskbar to bring it to front — do NOT close the terminal.
+
Do NOT: take screenshots after every action, go one action at a time when you can batch, use search engines for known URLs, retry same failed coords, declare a task complete before ALL steps are done — if the task says copy AND paste, you must do BOTH.`;
const SYSTEM_PROMPT = IS_MAC ? SYSTEM_PROMPT_MAC : SYSTEM_PROMPT_WIN;
@@ -416,6 +432,8 @@ export class ComputerUseBrain {
private lastMouseX = 0;
private lastMouseY = 0;
private computerUseOverrides?: ComputerUseOverrides;
+ private targetProcessName: string | null = null;
+ private verifier: import('./verifiers').TaskVerifier | null = null;
// A11y context cache — avoids hammering JXA after every single action
private a11yCache: { context: string; ts: number; pid?: number } | null = null;
@@ -443,7 +461,11 @@ export class ComputerUseBrain {
this.llmWidth = Math.min(screen.width, LLM_WIDTH);
this.llmHeight = Math.round(screen.height / this.scaleFactor);
- console.log(` 🖥️ Computer Use: declaring ${this.llmWidth}x${this.llmHeight} display (scale ${this.scaleFactor}x from ${this.screenWidth}x${this.screenHeight})`);
+ // Display config logged at debug level only
+ }
+
+ setVerifier(v: import('./verifiers').TaskVerifier): void {
+ this.verifier = v;
}
/**
@@ -464,6 +486,7 @@ export class ComputerUseBrain {
debugDir: string | null,
subtaskIndex: number,
priorSteps?: string[],
+ logger?: import('./task-logger').TaskLogger,
): Promise {
const steps: StepResult[] = [];
let llmCalls = 0;
@@ -473,7 +496,7 @@ export class ComputerUseBrain {
// they're purely vision-driven. Skip all a11y fetches to cut 3-10s per iteration.
const skipA11yCompletely = this.isVisualLoopSubtask(subtask);
- console.log(` 🖥️ Computer Use: "${subtask}"`);
+ console.log(` 🖥️ Layer 3: "${subtask.substring(0, 80)}${subtask.length > 80 ? '...' : ''}"`);
// Initialize checkpoint tracker
const taskType = detectTaskType(subtask);
@@ -488,7 +511,7 @@ export class ComputerUseBrain {
updateCheckpoints(this, action, description, claudeText);
},
};
- console.log(` 📋 Task type: ${taskType} — tracking ${checkpointNames.length} checkpoints`);
+ // checkpoint tracking is internal
// Build context from prior completed steps so the vision LLM doesn't redo work
let taskMessage = subtask;
@@ -502,6 +525,82 @@ export class ComputerUseBrain {
content: taskMessage,
});
+ // Fix 3: Window focus helper — verify target app is focused before starting
+ try {
+ const activeWindow = await this.a11y.getActiveWindow();
+ if (activeWindow) {
+ const activeProc = activeWindow.processName.toLowerCase();
+ const taskLower = subtask.toLowerCase();
+ // Detect expected target app from task text
+ const appHints: Record = {
+ chrome: ['chrome', 'browser', 'web', 'google', 'gmail', 'youtube'],
+ msedge: ['edge', 'browser', 'web', 'bing'],
+ firefox: ['firefox', 'browser', 'web'],
+ outlook: ['outlook', 'email', 'mail'],
+ thunderbird: ['thunderbird', 'email', 'mail'],
+ notepad: ['notepad', 'text editor', 'note'],
+ code: ['vscode', 'vs code', 'visual studio code', 'code editor'],
+ excel: ['excel', 'spreadsheet'],
+ word: ['word', 'document', 'doc'],
+ explorer: ['file explorer', 'files', 'folder'],
+ slack: ['slack'],
+ teams: ['teams'],
+ discord: ['discord'],
+ paint: ['paint', 'draw', 'sketch'],
+ };
+ let expectedApp: string | null = null;
+ for (const [proc, keywords] of Object.entries(appHints)) {
+ if (keywords.some(kw => taskLower.includes(kw))) {
+ expectedApp = proc;
+ break;
+ }
+ }
+ // Store for continuous focus verification during the action loop
+ this.targetProcessName = expectedApp;
+ this.targetProcessId = null; // will be detected on first focus check
+ // Handle known process name aliases (e.g., new Outlook = "olk", not "outlook")
+ const procAliases: Record = {
+ outlook: ['outlook', 'olk'],
+ chrome: ['chrome'],
+ firefox: ['firefox'],
+ notepad: ['notepad'],
+ word: ['word', 'winword'],
+ excel: ['excel'],
+ };
+ const matchesExpected = procAliases[expectedApp ?? '']
+ ? procAliases[expectedApp!].some(alias => activeProc.includes(alias))
+ : activeProc.includes(expectedApp ?? '');
+ if (expectedApp && !matchesExpected) {
+ // refocusing to expected app
+ // Try to find and focus the target window directly
+ const targetWin = await this.a11y.findWindow(expectedApp);
+ if (targetWin) {
+ this.targetProcessId = targetWin.processId;
+ await this.a11y.focusWindow(undefined, targetWin.processId);
+ await this.delay(500);
+ } else {
+ for (let attempt = 0; attempt < 3; attempt++) {
+ await this.desktop.keyPress('alt+tab');
+ await this.delay(500);
+ const newWindow = await this.a11y.getActiveWindow();
+ const newProc = (newWindow?.processName || '').toLowerCase();
+ const foundTarget = procAliases[expectedApp]
+ ? procAliases[expectedApp].some(alias => newProc.includes(alias))
+ : newProc.includes(expectedApp);
+ if (newWindow && foundTarget) {
+ this.targetProcessId = newWindow.processId;
+ break;
+ }
+ }
+ }
+ } else {
+ this.targetProcessId = activeWindow.processId;
+ }
+ }
+ } catch {
+ // focus check non-fatal
+ }
+
let consecutiveErrors = 0;
const MAX_CONSECUTIVE_ERRORS = 5;
let lastActionSignature = '';
@@ -512,12 +611,13 @@ export class ComputerUseBrain {
for (let i = 0; i < MAX_ITERATIONS; i++) {
llmCalls++;
- console.log(` 📡 Computer Use call ${i + 1}...`);
-
+ logger?.recordLlmCall();
+ const cuCallStart = performance.now();
const response = await this.callAPI(messages);
+ const cuCallMs = Math.round(performance.now() - cuCallStart);
if (response.error) {
- console.log(` ❌ API error: ${response.error}`);
+ console.log(` ❌ Layer 3 API error: ${response.error}`);
steps.push({
action: 'error',
description: `Computer Use API error: ${response.error}`,
@@ -533,15 +633,7 @@ export class ComputerUseBrain {
content: response.content,
});
- // Log any text blocks
- for (const block of response.content) {
- if ((block as TextBlock).type === 'text') {
- const text = (block as TextBlock).text;
- if (text.trim()) {
- console.log(` 💬 Vision LLM: ${text.substring(0, 120)}${text.length > 120 ? '...' : ''}`);
- }
- }
- }
+ // text blocks logged at debug level only
// If end_turn → vision LLM thinks it's done. Verify with checkpoints.
if (response.stop_reason === 'end_turn') {
@@ -550,21 +642,34 @@ export class ComputerUseBrain {
const skipVerify = skipA11yCompletely || /\b(draw|paint|sketch|doodle|color|design)\b/i.test(subtask);
if (skipVerify) {
- console.log(` ✅ Computer Use: subtask complete (skipping verification)`);
+ console.log(` ⚠️ Computer Use: LLM declared done (verification SKIPPED — visual/draw task)`);
+ logger?.logStep({ layer: 3, actionType: 'done', result: 'success', verification: { method: 'none', verified: false, detail: 'visual/draw task — verification skipped' } });
steps.push({
action: 'done',
- description: `Computer Use completed: "${subtask}"`,
+ description: `Computer Use completed (unverified): "${subtask}"`,
success: true,
timestamp: Date.now(),
});
return { success: true, steps, llmCalls };
}
- // Only verify when completion is uncertain
- const completedRatio = tracker.checkpoints.filter(c => c.detected).length / tracker.checkpoints.length;
- if (completedRatio < 0.80) {
- // For non-visual tasks: take a verification screenshot and ask the vision LLM to confirm
- console.log(` 🔍 Verifying outcome...`);
+ // Email-specific shortcut: if task was email-related and compose window closed, trust it
+ const isEmailTask = /\b(email|mail|send|compose|outlook|gmail)\b/i.test(subtask);
+ if (isEmailTask) {
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const winTitle = (activeWin?.title || '').toLowerCase();
+ const isCompose = winTitle.includes('new message') ||
+ winTitle.includes('untitled') ||
+ winTitle.includes('compose');
+ const isInbox = winTitle.includes('inbox') || winTitle.includes('mail') || winTitle.includes('outlook');
+ if (!isCompose && isInbox) {
+ steps.push({ action: 'done', description: `Task complete — compose window closed, now at inbox (${activeWin?.title})`, success: true, timestamp: Date.now() });
+ return { success: true, steps, llmCalls };
+ }
+ }
+
+ // ALWAYS verify with vision when LLM declares done — take a screenshot and confirm
+ {
llmCalls++;
const [verifyScreenshot, a11yContext] = await Promise.all([
@@ -585,15 +690,16 @@ export class ComputerUseBrain {
const verifyResponse = await this.callAPI(messages);
if (verifyResponse.error) {
- // If verification call fails, trust the original result
- console.log(` ⚠️ Verification call failed, trusting original result`);
+ // Verification call failed — report as unverified failure
+ console.log(` ⚠️ Layer 3 verification API call failed — marking unverified`);
+ logger?.logStep({ layer: 3, actionType: 'done', result: 'fail', verification: { method: 'vision', verified: false, detail: 'verification API call failed' } });
steps.push({
action: 'done',
- description: `Computer Use completed: "${subtask}" (unverified)`,
- success: true,
+ description: `Computer Use completed (UNVERIFIED — verify call failed): "${subtask}"`,
+ success: false,
timestamp: Date.now(),
});
- return { success: true, steps, llmCalls };
+ return { success: false, steps, llmCalls };
}
// Parse verification response
@@ -602,27 +708,63 @@ export class ComputerUseBrain {
.map((b: ContentBlock) => (b as TextBlock).text)
.join('');
- console.log(` 🔍 Verification: ${verifyText.substring(0, 120)}${verifyText.length > 120 ? '...' : ''}`);
-
- // Check if verified
const verifiedMatch = verifyText.match(/"verified"\s*:\s*(true|false)/);
const isVerified = verifiedMatch ? verifiedMatch[1] === 'true' : !verifyText.toLowerCase().includes('"verified": false');
if (isVerified) {
- console.log(` ✅ Computer Use: subtask VERIFIED complete`);
- steps.push({
- action: 'done',
- description: `Computer Use completed (verified): "${subtask}"`,
- success: true,
- timestamp: Date.now(),
- });
- return { success: true, steps, llmCalls };
+ // Ground truth post-verification — don't trust vision alone
+ const groundTruth = await this.groundTruthCheck(subtask, logger);
+ if (groundTruth.pass) {
+ console.log(` ✅ Layer 3 verified (ground truth: ${groundTruth.detail})`);
+ logger?.logStep({
+ layer: 3,
+ actionType: 'done',
+ result: 'success',
+ verification: { method: 'vision', verified: true, detail: `vision: ${verifyText.substring(0, 100)} | ground_truth: ${groundTruth.detail}` },
+ });
+ steps.push({
+ action: 'done',
+ description: `Computer Use completed (verified): "${subtask}"`,
+ success: true,
+ timestamp: Date.now(),
+ });
+ return { success: true, steps, llmCalls };
+ } else {
+ // Vision said verified but ground truth disagrees
+ console.log(` ⚠️ Layer 3 vision said verified but ground truth FAILED: ${groundTruth.detail}`);
+ logger?.logStep({
+ layer: 3,
+ actionType: 'done_rejected',
+ result: 'fail',
+ verification: { method: 'a11y_readback', verified: false, detail: `vision_lied: ${groundTruth.detail}` },
+ });
+ // Push back — don't accept, let the loop continue
+ verificationFailures++;
+ if (verificationFailures >= MAX_VERIFICATION_RETRIES) {
+ steps.push({
+ action: 'done',
+ description: `Computer Use completed (UNVERIFIED — ground truth failed): "${subtask}"`,
+ success: false,
+ timestamp: Date.now(),
+ });
+ return { success: false, steps, llmCalls };
+ }
+ messages.push({
+ role: 'assistant',
+ content: verifyResponse.content,
+ });
+ messages.push({
+ role: 'user',
+ content: `GROUND TRUTH CHECK FAILED: ${groundTruth.detail}. The task is NOT actually done. Take a screenshot and fix the issue.`,
+ });
+ continue;
+ }
}
// Not verified — vision LLM should continue with recovery
verificationFailures++;
if (verificationFailures >= MAX_VERIFICATION_RETRIES) {
- console.log(` ⚠️ Verification failed ${verificationFailures} times — accepting result to avoid infinite loop`);
+ // max verification retries reached
steps.push({
action: 'done',
description: `Computer Use completed (unverified after ${verificationFailures} retries): "${subtask}"`,
@@ -632,7 +774,7 @@ export class ComputerUseBrain {
return { success: false, steps, llmCalls };
}
- console.log(` ❌ Verification FAILED (${verificationFailures}/${MAX_VERIFICATION_RETRIES}) — analyzing logs and retrying`);
+ // verification failed — retrying
messages.push({
role: 'assistant',
content: verifyResponse.content,
@@ -663,15 +805,6 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
// Continue the loop — vision LLM will take corrective action
continue;
- } else {
- console.log(` ✅ Skipping verification — ${Math.round(completedRatio * 100)}% checkpoints already confirmed`);
- steps.push({
- action: 'done',
- description: `Computer Use completed (checkpoints): "${subtask}"`,
- success: true,
- timestamp: Date.now(),
- });
- return { success: true, steps, llmCalls };
}
}
@@ -701,7 +834,7 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
if (action === 'screenshot') {
// Always provide screenshot for explicit screenshot requests
- console.log(` 📸 Screenshot requested`);
+ // screenshot requested
// Run screenshot + a11y in parallel when a11y is needed
const [screenshot, a11yContext] = await Promise.all([
this.desktop.captureForLLM(),
@@ -734,7 +867,7 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
}
this.heldKeys = [];
}
- console.log(` ${result.error ? '❌' : '✅'} ${result.description}`);
+ if (result.error) console.log(` ❌ ${result.description}`);
steps.push({
action: action,
@@ -744,13 +877,21 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
timestamp: Date.now(),
});
+ logger?.logStep({
+ layer: 3,
+ actionType: action,
+ result: result.error ? 'fail' : 'success',
+ actionParams: { coordinate: toolUse.input.coordinate, text: toolUse.input.text?.substring(0, 80) },
+ error: result.error,
+ });
+
// Track consecutive errors for bail-out
if (result.error) {
consecutiveErrors++;
lastActionSignature = '';
repeatedActionStreak = 0;
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
- console.log(` ❌ ${MAX_CONSECUTIVE_ERRORS} consecutive errors — aborting task`);
+ console.log(` ❌ Layer 3: too many errors — aborting`);
return { success: false, steps, llmCalls };
}
} else {
@@ -786,7 +927,7 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
],
});
if (loopDetected) {
- console.log(` ♻️ Loop guard: repeated action detected — forcing recovery context`);
+ // loop guard triggered
repeatedActionStreak = 0;
lastActionSignature = '';
}
@@ -799,25 +940,39 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
const delayMs = isAppLaunch ? 600 : isNavigation ? 300 : isTyping ? 30 : isDrag ? 30 : 80;
await this.delay(delayMs);
- // Skip a11y after simple clicks/types, and always when in visual-loop mode.
- // Run screenshot + a11y in parallel when a11y is needed.
- const skipA11y = skipA11yCompletely || isTyping || (action === 'left_click' && !isNavigation);
- const [screenshot, a11yContext] = await Promise.all([
- this.desktop.captureForLLM(),
- this.getA11yContext(isAppLaunch, skipA11y),
- ]);
- if (debugDir) this.saveDebugScreenshot(screenshot.buffer, debugDir, subtaskIndex, i, action);
- const verifyHint = this.getVerificationHint(action, toolUse.input);
- const focusHint = this.getFocusHint(action, toolUse.input);
-
- toolResults.push({
- type: 'tool_result',
- tool_use_id: toolUse.id,
- content: [
- this.screenshotToContent(screenshot),
- { type: 'text', text: `${focusHint}${verifyHint}${a11yContext}` },
- ],
- });
+ // Fix 1: Wait for UI to settle after critical key actions before screenshot
+ await this.waitForUISettle(action, toolUse.input.text || '');
+
+ // Fix 2: Skip expensive screenshot for type/key actions — use a11y verification instead
+ const skipScreenshot = action === 'type' || (action === 'key' && !isNavigation && !isAppLaunch);
+ if (skipScreenshot) {
+ const a11yContext = await this.getA11yContext(false, false);
+ toolResults.push({
+ type: 'tool_result',
+ tool_use_id: toolUse.id,
+ content: [{ type: 'text', text: `Action executed. Current accessibility state:\n${a11yContext}` }],
+ });
+ } else {
+ // Skip a11y after simple clicks/types, and always when in visual-loop mode.
+ // Run screenshot + a11y in parallel when a11y is needed.
+ const skipA11y = skipA11yCompletely || (action === 'left_click' && !isNavigation);
+ const [screenshot, a11yContext] = await Promise.all([
+ this.desktop.captureForLLM(),
+ this.getA11yContext(isAppLaunch, skipA11y),
+ ]);
+ if (debugDir) this.saveDebugScreenshot(screenshot.buffer, debugDir, subtaskIndex, i, action);
+ const verifyHint = this.getVerificationHint(action, toolUse.input);
+ const focusHint = this.getFocusHint(action, toolUse.input);
+
+ toolResults.push({
+ type: 'tool_result',
+ tool_use_id: toolUse.id,
+ content: [
+ this.screenshotToContent(screenshot),
+ { type: 'text', text: `${focusHint}${verifyHint}${a11yContext}` },
+ ],
+ });
+ }
} else {
// Not last in batch: lightweight response, skip screenshot
const isAppLaunch = action === 'key' && toolUse.input.text?.toLowerCase().includes('super');
@@ -827,7 +982,7 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
const delayMs = isAppLaunch ? 600 : isDrag ? 20 : isClick ? 30 : 80;
await this.delay(delayMs);
- console.log(` ⏭️ Skipping screenshot (batch ${ti+1}/${toolUseBlocks.length})`);
+ // batch action — skip screenshot
toolResults.push({
type: 'tool_result',
tool_use_id: toolUse.id,
@@ -858,7 +1013,7 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
});
}
- console.log(` ⚠️ Max iterations (${MAX_ITERATIONS}) reached`);
+ console.log(` ⚠️ Layer 3: max iterations reached`);
return { success: false, steps, llmCalls };
}
@@ -933,6 +1088,42 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
return { content: [], stop_reason: 'end_turn', error: 'Max retries exceeded' };
}
+ // ─── Ground Truth Post-Verification ──────────────────────────
+
+ /**
+ * Programmatic verification after vision model claims done.
+ * Uses UIA/clipboard/window state — NOT another LLM call.
+ * Returns { pass: true/false, detail: string }
+ */
+ private async groundTruthCheck(
+ subtask: string,
+ _logger?: import('./task-logger').TaskLogger,
+ ): Promise<{ pass: boolean; detail: string }> {
+ try {
+ if (this.verifier) {
+ const readClip = () => this.a11y.readClipboard();
+ const result = await this.verifier.verify(subtask, readClip);
+ return { pass: result.pass, detail: `[${result.method}] ${result.detail}` };
+ }
+
+ // Minimal inline fallback when no verifier is set
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const processName = (activeWin?.processName || '').toLowerCase();
+
+ if (/notepad/i.test(subtask) || processName === 'notepad') {
+ const focused = await this.a11y.getFocusedElement().catch(() => null);
+ if (focused?.value && focused.value.trim().length > 10) {
+ return { pass: true, detail: `notepad has ${focused.value.length} chars` };
+ }
+ return { pass: false, detail: `notepad appears empty` };
+ }
+
+ return { pass: true, detail: `no verifier available — trusting vision` };
+ } catch (err) {
+ return { pass: true, detail: `ground truth error: ${String(err).substring(0, 80)}` };
+ }
+ }
+
// ─── Action Execution ──────────────────────────────────────────
private async executeAction(toolUse: ToolUseBlock): Promise<{ description: string; error?: string }> {
@@ -952,6 +1143,11 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
}
try {
+ // Verify target app is still focused before executing (prevents typing in wrong window)
+ if (action !== 'screenshot') {
+ await this.verifyAndRefocus();
+ }
+
switch (action) {
case 'left_click': {
const [x, y] = this.scale(coordinate!);
@@ -1026,6 +1222,15 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
case 'key': {
if (!text) return { description: 'Key press: empty', error: 'No key provided' };
+ // Block Alt+Tab — it breaks window focus and the agent loses context
+ const keyNorm = text.toLowerCase().replace(/\s/g, '');
+ if (keyNorm.includes('alt+tab') || keyNorm.includes('alt+shift+tab')) {
+ return { description: `Key press: ${text} — BLOCKED (Alt+Tab disabled, use focusWindow instead)` };
+ }
+ // Block Win/Super key — opens Start menu chaos
+ if (keyNorm === 'super' || keyNorm === 'win' || keyNorm === 'meta') {
+ return { description: `Key press: ${text} — BLOCKED (Win key disabled)` };
+ }
// Map Anthropic key names to nut-js key names
const mappedKey = this.mapKeyName(text);
await this.desktop.keyPress(mappedKey);
@@ -1087,6 +1292,64 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
}
+ /**
+ * Verify the target app is still focused. If not, refocus it.
+ * Prevents actions landing in the wrong window (e.g., typing in Edge URL bar).
+ */
+ private targetProcessId: number | null = null;
+
+ private async verifyAndRefocus(): Promise {
+ if (!this.targetProcessName) return;
+ try {
+ const activeWin = await this.a11y.getActiveWindow();
+ if (!activeWin) return;
+ const activeProc = activeWin.processName.toLowerCase();
+ const targetProc = this.targetProcessName.toLowerCase();
+
+ // Handle aliases (e.g., new Outlook = "olk")
+ const procAliases: Record = {
+ outlook: ['outlook', 'olk'],
+ chrome: ['chrome'],
+ msedge: ['msedge', 'edge'],
+ firefox: ['firefox'],
+ notepad: ['notepad'],
+ word: ['word', 'winword'],
+ excel: ['excel'],
+ };
+
+ const aliases = procAliases[targetProc] || [targetProc];
+ const isFocused = aliases.some(alias => activeProc.includes(alias));
+
+ if (!isFocused) {
+ // refocusing — focus lost
+ // First try by stored processId (fastest, most reliable)
+ if (this.targetProcessId) {
+ await this.a11y.focusWindow(undefined, this.targetProcessId);
+ await this.delay(400);
+ return;
+ }
+ // Try by process name through window search
+ const targetWin = await this.a11y.findWindow(this.targetProcessName);
+ if (targetWin) {
+ this.targetProcessId = targetWin.processId;
+ await this.a11y.focusWindow(undefined, targetWin.processId);
+ await this.delay(400);
+ } else {
+ // Fallback: Alt+Tab to cycle
+ await this.desktop.keyPress('alt+tab');
+ await this.delay(500);
+ }
+ } else {
+ // Store the processId for faster refocus next time
+ if (!this.targetProcessId) {
+ this.targetProcessId = activeWin.processId;
+ }
+ }
+ } catch {
+ // Non-fatal — best effort
+ }
+ }
+
/** Build a compact signature used to detect repeated no-progress action loops. */
private actionSignature(toolUse: ToolUseBlock): string {
const { action, coordinate, text, key, scroll_direction, scroll_amount } = toolUse.input;
@@ -1242,6 +1505,35 @@ Fix the specific missed step. Do NOT repeat steps that already succeeded.`,
private delay(ms: number): Promise {
return new Promise(resolve => setTimeout(resolve, ms));
}
+
+ /**
+ * For critical actions (send/submit/confirm/close), wait up to maxMs for
+ * the a11y tree to reflect the expected change, polling every 400ms.
+ * Returns true if a change was detected, false if timed out.
+ */
+ private async waitForUISettle(action: string, keyText: string, maxMs = 4000): Promise {
+ const criticalKeys = ['ctrl+return', 'ctrl+enter', 'return', 'enter', 'escape', 'ctrl+s', 'ctrl+w', 'alt+f4'];
+ const keyLower = (keyText || '').toLowerCase().replace(/\s/g, '');
+ if (action !== 'key' || !criticalKeys.some(k => keyLower.includes(k))) return false;
+
+ // settling after key press
+ const before = await this.a11y.getActiveWindow().catch(() => null);
+ const beforeTitle = before?.title || '';
+
+ const interval = 400;
+ const attempts = Math.ceil(maxMs / interval);
+ for (let i = 0; i < attempts; i++) {
+ await this.delay(interval);
+ const after = await this.a11y.getActiveWindow().catch(() => null);
+ const afterTitle = after?.title || '';
+ if (afterTitle !== beforeTitle) {
+ // UI settled
+ return true; // window changed — action took effect
+ }
+ }
+ // settle timeout
+ return false;
+ }
}
diff --git a/src/openclaw-credentials.ts b/src/credentials.ts
similarity index 94%
rename from src/openclaw-credentials.ts
rename to src/credentials.ts
index ced0ecf..1a423e4 100644
--- a/src/openclaw-credentials.ts
+++ b/src/credentials.ts
@@ -3,10 +3,16 @@ import * as os from 'os';
import * as path from 'path';
/**
- * OpenClaw-aware credential resolution.
+ * Credential Resolution — multi-source API key + endpoint detection.
*
- * In skill mode, Clawd Cursor should reuse OpenClaw's configured providers/models
- * instead of inferring provider from key prefixes.
+ * Precedence:
+ * 1. Explicit CLI flags (--api-key, --provider, --base-url)
+ * 2. External config files (e.g. OpenClaw auth-profiles, if installed)
+ * 3. Environment variables (AI_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.)
+ * 4. Local .clawdcursor-config.json
+ *
+ * External integrations (OpenClaw, etc.) are optional — Clawd Cursor works
+ * fully standalone with just env vars or CLI flags.
*/
export interface ResolvedApiConfig {
@@ -19,7 +25,7 @@ export interface ResolvedApiConfig {
textBaseUrl?: string;
visionApiKey?: string;
visionBaseUrl?: string;
- source: 'openclaw' | 'local';
+ source: 'external' | 'local';
}
interface ModelInfo {
@@ -184,8 +190,10 @@ function getOpenClawRoots(): string[] {
}
function readConfiguredProvider(): string | undefined {
- const configPath = path.join(process.cwd(), '.clawd-config.json');
- const cfg = safeReadJson(configPath);
+ // Check both the package directory (where the code lives) and cwd
+ const pkgConfigPath = path.join(__dirname, '..', '.clawdcursor-config.json');
+ const cwdConfigPath = path.join(process.cwd(), '.clawdcursor-config.json');
+ const cfg = safeReadJson(pkgConfigPath) || safeReadJson(cwdConfigPath);
if (!cfg || !isObject(cfg)) return undefined;
const provider = pick(cfg.provider, cfg?.pipeline?.provider, cfg?.pipeline?.providerKey);
@@ -375,12 +383,12 @@ function resolveFromOpenClawFiles(): ResolvedApiConfig | null {
visionApiKey: resolvedVisionApiKey,
visionBaseUrl: resolvedVisionBaseUrl,
provider: normalizeProvider(selectedProvider.key) || inferProviderFromBaseUrl(selectedProvider.baseUrl),
- source: 'openclaw',
+ source: 'external',
};
}
/**
- * Resolve key + endpoint + models with OpenClaw-first precedence.
+ * Resolve key + endpoint + models from all available sources.
*/
export function resolveApiConfig(opts?: {
apiKey?: string;
@@ -414,7 +422,7 @@ export function resolveApiConfig(opts?: {
return fromFiles;
}
- // Transitional fallback if OpenClaw explicitly injects runtime env vars.
+ // Check for externally-injected runtime env vars (e.g. from orchestration platforms).
const openClawKey = pick(
process.env.OPENCLAW_AI_API_KEY,
process.env.OPENCLAW_API_KEY,
@@ -456,7 +464,7 @@ export function resolveApiConfig(opts?: {
textBaseUrl: openClawBaseUrl,
visionApiKey: openClawKey,
visionBaseUrl: openClawBaseUrl,
- source: 'openclaw',
+ source: 'external',
};
}
diff --git a/src/dashboard.ts b/src/dashboard.ts
index dba0aaf..ea4a83d 100644
--- a/src/dashboard.ts
+++ b/src/dashboard.ts
@@ -800,7 +800,7 @@ const DASHBOARD_HTML = `
// When starring, check for credentials
if (!isFav && looksLikeCredential(item.task)) {
var msg = '🔒 This task may contain sensitive info (API key, password, or token).\\n\\n' +
- 'Starred commands are saved locally in .clawd-favorites.json on your machine — ' +
+ 'Starred commands are saved locally in .clawdcursor-favorites.json on your machine — ' +
'never sent over the network. Your credentials stay secure on your device.\\n\\n' +
'Star this command anyway?';
if (!confirm(msg)) return;
diff --git a/src/deterministic-flows.ts b/src/deterministic-flows.ts
new file mode 100644
index 0000000..dd373f1
--- /dev/null
+++ b/src/deterministic-flows.ts
@@ -0,0 +1,177 @@
+/**
+ * Deterministic Flows — zero-LLM verified workflows for known app patterns.
+ *
+ * Each step uses the action verifier to guarantee actions worked.
+ * If any step fails, returns { handled: false } so the caller can
+ * fall back to Layer 2 (LLM reasoner).
+ */
+
+import { AccessibilityBridge } from './accessibility';
+import { NativeDesktop } from './native-desktop';
+import { ActionVerifier } from './action-verifier';
+
+export interface FlowResult {
+ handled: boolean;
+ description: string;
+ failedAtStep?: number;
+ stepsCompleted?: number;
+}
+
+export class DeterministicFlows {
+ private a11y: AccessibilityBridge;
+ private desktop: NativeDesktop;
+ private verifier: ActionVerifier;
+
+ constructor(a11y: AccessibilityBridge, desktop: NativeDesktop) {
+ this.a11y = a11y;
+ this.desktop = desktop;
+ this.verifier = new ActionVerifier(a11y, desktop);
+ }
+
+ /**
+ * Try to match and execute a deterministic flow.
+ * Returns null if no flow matches the task.
+ */
+ async tryFlow(task: string, app: string): Promise {
+ const appLower = app.toLowerCase();
+ const taskLower = task.toLowerCase();
+
+ // Outlook email flow (process may be msedge but window title contains "Outlook")
+ if (/outlook|olk/i.test(appLower) && /send.*email|email.*to|mail.*to|introduce/i.test(taskLower)) {
+ const parsed = this.parseEmailTask(taskLower);
+ if (parsed) {
+ return this.outlookEmailFlow(parsed.to, parsed.subject, parsed.body);
+ }
+ }
+
+ return null; // No matching flow
+ }
+
+ private parseEmailTask(task: string): { to: string; subject: string; body: string } | null {
+ // Extract email address
+ const emailMatch = task.match(/[\w.-]+@[\w.-]+\.\w+/);
+ if (!emailMatch) return null;
+
+ const to = emailMatch[0];
+
+ // Extract "saying X" or "with subject X" or "about X"
+ let subject = 'Hello';
+ let body = '';
+
+ const sayingMatch = task.match(/saying\s+["']?(.+?)["']?$/i);
+ const subjectMatch = task.match(/(?:subject|about)\s+["']?(.+?)["']?(?:\s+(?:saying|body)|$)/i);
+ const bodyMatch = task.match(/body\s+["']?(.+?)["']?$/i);
+
+ if (subjectMatch) {
+ subject = subjectMatch[1].trim();
+ } else if (sayingMatch) {
+ subject = sayingMatch[1].trim();
+ }
+
+ if (bodyMatch) {
+ body = bodyMatch[1].trim();
+ } else if (sayingMatch) {
+ body = sayingMatch[1].trim();
+ } else {
+ body = subject;
+ }
+
+ return { to, subject, body };
+ }
+
+ /**
+ * Outlook email: deterministic Tab-based navigation.
+ * Ctrl+N → type To → Tab → type Subject → Tab → type Body → Ctrl+Enter
+ */
+ private async outlookEmailFlow(to: string, subject: string, body: string): Promise {
+ console.log(` 📧 Deterministic email flow: to=${to} subject="${subject}"`);
+ let step = 0;
+
+ try {
+ // Step 1: Open compose via UIAutomation invoke on "New mail" button
+ step = 1;
+ const activeWin = await this.a11y.getActiveWindow();
+ let composeOpen = false;
+
+ // Try UIAutomation invoke first — bypasses keyboard focus issues
+ try {
+ const invokeResult = await this.a11y.invokeElement({
+ name: 'New mail',
+ controlType: 'ControlType.Button',
+ action: 'click',
+ processId: activeWin?.processId,
+ });
+ if (invokeResult.success || invokeResult.clickPoint) {
+ if (invokeResult.clickPoint) {
+ await this.desktop.mouseClick(invokeResult.clickPoint.x, invokeResult.clickPoint.y);
+ }
+ console.log(` 📧 Step 1: Invoked "New mail" via UIAutomation`);
+ await new Promise(r => setTimeout(r, 2000));
+ composeOpen = true;
+ }
+ } catch { /* fall through to Ctrl+N */ }
+
+ // Fallback: click center + Ctrl+N
+ if (!composeOpen) {
+ const b = activeWin?.bounds;
+ if (b && b.x > -100 && b.y > -100 && b.width > 100 && b.height > 100) {
+ await this.desktop.mouseClick(b.x + Math.floor(b.width / 2), b.y + Math.floor(b.height / 2));
+ await new Promise(r => setTimeout(r, 300));
+ }
+ await this.desktop.keyPress('Control+n');
+ console.log(` 📧 Step 1: Fallback Ctrl+N, waiting for compose...`);
+ await new Promise(r => setTimeout(r, 2000));
+ composeOpen = true; // trust it — verification below will catch failures
+ }
+
+ // Step 2: Type recipient in To field
+ step = 2;
+ const typeToResult = await this.verifier.verifiedType(to);
+ console.log(` 📧 Step 2: Typed To "${to}" — ${typeToResult.success ? 'OK' : typeToResult.error}`);
+
+ // Step 3: Tab to Subject
+ step = 3;
+ const tabToSubject = await this.verifier.verifiedKeyPress('Tab', { focusShouldChange: true });
+ console.log(` 📧 Step 3: Tab to Subject — ${tabToSubject.success ? 'OK' : tabToSubject.error}`);
+
+ // Step 4: Type subject
+ step = 4;
+ const typeSubjectResult = await this.verifier.verifiedType(subject);
+ console.log(` 📧 Step 4: Typed Subject "${subject}" — ${typeSubjectResult.success ? 'OK' : typeSubjectResult.error}`);
+
+ // Step 5: Tab to Body
+ step = 5;
+ const tabToBody = await this.verifier.verifiedKeyPress('Tab', { focusShouldChange: true });
+ console.log(` 📧 Step 5: Tab to Body — ${tabToBody.success ? 'OK' : tabToBody.error}`);
+
+ // Step 6: Type body
+ step = 6;
+ const typeBodyResult = await this.verifier.verifiedType(body);
+ console.log(` 📧 Step 6: Typed Body — ${typeBodyResult.success ? 'OK' : typeBodyResult.error}`);
+
+ // Step 7: Send with Ctrl+Enter
+ step = 7;
+ const sendResult = await this.verifier.verifiedKeyPress('Control+Return', { windowShouldClose: true });
+ if (sendResult.success) {
+ console.log(` 📧 Step 7: Ctrl+Enter — email sent!`);
+ return { handled: true, description: `Email sent to ${to} with subject "${subject}"`, stepsCompleted: 7 };
+ }
+
+ // Ctrl+Enter didn't close window — try Alt+S as fallback
+ step = 8;
+ console.log(` 📧 Step 7 fallback: Ctrl+Enter didn't close compose, trying Alt+S`);
+ const altSResult = await this.verifier.verifiedKeyPress('Alt+s', { windowShouldClose: true });
+ if (altSResult.success) {
+ console.log(` 📧 Step 8: Alt+S — email sent!`);
+ return { handled: true, description: `Email sent to ${to} (via Alt+S)`, stepsCompleted: 8 };
+ }
+
+ console.log(` ❌ Deterministic flow: send failed (both Ctrl+Enter and Alt+S)`);
+ return { handled: false, description: 'Send shortcut did not work', failedAtStep: step, stepsCompleted: step };
+
+ } catch (err) {
+ console.log(` ❌ Deterministic flow error at step ${step}: ${err}`);
+ return { handled: false, description: `Error at step ${step}: ${err}`, failedAtStep: step, stepsCompleted: step - 1 };
+ }
+ }
+}
diff --git a/src/doctor.ts b/src/doctor.ts
index 30a7894..0f95e17 100644
--- a/src/doctor.ts
+++ b/src/doctor.ts
@@ -1,12 +1,14 @@
/**
* 🩺 Clawd Cursor Doctor - diagnoses setup and auto-configures the pipeline.
*
- * Tests:
- * 1. Screen capture (nut-js)
- * 2. Accessibility bridge (PowerShell / osascript)
- * 3. Input control (keyboard/mouse)
- * 4. AI provider connectivity + model availability (ALL providers in parallel)
- * 5. Builds optimal mixed 3-layer pipeline config
+ * Phases:
+ * 1. Screen capture test (nut-js)
+ * 2. Accessibility bridge test (PowerShell / osascript)
+ * 3. AI provider scan — all providers in parallel
+ * 4. Model verification — text: instruction-following, vision: real image input
+ * 5. Smoke test — a11y→LLM round-trip (reads active window, confirms via model)
+ * 6. Interactive pipeline selection
+ * 7. Save config
*/
import * as fs from 'fs';
@@ -30,9 +32,9 @@ import type {
ModelTestResult,
} from './providers';
import { DEFAULT_CONFIG } from './types';
-import { resolveApiConfig } from './openclaw-credentials';
+import { resolveApiConfig } from './credentials';
-const CONFIG_FILE = '.clawd-config.json';
+const CONFIG_FILE = '.clawdcursor-config.json';
const execFileAsync = promisify(execFile);
interface DiagResult {
@@ -74,7 +76,7 @@ export async function quickSetup(): Promise {
// 3. Build best pipeline automatically
const pipeline = buildMixedPipeline(scanResults, modelTests);
- // 4. Save to .clawd-config.json
+ // 4. Save to .clawdcursor-config.json
savePipelineConfig(pipeline, scanResults);
// 5. Return pipeline
@@ -156,83 +158,18 @@ async function quickTestModelAsync(
}
/**
- * Quick model test with 5s timeout.
+ * Quick model test with 5s timeout — uses the same real tests as full doctor.
+ * For quick setup, we accept text-only pings for vision models to save time.
*/
async function quickTestModel(
provider: ProviderProfile,
apiKey: string,
model: string,
- _isVision: boolean,
+ isVision: boolean,
): Promise<{ ok: boolean; latencyMs?: number; error?: string }> {
- const start = performance.now();
-
- try {
- if (provider.openaiCompat) {
- const response = await fetch(`${provider.baseUrl}/chat/completions`, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- ...provider.authHeader(apiKey),
- },
- body: JSON.stringify({
- model,
- max_tokens: 5,
- messages: [{ role: 'user', content: 'OK' }],
- }),
- signal: AbortSignal.timeout(5000), // 5s timeout for quick setup
- });
-
- const data = await response.json() as any;
- if (data.error) {
- const msg = typeof data.error === 'object' && data.error !== null
- ? (data.error.message || JSON.stringify(data.error))
- : String(data.error);
- return { ok: false, error: msg };
- }
- const text = data.choices?.[0]?.message?.content || '';
- if (!text) return { ok: false, error: 'Empty response' };
-
- return { ok: true, latencyMs: Math.round(performance.now() - start) };
- } else {
- // Anthropic API
- const response = await fetch(`${provider.baseUrl}/messages`, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- ...provider.authHeader(apiKey),
- ...provider.extraHeaders,
- },
- body: JSON.stringify({
- model,
- max_tokens: 5,
- messages: [{ role: 'user', content: 'OK' }],
- }),
- signal: AbortSignal.timeout(5000), // 5s timeout for quick setup
- });
-
- const data = await response.json() as any;
- if (data.type === 'error' && data.error) {
- const err = data.error;
- const msg = typeof err === 'object' && err !== null
- ? (err.message || JSON.stringify(err))
- : String(err);
- return { ok: false, error: msg };
- }
- if (data.error) {
- const msg = typeof data.error === 'object' && data.error !== null
- ? (data.error.message || JSON.stringify(data.error))
- : String(data.error);
- return { ok: false, error: msg };
- }
-
- return { ok: true, latencyMs: Math.round(performance.now() - start) };
- }
- } catch (err: any) {
- if (err.name === 'TimeoutError' || err.name === 'AbortError') {
- return { ok: false, error: 'Timeout (5s)' };
- }
- return { ok: false, error: err.message || String(err) };
- }
+ // Quick setup: text-only ping for both roles (speed over thoroughness)
+ // Full doctor will do the real vision test later
+ return testTextModel(provider, apiKey, model);
}
export async function runDoctor(opts: {
@@ -243,6 +180,18 @@ export async function runDoctor(opts: {
visionModel?: string;
save?: boolean;
}): Promise {
+ // Doctor is interactive-only. If stdin is not a TTY (e.g. run in background,
+ // piped, or via a script), exit immediately instead of hanging forever waiting
+ // for user input that will never come.
+ if (!process.stdin.isTTY || !process.stdout.isTTY) {
+ console.error(
+ '\n❌ clawdcursor doctor requires an interactive terminal.\n' +
+ ' Open a terminal window and run: clawdcursor doctor\n' +
+ ' Do NOT run it in the background, piped, or from a script.\n'
+ );
+ process.exit(1);
+ }
+
const results: DiagResult[] = [];
console.log(`\n🩺 Clawd Cursor Doctor - diagnosing your setup...\n`);
@@ -275,6 +224,64 @@ export async function runDoctor(opts: {
desktop.disconnect();
}
+ // ─── 1b. macOS Permissions (Screen Recording + Accessibility) ───
+ if (process.platform === 'darwin') {
+ console.log('🍎 macOS permissions...');
+ // Screen Recording: try to capture — if it fails with "not permitted" TCC denied it
+ try {
+ const { execFileAsync } = await import('child_process').then(m => ({
+ execFileAsync: require('util').promisify(m.execFile) as (cmd: string, args: string[]) => Promise<{ stdout: string; stderr: string }>,
+ }));
+ // osascript can query Accessibility permission
+ const { stdout: axOut } = await execFileAsync('osascript', [
+ '-e', 'tell application "System Events" to return (UI elements enabled as string)',
+ ]).catch(() => ({ stdout: 'false', stderr: '' }));
+ const axOk = axOut.trim() === 'true';
+ results.push({
+ name: 'macOS Accessibility permission',
+ ok: axOk,
+ detail: axOk
+ ? 'Granted — clawdcursor can read UI elements'
+ : 'DENIED — open System Settings → Privacy & Security → Accessibility → enable Terminal/Node',
+ });
+ if (axOk) {
+ console.log(' ✅ Accessibility permission granted');
+ } else {
+ console.log(' ❌ Accessibility permission DENIED');
+ console.log(' → System Settings → Privacy & Security → Accessibility → enable your terminal');
+ }
+ } catch {
+ results.push({ name: 'macOS Accessibility permission', ok: false, detail: 'Could not query — run manually in a terminal' });
+ }
+
+ // Screen Recording: attempt a screencapture dry run
+ try {
+ const { execFileAsync } = await import('child_process').then(m => ({
+ execFileAsync: require('util').promisify(m.execFile) as (cmd: string, args: string[]) => Promise<{ stdout: string; stderr: string }>,
+ }));
+ const tmpFile = `/tmp/.clawdcursor-scrtest-${Date.now()}.png`;
+ const { stderr } = await execFileAsync('screencapture', ['-x', '-t', 'png', tmpFile])
+ .catch(e => ({ stdout: '', stderr: String(e) }));
+ const denied = stderr.toLowerCase().includes('not permitted') || stderr.toLowerCase().includes('permission');
+ try { require('fs').unlinkSync(tmpFile); } catch { /* cleanup */ }
+ results.push({
+ name: 'macOS Screen Recording permission',
+ ok: !denied,
+ detail: denied
+ ? 'DENIED — open System Settings → Privacy & Security → Screen Recording → enable Terminal/Node'
+ : 'Granted — clawdcursor can capture the screen',
+ });
+ if (denied) {
+ console.log(' ❌ Screen Recording permission DENIED');
+ console.log(' → System Settings → Privacy & Security → Screen Recording → enable your terminal');
+ } else {
+ console.log(' ✅ Screen Recording permission granted');
+ }
+ } catch {
+ results.push({ name: 'macOS Screen Recording permission', ok: false, detail: 'Could not verify — check manually' });
+ }
+ }
+
// ─── 2. Accessibility Bridge ─────────────────────────────────────
console.log('♿ Accessibility bridge...');
const a11y = new AccessibilityBridge();
@@ -432,7 +439,60 @@ export async function runDoctor(opts: {
results.push({ name: 'Vision model', ok: false, detail: 'No working vision model found' });
}
- // ─── 5. Interactive provider/model selection ───────────────────
+ // ─── 5. Smoke Test — end-to-end pipeline sanity ─────────────
+ if (workingText.length > 0) {
+ console.log(`\n🧪 Smoke test...`);
+ const bestText = workingText[0];
+ const smokeProvider = PROVIDERS[bestText.providerKey];
+ const smokeScan = scanResults.find(s => s.key === bestText.providerKey);
+ const smokeKey = smokeScan?.apiKey || '';
+
+ // Quick round-trip: read active window title via a11y, ask LLM to echo it
+ let smokeOk = false;
+ try {
+ const smokeA11y = new AccessibilityBridge();
+ const activeWin = await smokeA11y.getActiveWindow();
+ const windowTitle = activeWin?.title || 'Terminal';
+
+ // Send window title to text model, ask it to confirm
+ const smokeInstruction = `The active window is titled "${windowTitle}". Reply with exactly: SMOKE_PASS`;
+
+ let smokeText = '';
+ if (smokeProvider.openaiCompat) {
+ const res = await fetch(`${smokeProvider.baseUrl}/chat/completions`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...smokeProvider.authHeader(smokeKey) },
+ body: JSON.stringify({ model: bestText.model, max_tokens: 15, temperature: 0, messages: [{ role: 'user', content: smokeInstruction }] }),
+ signal: AbortSignal.timeout(8000),
+ });
+ const data = await res.json() as any;
+ smokeText = data.choices?.[0]?.message?.content || '';
+ } else {
+ const res = await fetch(`${smokeProvider.baseUrl}/messages`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...smokeProvider.authHeader(smokeKey), ...smokeProvider.extraHeaders },
+ body: JSON.stringify({ model: bestText.model, max_tokens: 15, messages: [{ role: 'user', content: smokeInstruction }] }),
+ signal: AbortSignal.timeout(8000),
+ });
+ const data = await res.json() as any;
+ smokeText = data.content?.[0]?.text || '';
+ }
+
+ smokeOk = smokeText.includes('SMOKE_PASS');
+ if (smokeOk) {
+ console.log(` ✅ A11y → LLM round-trip passed (window: "${windowTitle}")`);
+ results.push({ name: 'Smoke test (a11y→LLM)', ok: true, detail: `Window "${windowTitle}" — model confirmed` });
+ } else {
+ console.log(` ⚠️ LLM responded but didn't confirm (got: "${smokeText.substring(0, 40)}")`);
+ results.push({ name: 'Smoke test (a11y→LLM)', ok: false, detail: `Model didn't follow instruction: "${smokeText.substring(0, 40)}"` });
+ }
+ } catch (err) {
+ console.log(` ⚠️ Smoke test skipped: ${err}`);
+ results.push({ name: 'Smoke test (a11y→LLM)', ok: false, detail: `Error: ${err}` });
+ }
+ }
+
+ // ─── 6. Interactive provider/model selection ───────────────────
const recommendedPipeline = buildMixedPipeline(scanResults, modelTests);
const gpuInfo = await detectGpuInfo();
if (gpuInfo) {
@@ -454,13 +514,13 @@ export async function runDoctor(opts: {
console.log(` 🖥️ Computer Use API: enabled (Anthropic native)`);
}
- // ─── 6. Save Config ─────────────────────────────────────────────
+ // ─── 7. Save Config ─────────────────────────────────────────────
if (opts.save !== false) {
savePipelineConfig(pipeline, scanResults);
}
- // ─── 7. OpenClaw Skill Registration ──────────────────────────────
- await registerOpenClawSkill(results);
+ // ─── 8. External Skill Registration (optional) ─────────────────
+ await registerExternalSkills(results);
// ─── Summary ────────────────────────────────────────────────────
printSummary(results, pipeline);
@@ -545,10 +605,10 @@ async function runSingleProviderFlow(
}
}
- // Test vision model (Layer 3)
+ // Test vision model (Layer 3) — with actual image
if (apiKey) {
console.log(` Testing ${visionModel} (vision)...`);
- const visionResult = await testModel(provider, apiKey, visionModel, false);
+ const visionResult = await testModel(provider, apiKey, visionModel, true);
if (visionResult.ok) {
visionModelWorks = true;
results.push({
@@ -600,7 +660,7 @@ async function runSingleProviderFlow(
// Save Config
if (opts.save !== false) {
- const configPath = path.join(process.cwd(), CONFIG_FILE);
+ const configPath = path.join(path.resolve(__dirname, '..'), CONFIG_FILE);
const configData = {
provider: providerKey,
pipeline: {
@@ -621,8 +681,8 @@ async function runSingleProviderFlow(
console.log(`\n💾 Config saved to ${CONFIG_FILE}`);
}
- // OpenClaw Skill Registration
- await registerOpenClawSkill(results);
+ // External Skill Registration (optional)
+ await registerExternalSkills(results);
// Summary
printSummary(results, pipeline);
@@ -955,7 +1015,8 @@ async function testModelAsync(
* Save pipeline config to disk, including multi-provider info.
*/
function savePipelineConfig(pipeline: PipelineConfig, scanResults: ProviderScanResult[]): void {
- const configPath = path.join(process.cwd(), CONFIG_FILE);
+ // Always save to the package directory so loadPipelineConfig finds it reliably
+ const configPath = path.join(path.resolve(__dirname, '..'), CONFIG_FILE);
// Determine which providers are actually used
const layer2ProviderKey = providerKeyForUrl(pipeline.layer2.baseUrl) || pipeline.providerKey;
@@ -1077,92 +1138,54 @@ function printSummary(results: DiagResult[], pipeline: PipelineConfig): void {
}
/**
- * Register Clawd Cursor as an OpenClaw skill by symlinking into the workspace skills folder.
+ * Register Clawd Cursor as a skill in detected external platforms (OpenClaw, Codex, etc.).
+ * Purely optional — skips silently if no platforms are installed.
*/
-async function registerOpenClawSkill(results: DiagResult[]): Promise {
- console.log('🔗 OpenClaw skill registration...');
+async function registerExternalSkills(results: DiagResult[]): Promise {
+ const homeDir = process.env.HOME || process.env.USERPROFILE || '';
+ if (!homeDir) return;
- try {
- const homeDir = process.env.HOME || process.env.USERPROFILE || '';
- if (!homeDir) {
- console.log(' ⚠️ Could not determine home directory — skipping');
- return;
- }
+ const clawdCursorRoot = path.resolve(__dirname, '..');
- // Check common OpenClaw workspace locations
- const candidates = [
- path.join(homeDir, '.openclaw', 'workspace', 'skills'),
- path.join(homeDir, '.openclaw-dev', 'workspace', 'skills'),
- ];
-
- let skillsDir: string | null = null;
- for (const candidate of candidates) {
- if (fs.existsSync(candidate)) {
- skillsDir = candidate;
- break;
- }
- }
+ // Each entry: [platform name, skills directory path, target folder name]
+ const platforms: [string, string, string][] = [
+ ['OpenClaw', path.join(homeDir, '.openclaw', 'workspace', 'skills'), 'clawdcursor'],
+ ['OpenClaw (dev)', path.join(homeDir, '.openclaw-dev', 'workspace', 'skills'), 'clawdcursor'],
+ ['OpenClaw (flat)', path.join(homeDir, '.openclaw', 'skills'), 'clawdcursor'],
+ ['Codex', path.join(homeDir, '.codex', 'skills'), 'clawdcursor'],
+ ];
- if (!skillsDir) {
- console.log(' ℹ️ OpenClaw not detected — skipping skill registration');
- console.log(' 💡 Install OpenClaw (https://openclaw.ai) to use Clawd Cursor as an AI skill');
- return;
- }
+ let registered = 0;
+ for (const [name, skillsDir, folderName] of platforms) {
+ if (!fs.existsSync(skillsDir)) continue;
- const skillTarget = path.join(skillsDir, 'clawdcursor');
- const clawdCursorRoot = path.resolve(__dirname, '..');
+ const skillTarget = path.join(skillsDir, folderName);
- // Check if already registered
if (fs.existsSync(skillTarget)) {
- // Verify it points to the right place
- try {
- const stat = fs.lstatSync(skillTarget);
- if (stat.isSymbolicLink()) {
- const linkTarget = fs.readlinkSync(skillTarget);
- if (path.resolve(linkTarget) === clawdCursorRoot) {
- console.log(' ✅ Already registered as OpenClaw skill');
- results.push({ name: 'OpenClaw skill', ok: true, detail: 'Registered (symlink)' });
- return;
- }
- // Wrong symlink — remove and recreate
- fs.unlinkSync(skillTarget);
- } else {
- // It's a real directory — check if SKILL.md exists and is current
- const existingSkill = path.join(skillTarget, 'SKILL.md');
- if (fs.existsSync(existingSkill)) {
- console.log(' ✅ Already registered as OpenClaw skill');
- results.push({ name: 'OpenClaw skill', ok: true, detail: 'Registered (directory)' });
- return;
- }
- }
- } catch {
- // Can't read — try to recreate
- }
+ registered++;
+ continue; // Already registered
}
- // Create symlink (or copy on Windows if symlink fails)
try {
fs.symlinkSync(clawdCursorRoot, skillTarget, process.platform === 'win32' ? 'junction' : 'dir');
- console.log(' ✅ Registered as OpenClaw skill');
- console.log(` 📂 ${skillTarget} → ${clawdCursorRoot}`);
- results.push({ name: 'OpenClaw skill', ok: true, detail: 'Registered (symlink created)' });
- } catch (symlinkErr) {
- // Symlink failed (permissions) — copy SKILL.md instead
+ // Silent in v0.7.0 — standalone, external skill link is optional
+ results.push({ name: `${name} skill`, ok: true, detail: 'Registered' });
+ registered++;
+ } catch {
try {
fs.mkdirSync(skillTarget, { recursive: true });
fs.copyFileSync(
path.join(clawdCursorRoot, 'SKILL.md'),
path.join(skillTarget, 'SKILL.md')
);
- console.log(' ✅ Registered as OpenClaw skill (copied SKILL.md)');
- results.push({ name: 'OpenClaw skill', ok: true, detail: 'Registered (SKILL.md copied)' });
- } catch (copyErr) {
- console.log(` ❌ Failed to register: ${copyErr}`);
- results.push({ name: 'OpenClaw skill', ok: false, detail: String(copyErr) });
- }
+ results.push({ name: `${name} skill`, ok: true, detail: 'Registered (SKILL.md copied)' });
+ registered++;
+ } catch { /* non-critical */ }
}
- } catch (err) {
- console.log(` ⚠️ ${err}`);
+ }
+
+ if (registered === 0) {
+ // No external platforms found — that's fine, clawdcursor works standalone
}
}
@@ -1180,9 +1203,9 @@ async function checkForUpdates(results: DiagResult[]): Promise {
const timeout = setTimeout(() => controller.abort(), 5000);
const res = await fetch(
- 'https://api.github.com/repos/AmrDab/clawd-cursor/releases/latest',
+ 'https://api.github.com/repos/AmrDab/clawdcursor/releases/latest',
{
- headers: { 'Accept': 'application/vnd.github.v3+json', 'User-Agent': 'clawd-cursor-doctor' },
+ headers: { 'Accept': 'application/vnd.github.v3+json', 'User-Agent': 'clawdcursor-doctor' },
signal: controller.signal,
},
);
@@ -1244,19 +1267,36 @@ function compareVersions(a: string, b: string): number {
}
/**
- * Test if a model is responding.
+ * Test if a model is responding AND can follow instructions.
+ * Text models: "Reply with exactly: CLAWD_OK" → verify response contains CLAWD_OK.
+ * Vision models: send a 1x1 green pixel → verify non-empty meaningful response.
*/
async function testModel(
provider: ProviderProfile,
apiKey: string,
model: string,
- _isVision: boolean,
+ isVision: boolean,
+): Promise<{ ok: boolean; latencyMs?: number; error?: string }> {
+ if (isVision) {
+ return testVisionModel(provider, apiKey, model);
+ }
+ return testTextModel(provider, apiKey, model);
+}
+
+/** Text model: verify instruction-following, not just connectivity */
+async function testTextModel(
+ provider: ProviderProfile,
+ apiKey: string,
+ model: string,
): Promise<{ ok: boolean; latencyMs?: number; error?: string }> {
const start = performance.now();
+ const TIMEOUT = 8000;
+ const INSTRUCTION = 'Reply with exactly one word: CLAWD_OK — nothing else.';
try {
+ let text = '';
+
if (provider.openaiCompat) {
- // OpenAI-compatible API (OpenAI, Ollama, Kimi)
const response = await fetch(`${provider.baseUrl}/chat/completions`, {
method: 'POST',
headers: {
@@ -1266,22 +1306,16 @@ async function testModel(
body: JSON.stringify({
model,
max_tokens: 10,
- messages: [{ role: 'user', content: 'Reply OK' }],
+ temperature: 0,
+ messages: [{ role: 'user', content: INSTRUCTION }],
}),
- signal: AbortSignal.timeout(15000),
+ signal: AbortSignal.timeout(TIMEOUT),
});
-
const data = await response.json() as any;
if (data.error) {
- const msg = typeof data.error === 'object' && data.error !== null
- ? (data.error.message || JSON.stringify(data.error))
- : String(data.error);
- return { ok: false, error: msg };
+ return { ok: false, error: extractErrorMessage(data.error) };
}
- const text = data.choices?.[0]?.message?.content || '';
- if (!text) return { ok: false, error: 'Empty response' };
-
- return { ok: true, latencyMs: Math.round(performance.now() - start) };
+ text = data.choices?.[0]?.message?.content || '';
} else {
// Anthropic API
const response = await fetch(`${provider.baseUrl}/messages`, {
@@ -1294,39 +1328,144 @@ async function testModel(
body: JSON.stringify({
model,
max_tokens: 10,
- messages: [{ role: 'user', content: 'Reply OK' }],
+ messages: [{ role: 'user', content: INSTRUCTION }],
}),
- signal: AbortSignal.timeout(15000),
+ signal: AbortSignal.timeout(TIMEOUT),
});
-
const data = await response.json() as any;
if (data.type === 'error' && data.error) {
- const err = data.error;
- const msg = typeof err === 'object' && err !== null
- ? (err.message || JSON.stringify(err))
- : String(err);
- const hint = (err.type === 'not_found_error' || err.type === 'invalid_request_error')
+ const hint = (data.error.type === 'not_found_error' || data.error.type === 'invalid_request_error')
? ' — check model id matches your provider'
: '';
- return { ok: false, error: msg + hint };
+ return { ok: false, error: extractErrorMessage(data.error) + hint };
}
if (data.error) {
- const msg = typeof data.error === 'object' && data.error !== null
- ? (data.error.message || JSON.stringify(data.error))
- : String(data.error);
- return { ok: false, error: msg };
+ return { ok: false, error: extractErrorMessage(data.error) };
}
+ text = data.content?.[0]?.text || '';
+ }
+
+ if (!text) return { ok: false, error: 'Empty response' };
- return { ok: true, latencyMs: Math.round(performance.now() - start) };
+ // Verify instruction-following
+ if (!text.includes('CLAWD_OK')) {
+ return { ok: false, error: `Model responded but didn't follow instructions (got: "${text.substring(0, 50)}")` };
}
+
+ return { ok: true, latencyMs: Math.round(performance.now() - start) };
} catch (err: any) {
if (err.name === 'TimeoutError' || err.name === 'AbortError') {
- return { ok: false, error: 'Timeout (15s)' };
+ return { ok: false, error: `Timeout (${TIMEOUT / 1000}s)` };
}
return { ok: false, error: err.message || String(err) };
}
}
+/** Vision model: send a real image and verify the model can process it */
+async function testVisionModel(
+ provider: ProviderProfile,
+ apiKey: string,
+ model: string,
+): Promise<{ ok: boolean; latencyMs?: number; error?: string }> {
+ const start = performance.now();
+ const TIMEOUT = 10000; // vision needs slightly more time
+
+ // 8x8 solid green PNG (89 bytes) — smallest valid PNG that's unambiguously green
+ const GREEN_PIXEL_PNG = 'iVBORw0KGgoAAAANSUhEUgAAAAgAAAAICAIAAABLbSncAAAADklEQVQI12Ng+M9AEwYAGJgBgV6GPOYAAAAASUVORK5CYII=';
+
+ try {
+ let text = '';
+
+ if (provider.openaiCompat) {
+ const response = await fetch(`${provider.baseUrl}/chat/completions`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ ...provider.authHeader(apiKey),
+ },
+ body: JSON.stringify({
+ model,
+ max_tokens: 20,
+ messages: [{
+ role: 'user',
+ content: [
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${GREEN_PIXEL_PNG}` } },
+ { type: 'text', text: 'What color is this image? Reply with one word.' },
+ ],
+ }],
+ }),
+ signal: AbortSignal.timeout(TIMEOUT),
+ });
+ const data = await response.json() as any;
+ if (data.error) {
+ return { ok: false, error: extractErrorMessage(data.error) };
+ }
+ text = data.choices?.[0]?.message?.content || '';
+ } else {
+ // Anthropic API — uses content blocks with type: image
+ const response = await fetch(`${provider.baseUrl}/messages`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ ...provider.authHeader(apiKey),
+ ...provider.extraHeaders,
+ },
+ body: JSON.stringify({
+ model,
+ max_tokens: 20,
+ messages: [{
+ role: 'user',
+ content: [
+ { type: 'image', source: { type: 'base64', media_type: 'image/png', data: GREEN_PIXEL_PNG } },
+ { type: 'text', text: 'What color is this image? Reply with one word.' },
+ ],
+ }],
+ }),
+ signal: AbortSignal.timeout(TIMEOUT),
+ });
+ const data = await response.json() as any;
+ if (data.type === 'error' && data.error) {
+ return { ok: false, error: extractErrorMessage(data.error) };
+ }
+ if (data.error) {
+ return { ok: false, error: extractErrorMessage(data.error) };
+ }
+ text = data.content?.[0]?.text || '';
+ }
+
+ if (!text) return { ok: false, error: 'Empty response — model may not support vision' };
+
+ // Any non-empty response proves the model accepted the image
+ // Bonus: check if it said "green" (but don't require it — some models describe differently)
+ const lower = text.toLowerCase();
+ const recognizedColor = lower.includes('green') || lower.includes('color');
+ return {
+ ok: true,
+ latencyMs: Math.round(performance.now() - start),
+ ...(recognizedColor ? {} : {}), // response is valid either way
+ };
+ } catch (err: any) {
+ if (err.name === 'TimeoutError' || err.name === 'AbortError') {
+ return { ok: false, error: `Timeout (${TIMEOUT / 1000}s)` };
+ }
+ // Common error: model doesn't support multimodal input
+ const msg = err.message || String(err);
+ if (msg.includes('image') || msg.includes('multimodal') || msg.includes('vision')) {
+ return { ok: false, error: `Model does not support vision input: ${msg}` };
+ }
+ return { ok: false, error: msg };
+ }
+}
+
+/** Extract a human-readable error message from an API error response */
+function extractErrorMessage(error: unknown): string {
+ if (typeof error === 'string') return error;
+ if (typeof error === 'object' && error !== null) {
+ return (error as any).message || JSON.stringify(error);
+ }
+ return String(error);
+}
+
/**
* Load saved pipeline config from disk.
*/
diff --git a/src/format.ts b/src/format.ts
new file mode 100644
index 0000000..70689ba
--- /dev/null
+++ b/src/format.ts
@@ -0,0 +1,29 @@
+/**
+ * Terminal formatting utilities.
+ *
+ * Emoji gate: Windows terminals not in UTF-8 mode render emoji as garbled characters.
+ * Detects capable terminals and provides ASCII fallbacks.
+ */
+
+/** Whether the current terminal can render emoji safely */
+export const canEmoji = (() => {
+ if (process.env.NO_COLOR) return false;
+ if (process.platform !== 'win32') return true;
+ // Windows Terminal sets WT_SESSION
+ if (process.env.WT_SESSION) return true;
+ // VS Code integrated terminal
+ if (process.env.TERM_PROGRAM === 'vscode') return true;
+ // Check if console codepage is UTF-8
+ try {
+ const { execSync } = require('child_process');
+ const cp = execSync('chcp', { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] });
+ return cp.includes('65001');
+ } catch {
+ return false;
+ }
+})();
+
+/** Emoji with ASCII fallback for non-capable terminals */
+export function e(emoji: string, fallback: string): string {
+ return canEmoji ? emoji : fallback;
+}
diff --git a/src/generic-computer-use.ts b/src/generic-computer-use.ts
new file mode 100644
index 0000000..6a15ba4
--- /dev/null
+++ b/src/generic-computer-use.ts
@@ -0,0 +1,505 @@
+/**
+ * Generic Computer Use — Universal L3 Vision Fallback
+ *
+ * Implements the same screenshot → action → screenshot loop as the
+ * Anthropic Computer Use adapter, but using OpenAI function-calling
+ * format so it works with ANY vision-capable provider:
+ * OpenAI (gpt-4o, gpt-4o-mini)
+ * Google Gemini (via OpenAI-compat endpoint)
+ * Groq (llama-3.2-90b-vision-preview)
+ * Together AI, DeepSeek, Ollama, and any OpenAI-compat provider
+ *
+ * The LLM is given one tool: `desktop_action` with a discriminated
+ * union of action types. It calls the tool with a structured action,
+ * we execute it, take another screenshot, and repeat.
+ *
+ * When the LLM returns { action: "done" } the loop ends.
+ */
+
+import os from 'os';
+import { NativeDesktop } from './native-desktop';
+import { AccessibilityBridge } from './accessibility';
+import { SafetyLayer, } from './safety';
+import { SafetyTier } from './types';
+import { normalizeKeyCombo } from './keys';
+import type { ClawdConfig, StepResult } from './types';
+import type { PipelineConfig } from './providers';
+import type { TaskLogger } from './task-logger';
+import type { TaskVerifier } from './verifiers';
+
+const MAX_ITERATIONS = 25;
+const IS_MAC = os.platform() === 'darwin';
+const LLM_TARGET_WIDTH = 1280;
+
+// ── OpenAI function-calling tool definition ───────────────────────────────────
+
+const DESKTOP_ACTION_TOOL = {
+ type: 'function' as const,
+ function: {
+ name: 'desktop_action',
+ description: 'Execute a desktop action (click, type, key press, scroll, screenshot, or done). Call this tool to interact with the desktop.',
+ parameters: {
+ type: 'object',
+ properties: {
+ action: {
+ type: 'string',
+ enum: ['screenshot', 'click', 'double_click', 'right_click', 'type', 'key', 'scroll', 'move', 'drag', 'done', 'wait'],
+ description: 'The action to perform.',
+ },
+ x: { type: 'number', description: 'X coordinate in screenshot space (0 to image width). Required for click/double_click/right_click/move/scroll/drag.' },
+ y: { type: 'number', description: 'Y coordinate in screenshot space (0 to image height). Required for click/double_click/right_click/move/scroll/drag.' },
+ end_x: { type: 'number', description: 'End X coordinate for drag action.' },
+ end_y: { type: 'number', description: 'End Y coordinate for drag action.' },
+ text: { type: 'string', description: 'Text to type. Required for action=type.' },
+ key: { type: 'string', description: 'Key or combo to press (e.g. "Return", "ctrl+c", "alt+F4"). Required for action=key.' },
+ direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Scroll direction. Default: down.' },
+ amount: { type: 'number', description: 'Scroll amount in ticks (1-10). Default: 3.' },
+ reason: { type: 'string', description: 'For action=done: explain what was accomplished. For action=wait: why waiting.' },
+ wait_ms: { type: 'number', description: 'For action=wait: milliseconds to wait (100-5000).' },
+ },
+ required: ['action'],
+ },
+ },
+};
+
+// ── System prompt ──────────────────────────────────────────────────────────────
+
+const SYSTEM_PROMPT = `You are a desktop automation agent. You control a computer by calling the desktop_action tool.
+
+WORKFLOW:
+1. Call desktop_action with action="screenshot" to see the current screen.
+2. Based on what you see, call desktop_action with the appropriate action.
+3. After each action that changes screen state (click, type, key), take another screenshot to verify.
+4. When the task is complete, call desktop_action with action="done" and explain what you accomplished.
+
+RULES:
+- ALWAYS start with action="screenshot" to orient yourself.
+- Take a screenshot after: opening apps, clicking buttons, typing text, navigating.
+- Batch predictable actions WITHOUT screenshots in between (e.g. type then press Return).
+- Coordinates are in screenshot-image space — I will scale them to the real screen for you.
+- Use keyboard shortcuts when faster: ${IS_MAC ? 'Cmd+C copy, Cmd+V paste, Cmd+Space Spotlight, Cmd+Tab switch apps' : 'Ctrl+C copy, Ctrl+V paste, Win search, Alt+Tab switch apps'}.
+- To open an app: press ${IS_MAC ? 'Cmd+Space' : 'the Windows key'}, type the app name, press Return.
+- If an action doesn't work, try a different approach — don't repeat the exact same action.
+- If the screen hasn't changed after an action, wait briefly and try again or use a different method.
+- NEVER call done unless you can see clear evidence the task is complete in the screenshot.
+
+PLATFORM: ${IS_MAC ? 'macOS' : process.platform === 'win32' ? 'Windows' : 'Linux'}`;
+
+// ── Result types ──────────────────────────────────────────────────────────────
+
+export interface GenericComputerUseResult {
+ success: boolean;
+ steps: StepResult[];
+ llmCalls: number;
+}
+
+// ── Provider capability check ─────────────────────────────────────────────────
+
+/**
+ * Returns true if the given provider/model can be used for generic Computer Use.
+ * Requires: OpenAI-compat API + a vision-capable model.
+ */
+export function isGenericComputerUseSupported(
+ config: ClawdConfig,
+ pipelineConfig?: PipelineConfig | null,
+): boolean {
+ // Anthropic has its own native CU — don't use generic for it
+ if (config.ai.provider === 'anthropic' && !config.ai.visionBaseUrl) return false;
+
+ // Need a vision model
+ const visionModel = pipelineConfig?.layer3?.model || config.ai.visionModel;
+ if (!visionModel) return false;
+
+ // Need an API key
+ const visionKey = pipelineConfig?.layer3?.apiKey || config.ai.visionApiKey || config.ai.apiKey;
+ if (!visionKey) return false;
+
+ return true;
+}
+
+// ── Main class ────────────────────────────────────────────────────────────────
+
+export class GenericComputerUse {
+ private llmWidth = LLM_TARGET_WIDTH;
+ private llmHeight = 720;
+ private verifier: TaskVerifier | null = null;
+
+ constructor(
+ private config: ClawdConfig,
+ private desktop: NativeDesktop,
+ private a11y: AccessibilityBridge,
+ private safety: SafetyLayer,
+ private pipelineConfig?: PipelineConfig | null,
+ ) {
+ const size = this.desktop.getScreenSize();
+ const scale = size.width > LLM_TARGET_WIDTH ? size.width / LLM_TARGET_WIDTH : 1;
+ this.llmWidth = LLM_TARGET_WIDTH;
+ this.llmHeight = Math.round(size.height / scale);
+ }
+
+ setVerifier(v: TaskVerifier) {
+ this.verifier = v;
+ }
+
+ /**
+ * Execute a subtask using the generic vision loop.
+ */
+ async executeSubtask(
+ subtask: string,
+ debugDir: string | null,
+ subtaskIndex: number,
+ priorSteps?: string[],
+ logger?: TaskLogger,
+ ): Promise {
+ const steps: StepResult[] = [];
+ let llmCalls = 0;
+
+ // Build initial context message
+ let userMessage = subtask;
+ if (priorSteps && priorSteps.length > 0) {
+ userMessage =
+ `CONTEXT — These steps were already completed:\n` +
+ priorSteps.map((s, i) => `${i + 1}. ${s}`).join('\n') +
+ `\n\nThe relevant app may already be open. Start with a screenshot to orient yourself.\n\nYOUR TASK: ${subtask}`;
+ }
+
+ const messages: any[] = [{ role: 'user', content: userMessage }];
+ const actionHistory: string[] = [];
+ let consecutiveScreenshots = 0;
+
+ console.log(` 🌐 Generic L3: "${subtask.substring(0, 80)}${subtask.length > 80 ? '...' : ''}"`);
+
+ for (let i = 0; i < MAX_ITERATIONS; i++) {
+ if ((this.config as any)._aborted) break;
+
+ // ── Call the vision LLM ───────────────────────────────────────────────
+ llmCalls++;
+ const response = await this.callVisionLLM(messages);
+
+ if (response.error) {
+ console.warn(` ⚠️ Generic CU API error: ${response.error}`);
+ steps.push({ action: 'error', description: `Vision LLM error: ${response.error}`, success: false, timestamp: Date.now() });
+ break;
+ }
+
+ // Parse tool call from response
+ const toolCall = this.extractToolCall(response);
+ if (!toolCall) {
+ // No tool call — LLM returned prose. Add it to context and ask again.
+ const text = this.extractText(response);
+ if (text) {
+ messages.push({ role: 'assistant', content: text });
+ messages.push({ role: 'user', content: 'Please use the desktop_action tool to perform the next action.' });
+ } else {
+ steps.push({ action: 'error', description: 'LLM returned no tool call and no text', success: false, timestamp: Date.now() });
+ break;
+ }
+ continue;
+ }
+
+ const { action, args } = toolCall;
+
+ // ── Handle tool call ─────────────────────────────────────────────────
+ if (action === 'done') {
+ const reason = args.reason || 'Task complete';
+ console.log(` ✅ Generic L3 done: ${reason}`);
+
+ // Ground truth verification
+ if (this.verifier) {
+ const vResult = await this.verifier.verify(subtask, () => this.a11y.readClipboard()).catch(() => null);
+ if (vResult && !vResult.pass && steps.length > 0) {
+ console.log(` 🚫 Generic L3 blocked done — verifier: ${vResult.detail}`);
+ // Feed the failure back to the LLM
+ messages.push(this.buildAssistantToolMessage(response, toolCall.rawCall));
+ messages.push(this.buildToolResult(toolCall.id, `VERIFICATION FAILED: ${vResult.detail}. The task is NOT complete. Continue working.`));
+ continue;
+ }
+ }
+
+ steps.push({ action: 'done', description: reason, success: true, timestamp: Date.now() });
+ logger?.logStep({ layer: 3, actionType: 'done', result: 'success', llmReasoning: reason.substring(0, 200) });
+ return { success: true, steps, llmCalls };
+ }
+
+ if (action === 'screenshot') {
+ consecutiveScreenshots++;
+ if (consecutiveScreenshots > 3) {
+ // LLM is stuck in a screenshot loop
+ messages.push(this.buildAssistantToolMessage(response, toolCall.rawCall));
+ messages.push(this.buildToolResult(toolCall.id, 'You have taken multiple screenshots without acting. Based on what you see, take the next action now — do not take another screenshot yet.'));
+ continue;
+ }
+
+ const screenshot = await this.desktop.captureForLLM();
+ const a11yTree = await this.a11y.getScreenContext().catch(() => '') ?? '';
+ const screenshotResult = this.buildScreenshotResult(screenshot, a11yTree);
+
+ messages.push(this.buildAssistantToolMessage(response, toolCall.rawCall));
+ messages.push(this.buildToolResult(toolCall.id, screenshotResult));
+
+ if (debugDir) {
+ require('fs').promises.writeFile(
+ require('path').join(debugDir, `generic-cu-${subtaskIndex}-${i}.png`),
+ screenshot.buffer,
+ ).catch(() => {});
+ }
+ continue;
+ }
+
+ // Reset consecutive screenshot counter for any real action
+ consecutiveScreenshots = 0;
+
+ if (action === 'wait') {
+ const waitMs = Math.min(Math.max(args.wait_ms ?? 1000, 100), 5000);
+ await this.delay(waitMs);
+ messages.push(this.buildAssistantToolMessage(response, toolCall.rawCall));
+ messages.push(this.buildToolResult(toolCall.id, `Waited ${waitMs}ms.`));
+ continue;
+ }
+
+ // ── Execute desktop action ────────────────────────────────────────────
+ const scaleFactor = await this.getScaleFactor();
+ const actionDesc = this.describeAction(action, args);
+ actionHistory.push(actionDesc);
+
+ // Safety check
+ const safetyAction = this.buildSafetyAction(action, args, scaleFactor);
+ const tier = safetyAction ? this.safety.classify(safetyAction, actionDesc) : SafetyTier.Auto;
+
+ if (tier === SafetyTier.Confirm && this.safety.isBlocked(actionDesc)) {
+ steps.push({ action: 'blocked', description: `Blocked: ${actionDesc}`, success: false, timestamp: Date.now() });
+ messages.push(this.buildAssistantToolMessage(response, toolCall.rawCall));
+ messages.push(this.buildToolResult(toolCall.id, `BLOCKED: This action is not allowed. Choose a different approach.`));
+ continue;
+ }
+
+ // Execute
+ let execResult: string;
+ try {
+ execResult = await this.executeAction(action, args, scaleFactor);
+ steps.push({ action, description: actionDesc, success: true, timestamp: Date.now() });
+ logger?.logStep({ layer: 3, actionType: action, result: 'success' });
+
+ // Brief settle delay after state-changing actions
+ if (['click', 'double_click', 'right_click', 'key'].includes(action)) {
+ await this.delay(200);
+ }
+ } catch (err) {
+ execResult = `Error: ${String(err).substring(0, 150)}`;
+ steps.push({ action, description: `Failed: ${actionDesc} — ${execResult}`, success: false, timestamp: Date.now() });
+ }
+
+ messages.push(this.buildAssistantToolMessage(response, toolCall.rawCall));
+ messages.push(this.buildToolResult(toolCall.id, execResult));
+ }
+
+ // Hit iteration limit
+ console.log(` ⚠️ Generic L3: iteration limit reached`);
+ return { success: false, steps, llmCalls };
+ }
+
+ // ── Vision LLM call ───────────────────────────────────────────────────────
+
+ private async callVisionLLM(messages: any[]): Promise {
+ const controller = new AbortController();
+ const timeout = setTimeout(() => controller.abort(), 120_000);
+
+ try {
+ const visionModel = this.pipelineConfig?.layer3?.model || this.config.ai.visionModel;
+ const visionKey = this.pipelineConfig?.layer3?.apiKey || this.config.ai.visionApiKey || this.config.ai.apiKey;
+ const visionBaseUrl = (
+ this.pipelineConfig?.layer3?.baseUrl ||
+ this.config.ai.visionBaseUrl ||
+ this.config.ai.baseUrl ||
+ 'https://api.openai.com/v1'
+ ).replace(/\/+$/, '');
+
+ const response = await fetch(`${visionBaseUrl}/chat/completions`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ 'Authorization': `Bearer ${visionKey}`,
+ },
+ body: JSON.stringify({
+ model: visionModel,
+ max_tokens: 1024,
+ messages: [
+ { role: 'system', content: SYSTEM_PROMPT },
+ ...messages,
+ ],
+ tools: [DESKTOP_ACTION_TOOL],
+ tool_choice: 'required', // always call the tool — no prose-only responses
+ }),
+ signal: controller.signal,
+ });
+
+ clearTimeout(timeout);
+ const data = await response.json() as any;
+
+ if (!response.ok) {
+ const msg = data?.error?.message || `HTTP ${response.status}`;
+ return { error: msg };
+ }
+
+ return data;
+ } catch (err) {
+ clearTimeout(timeout);
+ return { error: String(err) };
+ }
+ }
+
+ // ── Message building ──────────────────────────────────────────────────────
+
+ private buildScreenshotResult(
+ screenshot: { buffer: Buffer; llmWidth: number; llmHeight: number; scaleFactor: number },
+ a11yTree: string,
+ ): any {
+ const base64 = screenshot.buffer.toString('base64');
+ const content: any[] = [
+ {
+ type: 'image_url',
+ image_url: { url: `data:image/png;base64,${base64}` },
+ },
+ ];
+ if (a11yTree && a11yTree.trim()) {
+ content.push({
+ type: 'text',
+ text: `Screen: ${screenshot.llmWidth}×${screenshot.llmHeight} (scale ${screenshot.scaleFactor.toFixed(2)}x)\nAccessibility tree:\n${a11yTree.substring(0, 2000)}`,
+ });
+ } else {
+ content.push({
+ type: 'text',
+ text: `Screen: ${screenshot.llmWidth}×${screenshot.llmHeight} (scale ${screenshot.scaleFactor.toFixed(2)}x). Coordinates are in screenshot space.`,
+ });
+ }
+ return content;
+ }
+
+ private buildAssistantToolMessage(response: any, rawCall: any): any {
+ const choice = response?.choices?.[0];
+ return {
+ role: 'assistant',
+ content: choice?.message?.content ?? null,
+ tool_calls: [rawCall],
+ };
+ }
+
+ private buildToolResult(toolCallId: string, result: any): any {
+ return {
+ role: 'tool',
+ tool_call_id: toolCallId,
+ content: typeof result === 'string' ? result : JSON.stringify(result),
+ };
+ }
+
+ // ── Tool call extraction ──────────────────────────────────────────────────
+
+ private extractToolCall(response: any): { id: string; action: string; args: any; rawCall: any } | null {
+ const choice = response?.choices?.[0];
+ const toolCalls = choice?.message?.tool_calls;
+ if (!toolCalls || toolCalls.length === 0) return null;
+
+ const call = toolCalls[0];
+ if (call.function?.name !== 'desktop_action') return null;
+
+ try {
+ const args = JSON.parse(call.function.arguments || '{}');
+ return { id: call.id, action: args.action, args, rawCall: call };
+ } catch {
+ return null;
+ }
+ }
+
+ private extractText(response: any): string | null {
+ return response?.choices?.[0]?.message?.content || null;
+ }
+
+ // ── Action execution ──────────────────────────────────────────────────────
+
+ private async getScaleFactor(): Promise {
+ const size = this.desktop.getScreenSize();
+ return size.width > LLM_TARGET_WIDTH ? size.width / LLM_TARGET_WIDTH : 1;
+ }
+
+ private scaleCoord(val: number, scale: number): number {
+ return Math.round(val * scale);
+ }
+
+ private async executeAction(action: string, args: any, scale: number): Promise {
+ const x = args.x != null ? this.scaleCoord(args.x, scale) : 0;
+ const y = args.y != null ? this.scaleCoord(args.y, scale) : 0;
+
+ switch (action) {
+ case 'click':
+ await this.desktop.executeMouseAction({ kind: 'click', x, y });
+ return `Clicked at (${args.x}, ${args.y})`;
+
+ case 'double_click':
+ await this.desktop.executeMouseAction({ kind: 'double_click', x, y });
+ return `Double-clicked at (${args.x}, ${args.y})`;
+
+ case 'right_click':
+ await this.desktop.executeMouseAction({ kind: 'right_click', x, y });
+ return `Right-clicked at (${args.x}, ${args.y})`;
+
+ case 'move':
+ await this.desktop.executeMouseAction({ kind: 'move', x, y });
+ return `Moved mouse to (${args.x}, ${args.y})`;
+
+ case 'scroll': {
+ const direction = args.direction ?? 'down';
+ const amount = Math.min(Math.max(args.amount ?? 3, 1), 10);
+ const scrollDelta = direction === 'up' || direction === 'left' ? -amount : amount;
+ await this.desktop.executeMouseAction({ kind: 'scroll', x, y, scrollDelta });
+ return `Scrolled ${direction} ${amount} ticks at (${args.x}, ${args.y})`;
+ }
+
+ case 'drag': {
+ const ex = args.end_x != null ? this.scaleCoord(args.end_x, scale) : x;
+ const ey = args.end_y != null ? this.scaleCoord(args.end_y, scale) : y;
+ await this.desktop.executeMouseAction({ kind: 'drag', x, y, endX: ex, endY: ey });
+ return `Dragged from (${args.x}, ${args.y}) to (${args.end_x}, ${args.end_y})`;
+ }
+
+ case 'type':
+ await this.desktop.executeKeyboardAction({ kind: 'type', text: args.text ?? '' });
+ return `Typed "${(args.text ?? '').substring(0, 60)}${(args.text ?? '').length > 60 ? '...' : ''}"`;
+
+ case 'key': {
+ const combo = normalizeKeyCombo(args.key ?? '');
+ await this.desktop.executeKeyboardAction({ kind: 'key_press', key: combo });
+ return `Pressed key "${args.key}"`;
+ }
+
+ default:
+ return `Unknown action: ${action}`;
+ }
+ }
+
+ private buildSafetyAction(action: string, args: any, scale: number): any | null {
+ const x = args.x != null ? this.scaleCoord(args.x, scale) : 0;
+ const y = args.y != null ? this.scaleCoord(args.y, scale) : 0;
+ if (action === 'type') return { kind: 'type', text: args.text ?? '' };
+ if (action === 'click') return { kind: 'click', x, y };
+ if (action === 'key') return { kind: 'key_press', key: args.key ?? '' };
+ return null;
+ }
+
+ private describeAction(action: string, args: any): string {
+ switch (action) {
+ case 'click': return `Click at (${args.x}, ${args.y})`;
+ case 'double_click': return `Double-click at (${args.x}, ${args.y})`;
+ case 'right_click': return `Right-click at (${args.x}, ${args.y})`;
+ case 'type': return `Type "${(args.text ?? '').substring(0, 60)}"`;
+ case 'key': return `Key "${args.key}"`;
+ case 'scroll': return `Scroll ${args.direction ?? 'down'} at (${args.x}, ${args.y})`;
+ case 'drag': return `Drag (${args.x},${args.y})→(${args.end_x},${args.end_y})`;
+ case 'wait': return `Wait ${args.wait_ms ?? 1000}ms`;
+ default: return action;
+ }
+ }
+
+ private delay(ms: number): Promise {
+ return new Promise(resolve => setTimeout(resolve, ms));
+ }
+}
diff --git a/src/index.ts b/src/index.ts
index e6ca3ea..e640419 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -13,12 +13,88 @@ import { DEFAULT_CONFIG } from './types';
import type { ClawdConfig } from './types';
import { VERSION } from './version';
import dotenv from 'dotenv';
-import { resolveApiConfig } from './openclaw-credentials';
+import { resolveApiConfig } from './credentials';
import * as fs from 'fs';
import * as path from 'path';
+import { migrateFromLegacyDir } from './paths';
dotenv.config();
+// Migrate data from legacy ~/.openclaw/clawdcursor/ to ~/.clawdcursor/
+migrateFromLegacyDir();
+
+// ── Auth helper ──────────────────────────────────────────────────────────────
+// Reads the saved Bearer token from ~/.clawdcursor/token (written by start/serve).
+function loadAuthToken(): string {
+ try {
+ const tokenPath = path.join(require('os').homedir(), '.clawdcursor', 'token');
+ return fs.readFileSync(tokenPath, 'utf-8').trim();
+ } catch {
+ return '';
+ }
+}
+function authHeaders(): Record {
+ const token = loadAuthToken();
+ return token ? { 'Authorization': `Bearer ${token}` } : {};
+}
+
+// ── Emoji gate (shared utility) ──────────────────────────────────────────────
+import { e } from './format';
+
+// ── Single-instance pidfile lock ─────────────────────────────────────────────
+// Prevents duplicate start/mcp/serve processes from accumulating (a common
+// source of stale processes when Cursor/editors restart the MCP server).
+
+const PID_DIR = path.join(require('os').homedir(), '.clawdcursor');
+
+function pidFilePath(mode: 'start' | 'mcp' | 'serve'): string {
+ return path.join(PID_DIR, `${mode}.pid`);
+}
+
+function isProcessAlive(pid: number): boolean {
+ try {
+ // Signal 0 checks existence without sending a real signal.
+ process.kill(pid, 0);
+ return true;
+ } catch {
+ return false;
+ }
+}
+
+/**
+ * Check if another instance is already running for this mode.
+ * Returns the stale pid if a live duplicate is found, otherwise null.
+ * Writes the current pid to the lockfile on success.
+ */
+function claimPidFile(mode: 'start' | 'mcp' | 'serve'): number | null {
+ try {
+ if (!fs.existsSync(PID_DIR)) fs.mkdirSync(PID_DIR, { recursive: true });
+ const pidFile = pidFilePath(mode);
+ if (fs.existsSync(pidFile)) {
+ const existing = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
+ if (!isNaN(existing) && existing !== process.pid && isProcessAlive(existing)) {
+ return existing; // live duplicate found
+ }
+ }
+ fs.writeFileSync(pidFile, String(process.pid), { encoding: 'utf-8', mode: 0o600 });
+ return null;
+ } catch {
+ return null; // non-fatal — lock is best-effort
+ }
+}
+
+function releasePidFile(mode: 'start' | 'mcp' | 'serve'): void {
+ try {
+ const pidFile = pidFilePath(mode);
+ if (fs.existsSync(pidFile)) {
+ const stored = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
+ if (stored === process.pid) fs.unlinkSync(pidFile);
+ }
+ } catch {
+ // non-fatal
+ }
+}
+
const program = new Command();
async function isClawdInstance(port: number): Promise {
@@ -50,7 +126,7 @@ async function forceKillPort(port: number): Promise {
if (pids.size === 0) return false;
for (const pid of pids) {
execSync(`taskkill /F /PID ${pid}`);
- console.log(`🐾 Killed process ${pid}`);
+ console.log(`${e('🐾', '>')} Killed process ${pid}`);
}
return true;
} catch {
@@ -67,7 +143,7 @@ async function forceKillPort(port: number): Promise {
}
program
- .name('clawd-cursor')
+ .name('clawdcursor')
.description('🐾 AI Desktop Agent — native screen control')
.version(VERSION);
@@ -82,17 +158,35 @@ program
.option('--base-url ', 'Custom API base URL (OpenAI-compatible)')
.option('--api-key ', 'AI provider API key')
.option('--debug', 'Save screenshots to debug/ folder (off by default)')
+ .option('--accept', 'Accept desktop control consent non-interactively and start')
.action(async (opts) => {
+ // Single-instance guard
+ const existingPid = claimPidFile('start');
+ if (existingPid !== null) {
+ console.error(`${e('❌', '[ERR]')} clawdcursor start is already running (pid ${existingPid}). Run \`clawdcursor stop\` first.`);
+ process.exit(1);
+ }
+
+ // Handle consent before anything else
+ const { hasConsent, writeConsentFile, runOnboarding } = await import('./onboarding');
+ if (opts.accept) {
+ writeConsentFile();
+ console.log(' Consent recorded.\n');
+ } else if (!hasConsent()) {
+ const accepted = await runOnboarding('start', parseInt(opts.port, 10) || 3847);
+ if (!accepted) process.exit(1);
+ }
+
// Auto-setup on first run
- const configPath = path.join(__dirname, '..', '.clawd-config.json');
+ const configPath = path.join(__dirname, '..', '.clawdcursor-config.json');
if (!fs.existsSync(configPath)) {
- console.log('🔍 First run — auto-detecting AI providers...\n');
+ console.log(`${e('🔍', '*')} First run — auto-detecting AI providers...\n`);
const { quickSetup } = await import('./doctor');
const pipeline = await quickSetup();
if (pipeline) {
- console.log('✅ Auto-configured! Run `clawdcursor doctor` to customize.\n');
+ console.log(`${e('✅', '[OK]')} Auto-configured! Run \`clawdcursor doctor\` to customize.\n`);
} else {
- console.log('⚠️ No AI providers found. Layer 1 (Action Router) will still work.');
+ console.log(`${e('⚠️', '[WARN]')} No AI providers found. Layer 1 (Action Router) will still work.`);
console.log(' Run `clawdcursor doctor` to set up AI providers.\n');
}
}
@@ -123,17 +217,10 @@ program
debug: opts.debug || false,
};
- console.log(`
-🐾 ╔═══════════════════════════════════════╗
- ║ CLAWD CURSOR v${VERSION} ║
- ║ AI Desktop Agent — Smart Pipeline ║
- ╚═══════════════════════════════════════╝
-`);
+ console.log(`\x1b[32m\u2713\x1b[0m \x1b[1mclawdcursor\x1b[0m \x1b[90mv${VERSION}\x1b[0m \x1b[90m\u2014 desktop control active on ${config.server.host}:${config.server.port}\x1b[0m`);
- if (resolvedApi.source === 'openclaw') {
- console.log('🔗 Using OpenClaw agent credentials for AI provider routing');
- console.log(` Text: ${resolvedApi.textModel || 'auto'} via ${resolvedApi.textBaseUrl || 'default'}`);
- console.log(` Vision: ${resolvedApi.visionModel || 'auto'} via ${resolvedApi.visionBaseUrl || 'default'}`);
+ if (resolvedApi.source === 'external') {
+ console.log(`${e('🔗', '--')} External credentials detected — pipeline config (.clawdcursor-config.json) takes priority`);
}
const agent = new Agent(config);
@@ -141,27 +228,61 @@ program
try {
await agent.connect();
} catch (err) {
- console.error(`\n❌ Failed to initialize native desktop control: ${err}`);
+ console.error(`\n${e('❌', '[ERR]')} Failed to initialize native desktop control: ${err}`);
console.error(`\nThis usually means @nut-tree-fork/nut-js couldn't access the screen.`);
console.error(`Make sure you're running this on a desktop with a display.`);
process.exit(1);
}
- // Start API server
+ // Start API server (agent API + tool API on same port)
const app = createServer(agent, config);
- app.listen(config.server.port, config.server.host, () => {
- console.log(`\n🌐 API server: http://${config.server.host}:${config.server.port}`);
- console.log(`\nEndpoints:`);
+
+ // Mount model-agnostic tool server alongside agent API
+ // POST /execute/* requires auth; GET /tools and GET /docs are public
+ try {
+ const { createToolServer } = await import('./tool-server');
+ const { requireAuth } = await import('./server');
+ const toolCtx = {
+ desktop: agent.getDesktop(),
+ a11y: (agent as any).a11y,
+ cdp: (agent as any).cdpDriver,
+ getMouseScaleFactor: () => 1, // start command uses agent's own scaling
+ getScreenshotScaleFactor: () => agent.getDesktop().getScaleFactor(),
+ ensureInitialized: async () => {}, // agent already initialized
+ };
+ app.use('/execute', requireAuth); // auth gate on all tool execution
+ app.use(createToolServer(toolCtx));
+ } catch (err) {
+ console.warn('Tool server not loaded:', (err as Error).message);
+ }
+
+ app.listen(config.server.port, config.server.host, async () => {
+ const { SERVER_TOKEN } = await import('./server');
+ const tokenPath = require('path').join(require('os').homedir(), '.clawdcursor', 'token');
+ console.log(`\n\x1b[32m${e('🌐', '[NET]')} API server:\x1b[0m http://${config.server.host}:${config.server.port}`);
+ console.log(`\x1b[33m${e('🔑', '[KEY]')} Auth token:\x1b[0m ${SERVER_TOKEN}`);
+ console.log(`\x1b[90m (also saved to ${tokenPath})\x1b[0m`);
+ console.log(`\nAgent endpoints:`);
console.log(` POST /task — {"task": "Open Chrome and go to github.com"}`);
console.log(` GET /status — Agent state`);
- console.log(` POST /confirm — {"approved": true|false}`);
console.log(` POST /abort — Stop current task`);
- console.log(`\nReady. Send a task to get started! 🐾`);
+ console.log(`\nTool server (model-agnostic):`);
+ console.log(` GET /tools — Tool schemas (OpenAI function format)`);
+ console.log(` POST /execute/{name} — Execute any tool`);
+ console.log(` GET /docs — Tool documentation`);
+ console.log(`\nAll mutating endpoints require: \x1b[36mAuthorization: Bearer \x1b[0m`);
+ console.log(`\nReady. ${e('🐾', '')}`);
});
// Graceful shutdown
process.on('SIGINT', () => {
- console.log('\n👋 Shutting down...');
+ console.log(`\n${e('👋', '--')} Shutting down...`);
+ releasePidFile('start');
+ agent.disconnect();
+ process.exit(0);
+ });
+ process.on('SIGTERM', () => {
+ releasePidFile('start');
agent.disconnect();
process.exit(0);
});
@@ -182,15 +303,15 @@ program
});
if (opts.reset) {
- const configPath = path.join(__dirname, '..', '.clawd-config.json');
+ const configPath = path.join(__dirname, '..', '.clawdcursor-config.json');
if (fs.existsSync(configPath)) {
fs.unlinkSync(configPath);
- console.log('🗑️ Cleared saved config — re-detecting from scratch\n');
+ console.log(`${e('🗑️', '[DEL]')} Cleared saved config — re-detecting from scratch\n`);
}
}
// Only use explicit CLI flags for single-provider override.
- // OpenClaw auto-detected credentials should go through multi-provider scan.
+ // Auto-detected external credentials should go through multi-provider scan.
const isExplicit = !!(opts.apiKey || opts.provider);
await runDoctor({
apiKey: isExplicit ? resolvedApi.apiKey : undefined,
@@ -214,23 +335,23 @@ program
}
const isClawd = await isClawdInstance(port);
if (!isClawd) {
- console.log('🐾 No running instance found on port ' + port);
+ console.log(`${e('🐾', '>')} No running instance found on port ` + port);
return;
}
// Abort first so any active task exits quickly before shutdown.
try {
- await fetch(`http://127.0.0.1:${port}/abort`, { method: 'POST', signal: AbortSignal.timeout(2000) });
+ await fetch(`http://127.0.0.1:${port}/abort`, { method: 'POST', headers: authHeaders(), signal: AbortSignal.timeout(2000) });
} catch {
// Best effort only.
}
const url = `http://127.0.0.1:${port}/stop`;
try {
- const res = await fetch(url, { method: 'POST', signal: AbortSignal.timeout(5000) });
+ const res = await fetch(url, { method: 'POST', headers: authHeaders(), signal: AbortSignal.timeout(5000) });
const data = await res.json() as any;
if (data.stopped) {
- console.log('🐾 Clawd Cursor stopped');
+ console.log(`${e('🐾', '>')} Clawd Cursor stopped`);
} else {
console.error('Unexpected response:', JSON.stringify(data));
}
@@ -246,16 +367,16 @@ program
// Still alive — keep waiting
} catch {
// Connection refused = dead = success
- console.log('✅ Server confirmed stopped');
+ console.log(`${e('✅', '[OK]')} Server confirmed stopped`);
return;
}
}
- console.log('⚠️ Graceful stop did not complete — force killing...');
+ console.log(`${e('⚠️', '[WARN]')} Graceful stop did not complete — force killing...`);
const killed = await forceKillPort(port);
if (killed) {
- console.log('🐾 Clawd Cursor force stopped');
+ console.log(`${e('🐾', '>')} Clawd Cursor force stopped`);
} else {
- console.error('❌ Could not force stop process on port ' + port);
+ console.error(`${e('❌', '[ERR]')} Could not force stop process on port ` + port);
}
});
@@ -268,10 +389,10 @@ program
const sendTask = async (taskText: string) => {
try {
- console.log(`\n🐾 Sending: ${taskText}`);
+ console.log(`\n${e('🐾', '>')} Sending: ${taskText}`);
const res = await fetch(url, {
method: 'POST',
- headers: { 'Content-Type': 'application/json' },
+ headers: { 'Content-Type': 'application/json', ...authHeaders() },
body: JSON.stringify({ task: taskText }),
});
const data = await res.json();
@@ -291,6 +412,7 @@ program
const { execFile: spawnExec } = await import('child_process');
const platform = os.platform();
+ const token = loadAuthToken();
const scriptContent = platform === 'win32'
? // Windows: PowerShell script
`
@@ -298,15 +420,21 @@ $host.UI.RawUI.WindowTitle = "Clawd Cursor - Task Console"
Write-Host "Clawd Cursor - Interactive Task Mode" -ForegroundColor Cyan
Write-Host " Type a task and press Enter. Type 'quit' to exit." -ForegroundColor Gray
Write-Host ""
+$headers = @{ "Content-Type" = "application/json"${token ? `; "Authorization" = "Bearer ${token}"` : ''} }
while ($true) {
$task = Read-Host "Enter task"
if (-not $task -or $task -eq "quit" -or $task -eq "exit") {
- Write-Host "👋 Bye!"
+ Write-Host "Bye!"
break
}
- Write-Host "🐾 Sending: $task" -ForegroundColor Yellow
+ # Strip control characters (Ctrl+L, etc.) that break JSON
+ $task = $task -replace '[\\x00-\\x1f]', ''
+ $task = $task.Trim()
+ if (-not $task) { continue }
+ Write-Host "> Sending: $task" -ForegroundColor Yellow
try {
- $response = Invoke-RestMethod -Uri http://127.0.0.1:${opts.port}/task -Method POST -ContentType "application/json" -Body ('{"task": "' + $task.Replace('"', '\\"') + '"}')
+ $jsonBody = @{ task = $task } | ConvertTo-Json -Compress
+ $response = Invoke-RestMethod -Uri http://127.0.0.1:${opts.port}/task -Method POST -Headers $headers -Body $jsonBody
$response | ConvertTo-Json -Depth 5
} catch {
Write-Host "Failed to connect. Is clawdcursor start running?" -ForegroundColor Red
@@ -316,18 +444,19 @@ while ($true) {
`
: // macOS/Linux: bash script
`
-echo "🐾 Clawd Cursor — Interactive Task Mode"
+echo "Clawd Cursor - Interactive Task Mode"
echo " Type a task and press Enter. Type 'quit' to exit."
echo ""
+AUTH_HEADER="${token ? `Authorization: Bearer ${token}` : ''}"
while true; do
printf "Enter task: "
read task
if [ -z "$task" ] || [ "$task" = "quit" ] || [ "$task" = "exit" ]; then
- echo "👋 Bye!"
+ echo "Bye!"
break
fi
- echo "🐾 Sending: $task"
- curl -s -X POST http://127.0.0.1:${opts.port}/task -H "Content-Type: application/json" -d "{\\"task\\": \\"$task\\"}" | python3 -m json.tool 2>/dev/null || echo "Failed to connect. Is clawdcursor start running?"
+ echo "> Sending: $task"
+ curl -s -X POST http://127.0.0.1:${opts.port}/task -H "Content-Type: application/json"${token ? ' -H "$AUTH_HEADER"' : ''} -d "{\\"task\\": \\"$task\\"}" | python3 -m json.tool 2>/dev/null || echo "Failed to connect. Is clawdcursor start running?"
echo ""
done
`;
@@ -336,7 +465,7 @@ done
// Write temp PS1 and open in new Windows Terminal / PowerShell window
const fs = await import('fs');
const path = await import('path');
- const tmpScript = path.join(os.tmpdir(), `clawd-task-${Date.now()}.ps1`);
+ const tmpScript = path.join(os.tmpdir(), `clawdcursor-task-${Date.now()}.ps1`);
fs.writeFileSync(tmpScript, scriptContent);
spawnExec('powershell.exe', [
'-Command', `Start-Process powershell -ArgumentList '-NoExit','-ExecutionPolicy','Bypass','-File','${tmpScript}'`
@@ -344,19 +473,19 @@ done
} else if (platform === 'darwin') {
const fs = await import('fs');
const path = await import('path');
- const tmpScript = path.join(os.tmpdir(), `clawd-task-${Date.now()}.sh`);
+ const tmpScript = path.join(os.tmpdir(), `clawdcursor-task-${Date.now()}.sh`);
fs.writeFileSync(tmpScript, scriptContent, { mode: 0o755 });
spawnExec('open', ['-a', 'Terminal', tmpScript], { detached: true, stdio: 'ignore' } as any);
} else {
// Linux fallback
const fs = await import('fs');
const path = await import('path');
- const tmpScript = path.join(os.tmpdir(), `clawd-task-${Date.now()}.sh`);
+ const tmpScript = path.join(os.tmpdir(), `clawdcursor-task-${Date.now()}.sh`);
fs.writeFileSync(tmpScript, scriptContent, { mode: 0o755 });
spawnExec('x-terminal-emulator', ['-e', tmpScript], { detached: true, stdio: 'ignore' } as any);
}
- console.log('🐾 Task console opened in a new terminal window.');
+ console.log(`${e('🐾', '>')} Task console opened in a new terminal window.`);
}
});
@@ -366,7 +495,7 @@ program
.option('--port ', 'API server port', '3847')
.action(async (opts) => {
const url = `http://127.0.0.1:${opts.port}`;
- console.log('🐾 Opening dashboard... Make sure clawdcursor start is running.');
+ console.log(`${e('🐾', '>')} Opening dashboard... Make sure clawdcursor start is running.`);
const os = await import('os');
const { exec: execCmd } = await import('child_process');
@@ -396,13 +525,13 @@ program
// Verify it's actually a Clawd Cursor instance before killing
const isClawd = await isClawdInstance(port);
if (!isClawd) {
- console.log('🐾 No running instance found on port ' + port);
+ console.log(`${e('🐾', '>')} No running instance found on port ` + port);
return;
}
// Try graceful stop
try {
- await fetch(`http://127.0.0.1:${port}/stop`, { method: 'POST', signal: AbortSignal.timeout(3000) });
+ await fetch(`http://127.0.0.1:${port}/stop`, { method: 'POST', headers: authHeaders(), signal: AbortSignal.timeout(3000) });
} catch {
// May fail if server dies mid-response — that's OK
}
@@ -413,22 +542,22 @@ program
try {
await fetch(`http://127.0.0.1:${port}/health`, { signal: AbortSignal.timeout(1000) });
// Still alive — force kill
- console.log('⚠️ Graceful stop failed — force killing...');
+ console.log(`${e('⚠️', '[WARN]')} Graceful stop failed — force killing...`);
const killed = await forceKillPort(port);
if (killed) {
- console.log('🐾 Clawd Cursor force killed');
+ console.log(`${e('🐾', '>')} Clawd Cursor force killed`);
} else {
console.error('Could not find process to kill');
}
} catch {
// Connection refused = dead = success
- console.log('🐾 Clawd Cursor killed');
+ console.log(`${e('🐾', '>')} Clawd Cursor killed`);
}
});
program
.command('install')
- .description('Register Clawd Cursor as an OpenClaw skill and save config')
+ .description('Set up API key, configure pipeline, and auto-detect providers')
.option('--api-key ', 'AI provider API key')
.option('--provider ', 'AI provider (auto-detected, or specify: anthropic|openai|ollama|kimi|groq|...)')
.action(async (opts) => {
@@ -436,7 +565,7 @@ program
const path = await import('path');
const os = await import('os');
- console.log('\n🐾 Installing Clawd Cursor...\n');
+ console.log(`\n${e('🐾', '>')} Installing Clawd Cursor...\n`);
const clawdRoot = path.resolve(__dirname, '..');
@@ -445,10 +574,10 @@ program
const envPath = path.join(clawdRoot, '.env');
const envContent = `AI_API_KEY=${opts.apiKey}\n`;
fs.writeFileSync(envPath, envContent);
- console.log(' ✅ API key saved to .env');
+ console.log(` ${e('✅', '[OK]')} API key saved to .env`);
}
- // 2. Run doctor (auto-configures pipeline + registers OpenClaw skill)
+ // 2. Run doctor (auto-configures pipeline)
const { runDoctor } = await import('./doctor');
const resolvedApi = resolveApiConfig({
apiKey: opts.apiKey,
@@ -463,13 +592,18 @@ program
save: true,
});
- console.log('\n🐾 Installation complete! Run: clawdcursor start');
+ console.log(`\n${e('🐾', '>')} Installation complete! Run: clawdcursor start`);
});
program
.command('uninstall')
- .description('Remove all Clawd Cursor config, data, and OpenClaw skill registration')
+ .description('Remove all Clawd Cursor config, data, and skill registrations')
.action(async () => {
+ if (!process.stdin.isTTY || !process.stdout.isTTY) {
+ console.error(`\n${e('❌', '[ERR]')} clawdcursor uninstall requires an interactive terminal.\n`);
+ process.exit(1);
+ }
+
const fs = await import('fs');
const path = await import('path');
const os = await import('os');
@@ -477,7 +611,7 @@ program
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
const answer = await new Promise((resolve) => {
- rl.question('\n⚠️ This will remove all Clawd Cursor config and data. Continue? (y/N) ', resolve);
+ rl.question(`\n${e('⚠️', '[WARN]')} This will remove all Clawd Cursor config and data. Continue? (y/N) `, resolve);
});
rl.close();
@@ -486,20 +620,20 @@ program
return;
}
- console.log('\n🗑️ Uninstalling Clawd Cursor...\n');
+ console.log(`\n${e('🗑️', '[DEL]')} Uninstalling Clawd Cursor...\n`);
const clawdRoot = path.resolve(__dirname, '..');
let removed = 0;
// 1. Remove config files
const configFiles = [
- path.join(clawdRoot, '.clawd-config.json'),
- path.join(clawdRoot, '.clawd-favorites.json'),
+ path.join(clawdRoot, '.clawdcursor-config.json'),
+ path.join(clawdRoot, '.clawdcursor-favorites.json'),
path.join(clawdRoot, '.env'),
];
for (const f of configFiles) {
if (fs.existsSync(f)) {
fs.unlinkSync(f);
- console.log(` 🗑️ Removed ${path.basename(f)}`);
+ console.log(` ${e('🗑️', '[DEL]')} Removed ${path.basename(f)}`);
removed++;
}
}
@@ -508,15 +642,17 @@ program
const debugDir = path.join(clawdRoot, 'debug');
if (fs.existsSync(debugDir)) {
fs.rmSync(debugDir, { recursive: true, force: true });
- console.log(' 🗑️ Removed debug/');
+ console.log(` ${e('🗑️', '[DEL]')} Removed debug/`);
removed++;
}
- // 3. Remove OpenClaw skill registration
+ // 3. Remove external skill registrations (OpenClaw, Codex, etc.)
const homeDir = os.homedir();
const skillPaths = [
path.join(homeDir, '.openclaw', 'workspace', 'skills', 'clawdcursor'),
path.join(homeDir, '.openclaw-dev', 'workspace', 'skills', 'clawdcursor'),
+ path.join(homeDir, '.openclaw', 'skills', 'clawdcursor'),
+ path.join(homeDir, '.codex', 'skills', 'clawdcursor'),
];
for (const sp of skillPaths) {
if (fs.existsSync(sp)) {
@@ -526,7 +662,7 @@ program
} else {
fs.rmSync(sp, { recursive: true, force: true });
}
- console.log(` 🗑️ Removed OpenClaw skill: ${sp}`);
+ console.log(` ${e('🗑️', '[DEL]')} Removed skill registration: ${sp}`);
removed++;
}
}
@@ -535,7 +671,7 @@ program
const distDir = path.join(clawdRoot, 'dist');
if (fs.existsSync(distDir)) {
fs.rmSync(distDir, { recursive: true, force: true });
- console.log(' 🗑️ Removed dist/');
+ console.log(` ${e('🗑️', '[DEL]')} Removed dist/`);
removed++;
}
@@ -543,8 +679,308 @@ program
console.log(' Nothing to clean up.');
}
- console.log(`\n🐾 Uninstalled. To fully remove, delete the clawd-cursor folder:`);
+ console.log(`\n${e('🐾', '>')} Uninstalled. To fully remove, delete the clawdcursor folder:`);
console.log(` ${clawdRoot}\n`);
});
+// ── Shared subsystem initialization (used by mcp + serve) ──
+
+async function createToolContext() {
+ const { NativeDesktop } = await import('./native-desktop');
+ const { AccessibilityBridge } = await import('./accessibility');
+ const { CDPDriver } = await import('./cdp-driver');
+ const { DEFAULT_CONFIG } = await import('./types');
+
+ const desktop = new NativeDesktop({ ...DEFAULT_CONFIG });
+ const a11y = new AccessibilityBridge();
+ const cdp = new CDPDriver(9223);
+
+ let initialized = false;
+ let initPromise: Promise | null = null;
+ let mouseScaleFactor = 1;
+ let screenshotScaleFactor = 1;
+
+ const ensureInitialized = async (): Promise => {
+ if (initialized) return;
+ if (initPromise) return initPromise;
+ initPromise = (async () => {
+ await desktop.connect();
+ screenshotScaleFactor = desktop.getScaleFactor();
+ try {
+ const { execFileSync } = await import('child_process');
+ const result = execFileSync('powershell.exe', [
+ '-NoProfile', '-Command',
+ "Add-Type -AssemblyName System.Windows.Forms; $s=[System.Windows.Forms.Screen]::PrimaryScreen.Bounds; \"$($s.Width),$($s.Height)\"",
+ ], { timeout: 10000, encoding: 'utf-8' }).trim();
+ const [logicalW] = result.split(',').map(Number);
+ if (logicalW > 0) mouseScaleFactor = logicalW / 1280;
+ } catch {
+ mouseScaleFactor = screenshotScaleFactor;
+ }
+ await a11y.warmup();
+ initialized = true;
+ console.log('Subsystems initialized');
+ })();
+ return initPromise;
+ };
+
+ return {
+ desktop, a11y, cdp,
+ getMouseScaleFactor: () => mouseScaleFactor,
+ getScreenshotScaleFactor: () => screenshotScaleFactor,
+ ensureInitialized,
+ };
+}
+
+// ── MCP Mode (for Claude Code, Cursor, Windsurf, Zed, etc.) ──
+
+program
+ .command('mcp')
+ .description('Run as MCP tool server over stdio (for Claude Code, Cursor, Windsurf, Zed)')
+ .action(async () => {
+ // Single-instance guard (MCP servers can accumulate when editors restart them)
+ const existingMcpPid = claimPidFile('mcp');
+ if (existingMcpPid !== null) {
+ process.stderr.write(`[ERROR] clawdcursor mcp is already running (pid ${existingMcpPid}). Kill it first.\n`);
+ process.exit(1);
+ }
+
+ // MCP mode: stdout is protocol, logs go to stderr
+ const stderrWrite = (prefix: string, args: any[]) =>
+ process.stderr.write(`${prefix}${args.map(a => typeof a === 'string' ? a : JSON.stringify(a)).join(' ')}\n`);
+ console.log = (...args: any[]) => stderrWrite('', args);
+ console.warn = (...args: any[]) => stderrWrite('[WARN] ', args);
+ console.error = (...args: any[]) => stderrWrite('[ERROR] ', args);
+
+ // Consent gate — must be accepted before MCP tools become active
+ const { hasConsent } = await import('./onboarding');
+ if (!hasConsent()) {
+ process.stderr.write(
+ `\nERROR: clawdcursor requires one-time consent before use.\n` +
+ `This tool gives AI models full control of your desktop.\n\n` +
+ `Run one of the following, then retry:\n` +
+ ` clawdcursor consent # interactive consent prompt\n` +
+ ` clawdcursor consent --accept # non-interactive (CI/scripts)\n` +
+ ` clawdcursor start # consent + start agent\n\n`
+ );
+ process.exit(1);
+ }
+
+ console.log('clawdcursor MCP mode starting...');
+
+ const { getAllTools } = await import('./tools');
+ const ctx = await createToolContext();
+
+ // Dynamic import MCP SDK (ESM package from CJS)
+ const { McpServer } = await import('@modelcontextprotocol/sdk/server/mcp.js' as any);
+ const { StdioServerTransport } = await import('@modelcontextprotocol/sdk/server/stdio.js' as any);
+ const { z } = await import('zod');
+
+ const server = new McpServer({ name: 'clawdcursor', version: '0.7.0' });
+
+ // Register all tools from the unified registry
+ const tools = getAllTools();
+ for (const tool of tools) {
+ // Convert parameters to Zod schema
+ const zodParams: Record = {};
+ for (const [key, def] of Object.entries(tool.parameters)) {
+ let schema: any;
+ if (def.type === 'number') schema = z.coerce.number();
+ else if (def.type === 'boolean') schema = z.coerce.boolean();
+ else schema = z.string();
+ if (def.enum) schema = z.enum(def.enum as [string, ...string[]]);
+ schema = schema.describe(def.description);
+ if (def.required === false) schema = schema.optional();
+ zodParams[key] = schema;
+ }
+
+ const handler = async (params: any) => {
+ const result = await tool.handler(params, ctx);
+ const content: any[] = [];
+ if (result.image) {
+ content.push({ type: 'image', data: result.image.data, mimeType: result.image.mimeType });
+ }
+ content.push({ type: 'text', text: result.text });
+ return { content, isError: result.isError };
+ };
+
+ if (Object.keys(zodParams).length > 0) {
+ server.tool(tool.name, tool.description, zodParams, handler);
+ } else {
+ server.tool(tool.name, tool.description, handler);
+ }
+ }
+
+ const transport = new StdioServerTransport();
+ await server.connect(transport);
+ console.log(`clawdcursor MCP ready — ${tools.length} tools registered`);
+
+ ctx.ensureInitialized().catch((err: any) => {
+ console.error('Subsystem init failed:', err?.message);
+ });
+
+ // Release pidfile on exit so a fresh restart can claim it immediately
+ const releaseMcp = () => { releasePidFile('mcp'); process.exit(0); };
+ process.on('SIGINT', releaseMcp);
+ process.on('SIGTERM', releaseMcp);
+ });
+
+// ── Tool Server (model-agnostic, no LLM needed) ──
+
+program
+ .command('serve')
+ .description('Start the tool server only (no autonomous agent, no LLM). Any AI model can connect via HTTP.')
+ .option('--port ', 'HTTP server port', '3847')
+ .option('--skip-consent', 'Skip consent prompt (requires NODE_ENV=development)')
+ .action(async (opts) => {
+ // Single-instance guard
+ const existingServePid = claimPidFile('serve');
+ if (existingServePid !== null) {
+ console.error(`${e('❌', '[ERR]')} clawdcursor serve is already running (pid ${existingServePid}). Run \`clawdcursor stop\` first.`);
+ process.exit(1);
+ }
+
+ const { runOnboarding, hasConsent } = await import('./onboarding');
+
+ // First-run consent — --skip-consent only works in development mode
+ const canSkip = opts.skipConsent && process.env.NODE_ENV === 'development';
+ if (!canSkip && !hasConsent()) {
+ const accepted = await runOnboarding();
+ if (!accepted) process.exit(1);
+ }
+
+ const port = parseInt(opts.port);
+ const express = (await import('express')).default;
+ const { createToolServer } = await import('./tool-server');
+ const { VERSION } = await import('./version');
+ const { randomBytes } = await import('crypto');
+ const os = await import('os');
+
+ // Generate auth token (same pattern as start mode)
+ const tokenDir = path.join(os.homedir(), '.clawdcursor');
+ if (!fs.existsSync(tokenDir)) fs.mkdirSync(tokenDir, { recursive: true });
+ const serveToken = randomBytes(32).toString('hex');
+ fs.writeFileSync(path.join(tokenDir, 'token'), serveToken, { encoding: 'utf-8', mode: 0o600 });
+
+ console.log(`\n${e('🐾', '>')} clawdcursor v${VERSION} — Tool Server mode`);
+ console.log(' No LLM. No autonomous agent. Just OS primitives over HTTP.\n');
+
+ const ctx = await createToolContext();
+
+ // Create HTTP server with tool routes
+ const app = express();
+ app.use(express.json());
+
+ // Auth middleware — require Bearer token on mutating (non-GET) endpoints
+ app.use((req: any, res: any, next: any) => {
+ if (req.method === 'GET') return next(); // GET /tools, /docs, /health are read-only
+ const authHeader = req.headers['authorization'] || '';
+ const bearer = authHeader.startsWith('Bearer ') ? authHeader.slice(7) : '';
+ if (!bearer || bearer !== serveToken) {
+ return res.status(401).json({ error: 'Unauthorized — include Authorization: Bearer header. Token is at ~/.clawdcursor/token' });
+ }
+ next();
+ });
+
+ app.use(createToolServer(ctx));
+
+ app.listen(port, '127.0.0.1', () => {
+ console.log(` Tool server: http://127.0.0.1:${port}`);
+ console.log(` Tool schemas: http://127.0.0.1:${port}/tools`);
+ console.log(` Documentation: http://127.0.0.1:${port}/docs`);
+ console.log(` Execute: POST http://127.0.0.1:${port}/execute/{tool_name}`);
+ console.log(`\n ${e('🔑', '[KEY]')} Auth token: ${serveToken}`);
+ console.log(` (saved to ~/.clawdcursor/token)`);
+ console.log(` All POST endpoints require: Authorization: Bearer `);
+ console.log(`\n Ready. Connect your AI model.\n`);
+ });
+
+ // Background init
+ ctx.ensureInitialized().catch((err: any) => {
+ console.error('Subsystem init failed:', err?.message);
+ });
+
+ process.on('SIGINT', () => {
+ console.log('\n Shutting down...');
+ releasePidFile('serve');
+ process.exit(0);
+ });
+ process.on('SIGTERM', () => {
+ releasePidFile('serve');
+ process.exit(0);
+ });
+ });
+
+program
+ .command('report')
+ .description('Send an error report to help improve clawdcursor. Shows a preview before sending.')
+ .option('--log ', 'Path to a specific task log file')
+ .option('--note ', 'Add a note describing what went wrong')
+ .option('--save-only', 'Save report locally without sending')
+ .action(async (opts) => {
+ const { interactiveReport, buildReport, saveReportLocally, submitReport } = await import('./report');
+
+ if (!process.stdin.isTTY) {
+ // Non-interactive: build and submit directly
+ const report = buildReport(opts.log, opts.note);
+ if (opts.saveOnly) {
+ const p = saveReportLocally(report);
+ console.log(`Report saved: ${p}`);
+ } else {
+ const result = await submitReport(report);
+ if (result.success) {
+ console.log(`Report sent. ID: ${result.reportId}`);
+ } else {
+ const p = saveReportLocally(report);
+ console.log(`Send failed: ${result.error}. Saved locally: ${p}`);
+ }
+ }
+ return;
+ }
+
+ // Interactive mode
+ await interactiveReport();
+ });
+
+// ── Consent management ──────────────────────────────────────────────────────
+program
+ .command('consent')
+ .description('Manage desktop control consent (required before MCP/REST use)')
+ .option('--accept', 'Accept consent non-interactively (CI/scripted environments)')
+ .option('--revoke', 'Remove stored consent')
+ .option('--status', 'Show current consent status')
+ .action(async (opts) => {
+ const { hasConsent, writeConsentFile, revokeConsent, runOnboarding } = await import('./onboarding');
+
+ if (opts.status) {
+ if (hasConsent()) {
+ console.log(`${e('✅', '[OK]')} Consent: accepted — clawdcursor is authorized to control this desktop.`);
+ } else {
+ console.log(`${e('❌', '[ERR]')} Consent: not given — run \`clawdcursor consent\` to authorize.`);
+ }
+ return;
+ }
+
+ if (opts.revoke) {
+ revokeConsent();
+ console.log(' Consent revoked. clawdcursor will require re-authorization before next use.');
+ return;
+ }
+
+ if (opts.accept) {
+ writeConsentFile();
+ console.log(' Consent accepted. clawdcursor can now control your desktop.');
+ console.log(' Run `clawdcursor start` or `clawdcursor mcp` to begin.\n');
+ return;
+ }
+
+ // Interactive flow
+ const accepted = await runOnboarding('consent');
+ if (accepted) {
+ console.log(' Run `clawdcursor start` or `clawdcursor mcp` to begin.\n');
+ } else {
+ process.exit(1);
+ }
+ });
+
program.parse();
diff --git a/src/local-parser.ts b/src/local-parser.ts
index 3b96d6f..df5750c 100644
--- a/src/local-parser.ts
+++ b/src/local-parser.ts
@@ -166,8 +166,9 @@ export class LocalTaskParser {
return `focus ${focusMatch[1].trim()}`;
}
- // 7. Type / Write / Enter [text]
- const typeMatch = normalized.match(/^(?:type|write|enter)\s+(\S.*)$/i);
+ // 7. Type / Enter [text] — literal keyboard input only
+ // "write" excluded: implies creative composition needing LLM
+ const typeMatch = normalized.match(/^(?:type|enter)\s+(\S.*)$/i);
if (typeMatch) {
const text = typeMatch[1].trim();
// Remove surrounding quotes if present
diff --git a/src/native-desktop.ts b/src/native-desktop.ts
index 534f977..393b31d 100644
--- a/src/native-desktop.ts
+++ b/src/native-desktop.ts
@@ -64,11 +64,22 @@ const KEY_MAP: Record = {
// At 2560 screen: 1280 → scale 2x (was 1024 → 2.5x). Icons go from ~12px to ~20px.
const LLM_TARGET_WIDTH = 1280;
+export interface MonitorInfo {
+ index: number;
+ x: number;
+ y: number;
+ width: number;
+ height: number;
+ primary: boolean;
+ name: string;
+}
+
export class NativeDesktop extends EventEmitter {
private config: ClawdConfig;
private screenWidth = 0;
private screenHeight = 0;
private connected = false;
+ private monitors: MonitorInfo[] = [];
/** Scale factor: LLM coordinates × scaleFactor = real screen coordinates */
private scaleFactor = 1;
@@ -78,6 +89,92 @@ export class NativeDesktop extends EventEmitter {
this.config = config;
}
+ /**
+ * Enumerate all connected monitors with their positions and sizes.
+ * Returns best-effort results — falls back to primary only on errors.
+ */
+ async getMonitors(): Promise {
+ if (this.monitors.length > 0) return this.monitors;
+
+ try {
+ if (process.platform === 'win32') {
+ const { exec } = await import('child_process');
+ const { promisify } = await import('util');
+ const execAsync = promisify(exec);
+ const ps = `Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.Screen]::AllScreens | ForEach-Object { "$($_.Bounds.X),$($_.Bounds.Y),$($_.Bounds.Width),$($_.Bounds.Height),$($_.Primary),$($_.DeviceName)" }`;
+ const { stdout } = await execAsync(`powershell -NoProfile -Command "${ps}"`);
+ const lines = stdout.trim().split('\n').filter(Boolean);
+ this.monitors = lines.map((line, i) => {
+ const [x, y, w, h, primary, name] = line.trim().split(',');
+ return {
+ index: i,
+ x: parseInt(x), y: parseInt(y),
+ width: parseInt(w), height: parseInt(h),
+ primary: primary.trim().toLowerCase() === 'true',
+ name: name?.trim() || `Monitor ${i + 1}`,
+ };
+ });
+ } else if (process.platform === 'darwin') {
+ const { exec } = await import('child_process');
+ const { promisify } = await import('util');
+ const execAsync = promisify(exec);
+ // system_profiler gives display info; for bounds we use osascript
+ const { stdout } = await execAsync(`osascript -e 'tell application "System Events" to get bounds of every desktop'`).catch(() => ({ stdout: '' }));
+ if (stdout.trim()) {
+ // fallback: just return primary
+ this.monitors = [{ index: 0, x: 0, y: 0, width: this.screenWidth, height: this.screenHeight, primary: true, name: 'Primary' }];
+ } else {
+ this.monitors = [{ index: 0, x: 0, y: 0, width: this.screenWidth, height: this.screenHeight, primary: true, name: 'Primary' }];
+ }
+ } else {
+ // Linux: use xrandr
+ const { exec } = await import('child_process');
+ const { promisify } = await import('util');
+ const execAsync = promisify(exec);
+ const { stdout } = await execAsync('xrandr --query 2>/dev/null').catch(() => ({ stdout: '' }));
+ const re = /(\S+) connected(?: primary)? (\d+)x(\d+)\+(\d+)\+(\d+)/g;
+ let m; let i = 0; const results: MonitorInfo[] = [];
+ while ((m = re.exec(stdout)) !== null) {
+ results.push({ index: i++, x: parseInt(m[4]), y: parseInt(m[5]), width: parseInt(m[2]), height: parseInt(m[3]), primary: stdout.includes(m[1] + ' connected primary'), name: m[1] });
+ }
+ this.monitors = results.length > 0 ? results : [{ index: 0, x: 0, y: 0, width: this.screenWidth, height: this.screenHeight, primary: true, name: 'Primary' }];
+ }
+ } catch {
+ this.monitors = [{ index: 0, x: 0, y: 0, width: this.screenWidth, height: this.screenHeight, primary: true, name: 'Primary' }];
+ }
+
+ return this.monitors;
+ }
+
+ /**
+ * Capture a specific monitor by index.
+ * Falls back to primary grab if region capture fails.
+ */
+ async captureMonitor(monitorIndex = 0): Promise {
+ const monitors = await this.getMonitors();
+ const mon = monitors[monitorIndex] ?? monitors.find(m => m.primary) ?? monitors[0];
+ if (!mon) return this.captureForLLM();
+
+ try {
+ const { Region } = await import('@nut-tree-fork/nut-js');
+ const region = new Region(mon.x, mon.y, mon.width, mon.height);
+ const img = await screen.grabRegion(region);
+ const scaleFactor = mon.width > LLM_TARGET_WIDTH ? mon.width / LLM_TARGET_WIDTH : 1;
+ const llmW = Math.round(mon.width / scaleFactor);
+ const llmH = Math.round(mon.height / scaleFactor);
+ const processed = await sharp(img.data, { raw: { width: img.width, height: img.height, channels: 4 } })
+ .resize(llmW, llmH)
+ .png()
+ .toBuffer();
+ // Release the raw RGBA buffer immediately after processing
+ (img as any).data = null;
+ return { width: mon.width, height: mon.height, buffer: processed, timestamp: Date.now(), format: 'png', scaleFactor, llmWidth: llmW, llmHeight: llmH };
+ } catch {
+ // Fallback: full primary grab
+ return this.captureForLLM();
+ }
+ }
+
/**
* "Connect" to the native desktop — detects screen size and configures nut-js.
* No actual network connection; just initializes the local screen interface.
@@ -134,6 +231,8 @@ export class NativeDesktop extends EventEmitter {
this.screenWidth,
this.screenHeight,
);
+ // Release the raw RGBA buffer immediately after processing
+ (img as any).data = null;
return {
width: this.screenWidth,
@@ -178,6 +277,8 @@ export class NativeDesktop extends EventEmitter {
llmWidth,
llmHeight,
);
+ // Release the raw RGBA buffer immediately after processing
+ (img as any).data = null;
return {
width: this.screenWidth, // real screen width
@@ -228,6 +329,8 @@ export class NativeDesktop extends EventEmitter {
const buffer = format === 'jpeg'
? await pipeline.jpeg({ quality }).toBuffer()
: await pipeline.png().toBuffer();
+ // Release the raw RGBA buffer immediately after processing
+ (img as any).data = null;
return {
width: rw,
@@ -289,27 +392,27 @@ export class NativeDesktop extends EventEmitter {
async mouseClick(x: number, y: number, button: number = 1): Promise {
if (!this.connected) throw new Error('Not connected');
- console.log(` 🖱️ Click at (${x}, ${y})`);
await mouse.setPosition(new Point(x, y));
await this.delay(50);
const btn = this.mapButton(button);
await mouse.click(btn);
+ console.log(` 🖱️ Click at (${x}, ${y})`);
}
async mouseDoubleClick(x: number, y: number): Promise {
if (!this.connected) throw new Error('Not connected');
- console.log(` 🖱️ Double-click at (${x}, ${y})`);
await mouse.setPosition(new Point(x, y));
await this.delay(50);
await mouse.doubleClick(Button.LEFT);
+ console.log(` 🖱️ Double-click at (${x}, ${y})`);
}
async mouseRightClick(x: number, y: number): Promise {
if (!this.connected) throw new Error('Not connected');
- console.log(` 🖱️ Right-click at (${x}, ${y})`);
await mouse.setPosition(new Point(x, y));
await this.delay(50);
await mouse.rightClick();
+ console.log(` 🖱️ Right-click at (${x}, ${y})`);
}
async mouseMove(x: number, y: number): Promise {
@@ -319,7 +422,6 @@ export class NativeDesktop extends EventEmitter {
async mouseScroll(x: number, y: number, delta: number): Promise {
if (!this.connected) throw new Error('Not connected');
- console.log(` 🖱️ Scroll at (${x}, ${y}) delta=${delta}`);
await mouse.setPosition(new Point(x, y));
await this.delay(30);
const steps = Math.abs(Math.round(delta));
@@ -331,17 +433,17 @@ export class NativeDesktop extends EventEmitter {
}
await this.delay(30);
}
+ console.log(` 🖱️ Scroll at (${x}, ${y}) delta=${delta}`);
}
async typeText(text: string): Promise {
if (!this.connected) throw new Error('Not connected');
- console.log(` ⌨️ Typing: "${text.substring(0, 60)}${text.length > 60 ? '...' : ''}"`);
await keyboard.type(text);
+ console.log(` ⌨️ Typed: "${text.substring(0, 60)}${text.length > 60 ? '...' : ''}"`);
}
async keyPress(keyCombo: string): Promise {
if (!this.connected) throw new Error('Not connected');
- console.log(` ⌨️ Key press: ${keyCombo}`);
const parts = keyCombo.split('+').map(k => k.trim());
const keys = parts.map(k => this.mapKey(k));
@@ -361,6 +463,7 @@ export class NativeDesktop extends EventEmitter {
await this.delay(30);
}
}
+ console.log(` ⌨️ Key press: ${keyCombo}`);
}
async executeMouseAction(action: MouseAction): Promise {
@@ -474,6 +577,9 @@ export class NativeDesktop extends EventEmitter {
this.screenWidth = 0;
this.screenHeight = 0;
this.emit('disconnected');
+ // Remove all listeners so this instance can be GCd after disconnect.
+ // Must come after emit so 'disconnected' handlers still fire.
+ this.removeAllListeners();
console.log('🐾 Native desktop disconnected');
}
diff --git a/src/ocr-engine.ts b/src/ocr-engine.ts
new file mode 100644
index 0000000..abe05fc
--- /dev/null
+++ b/src/ocr-engine.ts
@@ -0,0 +1,360 @@
+/**
+ * OcrEngine — OS-level OCR bridge.
+ *
+ * Takes a screenshot (or region), returns structured OCR results with bounding boxes.
+ * Coordinates are in REAL screen pixels — no scaleFactor conversion needed.
+ *
+ * Windows: Windows.Media.Ocr via PowerShell one-shot (scripts/ocr-recognize.ps1).
+ * macOS: Apple Vision framework via Swift script (scripts/mac/ocr-recognize.swift).
+ * Linux: Tesseract OCR via Python script (scripts/linux/ocr-recognize.py).
+ *
+ * Caching: last result is kept for 300ms. Invalidated on any action execution.
+ */
+
+import * as os from 'os';
+import * as path from 'path';
+import * as fs from 'fs';
+import { execFile } from 'child_process';
+import { promisify } from 'util';
+import { screen } from '@nut-tree-fork/nut-js';
+import sharp from 'sharp';
+
+const execFileAsync = promisify(execFile);
+
+const SCRIPTS_DIR = path.join(__dirname, '..', 'scripts');
+const OCR_SCRIPT = path.join(SCRIPTS_DIR, 'ocr-recognize.ps1');
+const MAC_OCR_SCRIPT = path.join(SCRIPTS_DIR, 'mac', 'ocr-recognize.swift');
+const LINUX_OCR_SCRIPT = path.join(SCRIPTS_DIR, 'linux', 'ocr-recognize.py');
+const CACHE_TTL_MS = 300;
+const OCR_TIMEOUT = 15000; // 15s — WinRT assembly load + recognition
+const MAC_OCR_TIMEOUT = 20000; // 20s — Swift compilation on first run
+const LINUX_OCR_TIMEOUT = 30000; // 30s — Tesseract can be slow on large images
+const MAX_BUFFER = 4 * 1024 * 1024; // 4MB — large screens with dense text
+
+// ─── Public types ─────────────────────────────────────────────────────────────
+
+export interface OcrElement {
+ text: string;
+ x: number; // left edge in screen pixels
+ y: number; // top edge
+ width: number;
+ height: number;
+ confidence: number; // 0.0–1.0
+ line: number; // line index (for grouping)
+}
+
+export interface OcrResult {
+ elements: OcrElement[];
+ fullText: string; // flat concatenation for quick search
+ durationMs: number;
+}
+
+const EMPTY_RESULT: OcrResult = Object.freeze({ elements: [], fullText: '', durationMs: 0 });
+
+// ─── OcrEngine ────────────────────────────────────────────────────────────────
+
+export class OcrEngine {
+ private cachedResult: OcrResult | null = null;
+ private cacheTimestamp = 0;
+ private available: boolean | null = null;
+
+ /**
+ * Check if OS-level OCR is available on this platform.
+ * Never throws.
+ */
+ isAvailable(): boolean {
+ if (this.available !== null) return this.available;
+
+ if (process.platform === 'win32') {
+ // Windows.Media.Ocr ships with Windows 10+.
+ // Actual availability (language packs) is verified on first recognizeScreen() call.
+ this.available = true;
+ return true;
+ }
+
+ if (process.platform === 'darwin') {
+ // macOS: Apple Vision framework via Swift script.
+ // Available on macOS 10.15+ (Catalina and later).
+ // swift is always present on macOS with Xcode CLI tools.
+ try {
+ const { execFileSync } = require('child_process');
+ execFileSync('which', ['swift'], { timeout: 3000, stdio: 'pipe' });
+ this.available = fs.existsSync(MAC_OCR_SCRIPT);
+ if (this.available) {
+ console.log('[OCR] macOS Vision framework available via Swift');
+ }
+ } catch {
+ this.available = false;
+ }
+ return this.available;
+ }
+
+ if (process.platform === 'linux') {
+ // Linux: Tesseract OCR via Python script.
+ // Requires: sudo apt install tesseract-ocr && python3
+ try {
+ const { execFileSync } = require('child_process');
+ execFileSync('which', ['tesseract'], { timeout: 3000, stdio: 'pipe' });
+ execFileSync('which', ['python3'], { timeout: 3000, stdio: 'pipe' });
+ this.available = fs.existsSync(LINUX_OCR_SCRIPT);
+ if (this.available) {
+ console.log('[OCR] Linux Tesseract OCR available');
+ }
+ } catch {
+ this.available = false;
+ }
+ return this.available;
+ }
+
+ this.available = false;
+ return false;
+ }
+
+ /**
+ * Invalidate the cached OCR result.
+ * Call after any action execution so the next read is fresh.
+ */
+ invalidateCache(): void {
+ this.cachedResult = null;
+ this.cacheTimestamp = 0;
+ }
+
+ /**
+ * OCR the entire screen. Returns cached result if within 300ms window.
+ * Never throws — returns EMPTY_RESULT on failure and degrades gracefully.
+ */
+ async recognizeScreen(): Promise {
+ if (!this.isAvailable()) return { ...EMPTY_RESULT };
+
+ // Return cached if fresh
+ const now = Date.now();
+ if (this.cachedResult && (now - this.cacheTimestamp) < CACHE_TTL_MS) {
+ return this.cachedResult;
+ }
+
+ const start = Date.now();
+ try {
+ // Capture full-resolution screenshot via nut-js
+ const img = await screen.grab();
+ const pngBuffer = await sharp(img.data, {
+ raw: { width: img.width, height: img.height, channels: 4 },
+ }).png().toBuffer();
+
+ // Save to temp file — OS OCR reads from disk
+ const tmpPath = path.join(os.tmpdir(), `clawdcursor-ocr-${process.pid}.png`);
+ fs.writeFileSync(tmpPath, pngBuffer);
+
+ try {
+ const result = await this.runOcr(tmpPath);
+ result.durationMs = Date.now() - start;
+
+ // Cache
+ this.cachedResult = result;
+ this.cacheTimestamp = Date.now();
+
+ return result;
+ } finally {
+ try { fs.unlinkSync(tmpPath); } catch { /* non-fatal */ }
+ }
+ } catch (err: any) {
+ console.error(`[OCR] recognizeScreen failed: ${err?.message}`);
+ // If first call ever fails, mark unavailable so pipeline degrades to vision LLM
+ if (this.cachedResult === null) {
+ this.available = false;
+ }
+ return { ...EMPTY_RESULT, durationMs: Date.now() - start };
+ }
+ }
+
+ /**
+ * OCR a rectangular region of the screen.
+ * Coordinates are in real screen pixels (input and output).
+ * Never throws.
+ */
+ async recognizeRegion(x: number, y: number, w: number, h: number): Promise {
+ if (!this.isAvailable()) return { ...EMPTY_RESULT };
+
+ const start = Date.now();
+ try {
+ const img = await screen.grab();
+
+ // Clamp to screen bounds
+ const rx = Math.max(0, Math.min(x, img.width - 1));
+ const ry = Math.max(0, Math.min(y, img.height - 1));
+ const rw = Math.min(w, img.width - rx);
+ const rh = Math.min(h, img.height - ry);
+
+ const pngBuffer = await sharp(img.data, {
+ raw: { width: img.width, height: img.height, channels: 4 },
+ })
+ .extract({ left: rx, top: ry, width: rw, height: rh })
+ .png()
+ .toBuffer();
+
+ const tmpPath = path.join(os.tmpdir(), `clawdcursor-ocr-region-${process.pid}.png`);
+ fs.writeFileSync(tmpPath, pngBuffer);
+
+ try {
+ const result = await this.runOcr(tmpPath);
+ result.durationMs = Date.now() - start;
+
+ // Offset coordinates back to full-screen space
+ for (const el of result.elements) {
+ el.x += rx;
+ el.y += ry;
+ }
+
+ return result;
+ } finally {
+ try { fs.unlinkSync(tmpPath); } catch { /* non-fatal */ }
+ }
+ } catch (err: any) {
+ console.error(`[OCR] recognizeRegion failed: ${err?.message}`);
+ return { ...EMPTY_RESULT, durationMs: Date.now() - start };
+ }
+ }
+
+ // ─── Private ──────────────────────────────────────────────────────────────
+
+ /**
+ * Dispatch to the platform-specific OCR implementation.
+ */
+ private async runOcr(imagePath: string): Promise {
+ if (process.platform === 'win32') {
+ return this.runWindowsOcr(imagePath);
+ }
+ if (process.platform === 'darwin') {
+ return this.runMacOcr(imagePath);
+ }
+ if (process.platform === 'linux') {
+ return this.runLinuxOcr(imagePath);
+ }
+ return { ...EMPTY_RESULT };
+ }
+
+ /**
+ * Windows: spawn PowerShell to invoke Windows.Media.Ocr WinRT API.
+ * The script outputs a single JSON line with { elements, fullText }.
+ */
+ private async runWindowsOcr(imagePath: string): Promise {
+ const { stdout } = await execFileAsync('powershell.exe', [
+ '-NoProfile',
+ '-NonInteractive',
+ '-ExecutionPolicy', 'Bypass',
+ '-File', OCR_SCRIPT,
+ imagePath,
+ ], {
+ timeout: OCR_TIMEOUT,
+ maxBuffer: MAX_BUFFER,
+ windowsHide: true,
+ });
+
+ const trimmed = stdout.trim();
+ if (!trimmed) {
+ throw new Error('PowerShell OCR script returned empty output');
+ }
+
+ // Sanitize control characters that PowerShell's ConvertTo-Json may leave unescaped
+ // (e.g. bell \x07 from OCR'd icons). Keep \t, \n, \r which are valid in JSON.
+ const sanitized = trimmed.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, '');
+
+ const data = JSON.parse(sanitized);
+ if (data.error) {
+ throw new Error(data.error);
+ }
+
+ const rawElements = Array.isArray(data.elements) ? data.elements : [];
+ const elements: OcrElement[] = rawElements.map((el: Record) => ({
+ text: String(el.text ?? ''),
+ x: Number(el.x) || 0,
+ y: Number(el.y) || 0,
+ width: Number(el.width) || 0,
+ height: Number(el.height) || 0,
+ confidence: Number(el.confidence) || 0,
+ line: Number(el.line) || 0,
+ }));
+
+ return {
+ elements,
+ fullText: String(data.fullText ?? ''),
+ durationMs: 0, // filled by caller
+ };
+ }
+
+ /**
+ * macOS: run Swift script that uses Apple Vision framework (VNRecognizeTextRequest).
+ * Requires macOS 10.15+ and Xcode command-line tools (swift).
+ * The script outputs a single JSON line with { elements, fullText }.
+ */
+ private async runMacOcr(imagePath: string): Promise {
+ const { stdout } = await execFileAsync('swift', [
+ MAC_OCR_SCRIPT,
+ imagePath,
+ ], {
+ timeout: MAC_OCR_TIMEOUT,
+ maxBuffer: MAX_BUFFER,
+ });
+
+ const trimmed = stdout.trim();
+ if (!trimmed) {
+ throw new Error('Swift OCR script returned empty output');
+ }
+
+ const data = JSON.parse(trimmed);
+ if (data.error) {
+ throw new Error(data.error);
+ }
+
+ return this.parseOcrJson(data);
+ }
+
+ /**
+ * Linux: run Python script that uses Tesseract OCR.
+ * Requires: tesseract-ocr package + python3.
+ * The script outputs a single JSON line with { elements, fullText }.
+ */
+ private async runLinuxOcr(imagePath: string): Promise {
+ const { stdout } = await execFileAsync('python3', [
+ LINUX_OCR_SCRIPT,
+ imagePath,
+ ], {
+ timeout: LINUX_OCR_TIMEOUT,
+ maxBuffer: MAX_BUFFER,
+ });
+
+ const trimmed = stdout.trim();
+ if (!trimmed) {
+ throw new Error('Python OCR script returned empty output');
+ }
+
+ const data = JSON.parse(trimmed);
+ if (data.error) {
+ throw new Error(data.error);
+ }
+
+ return this.parseOcrJson(data);
+ }
+
+ /**
+ * Shared JSON parser for macOS/Linux OCR output.
+ * Both scripts emit the same { elements, fullText } format.
+ */
+ private parseOcrJson(data: Record): OcrResult {
+ const rawElements = Array.isArray(data.elements) ? data.elements : [];
+ const elements: OcrElement[] = rawElements.map((el: Record) => ({
+ text: String(el.text ?? ''),
+ x: Number(el.x) || 0,
+ y: Number(el.y) || 0,
+ width: Number(el.width) || 0,
+ height: Number(el.height) || 0,
+ confidence: Number(el.confidence) || 0,
+ line: Number(el.line) || 0,
+ }));
+
+ return {
+ elements,
+ fullText: String(data.fullText ?? ''),
+ durationMs: 0, // filled by caller
+ };
+ }
+}
diff --git a/src/ocr-reasoner.ts b/src/ocr-reasoner.ts
new file mode 100644
index 0000000..2732b0a
--- /dev/null
+++ b/src/ocr-reasoner.ts
@@ -0,0 +1,385 @@
+/**
+ * OCR Reasoner — Layer 2.5.
+ *
+ * Primary universal read layer. Takes a screenshot, runs OS-level OCR,
+ * builds a structured UI snapshot string, feeds it to a cheap text LLM,
+ * and executes the returned action. Loops until done or cannot_read.
+ *
+ * Coordinates are in REAL screen pixels (no scaleFactor conversion needed).
+ * This is simpler and more accurate than the vision LLM coordinate path.
+ *
+ * Falls through to vision LLM (L3) only when OCR genuinely cannot parse
+ * the UI (captchas, pure image content, etc.).
+ */
+
+import { OcrEngine, type OcrResult, type OcrElement } from './ocr-engine';
+import { NativeDesktop } from './native-desktop';
+import { AccessibilityBridge } from './accessibility';
+import type { PipelineConfig } from './providers';
+import type { StepResult } from './types';
+
+const MAX_OCR_STEPS = 15; // max actions before giving up
+const SETTLE_MS = 400; // wait after action before re-OCR
+const CANNOT_READ_RETRIES = 2; // retries before signaling vision fallback
+
+// ─── Action types returned by the LLM ────────────────────────────────────────
+
+export type OcrAction =
+ | { action: 'click'; x: number; y: number; description: string }
+ | { action: 'type'; text: string; description: string }
+ | { action: 'key'; key: string; description: string }
+ | { action: 'scroll'; x: number; y: number; direction: 'up' | 'down'; amount: number }
+ | { action: 'wait'; ms: number; reason: string }
+ | { action: 'done'; evidence: string }
+ | { action: 'cannot_read'; reason: string };
+
+// ─── Result from a single OcrReasoner run ────────────────────────────────────
+
+export interface OcrReasonerResult {
+ handled: boolean;
+ success: boolean;
+ description: string;
+ steps: number;
+ fallbackReason?: string; // set when cannot_read — tells agent.ts to try vision LLM
+ actionLog: Array<{ action: string; description: string }>;
+}
+
+// ─── System prompt for the text LLM ─────────────────────────────────────────
+
+const SYSTEM_PROMPT = `You are a desktop automation agent. You receive a UI snapshot with OCR text elements (with pixel coordinates) and optionally an accessibility tree. Your job is to decide the SINGLE NEXT ACTION to accomplish the user's task.
+
+COORDINATE SYSTEM: All coordinates are in REAL SCREEN PIXELS. Click coordinates should target the CENTER of the element you want to click.
+
+RESPONSE FORMAT — respond with ONLY valid JSON, no markdown:
+{"action":"click","x":150,"y":300,"description":"Click the Send button"}
+{"action":"type","text":"Hello world","description":"Type greeting into the text field"}
+{"action":"key","key":"ctrl+s","description":"Save the document"}
+{"action":"scroll","x":640,"y":400,"direction":"down","amount":3,"description":"Scroll down to see more content"}
+{"action":"wait","ms":1000,"reason":"Waiting for page to load"}
+{"action":"done","evidence":"The email was sent — confirmation banner visible at top"}
+{"action":"cannot_read","reason":"Screen contains a captcha image that OCR cannot parse"}
+
+RULES:
+1. Return exactly ONE action per response
+2. Use OCR element coordinates — click at the CENTER of the target element (x + width/2, y + height/2)
+3. Prefer keyboard shortcuts over mouse clicks when available
+4. Say "done" ONLY when you have clear evidence the task is complete
+5. Say "cannot_read" ONLY when the screen content is genuinely unreadable (images, captchas)
+6. NEVER repeat the same failed action — try an alternative approach
+7. If an accessibility tree is provided, use it for semantic context (button roles, field labels)`;
+
+// ─── OcrReasoner class ──────────────────────────────────────────────────────
+
+export class OcrReasoner {
+ constructor(
+ private ocr: OcrEngine,
+ private desktop: NativeDesktop,
+ private a11y: AccessibilityBridge,
+ private pipelineConfig: PipelineConfig,
+ ) {}
+
+ /**
+ * Run the OCR reasoning loop for a single task.
+ * Returns when done, failed, or signals cannot_read for vision fallback.
+ */
+ async run(task: string, priorContext?: string[]): Promise {
+ const actionLog: Array<{ action: string; description: string }> = [];
+ let cannotReadCount = 0;
+ let stepCount = 0;
+
+ // Build conversation history for context
+ const messages: Array<{ role: string; content: string }> = [
+ { role: 'system', content: SYSTEM_PROMPT },
+ ];
+
+ for (let step = 0; step < MAX_OCR_STEPS; step++) {
+ stepCount = step + 1;
+
+ // 1. OCR the screen
+ this.ocr.invalidateCache();
+ const ocrResult = await this.ocr.recognizeScreen();
+
+ // 2. Optionally read a11y tree for semantic context
+ let a11ySnippet = '';
+ try {
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ if (activeWin) {
+ const tree = await this.a11y.getScreenContext(activeWin.processId).catch(() => null);
+ if (tree) {
+ a11ySnippet = `\n=== A11Y TREE (${activeWin.processName}: ${activeWin.title}) ===\n${tree.substring(0, 2000)}`;
+ }
+ }
+ } catch { /* non-fatal — OCR is the primary source */ }
+
+ // 3. Build the UI snapshot string
+ const snapshot = this.buildSnapshot(ocrResult, a11ySnippet, actionLog, task, priorContext);
+
+ // 4. Ask the text LLM for the next action
+ messages.push({ role: 'user', content: snapshot });
+
+ let llmResponse: string;
+ try {
+ llmResponse = await this.callTextLLM(messages);
+ } catch (err: any) {
+ console.error(` [OCR Reasoner] LLM call failed: ${err.message}`);
+ return {
+ handled: false,
+ success: false,
+ description: `LLM call failed: ${err.message}`,
+ steps: stepCount,
+ actionLog,
+ };
+ }
+
+ messages.push({ role: 'assistant', content: llmResponse });
+
+ // 5. Parse the action
+ const action = this.parseAction(llmResponse);
+ if (!action) {
+ console.log(` [OCR] Step ${stepCount}: Failed to parse LLM response`);
+ actionLog.push({ action: 'parse_error', description: 'Could not parse LLM response' });
+ continue;
+ }
+
+ // 6. Execute the action
+ console.log(` [OCR] Step ${stepCount}: ${action.action} — ${this.describeAction(action)}`);
+ actionLog.push({ action: action.action, description: this.describeAction(action) });
+
+ if (action.action === 'done') {
+ return {
+ handled: true,
+ success: true,
+ description: `OCR Reasoner completed: ${action.evidence}`,
+ steps: stepCount,
+ actionLog,
+ };
+ }
+
+ if (action.action === 'cannot_read') {
+ cannotReadCount++;
+ if (cannotReadCount >= CANNOT_READ_RETRIES) {
+ return {
+ handled: false,
+ success: false,
+ description: `OCR cannot read UI: ${action.reason}`,
+ steps: stepCount,
+ fallbackReason: 'cannot_read',
+ actionLog,
+ };
+ }
+ // Retry — maybe the screen changed
+ await new Promise(r => setTimeout(r, SETTLE_MS));
+ continue;
+ }
+
+ try {
+ await this.executeAction(action);
+ } catch (err: any) {
+ console.error(` [OCR] Action failed: ${err.message}`);
+ actionLog.push({ action: 'error', description: `Action failed: ${err.message}` });
+ }
+
+ // Wait for UI to settle
+ await new Promise(r => setTimeout(r, SETTLE_MS));
+ }
+
+ // Exceeded max steps
+ return {
+ handled: false,
+ success: false,
+ description: `OCR Reasoner exhausted ${MAX_OCR_STEPS} steps without completing`,
+ steps: stepCount,
+ actionLog,
+ };
+ }
+
+ // ─── Private helpers ──────────────────────────────────────────────────────
+
+ /**
+ * Build the UI snapshot string from OCR results + a11y tree.
+ */
+ private buildSnapshot(
+ ocrResult: OcrResult,
+ a11ySnippet: string,
+ actionLog: Array<{ action: string; description: string }>,
+ task: string,
+ priorContext?: string[],
+ ): string {
+ // Group OCR elements by line for readability
+ const lines = new Map();
+ for (const el of ocrResult.elements) {
+ const lineEls = lines.get(el.line) ?? [];
+ lineEls.push(el);
+ lines.set(el.line, lineEls);
+ }
+
+ const ocrLines: string[] = [];
+ for (const [_lineIdx, lineEls] of [...lines.entries()].sort((a, b) => a[0] - b[0])) {
+ const parts = lineEls
+ .sort((a, b) => a.x - b.x)
+ .map(el => `(${el.x},${el.y},${el.width}x${el.height}) "${el.text}"`);
+ ocrLines.push(parts.join(' | '));
+ }
+
+ const ocrText = ocrLines.length > 0
+ ? ocrLines.join('\n')
+ : '(no text detected — screen may be blank or contain only images)';
+
+ // Build action history string
+ const historyStr = actionLog.length > 0
+ ? `\n=== ACTIONS TAKEN SO FAR ===\n${actionLog.map((a, i) => `${i + 1}. ${a.action}: ${a.description}`).join('\n')}`
+ : '';
+
+ // Prior context from earlier pipeline stages
+ const contextStr = priorContext?.length
+ ? `\n=== PRIOR CONTEXT ===\n${priorContext.join('\n')}`
+ : '';
+
+ return `=== TASK ===
+${task}
+${contextStr}
+=== SCREEN SNAPSHOT (OCR — coordinates in real screen pixels) ===
+${ocrText}
+${a11ySnippet}
+${historyStr}
+
+What is the SINGLE NEXT ACTION to accomplish this task? Respond with JSON only.`;
+ }
+
+ /**
+ * Call the text LLM (layer2 config — cheap model like Haiku/GPT-4o-mini).
+ */
+ private async callTextLLM(messages: Array<{ role: string; content: string }>): Promise {
+ const { model, baseUrl } = this.pipelineConfig.layer2;
+ const apiKey = this.pipelineConfig.apiKey || '';
+
+ // Use OpenAI-compatible chat completions API
+ const headers: Record = {
+ 'Content-Type': 'application/json',
+ };
+
+ if (this.pipelineConfig.provider.openaiCompat) {
+ if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`;
+ } else {
+ // Anthropic — use their auth format
+ const authHeaders = this.pipelineConfig.provider.authHeader(apiKey);
+ Object.assign(headers, authHeaders);
+ }
+
+ // For Anthropic, use the Messages API format
+ if (!this.pipelineConfig.provider.openaiCompat) {
+ const response = await fetch(`${baseUrl}/messages`, {
+ method: 'POST',
+ headers,
+ body: JSON.stringify({
+ model,
+ max_tokens: 500,
+ system: messages[0].content,
+ messages: messages.slice(1).map(m => ({
+ role: m.role === 'system' ? 'user' : m.role,
+ content: m.content,
+ })),
+ temperature: 0,
+ }),
+ });
+
+ const data: any = await response.json();
+ if (data.error) throw new Error(data.error.message || JSON.stringify(data.error));
+ return data.content?.[0]?.text || '';
+ }
+
+ // OpenAI-compatible
+ const response = await fetch(`${baseUrl}/chat/completions`, {
+ method: 'POST',
+ headers,
+ body: JSON.stringify({
+ model,
+ messages,
+ temperature: 0,
+ max_tokens: 500,
+ }),
+ });
+
+ const data: any = await response.json();
+ if (data.error) throw new Error(data.error.message || JSON.stringify(data.error));
+ return data.choices?.[0]?.message?.content || '';
+ }
+
+ /**
+ * Parse an OcrAction from the LLM response string.
+ */
+ private parseAction(response: string): OcrAction | null {
+ try {
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
+ if (!jsonMatch) return null;
+
+ const parsed = JSON.parse(jsonMatch[0]);
+ if (!parsed.action) return null;
+
+ return parsed as OcrAction;
+ } catch {
+ return null;
+ }
+ }
+
+ /**
+ * Execute a single OcrAction via NativeDesktop.
+ * Coordinates are in real screen pixels — no scaling needed.
+ */
+ private async executeAction(action: OcrAction): Promise {
+ switch (action.action) {
+ case 'click':
+ await this.desktop.mouseClick(action.x, action.y);
+ this.a11y.invalidateCache();
+ this.ocr.invalidateCache();
+ break;
+
+ case 'type':
+ // Use clipboard paste for reliability
+ await this.a11y.writeClipboard(action.text);
+ await new Promise(r => setTimeout(r, 50));
+ await this.desktop.keyPress('ctrl+v');
+ await new Promise(r => setTimeout(r, 100));
+ this.a11y.invalidateCache();
+ this.ocr.invalidateCache();
+ break;
+
+ case 'key':
+ await this.desktop.keyPress(action.key);
+ this.a11y.invalidateCache();
+ this.ocr.invalidateCache();
+ break;
+
+ case 'scroll':
+ const delta = action.direction === 'down' ? action.amount : -action.amount;
+ await this.desktop.mouseScroll(action.x, action.y, delta);
+ this.ocr.invalidateCache();
+ break;
+
+ case 'wait':
+ await new Promise(r => setTimeout(r, action.ms));
+ this.ocr.invalidateCache();
+ break;
+
+ case 'done':
+ case 'cannot_read':
+ // No execution needed — handled by caller
+ break;
+ }
+ }
+
+ /**
+ * Human-readable description of an action.
+ */
+ private describeAction(action: OcrAction): string {
+ switch (action.action) {
+ case 'click': return `${action.description} at (${action.x},${action.y})`;
+ case 'type': return `${action.description}: "${action.text.substring(0, 50)}"`;
+ case 'key': return `${action.description}: ${action.key}`;
+ case 'scroll': return `Scroll ${action.direction} ${action.amount} at (${action.x},${action.y})`;
+ case 'wait': return `Wait ${action.ms}ms: ${action.reason}`;
+ case 'done': return `Done: ${action.evidence}`;
+ case 'cannot_read': return `Cannot read: ${action.reason}`;
+ }
+ }
+}
diff --git a/src/onboarding.ts b/src/onboarding.ts
new file mode 100644
index 0000000..04b69bb
--- /dev/null
+++ b/src/onboarding.ts
@@ -0,0 +1,142 @@
+/**
+ * Onboarding — first-run consent flow for desktop control.
+ *
+ * On first run, warns the user about desktop control capabilities
+ * and requires explicit consent before tools become active.
+ * Consent is stored in ~/.clawdcursor/consent.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import * as readline from 'readline';
+
+const CONSENT_DIR = path.join(os.homedir(), '.clawdcursor');
+const CONSENT_FILE = path.join(CONSENT_DIR, 'consent');
+
+/** Check if the user has already given consent */
+export function hasConsent(): boolean {
+ return fs.existsSync(CONSENT_FILE);
+}
+
+/** Save consent to disk */
+function saveConsent(): void {
+ if (!fs.existsSync(CONSENT_DIR)) {
+ fs.mkdirSync(CONSENT_DIR, { recursive: true });
+ }
+ fs.writeFileSync(CONSENT_FILE, JSON.stringify({
+ accepted: true,
+ timestamp: new Date().toISOString(),
+ platform: process.platform,
+ version: '0.7.0',
+ }, null, 2));
+}
+
+/** Write consent file directly (for --accept flag / CI / scripted use) */
+export function writeConsentFile(): void {
+ saveConsent();
+}
+
+/** Print the big ASCII banner — only called during first-run onboarding */
+function printBanner(): void {
+ const G = '\x1b[32m', B = '\x1b[1m\x1b[32m', R = '\x1b[0m', D = '\x1b[90m';
+ process.stdout.write(
+ `\n${G}\n` +
+ ` /\\___/\\\n` +
+ ` ( >^.^< ) claw\n` +
+ ` ) ( claw\n` +
+ ` (_)_(_)_)\n` +
+ `${R}\n` +
+ `${B}\n` +
+ ` \u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2557\n` +
+ ` \u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2551 \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\n` +
+ ` \u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2557 \u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2551\n` +
+ ` \u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u2588\u2588\u2551\u2588\u2588\u2588\u2557\u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2551\n` +
+ ` \u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u255a\u2588\u2588\u2588\u2554\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\n` +
+ ` \u255a\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u2550\u255d\u255a\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d\n` +
+ `${R}${G}\n` +
+ ` \u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557\n` +
+ ` \u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\n` +
+ ` \u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\n` +
+ ` \u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u255a\u2550\u2550\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\n` +
+ ` \u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u255a\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2551 \u2588\u2588\u2551\n` +
+ ` \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u255d\n` +
+ `${R}\n` +
+ `${D} OS-level Desktop Automation Server${R}\n` +
+ `${D} \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500${R}\n\n`
+ );
+}
+
+/** Run the onboarding consent flow (interactive terminal) */
+export async function runOnboarding(context: 'start' | 'consent' = 'start', startPort: number = 3847): Promise {
+ // Non-interactive mode (piped stdin, CI, MCP stdio) — skip consent
+ if (!process.stdin.isTTY || !process.stdout.isTTY) {
+ return true;
+ }
+
+ // Show the big banner — this is the one moment every user sees it
+ printBanner();
+
+ const contextNote = context === 'start'
+ ? `\x1b[90m You are starting:\x1b[0m\n` +
+ `\x1b[90m \u2192 AI Agent + REST API on \x1b[0m\x1b[36mlocalhost:${startPort}\x1b[0m\n` +
+ `\x1b[90m \u2192 Any local process can call tool endpoints on that port\x1b[0m\n`
+ : `\x1b[90m This one-time consent covers all transport modes:\x1b[0m\n` +
+ `\x1b[90m \u2192 MCP server (Claude Code, Cursor, Windsurf, Zed)\x1b[0m\n` +
+ `\x1b[90m \u2192 REST API (clawdcursor start)\x1b[0m\n` +
+ `\x1b[90m \u2192 Direct agent tasks\x1b[0m\n`;
+
+ console.log(`
+\x1b[33m
+ \u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557
+ \u2551 \u2551
+ \u2551 \u26a0 DESKTOP CONTROL WARNING \u26a0 \u2551
+ \u2551 \u2551
+ \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255d
+\x1b[0m
+\x1b[90m clawdcursor gives AI models full control of your desktop:\x1b[0m
+
+\x1b[31m ●\x1b[0m Mouse clicks and keyboard input anywhere on screen
+\x1b[31m ●\x1b[0m Screenshot capture of your entire display
+\x1b[31m ●\x1b[0m Read and write OS clipboard
+\x1b[31m ●\x1b[0m Open, close, and switch between applications
+\x1b[31m ●\x1b[0m Browser DOM interaction via Chrome DevTools Protocol
+\x1b[31m ●\x1b[0m Read accessibility tree (window contents, UI elements)
+
+${contextNote}
+\x1b[32m SAFETY NOTES:\x1b[0m
+\x1b[90m ● Only run on a machine you control\x1b[0m
+\x1b[90m ● Only connect AI models you trust\x1b[0m
+\x1b[90m ● Server binds to localhost only (127.0.0.1)\x1b[0m
+\x1b[90m ● Dangerous key combos (Alt+F4, Ctrl+Alt+Del) are blocked\x1b[0m
+\x1b[90m ● Run \x1b[0m\x1b[36mclawdcursor stop\x1b[0m\x1b[90m to shut down when not in use\x1b[0m
+
+\x1b[90m ──────────────────────────────────────────────────────────\x1b[0m
+`);
+
+ const rl = readline.createInterface({
+ input: process.stdin,
+ output: process.stdout,
+ });
+
+ const answer = await new Promise((resolve) => {
+ rl.question(' Accept and continue? (y/N) ', resolve);
+ });
+ rl.close();
+
+ if (answer.toLowerCase() === 'y' || answer.toLowerCase() === 'yes') {
+ saveConsent();
+ console.log('\n Consent saved. You won\'t be asked again.\n');
+ return true;
+ }
+
+ console.log('\n Declined. clawdcursor will not start.\n');
+ return false;
+}
+
+/** Revoke consent (for uninstall) */
+export function revokeConsent(): void {
+ if (fs.existsSync(CONSENT_FILE)) {
+ fs.unlinkSync(CONSENT_FILE);
+ }
+}
diff --git a/src/paths.ts b/src/paths.ts
new file mode 100644
index 0000000..6cdfc46
--- /dev/null
+++ b/src/paths.ts
@@ -0,0 +1,82 @@
+/**
+ * Paths — central data directory for Clawd Cursor.
+ *
+ * All persistent data lives under ~/.clawdcursor/:
+ * task-logs/ — JSONL per-task execution logs
+ * reports/ — locally saved error reports
+ * consent — first-run consent flag
+ * ui-knowledge/ — local app workflow instruction sets
+ *
+ * Migrates from legacy paths if found:
+ * ~/.openclaw/clawd-cursor/ (v0.5.x)
+ * ~/.clawd-cursor/ (v0.6.x–v0.7.0 pre-rename)
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/** Root data directory: ~/.clawdcursor */
+export const DATA_DIR = path.join(os.homedir(), '.clawdcursor');
+
+/** Sub-directories */
+export const TASK_LOGS_DIR = path.join(DATA_DIR, 'task-logs');
+export const REPORTS_DIR = path.join(DATA_DIR, 'reports');
+export const UI_KNOWLEDGE_DIR = path.join(DATA_DIR, 'ui-knowledge');
+
+/** Persistent files */
+export const FAVORITES_PATH = path.join(DATA_DIR, '.clawdcursor-favorites.json');
+export const TOKEN_PATH = path.join(DATA_DIR, 'token');
+
+/**
+ * Migrate data from legacy directories to ~/.clawdcursor/.
+ * Checks both ~/.openclaw/clawd-cursor/ (v0.5.x) and ~/.clawd-cursor/ (v0.6–v0.7 pre-rename).
+ * Only runs once — if the new dir already has content, skips.
+ * Safe: copies, doesn't delete originals.
+ */
+export function migrateFromLegacyDir(): void {
+ // Try most recent legacy path first, then oldest
+ const legacyCandidates = [
+ path.join(os.homedir(), '.clawd-cursor'), // v0.6.x–v0.7.0 pre-rename
+ path.join(os.homedir(), '.openclaw', 'clawd-cursor'), // v0.5.x
+ ];
+ const legacyDir = legacyCandidates.find(d => fs.existsSync(d));
+ if (!legacyDir) return;
+
+ // If new dir already has task-logs, skip migration (already migrated)
+ if (fs.existsSync(TASK_LOGS_DIR) && fs.readdirSync(TASK_LOGS_DIR).length > 0) return;
+
+ try {
+ // Ensure new dirs exist
+ fs.mkdirSync(TASK_LOGS_DIR, { recursive: true });
+ fs.mkdirSync(REPORTS_DIR, { recursive: true });
+
+ // Copy task-logs
+ const legacyLogs = path.join(legacyDir, 'task-logs');
+ if (fs.existsSync(legacyLogs)) {
+ for (const file of fs.readdirSync(legacyLogs)) {
+ const src = path.join(legacyLogs, file);
+ const dst = path.join(TASK_LOGS_DIR, file);
+ if (!fs.existsSync(dst)) {
+ fs.copyFileSync(src, dst);
+ }
+ }
+ }
+
+ // Copy reports
+ const legacyReports = path.join(legacyDir, 'reports');
+ if (fs.existsSync(legacyReports)) {
+ for (const file of fs.readdirSync(legacyReports)) {
+ const src = path.join(legacyReports, file);
+ const dst = path.join(REPORTS_DIR, file);
+ if (!fs.existsSync(dst)) {
+ fs.copyFileSync(src, dst);
+ }
+ }
+ }
+
+ console.log(`📦 Migrated data from ${legacyDir} → ${DATA_DIR}`);
+ } catch {
+ // Non-critical — old data still accessible at original path
+ }
+}
diff --git a/src/postbuild.ts b/src/postbuild.ts
index d35e0a3..660ecb1 100644
--- a/src/postbuild.ts
+++ b/src/postbuild.ts
@@ -1,78 +1,20 @@
/**
* Post-build script — runs after tsc.
- * 1. Prints available commands
- * 2. Auto-registers as OpenClaw skill if OpenClaw is installed
+ * Prints available commands.
*/
-import * as fs from 'fs';
-import * as path from 'path';
-import * as os from 'os';
-
// Print available commands
console.log(`
🐾 Clawd Cursor installed! Available commands:
- clawdcursor install Set up API key, configure pipeline, register as OpenClaw skill
+ clawdcursor install Set up API key and configure pipeline
clawdcursor doctor Auto-detect and configure your AI
clawdcursor start Start the agent
+ clawdcursor serve Start tools-only server (no built-in LLM)
+ clawdcursor mcp Run as MCP tool server (for Claude Code, Cursor, etc.)
clawdcursor task Send a task
clawdcursor stop Stop the agent
clawdcursor dashboard Open the web dashboard
- clawdcursor uninstall Remove all config, data, and OpenClaw registration
+ clawdcursor report Send an error report to help improve the agent
+ clawdcursor uninstall Remove all config and data
`);
-
-// Auto-register as OpenClaw skill
-const homeDir = os.homedir();
-const skillsDir = path.join(homeDir, '.openclaw', 'skills');
-const skillTarget = path.join(skillsDir, 'clawdcursor');
-const clawdRoot = path.resolve(__dirname, '..');
-
-if (fs.existsSync(skillsDir)) {
- if (fs.existsSync(skillTarget)) {
- console.log('🔗 OpenClaw skill: already registered');
- } else {
- try {
- fs.symlinkSync(clawdRoot, skillTarget, process.platform === 'win32' ? 'junction' : 'dir');
- console.log(`🔗 OpenClaw skill: registered → ${skillTarget}`);
- } catch {
- // Symlink failed — copy SKILL.md
- try {
- fs.mkdirSync(skillTarget, { recursive: true });
- fs.copyFileSync(
- path.join(clawdRoot, 'SKILL.md'),
- path.join(skillTarget, 'SKILL.md')
- );
- console.log('🔗 OpenClaw skill: registered (copied SKILL.md)');
- } catch {
- console.log('🔗 OpenClaw skill: failed to register (run clawdcursor install to retry)');
- }
- }
- }
-} else {
- console.log('🔗 OpenClaw not detected — install OpenClaw (https://openclaw.ai) to use as an AI skill');
-}
-
-// Auto-register as Codex skill (Codex UI / Codex agent)
-const codexSkillsDir = path.join(homeDir, '.codex', 'skills');
-const codexTarget = path.join(codexSkillsDir, 'clawd-cursor');
-if (fs.existsSync(codexSkillsDir)) {
- if (fs.existsSync(codexTarget)) {
- console.log('🔗 Codex skill: already registered');
- } else {
- try {
- fs.symlinkSync(clawdRoot, codexTarget, process.platform === 'win32' ? 'junction' : 'dir');
- console.log(`🔗 Codex skill: registered → ${codexTarget}`);
- } catch {
- try {
- fs.mkdirSync(codexTarget, { recursive: true });
- fs.copyFileSync(
- path.join(clawdRoot, 'SKILL.md'),
- path.join(codexTarget, 'SKILL.md')
- );
- console.log('🔗 Codex skill: registered (copied SKILL.md)');
- } catch {
- console.log('🔗 Codex skill: failed to register — symlink/copy ~/.codex/skills/clawd-cursor manually');
- }
- }
- }
-}
diff --git a/src/providers.ts b/src/providers.ts
index c457d4a..6921fbf 100644
--- a/src/providers.ts
+++ b/src/providers.ts
@@ -1,4 +1,4 @@
-import { resolveApiConfig } from './openclaw-credentials';
+import { resolveApiConfig } from './credentials';
/**
* Provider Model Map — auto-selects cheap/expensive models per provider.
@@ -90,6 +90,33 @@ export const PROVIDERS: Record = {
openaiCompat: true,
computerUse: false,
},
+ gemini: {
+ name: 'Google Gemini',
+ baseUrl: 'https://generativelanguage.googleapis.com/v1beta/openai',
+ authHeader: (key) => ({ 'Authorization': `Bearer ${key}` }),
+ textModel: 'gemini-2.0-flash',
+ visionModel: 'gemini-2.0-flash',
+ openaiCompat: true,
+ computerUse: false,
+ },
+ mistral: {
+ name: 'Mistral AI',
+ baseUrl: 'https://api.mistral.ai/v1',
+ authHeader: (key) => ({ 'Authorization': `Bearer ${key}` }),
+ textModel: 'mistral-small-latest',
+ visionModel: 'pixtral-large-latest',
+ openaiCompat: true,
+ computerUse: false,
+ },
+ xai: {
+ name: 'xAI (Grok)',
+ baseUrl: 'https://api.x.ai/v1',
+ authHeader: (key) => ({ 'Authorization': `Bearer ${key}` }),
+ textModel: 'grok-3-mini',
+ visionModel: 'grok-2-vision-1212',
+ openaiCompat: true,
+ computerUse: false,
+ },
generic: {
name: 'OpenAI-Compatible',
baseUrl: '', // set from config
@@ -113,6 +140,8 @@ export function detectProvider(apiKey: string, explicitProvider?: string): strin
if (!apiKey) return 'ollama'; // No key = local mode
if (apiKey.startsWith('sk-ant-')) return 'anthropic';
+ if (apiKey.startsWith('AIza')) return 'gemini'; // Google Gemini API keys start with AIza
+ if (apiKey.startsWith('xai-')) return 'xai'; // xAI Grok
if (apiKey.startsWith('sk-') && apiKey.length > 60) return 'kimi'; // Kimi keys are longer than OpenAI
if (apiKey.startsWith('sk-')) return 'openai';
if (apiKey.startsWith('gsk_')) return 'groq';
@@ -143,6 +172,10 @@ export interface PipelineConfig {
computerUse: boolean;
apiKey?: string;
};
+ /** OCR-first pipeline — enabled when OS OCR is available */
+ ocrEnabled?: boolean;
+ /** Skill cache — learns from successful task completions */
+ skillCacheEnabled?: boolean;
}
/**
@@ -234,7 +267,7 @@ function isOllamaVisionModel(modelId: string): boolean {
/**
* Env var names we check per provider key.
- * AI_API_KEY is a generic fallback; OpenClaw-provided provider hints are preferred.
+ * AI_API_KEY is a generic fallback; external config provider hints are preferred.
*/
const PROVIDER_ENV_VARS: Record = {
@@ -259,11 +292,11 @@ export async function scanProviders(): Promise {
const resolvedApi = resolveApiConfig();
const genericKey = resolvedApi.apiKey || process.env.AI_API_KEY || '';
const genericProviderHint = resolvedApi.provider || '';
- const genericIsOpenClaw = resolvedApi.source === 'openclaw';
+ const isExternalSource = resolvedApi.source === 'external';
- // When OpenClaw is the source, load ALL provider keys from config files
- const openclawProviderKeys: Record = {};
- if (resolvedApi.source === 'openclaw') {
+ // When credentials come from external config, load ALL provider keys from config files
+ const externalProviderKeys: Record = {};
+ if (resolvedApi.source === 'external') {
// resolveApiConfig only returns the "best" provider.
// We need ALL of them for scanning. Read auth-profiles directly.
try {
@@ -293,7 +326,7 @@ export async function scanProviders(): Promise {
const apiKey = val?.key || val?.apiKey || val?.api_key || '';
if (!apiKey) continue;
- // Map OpenClaw provider names to Clawd Cursor provider keys
+ // Map external provider names to Clawd Cursor provider keys
const providerMap: Record = {
'anthropic': 'anthropic',
'openai': 'openai',
@@ -305,8 +338,8 @@ export async function scanProviders(): Promise {
};
const clawdKey = providerMap[providerName];
- if (clawdKey && !openclawProviderKeys[clawdKey]) {
- openclawProviderKeys[clawdKey] = { apiKey };
+ if (clawdKey && !externalProviderKeys[clawdKey]) {
+ externalProviderKeys[clawdKey] = { apiKey };
}
}
} catch { /* skip */ }
@@ -340,17 +373,17 @@ export async function scanProviders(): Promise {
};
const clawdKey = providerMap[provName.toLowerCase()];
- if (clawdKey && openclawProviderKeys[clawdKey] && baseUrl) {
- openclawProviderKeys[clawdKey].baseUrl = baseUrl;
+ if (clawdKey && externalProviderKeys[clawdKey] && baseUrl) {
+ externalProviderKeys[clawdKey].baseUrl = baseUrl;
}
}
} catch { /* skip */ }
}
}
- } catch { /* OpenClaw config read failed, continue with existing logic */ }
-
- if (Object.keys(openclawProviderKeys).length > 0) {
- console.log(` 🔗 OpenClaw providers detected: ${Object.keys(openclawProviderKeys).join(', ')}`);
+ } catch { /* External config read failed, continue with existing logic */ }
+
+ if (Object.keys(externalProviderKeys).length > 0) {
+ console.log(` 🔗 External providers detected: ${Object.keys(externalProviderKeys).join(', ')}`);
}
}
@@ -361,8 +394,8 @@ export async function scanProviders(): Promise {
if (genericProviderHint === providerKey && genericKey) {
key = genericKey;
- } else if (genericIsOpenClaw && !genericProviderHint && providerKey === 'openai' && genericKey) {
- // OpenClaw may provide an OpenAI-compatible endpoint without a provider label.
+ } else if (isExternalSource && !genericProviderHint && providerKey === 'openai' && genericKey) {
+ // External config may provide an OpenAI-compatible endpoint without a provider label.
key = genericKey;
}
@@ -374,13 +407,13 @@ export async function scanProviders(): Promise {
}
}
- // OpenClaw multi-provider keys
- if (!key && openclawProviderKeys[providerKey]) {
- key = openclawProviderKeys[providerKey].apiKey;
+ // External multi-provider keys
+ if (!key && externalProviderKeys[providerKey]) {
+ key = externalProviderKeys[providerKey].apiKey;
}
// For standalone AI_API_KEY, infer provider by key format as a best-effort fallback.
- if (!key && genericKey && !(genericIsOpenClaw && !genericProviderHint)) {
+ if (!key && genericKey && !(isExternalSource && !genericProviderHint)) {
const detected = detectProvider(genericKey);
if (detected === providerKey) {
key = genericKey;
@@ -448,8 +481,8 @@ export async function scanProviders(): Promise {
results.push(ollamaResult);
- // ── Create dynamic provider entries for unknown OpenClaw providers ──────
- if (resolvedApi.source === 'openclaw') {
+ // ── Create dynamic provider entries for unknown external providers ──────
+ if (resolvedApi.source === 'external') {
try {
const os = await import('os');
const fs = await import('fs');
@@ -510,7 +543,7 @@ export async function scanProviders(): Promise {
if (!apiKey) continue;
- // Extract model names from OpenClaw config
+ // Extract model names from external config
const textModels = Object.keys(models).filter(m =>
!m.toLowerCase().includes('vision') &&
!m.toLowerCase().includes('dall-e') &&
@@ -547,7 +580,7 @@ export async function scanProviders(): Promise {
key: dynamicProviderKey,
name: provName,
available: true,
- detail: `OpenClaw config (${maskKey(apiKey)})`,
+ detail: `external config (${maskKey(apiKey)})`,
apiKey: apiKey,
});
@@ -558,22 +591,22 @@ export async function scanProviders(): Promise {
} catch { /* skip */ }
}
}
- } catch { /* OpenClaw dynamic provider creation failed, continue */ }
+ } catch { /* External dynamic provider creation failed, continue */ }
}
- // Apply OpenClaw base URLs to custom providers (e.g., moonshot uses api.moonshot.cn, not openai.com)
+ // Apply external base URLs to custom providers (e.g., moonshot uses api.moonshot.cn, not openai.com)
for (const result of results) {
- if (openclawProviderKeys[result.key]?.baseUrl && result.available) {
+ if (externalProviderKeys[result.key]?.baseUrl && result.available) {
// Store for later use in pipeline building
- (result as any).openclawBaseUrl = openclawProviderKeys[result.key].baseUrl;
+ (result as any).externalBaseUrl = externalProviderKeys[result.key].baseUrl;
}
}
return results;
}
-/** Text model preference: cheapest/fastest first */
-const TEXT_MODEL_PREFERENCE: string[] = ['ollama', 'groq', 'together', 'deepseek', 'kimi', 'openai', 'anthropic'];
+/** Text model preference: fastest/most-reliable first */
+const TEXT_MODEL_PREFERENCE: string[] = ['ollama', 'groq', 'together', 'deepseek', 'anthropic', 'openai', 'kimi'];
/** Vision model preference: best vision capability first */
const VISION_MODEL_PREFERENCE: string[] = ['anthropic', 'openai', 'groq', 'together', 'kimi', 'deepseek', 'ollama'];
diff --git a/src/ps-runner.ts b/src/ps-runner.ts
new file mode 100644
index 0000000..65a2668
--- /dev/null
+++ b/src/ps-runner.ts
@@ -0,0 +1,177 @@
+/**
+ * PSRunner — Persistent PowerShell UIA bridge.
+ *
+ * Keeps one powershell.exe alive for the entire session.
+ * UI Automation assemblies are loaded once at startup (~800ms).
+ * Each subsequent command costs only the actual work — no 200-500ms spawn overhead.
+ *
+ * Protocol: newline-delimited JSON on stdin/stdout.
+ * Send: {"cmd":"invoke-element","processId":123,...}\n
+ * Recv: {"success":true,...}\n
+ *
+ * Commands are serialized (one at a time), queued if a call is in-flight.
+ */
+
+import { spawn, type ChildProcessWithoutNullStreams } from 'child_process';
+import * as readline from 'readline';
+import * as path from 'path';
+
+const BRIDGE_SCRIPT = path.join(__dirname, '..', 'scripts', 'ps-bridge.ps1');
+const READY_TIMEOUT = 12000; // initial PS startup + assembly load
+const CALL_TIMEOUT = 45000; // per command (complex web pages need more time)
+const MAX_QUEUE_SIZE = 100; // backpressure — reject if queue exceeds this
+
+interface PendingCall {
+ command: Record;
+ resolve: (value: unknown) => void;
+ reject: (reason: unknown) => void;
+ timer: ReturnType;
+}
+
+export class PSRunner {
+ private proc: ChildProcessWithoutNullStreams | null = null;
+ private rl: readline.Interface | null = null;
+ private ready = false;
+ private dead = false;
+ private queue: PendingCall[] = [];
+ private current: PendingCall | null = null;
+ private startPromise: Promise | null = null;
+
+ async start(): Promise {
+ if (this.startPromise) return this.startPromise;
+ this.startPromise = this._start().catch(err => {
+ this.startPromise = null;
+ throw err;
+ });
+ return this.startPromise;
+ }
+
+ private _start(): Promise {
+ return new Promise((resolve, reject) => {
+ this.dead = false;
+ this.ready = false;
+
+ this.proc = spawn('powershell.exe', [
+ '-NoProfile',
+ '-NonInteractive',
+ '-ExecutionPolicy', 'Bypass',
+ '-File', BRIDGE_SCRIPT,
+ ], { stdio: ['pipe', 'pipe', 'pipe'] });
+
+ this.rl = readline.createInterface({ input: this.proc.stdout! });
+
+ const readyTimer = setTimeout(() => {
+ reject(new Error('PSRunner: timed out waiting for bridge ready'));
+ }, READY_TIMEOUT);
+
+ this.rl.on('line', (line) => {
+ line = line.trim();
+ if (!line) return;
+
+ let data: any;
+ try { data = JSON.parse(line); } catch { return; }
+
+ if (!this.ready) {
+ if (data.ready) {
+ this.ready = true;
+ clearTimeout(readyTimer);
+ console.log('[PSBridge] Ready — UIA assemblies loaded');
+ resolve();
+ } else if (data.error) {
+ clearTimeout(readyTimer);
+ reject(new Error(`PSRunner startup: ${data.error}`));
+ }
+ return;
+ }
+
+ // Deliver to in-flight call
+ const call = this.current;
+ this.current = null;
+ if (call) {
+ clearTimeout(call.timer);
+ if (data.error) call.reject(new Error(data.error));
+ else call.resolve(data);
+ }
+ this._drain();
+ });
+
+ this.proc.stderr!.on('data', (chunk: Buffer) => {
+ const msg = chunk.toString().trim();
+ if (msg) console.error(`[PSBridge] ${msg}`);
+ });
+
+ this.proc.on('exit', (code) => {
+ const pending = this.current ? [this.current, ...this.queue] : [...this.queue];
+ this.dead = true;
+ this.ready = false;
+ this.startPromise = null;
+ clearTimeout(readyTimer);
+ if (pending.length > 0) {
+ console.error(`[PSBridge] Process exited (code ${code}) with ${pending.length} pending command(s) — will restart on next call`);
+ }
+ this.current = null;
+ this.queue = [];
+ const err = new Error(`PSRunner exited (code ${code})`);
+ for (const c of pending) { clearTimeout(c.timer); c.reject(err); }
+ });
+ });
+ }
+
+ async run(command: Record): Promise {
+ // Auto-start or auto-restart
+ if (!this.startPromise || this.dead) {
+ if (this.dead) console.log('[PSBridge] Restarting crashed bridge process...');
+ this.dead = false;
+ await this.start();
+ } else {
+ await this.startPromise;
+ }
+
+ return new Promise((resolve, reject) => {
+ if (this.queue.length >= MAX_QUEUE_SIZE) {
+ reject(new Error(`PSRunner queue full (${MAX_QUEUE_SIZE}) — backpressure. Try again later.`));
+ return;
+ }
+ const call: PendingCall = {
+ command,
+ resolve,
+ reject,
+ timer: setTimeout(() => {
+ if (this.current === call) this.current = null;
+ console.error(`[PSBridge] Command timeout after ${CALL_TIMEOUT}ms: ${String(command.cmd)}`);
+ reject(new Error(`PSRunner timeout: ${String(command.cmd)}`));
+ this._drain();
+ }, CALL_TIMEOUT),
+ };
+ this.queue.push(call);
+ this._drain();
+ });
+ }
+
+ private _drain(): void {
+ if (this.current || this.queue.length === 0 || !this.proc || this.dead) return;
+ this.current = this.queue.shift()!;
+ try {
+ const line = JSON.stringify(this.current.command) + '\n';
+ this.proc.stdin!.write(line);
+ } catch (err) {
+ const call = this.current;
+ this.current = null;
+ clearTimeout(call.timer);
+ call.reject(err);
+ this._drain();
+ }
+ }
+
+ stop(): void {
+ if (this.proc) {
+ try { this.proc.stdin!.write('EXIT\n'); } catch {}
+ setTimeout(() => { try { this.proc?.kill(); } catch {} }, 500);
+ }
+ this.ready = false;
+ this.dead = true;
+ }
+}
+
+// Singleton — shared across all AccessibilityBridge instances
+export const psRunner = new PSRunner();
diff --git a/src/report.ts b/src/report.ts
new file mode 100644
index 0000000..1ab5df6
--- /dev/null
+++ b/src/report.ts
@@ -0,0 +1,382 @@
+/**
+ * Error Report — opt-in user report submission.
+ *
+ * Users can send task logs + system info to help improve the agent.
+ * All data is redacted before sending (no clipboard, no typed text,
+ * no file paths with usernames, no API keys).
+ *
+ * Privacy-first: never automatic, user must explicitly trigger.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import * as readline from 'readline';
+import { getVersion } from './version';
+import { TASK_LOGS_DIR, REPORTS_DIR } from './paths';
+
+// ─── Configuration ──────────────────────────────────────────
+
+const REPORT_ENDPOINT = process.env.CLAWD_REPORT_URL || 'https://api.clawdcursor.com/reports';
+const LOG_DIR = TASK_LOGS_DIR;
+
+// ─── Types ──────────────────────────────────────────────────
+
+export interface ErrorReport {
+ reportId: string;
+ timestamp: string;
+ version: string;
+ system: {
+ platform: string;
+ arch: string;
+ nodeVersion: string;
+ osRelease: string;
+ };
+ task?: {
+ description: string;
+ status: string;
+ totalSteps: number;
+ durationMs: number;
+ layersUsed: (string | number)[];
+ llmCallCount: number;
+ };
+ steps: RedactedStep[];
+ userNote?: string;
+ errorContext?: string;
+}
+
+interface RedactedStep {
+ stepIndex: number;
+ timestamp: string;
+ layer: string | number;
+ actionType: string;
+ result: string;
+ durationMs?: number;
+ error?: string;
+ verification?: {
+ method: string;
+ verified: boolean;
+ };
+}
+
+// ─── Redaction ───────────────────────────────────────────────
+
+const SENSITIVE_PATTERNS = [
+ // API keys
+ /sk-[a-zA-Z0-9_-]{20,}/g,
+ /api[_-]?key["\s:=]+["']?[a-zA-Z0-9_-]{16,}/gi,
+ /bearer\s+[a-zA-Z0-9_.-]{20,}/gi,
+ // Auth tokens in URLs
+ /token=[a-zA-Z0-9_.-]{10,}/gi,
+ /auth=[a-zA-Z0-9_.-]{10,}/gi,
+ // Email addresses
+ /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g,
+];
+
+/** Redact user home directory from paths */
+function redactPaths(text: string): string {
+ const home = os.homedir().replace(/\\/g, '/');
+ const homeWin = os.homedir().replace(/\//g, '\\');
+ let result = text.replace(new RegExp(escapeRegex(home), 'gi'), '~');
+ result = result.replace(new RegExp(escapeRegex(homeWin), 'gi'), '~');
+ // Also redact common username patterns in paths
+ const username = os.userInfo().username;
+ if (username.length > 2) {
+ result = result.replace(new RegExp(`/Users/${escapeRegex(username)}`, 'gi'), '/Users/[REDACTED]');
+ result = result.replace(new RegExp(`\\\\Users\\\\${escapeRegex(username)}`, 'gi'), '\\Users\\[REDACTED]');
+ result = result.replace(new RegExp(`/home/${escapeRegex(username)}`, 'gi'), '/home/[REDACTED]');
+ }
+ return result;
+}
+
+function escapeRegex(str: string): string {
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+/** Redact sensitive patterns from a string */
+function redactSensitive(text: string): string {
+ let result = text;
+ for (const pattern of SENSITIVE_PATTERNS) {
+ result = result.replace(pattern, '[REDACTED]');
+ }
+ return redactPaths(result);
+}
+
+/** Redact a step entry — strips typed text, clipboard, actionParams with sensitive data */
+function redactStep(raw: Record): RedactedStep {
+ const step: RedactedStep = {
+ stepIndex: raw.stepIndex as number ?? 0,
+ timestamp: raw.timestamp as string ?? '',
+ layer: raw.layer as string | number ?? '',
+ actionType: raw.actionType as string ?? '',
+ result: raw.result as string ?? '',
+ };
+
+ if (raw.durationMs !== undefined) step.durationMs = raw.durationMs as number;
+ if (raw.error) step.error = redactSensitive(String(raw.error));
+ if (raw.verification) {
+ const v = raw.verification as Record;
+ step.verification = {
+ method: v.method as string ?? 'unknown',
+ verified: v.verified as boolean ?? false,
+ };
+ }
+
+ // Deliberately omit: actionParams (may contain typed text, selectors with user data),
+ // llmReasoning (may reference user content), uiStateSummary (may contain screen text)
+
+ return step;
+}
+
+// ─── Report Building ────────────────────────────────────────
+
+/** Read and parse a task log JSONL file */
+function readTaskLog(logPath: string): Record[] {
+ try {
+ const content = fs.readFileSync(logPath, 'utf-8');
+ return content.trim().split('\n').map(line => {
+ try { return JSON.parse(line); }
+ catch { return null; }
+ }).filter(Boolean) as Record[];
+ } catch {
+ return [];
+ }
+}
+
+/** Get the most recent task log file path */
+function getMostRecentLog(): string | null {
+ try {
+ const files = fs.readdirSync(LOG_DIR)
+ .filter(f => f.endsWith('.jsonl'))
+ .sort()
+ .reverse();
+ if (files.length === 0) return null;
+ return path.join(LOG_DIR, files[0]);
+ } catch {
+ return null;
+ }
+}
+
+/** Get N most recent log files */
+function getRecentLogs(count: number): string[] {
+ try {
+ return fs.readdirSync(LOG_DIR)
+ .filter(f => f.endsWith('.jsonl'))
+ .sort()
+ .reverse()
+ .slice(0, count)
+ .map(f => path.join(LOG_DIR, f));
+ } catch {
+ return [];
+ }
+}
+
+/** Build a report from a task log */
+export function buildReport(logPath?: string, userNote?: string): ErrorReport {
+ const targetPath = logPath || getMostRecentLog();
+ const entries = targetPath ? readTaskLog(targetPath) : [];
+
+ // Separate summary from steps
+ const summary = entries.find(e => e._type === 'task_summary') as Record | undefined;
+ const steps = entries.filter(e => e._type !== 'task_summary');
+
+ const reportId = `rpt_${Date.now().toString(36)}_${Math.random().toString(36).substring(2, 6)}`;
+
+ const report: ErrorReport = {
+ reportId,
+ timestamp: new Date().toISOString(),
+ version: getVersion(),
+ system: {
+ platform: process.platform,
+ arch: process.arch,
+ nodeVersion: process.version,
+ osRelease: os.release(),
+ },
+ steps: steps.map(redactStep),
+ };
+
+ if (summary) {
+ report.task = {
+ description: redactSensitive(String(summary.task || '')),
+ status: String(summary.status || 'unknown'),
+ totalSteps: summary.totalSteps as number ?? 0,
+ durationMs: summary.durationMs as number ?? 0,
+ layersUsed: summary.layersUsed as (string | number)[] ?? [],
+ llmCallCount: summary.llmCallCount as number ?? 0,
+ };
+ }
+
+ if (userNote) {
+ report.userNote = userNote;
+ }
+
+ // Check if the last step had an error
+ const lastStep = steps[steps.length - 1];
+ if (lastStep?.error) {
+ report.errorContext = redactSensitive(String(lastStep.error));
+ }
+
+ return report;
+}
+
+// ─── Submission ─────────────────────────────────────────────
+
+/** Submit a report to the backend */
+export async function submitReport(report: ErrorReport): Promise<{ success: boolean; reportId: string; error?: string }> {
+ try {
+ const resp = await fetch(REPORT_ENDPOINT, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(report),
+ signal: AbortSignal.timeout(15000),
+ });
+
+ if (resp.ok) {
+ const data = await resp.json().catch(() => ({})) as Record;
+ return { success: true, reportId: data.reportId as string ?? report.reportId };
+ }
+
+ return { success: false, reportId: report.reportId, error: `Server responded ${resp.status}` };
+ } catch (err: unknown) {
+ const message = err instanceof Error ? err.message : 'Network error';
+ return { success: false, reportId: report.reportId, error: message };
+ }
+}
+
+/** Save report locally (fallback if network fails) */
+export function saveReportLocally(report: ErrorReport): string {
+ const reportDir = REPORTS_DIR;
+ fs.mkdirSync(reportDir, { recursive: true });
+ const filePath = path.join(reportDir, `${report.reportId}.json`);
+ fs.writeFileSync(filePath, JSON.stringify(report, null, 2));
+ return filePath;
+}
+
+// ─── Interactive CLI ────────────────────────────────────────
+
+/** Interactive report flow — shows what will be sent, asks for confirmation */
+export async function interactiveReport(): Promise {
+ const logPath = getMostRecentLog();
+
+ if (!logPath) {
+ console.log('\n No task logs found. Run a task first, then try again.\n');
+ return;
+ }
+
+ const logName = path.basename(logPath);
+ console.log(`\n Most recent task log: ${logName}`);
+
+ // Show available logs
+ const recentLogs = getRecentLogs(5);
+ if (recentLogs.length > 1) {
+ console.log('\n Recent logs:');
+ recentLogs.forEach((l, i) => {
+ const entries = readTaskLog(l);
+ const summary = entries.find(e => e._type === 'task_summary') as Record | undefined;
+ const task = summary?.task ? redactSensitive(String(summary.task)).substring(0, 60) : '(no summary)';
+ const status = summary?.status ?? 'unknown';
+ const marker = i === 0 ? ' [latest]' : '';
+ console.log(` ${i + 1}. ${path.basename(l)} — ${status} — "${task}"${marker}`);
+ });
+ }
+
+ // Build the report
+ const report = buildReport(logPath);
+
+ // Show preview
+ console.log('\n ── Report Preview ──────────────────────────────');
+ console.log(` Report ID: ${report.reportId}`);
+ console.log(` Version: ${report.version}`);
+ console.log(` Platform: ${report.system.platform}/${report.system.arch}`);
+ console.log(` Node: ${report.system.nodeVersion}`);
+ if (report.task) {
+ console.log(` Task: "${report.task.description}"`);
+ console.log(` Status: ${report.task.status}`);
+ console.log(` Steps: ${report.task.totalSteps}`);
+ console.log(` Duration: ${(report.task.durationMs / 1000).toFixed(1)}s`);
+ console.log(` LLM Calls: ${report.task.llmCallCount}`);
+ }
+ console.log(` Step data: ${report.steps.length} entries (redacted)`);
+ if (report.errorContext) {
+ console.log(` Error: ${report.errorContext}`);
+ }
+ console.log(' ────────────────────────────────────────────────');
+ console.log('\n Privacy: No typed text, clipboard data, screenshots,');
+ console.log(' or personal file paths are included.');
+
+ // Ask for optional note
+ const rl = readline.createInterface({
+ input: process.stdin,
+ output: process.stdout,
+ });
+
+ const note = await new Promise((resolve) => {
+ rl.question('\n Add a note (optional, press Enter to skip): ', resolve);
+ });
+
+ if (note.trim()) {
+ report.userNote = note.trim();
+ }
+
+ // Confirm
+ const confirm = await new Promise((resolve) => {
+ rl.question(' Send this report? (y/N) ', resolve);
+ });
+ rl.close();
+
+ if (confirm.toLowerCase() !== 'y' && confirm.toLowerCase() !== 'yes') {
+ // Save locally as fallback
+ const savedPath = saveReportLocally(report);
+ console.log(`\n Report saved locally: ${savedPath}`);
+ console.log(' You can manually share this file if needed.\n');
+ return;
+ }
+
+ // Submit
+ console.log('\n Sending report...');
+ const result = await submitReport(report);
+
+ if (result.success) {
+ console.log(` Report sent. ID: ${result.reportId}`);
+ console.log(' Thank you — this helps us make clawdcursor better.\n');
+ } else {
+ // Save locally on failure
+ const savedPath = saveReportLocally(report);
+ console.log(` Failed to send: ${result.error}`);
+ console.log(` Report saved locally: ${savedPath}`);
+ console.log(' You can manually share this file if needed.\n');
+ }
+}
+
+// ─── Server API Helpers ─────────────────────────────────────
+
+/** Build and submit a report programmatically (for REST API) */
+export async function apiSubmitReport(opts: {
+ logPath?: string;
+ userNote?: string;
+ logIndex?: number;
+}): Promise<{ success: boolean; reportId: string; preview?: ErrorReport; error?: string }> {
+ let targetPath = opts.logPath;
+
+ if (!targetPath && opts.logIndex !== undefined) {
+ const logs = getRecentLogs(opts.logIndex + 1);
+ targetPath = logs[opts.logIndex];
+ }
+
+ if (!targetPath) {
+ targetPath = getMostRecentLog() ?? undefined;
+ }
+
+ if (!targetPath) {
+ return { success: false, reportId: '', error: 'No task logs found' };
+ }
+
+ const report = buildReport(targetPath, opts.userNote);
+ const result = await submitReport(report);
+
+ if (!result.success) {
+ saveReportLocally(report);
+ }
+
+ return { ...result, preview: report };
+}
diff --git a/src/safe-json.ts b/src/safe-json.ts
new file mode 100644
index 0000000..5877d9e
--- /dev/null
+++ b/src/safe-json.ts
@@ -0,0 +1,81 @@
+/**
+ * Safe JSON extraction from LLM responses.
+ *
+ * LLMs often return JSON wrapped in markdown fences, explanation text,
+ * or multiple JSON fragments. The greedy regex /\{[\s\S]*\}/ can match
+ * from the first { to the LAST }, capturing invalid JSON. This module
+ * uses balanced-brace counting for reliable extraction.
+ */
+
+/**
+ * Extract the first valid JSON object from a string.
+ * Uses balanced-brace matching instead of greedy regex.
+ */
+export function extractJsonObject(text: string): unknown | null {
+ const start = text.indexOf('{');
+ if (start === -1) return null;
+
+ let depth = 0;
+ let inString = false;
+ let escape = false;
+
+ for (let i = start; i < text.length; i++) {
+ const ch = text[i];
+
+ if (escape) { escape = false; continue; }
+ if (ch === '\\' && inString) { escape = true; continue; }
+ if (ch === '"') { inString = !inString; continue; }
+ if (inString) continue;
+
+ if (ch === '{') depth++;
+ else if (ch === '}') {
+ depth--;
+ if (depth === 0) {
+ const candidate = text.substring(start, i + 1);
+ try {
+ return JSON.parse(candidate);
+ } catch {
+ // Brace-matched but invalid JSON — keep scanning
+ return null;
+ }
+ }
+ }
+ }
+ return null;
+}
+
+/**
+ * Extract the first valid JSON array from a string.
+ */
+export function extractJsonArray(text: string): unknown[] | null {
+ const start = text.indexOf('[');
+ if (start === -1) return null;
+
+ let depth = 0;
+ let inString = false;
+ let escape = false;
+
+ for (let i = start; i < text.length; i++) {
+ const ch = text[i];
+
+ if (escape) { escape = false; continue; }
+ if (ch === '\\' && inString) { escape = true; continue; }
+ if (ch === '"') { inString = !inString; continue; }
+ if (inString) continue;
+
+ if (ch === '[') depth++;
+ else if (ch === ']') {
+ depth--;
+ if (depth === 0) {
+ const candidate = text.substring(start, i + 1);
+ try {
+ const parsed = JSON.parse(candidate);
+ return Array.isArray(parsed) ? parsed : null;
+ } catch {
+ return null;
+ }
+ }
+ }
+ }
+ return null;
+}
diff --git a/src/safety.ts b/src/safety.ts
index a686d86..ca70f02 100644
--- a/src/safety.ts
+++ b/src/safety.ts
@@ -38,9 +38,15 @@ export class SafetyLayer {
}
}
- // Typing is preview tier (user can see what's being typed)
+ // Typing in a terminal context is Confirm tier — keystrokes could execute
+ // shell commands. Typing in other contexts is Preview (user can see output).
if ('text' in action && action.kind === 'type') {
- return SafetyTier.Preview;
+ const terminalProcesses = ['cmd', 'powershell', 'pwsh', 'bash', 'zsh', 'sh',
+ 'wt', 'windowsterminal', 'terminal', 'iterm2', 'alacritty', 'hyper', 'conhost',
+ 'mintty', 'git bash', 'wsl'];
+ const descLower = description.toLowerCase();
+ const isTerminalContext = terminalProcesses.some(t => descLower.includes(t));
+ return isTerminalContext ? SafetyTier.Confirm : SafetyTier.Preview;
}
// Everything else is auto
diff --git a/src/server.ts b/src/server.ts
index c44628a..1d5fb7e 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -14,19 +14,61 @@
* GET /favorites — list saved favorite commands
* POST /favorites — add a command to favorites
* DELETE /favorites — remove a command from favorites
+ * POST /report — submit an error report (opt-in)
*/
import express from 'express';
-import { readFileSync, writeFileSync, existsSync } from 'fs';
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
import { join } from 'path';
+import { randomBytes } from 'crypto';
import { z } from 'zod';
import type { ClawdConfig } from './types';
import { Agent } from './agent';
import { mountDashboard } from './dashboard';
import { VERSION } from './version';
+import { DATA_DIR } from './paths';
+import { e } from './format';
-// Favorites persistence
-const FAVORITES_PATH = join(process.cwd(), '.clawd-favorites.json');
+// Favorites persistence — stored in ~/.clawdcursor/ so it persists across cwd changes
+const FAVORITES_PATH = join(DATA_DIR, '.clawdcursor-favorites.json');
+
+// ── Bearer token auth ─────────────────────────────────────────────────────────
+// Generated once at startup, persisted to ~/.clawdcursor/token so the
+// dashboard and external callers can read it. Rotates on every fresh start.
+const TOKEN_PATH = join(DATA_DIR, 'token');
+
+function generateToken(): string {
+ const token = randomBytes(32).toString('hex');
+ try {
+ if (!existsSync(DATA_DIR)) mkdirSync(DATA_DIR, { recursive: true });
+ writeFileSync(TOKEN_PATH, token, { encoding: 'utf-8', mode: 0o600 });
+ } catch (tokenErr) {
+ console.warn(`${e('⚠', '[WARN]')} Could not write auth token file:`, (tokenErr as Error).message);
+ }
+ return token;
+}
+
+// Token is generated lazily (only when createServer is first called, i.e. `start`).
+// This prevents CLI commands like `stop`, `task`, `consent` from overwriting the
+// running server's token file on import.
+export let SERVER_TOKEN = '';
+
+/** Initialize the auth token. Called once from createServer(). */
+export function initServerToken(): string {
+ SERVER_TOKEN = generateToken();
+ return SERVER_TOKEN;
+}
+
+/** Middleware: require Authorization: Bearer on mutating endpoints. */
+export function requireAuth(req: express.Request, res: express.Response, next: express.NextFunction): void {
+ const authHeader = req.headers['authorization'] || '';
+ const token = authHeader.startsWith('Bearer ') ? authHeader.slice(7) : '';
+ if (!token || token !== SERVER_TOKEN) {
+ res.status(401).json({ error: 'Unauthorized — include Authorization: Bearer header. Token is at ~/.clawdcursor/token' });
+ return;
+ }
+ next();
+}
function loadFavorites(): string[] {
try {
@@ -35,8 +77,8 @@ function loadFavorites(): string[] {
const parsed = JSON.parse(data);
if (Array.isArray(parsed)) return parsed;
}
- } catch (e) {
- console.warn('⚠ Failed to load favorites:', (e as Error).message);
+ } catch (favErr) {
+ console.warn(`${e('⚠', '[WARN]')} Failed to load favorites:`, (favErr as Error).message);
}
return [];
}
@@ -44,8 +86,8 @@ function loadFavorites(): string[] {
function saveFavorites(favorites: string[]): void {
try {
writeFileSync(FAVORITES_PATH, JSON.stringify(favorites, null, 2), 'utf-8');
- } catch (e) {
- console.error('❌ Failed to save favorites:', (e as Error).message);
+ } catch (saveErr) {
+ console.error(`${e('❌', '[ERR]')} Failed to save favorites:`, (saveErr as Error).message);
}
}
@@ -59,8 +101,14 @@ interface LogEntry {
const MAX_LOGS = 200;
const logBuffer: LogEntry[] = [];
+const MAX_LOG_MSG_LEN = 500;
+
function addLog(level: LogEntry['level'], message: string): void {
- logBuffer.push({ timestamp: Date.now(), level, message });
+ // Truncate oversized messages (e.g. full LLM responses) to keep the buffer lean
+ const truncated = message.length > MAX_LOG_MSG_LEN
+ ? message.slice(0, MAX_LOG_MSG_LEN) + '…'
+ : message;
+ logBuffer.push({ timestamp: Date.now(), level, message: truncated });
if (logBuffer.length > MAX_LOGS) {
logBuffer.splice(0, logBuffer.length - MAX_LOGS);
}
@@ -117,12 +165,51 @@ const confirmSchema = z.object({
});
export function createServer(agent: Agent, config: ClawdConfig): express.Express {
+ // Generate auth token (only on actual server start, not on module import)
+ initServerToken();
+
// Hook console to capture logs
hookConsole();
const app = express();
app.use(express.json());
+ // ── CORS: block browser-origin requests to prevent SSRF/localhost-bypass attacks ──
+ // The dashboard at GET / is exempt (browser tab). All API routes require:
+ // 1. Non-browser origin (no Origin header), OR same origin, OR explicit allowlist
+ // 2. Bearer token (on mutating endpoints)
+ app.use((req, res, next) => {
+ const origin = req.headers['origin'];
+ // Allow: no origin (curl, CLI, direct), or localhost origins
+ const allowedOrigins = [
+ 'http://localhost:3847',
+ 'http://127.0.0.1:3847',
+ ];
+ if (origin) {
+ if (allowedOrigins.includes(origin)) {
+ res.setHeader('Access-Control-Allow-Origin', origin);
+ res.setHeader('Access-Control-Allow-Methods', 'GET,POST,DELETE,OPTIONS');
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Authorization');
+ res.setHeader('Vary', 'Origin');
+ } else {
+ // Cross-origin browser request — block it
+ if (req.method === 'OPTIONS') { res.status(204).end(); return; }
+ res.status(403).json({ error: 'Cross-origin requests not allowed' });
+ return;
+ }
+ }
+ if (req.method === 'OPTIONS') { res.status(204).end(); return; }
+ next();
+ });
+
+ // Handle malformed JSON gracefully (e.g. control characters from terminal)
+ app.use((err: any, _req: any, res: any, next: any) => {
+ if (err.type === 'entity.parse.failed') {
+ return res.status(400).json({ error: 'Invalid JSON in request body' });
+ }
+ next(err);
+ });
+
// Mount the web dashboard at GET /
mountDashboard(app);
@@ -134,7 +221,7 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
});
// Add a favorite
- app.post('/favorites', (req, res) => {
+ app.post('/favorites', requireAuth, (req, res) => {
const parsed = taskSchema.safeParse(req.body);
if (!parsed.success) {
return res.status(400).json({ error: 'Missing "task" string in body' });
@@ -149,7 +236,7 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
});
// Remove a favorite
- app.delete('/favorites', (req, res) => {
+ app.delete('/favorites', requireAuth, (req, res) => {
const parsed = taskSchema.safeParse(req.body);
if (!parsed.success) {
return res.status(400).json({ error: 'Missing "task" string in body' });
@@ -166,7 +253,7 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
});
// Submit a task
- app.post('/task', async (req, res) => {
+ app.post('/task', requireAuth, async (req, res) => {
const parsed = taskSchema.safeParse(req.body);
if (!parsed.success) {
return res.status(400).json({ error: 'Missing "task" in body' });
@@ -181,13 +268,13 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
});
}
- console.log(`\n📨 New task received: ${task}`);
+ console.log(`\n${e('📨', '>')} New task received: ${task}`);
// Execute async — respond immediately
agent.executeTask(task).then(result => {
- console.log(`\n📋 Task result:`, JSON.stringify(result, null, 2));
+ console.log(`\n${e('📋', '>')} Task result:`, JSON.stringify(result, null, 2));
}).catch(err => {
- console.error(`\n❌ Task execution failed:`, err);
+ console.error(`\n${e('❌', '[ERR]')} Task execution failed:`, err);
});
res.json({ accepted: true, task });
@@ -198,8 +285,30 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
res.json(agent.getState());
});
+ // Task logs — structured JSONL logs for every task
+ app.get('/task-logs', (_req, res) => {
+ try {
+ const logger = (agent as any).logger;
+ if (!logger) return res.json([]);
+ res.json(logger.getRecentSummaries(50));
+ } catch { res.json([]); }
+ });
+
+ app.get('/task-logs/current', (_req, res) => {
+ try {
+ const logger = (agent as any).logger;
+ const logPath = logger?.getCurrentLogPath();
+ if (!logPath || !require('fs').existsSync(logPath)) {
+ return res.status(404).json({ error: 'No current log' });
+ }
+ const content = require('fs').readFileSync(logPath, 'utf-8');
+ const entries = content.trim().split('\n').map((l: string) => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean);
+ res.json(entries);
+ } catch { res.status(500).json({ error: 'Failed to read log' }); }
+ });
+
// Approve or reject a pending confirmation
- app.post('/confirm', (req, res) => {
+ app.post('/confirm', requireAuth, (req, res) => {
const parsed = confirmSchema.safeParse(req.body);
if (!parsed.success) {
return res.status(400).json({ error: 'Missing "approved" boolean in body' });
@@ -221,7 +330,7 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
});
// Abort current task
- app.post('/abort', (req, res) => {
+ app.post('/abort', requireAuth, (req, res) => {
agent.abort();
res.json({ aborted: true });
});
@@ -231,13 +340,106 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
res.json(logBuffer);
});
+ // Screenshot — returns PNG image of current screen
+ app.get('/screenshot', requireAuth, async (_req, res) => {
+ try {
+ const desktop = agent.getDesktop();
+ const frame = await desktop.captureForLLM();
+ res.set('Content-Type', 'image/png');
+ res.set('X-Scale-Factor', String(frame.scaleFactor));
+ res.set('X-Screen-Width', String(frame.llmWidth));
+ res.set('X-Screen-Height', String(frame.llmHeight));
+ res.send(frame.buffer);
+ } catch (err) {
+ res.status(500).json({ error: `Screenshot failed: ${(err as Error).message}` });
+ }
+ });
+
+ // Direct action execution — lets an external brain (e.g. Claude Code) drive the agent
+ // Coordinates are in LLM-space (1280px wide) — auto-scaled to real screen
+ app.post('/action', requireAuth, async (req, res) => {
+ try {
+ const { action, x, y, text, key, button, scrollDelta } = req.body;
+ if (!action) return res.status(400).json({ error: 'Missing "action" field' });
+
+ const desktop = agent.getDesktop();
+ const screen = desktop.getScreenSize();
+ const LLM_WIDTH = 1280;
+ const scale = screen.width > LLM_WIDTH ? screen.width / LLM_WIDTH : 1;
+
+ const realX = x != null ? Math.round(Number(x) * scale) : 0;
+ const realY = y != null ? Math.round(Number(y) * scale) : 0;
+
+ switch (action) {
+ case 'click':
+ if (x == null || y == null) return res.status(400).json({ error: 'click requires x, y' });
+ await desktop.executeMouseAction({ kind: button === 'right' ? 'right_click' : 'click', x: realX, y: realY });
+ res.json({ ok: true, action: 'click', x, y, realX, realY });
+ break;
+ case 'double_click':
+ if (x == null || y == null) return res.status(400).json({ error: 'double_click requires x, y' });
+ await desktop.executeMouseAction({ kind: 'double_click', x: realX, y: realY });
+ res.json({ ok: true, action: 'double_click', x, y, realX, realY });
+ break;
+ case 'type':
+ if (!text) return res.status(400).json({ error: 'type requires text' });
+ await desktop.executeKeyboardAction({ kind: 'type', text });
+ res.json({ ok: true, action: 'type', length: text.length });
+ break;
+ case 'key':
+ if (!key) return res.status(400).json({ error: 'key requires key' });
+ await desktop.executeKeyboardAction({ kind: 'key_press', key });
+ res.json({ ok: true, action: 'key', key });
+ break;
+ case 'scroll':
+ if (x == null || y == null) return res.status(400).json({ error: 'scroll requires x, y' });
+ await desktop.executeMouseAction({ kind: 'scroll', x: realX, y: realY, scrollDelta: Number(scrollDelta || 3) });
+ res.json({ ok: true, action: 'scroll', x, y, realX, realY, scrollDelta: scrollDelta || 3 });
+ break;
+ case 'move':
+ if (x == null || y == null) return res.status(400).json({ error: 'move requires x, y' });
+ await desktop.executeMouseAction({ kind: 'move', x: realX, y: realY });
+ res.json({ ok: true, action: 'move', x, y, realX, realY });
+ break;
+ default:
+ res.status(400).json({ error: `Unknown action: ${action}` });
+ }
+ } catch (err) {
+ res.status(500).json({ error: `Action failed: ${(err as Error).message}` });
+ }
+ });
+
+ // Error report — opt-in submission of redacted task logs
+ app.post('/report', requireAuth, async (req, res) => {
+ try {
+ const { apiSubmitReport } = await import('./report');
+ const { userNote, logIndex } = req.body || {};
+ const result = await apiSubmitReport({
+ userNote: typeof userNote === 'string' ? userNote : undefined,
+ logIndex: typeof logIndex === 'number' ? logIndex : undefined,
+ });
+ if (result.success) {
+ res.json({ success: true, reportId: result.reportId, preview: result.preview });
+ } else {
+ res.status(result.error === 'No task logs found' ? 404 : 502).json({
+ success: false,
+ error: result.error,
+ reportId: result.reportId,
+ preview: result.preview,
+ });
+ }
+ } catch (err) {
+ res.status(500).json({ error: `Report failed: ${(err as Error).message}` });
+ }
+ });
+
// Health check
app.get('/health', (req, res) => {
res.json({ status: 'ok', version: VERSION });
});
// Graceful shutdown (localhost only)
- app.post('/stop', (req, res) => {
+ app.post('/stop', requireAuth, (req, res) => {
const ip = req.ip || req.socket.remoteAddress || '';
const isLocal = ip === '127.0.0.1' || ip === '::1' || ip === '::ffff:127.0.0.1';
if (!isLocal) {
@@ -249,7 +451,7 @@ export function createServer(agent: Agent, config: ClawdConfig): express.Express
res.writeHead(200, { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(body) });
res.end(body, () => {
// Response fully flushed — now shut down
- console.log('\n👋 Shutting down (stop command received)...');
+ console.log(`\n${e('👋', '--')} Shutting down (stop command received)...`);
agent.disconnect();
// Force exit after short delay (covers Windows edge cases)
setTimeout(() => process.exit(0), 500);
diff --git a/src/shortcuts.ts b/src/shortcuts.ts
index d7eca2e..294002d 100644
--- a/src/shortcuts.ts
+++ b/src/shortcuts.ts
@@ -69,7 +69,7 @@ export const SHORTCUTS: ShortcutDefinition[] = [
shortcut('undo', 'editing', 'Undo previous action', 'undo', ['undo'], { default: `${MOD}+z`, darwin: `${CMD}+z` }),
shortcut('redo', 'editing', 'Redo previous action', 'redo', ['redo'], { default: `${MOD}+y`, darwin: `${CMD}+Shift+z` }),
shortcut('select-all', 'editing', 'Select all content', 'select all', ['select all'], { default: `${MOD}+a`, darwin: `${CMD}+a` }),
- shortcut('find', 'editing', 'Find in current context', 'find', ['find', 'search in page', 'find text'], { default: `${MOD}+f`, darwin: `${CMD}+f` }),
+ shortcut('find', 'editing', 'Find in current context', 'find', ['search in page', 'find in page', 'find on page', 'find text'], { default: `${MOD}+f`, darwin: `${CMD}+f` }),
shortcut('replace', 'editing', 'Find and replace', 'replace', ['replace', 'find and replace'], { default: `${MOD}+h`, darwin: `${CMD}+Option+f` }),
// Social media (context-aware: these are risky one-key shortcuts)
@@ -111,6 +111,86 @@ export const SHORTCUTS: ShortcutDefinition[] = [
shortcut('delete', 'quick', 'Delete selection', 'press delete', ['delete', 'press delete'], { default: 'Delete' }),
shortcut('system-search', 'quick', 'Open system search', 'system search', ['system search', 'open search', 'search apps'], { default: 'Super+s', darwin: `${CMD}+Space` }),
shortcut('lock-screen', 'quick', 'Lock current session', 'lock screen', ['lock screen'], { default: 'Super+l', darwin: 'Control+Super+q' }),
+
+ // ─── Outlook shortcuts ──────────────────────────────────────────
+ // New Outlook (olk) — same as Outlook on the web with some desktop overrides
+ shortcut('outlook-new-message', 'editing', 'Create new email message', 'new email',
+ ['new email', 'new message', 'compose email', 'compose message', 'write email', 'create email', 'new mail'],
+ { default: `${MOD}+n`, darwin: `${CMD}+n` }, ['outlook', 'olk']),
+ shortcut('outlook-send', 'editing', 'Send email message', 'send email',
+ ['send email', 'send message', 'send mail', 'send this email', 'send it'],
+ { default: `${MOD}+Return`, darwin: `${CMD}+Return` }, ['outlook', 'olk']),
+ shortcut('outlook-reply', 'editing', 'Reply to email', 'reply',
+ ['reply', 'reply to email', 'reply to message', 'reply to this'],
+ { default: `${MOD}+r`, darwin: `${CMD}+r` }, ['outlook', 'olk']),
+ shortcut('outlook-reply-all', 'editing', 'Reply all to email', 'reply all',
+ ['reply all', 'reply to all', 'respond to all'],
+ { default: `${MOD}+Shift+r`, darwin: `${CMD}+Shift+r` }, ['outlook', 'olk']),
+ shortcut('outlook-forward', 'editing', 'Forward email', 'forward email',
+ ['forward', 'forward email', 'forward message', 'forward this'],
+ { default: `${MOD}+f`, darwin: `${CMD}+f` }, ['outlook', 'olk']),
+ shortcut('outlook-mark-read', 'editing', 'Mark email as read', 'mark as read',
+ ['mark as read', 'mark read'],
+ { default: `${MOD}+q`, darwin: `${CMD}+q` }, ['outlook', 'olk']),
+ shortcut('outlook-mark-unread', 'editing', 'Mark email as unread', 'mark as unread',
+ ['mark as unread', 'mark unread'],
+ { default: `${MOD}+u`, darwin: `${CMD}+u` }, ['outlook', 'olk']),
+ shortcut('outlook-flag', 'editing', 'Flag email for follow up', 'flag email',
+ ['flag', 'flag email', 'flag message', 'flag for follow up'],
+ { default: 'Insert' }, ['outlook', 'olk']),
+ shortcut('outlook-delete', 'editing', 'Delete email', 'delete email',
+ ['delete email', 'delete message', 'delete this email'],
+ { default: 'Delete' }, ['outlook', 'olk']),
+ shortcut('outlook-search', 'editing', 'Search in Outlook', 'search outlook',
+ ['search', 'search email', 'search emails', 'search outlook', 'find email'],
+ { default: `${MOD}+e`, darwin: `${CMD}+e` }, ['outlook', 'olk']),
+ shortcut('outlook-go-calendar', 'navigation', 'Go to Outlook Calendar', 'go to calendar',
+ ['go to calendar', 'open calendar', 'switch to calendar', 'calendar'],
+ { default: `${MOD}+2`, darwin: `${CMD}+2` }, ['outlook', 'olk']),
+ shortcut('outlook-go-mail', 'navigation', 'Go to Outlook Mail', 'go to mail',
+ ['go to mail', 'go to inbox', 'switch to mail', 'open mail', 'inbox'],
+ { default: `${MOD}+1`, darwin: `${CMD}+1` }, ['outlook', 'olk']),
+ shortcut('outlook-go-people', 'navigation', 'Go to Outlook People/Contacts', 'go to contacts',
+ ['go to contacts', 'go to people', 'open contacts', 'address book'],
+ { default: `${MOD}+4`, darwin: `${CMD}+4` }, ['outlook', 'olk']),
+ shortcut('outlook-new-appointment', 'editing', 'Create new calendar appointment', 'new appointment',
+ ['new appointment', 'new event', 'create appointment', 'create event', 'new calendar event'],
+ { default: `${MOD}+n`, darwin: `${CMD}+n` }, ['outlook', 'calendar']),
+ shortcut('outlook-open-message', 'editing', 'Open email in new window', 'open message',
+ ['open message', 'open email', 'open in new window'],
+ { default: 'Shift+Return' }, ['outlook', 'olk']),
+
+ // Classic Outlook (OUTLOOK / WINWORD process) — different shortcuts
+ shortcut('outlook-classic-new-message', 'editing', 'Create new email (classic)', 'new email classic',
+ ['new email', 'new message', 'compose email', 'new mail'],
+ { default: `${MOD}+Shift+m` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-send', 'editing', 'Send email (classic)', 'send email classic',
+ ['send email', 'send message', 'send mail'],
+ { default: 'Alt+s' }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-go-inbox', 'navigation', 'Go to Inbox (classic)', 'go to inbox classic',
+ ['go to inbox', 'switch to inbox'],
+ { default: `${MOD}+Shift+i` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-go-outbox', 'navigation', 'Go to Outbox (classic)', 'go to outbox classic',
+ ['go to outbox', 'switch to outbox'],
+ { default: `${MOD}+Shift+o` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-check-mail', 'editing', 'Check for new messages (classic)', 'check mail classic',
+ ['check mail', 'check messages', 'check for new messages', 'get mail'],
+ { default: `${MOD}+m` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-address-book', 'editing', 'Open Address Book (classic)', 'open address book classic',
+ ['address book', 'open address book'],
+ { default: `${MOD}+Shift+b` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-new-task', 'editing', 'Create new task (classic)', 'new task classic',
+ ['new task', 'create task'],
+ { default: `${MOD}+Shift+k` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-new-appointment', 'editing', 'Create new appointment (classic)', 'new appointment classic',
+ ['new appointment', 'create appointment', 'new event'],
+ { default: `${MOD}+Shift+a` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-new-contact', 'editing', 'Create new contact (classic)', 'new contact classic',
+ ['new contact', 'create contact'],
+ { default: `${MOD}+Shift+c` }, ['winword', 'outlook classic']),
+ shortcut('outlook-classic-new-meeting', 'editing', 'Create new meeting (classic)', 'new meeting classic',
+ ['new meeting', 'create meeting', 'schedule meeting'],
+ { default: `${MOD}+Shift+q` }, ['winword', 'outlook classic']),
];
function shortcut(
diff --git a/src/skill-cache.ts b/src/skill-cache.ts
new file mode 100644
index 0000000..4d32bd8
--- /dev/null
+++ b/src/skill-cache.ts
@@ -0,0 +1,304 @@
+/**
+ * Skill Cache — Layer 2.
+ *
+ * Stores and replays learned task paths. When the same task+app pattern
+ * succeeds 2+ times via OcrReasoner, the action sequence is promoted to
+ * a skill. Future runs skip OCR+LLM and execute directly from cache.
+ *
+ * This is the "growing a11y tree" — each successful interaction teaches
+ * the system a faster path for next time.
+ *
+ * Storage: ~/.clawdcursor/skills.json
+ * Matching: token overlap ratio (no new npm deps)
+ * Promotion: auto after 2 successes for the same task+app pair
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { DATA_DIR } from './paths';
+import { NativeDesktop } from './native-desktop';
+import { AccessibilityBridge } from './accessibility';
+
+const SKILLS_PATH = path.join(DATA_DIR, 'skills.json');
+const MATCH_THRESHOLD = 0.75; // token overlap ratio for fuzzy matching
+const PROMOTE_THRESHOLD = 2; // successes before auto-promotion
+const MAX_SKILLS = 200; // cap to prevent unbounded growth
+const MAX_SKILL_AGE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days unused → evict
+
+// ─── Skill types ─────────────────────────────────────────────────────────────
+
+export interface SkillStep {
+ type: 'click' | 'type' | 'key' | 'scroll' | 'wait';
+ /** For click — pixel coordinates (real screen) */
+ x?: number;
+ y?: number;
+ /** For type — the text (may contain {variable} placeholders) */
+ text?: string;
+ /** For key — the key combo */
+ key?: string;
+ /** For scroll */
+ direction?: 'up' | 'down';
+ amount?: number;
+ /** For wait */
+ ms?: number;
+ /** Human-readable description */
+ description: string;
+}
+
+export interface Skill {
+ id: string;
+ taskPattern: string; // normalized task string
+ appName: string; // process name (e.g. "msedge", "OUTLOOK")
+ steps: SkillStep[];
+ successCount: number;
+ lastUsed: number; // timestamp
+ createdAt: number;
+}
+
+// ─── Token matching utilities ────────────────────────────────────────────────
+
+function tokenize(input: string): string[] {
+ return input
+ .toLowerCase()
+ .replace(/[^a-z0-9\s]/g, '')
+ .split(/\s+/)
+ .filter(t => t.length > 1); // drop single chars
+}
+
+function tokenOverlap(a: string[], b: string[]): number {
+ if (a.length === 0 || b.length === 0) return 0;
+ const setA = new Set(a);
+ const setB = new Set(b);
+ let overlap = 0;
+ for (const token of setA) {
+ if (setB.has(token)) overlap++;
+ }
+ return overlap / Math.max(setA.size, setB.size);
+}
+
+// ─── SkillCache class ────────────────────────────────────────────────────────
+
+export class SkillCache {
+ private skills: Skill[] = [];
+ private loaded = false;
+ /** Pending recordings — tasks that succeeded once but aren't promoted yet */
+ private pending: Map = new Map();
+
+ /**
+ * Load skills from disk. Safe to call multiple times (no-op after first).
+ */
+ load(): void {
+ if (this.loaded) return;
+ this.loaded = true;
+
+ try {
+ if (fs.existsSync(SKILLS_PATH)) {
+ const raw = fs.readFileSync(SKILLS_PATH, 'utf-8');
+ const data = JSON.parse(raw);
+ this.skills = Array.isArray(data) ? data : [];
+ // Evict stale skills
+ const now = Date.now();
+ this.skills = this.skills.filter(s => (now - s.lastUsed) < MAX_SKILL_AGE_MS);
+ console.log(` 📚 Skill cache loaded: ${this.skills.length} skills`);
+ }
+ } catch {
+ this.skills = [];
+ }
+ }
+
+ /**
+ * Save skills to disk.
+ */
+ private save(): void {
+ try {
+ fs.mkdirSync(path.dirname(SKILLS_PATH), { recursive: true });
+ fs.writeFileSync(SKILLS_PATH, JSON.stringify(this.skills, null, 2), 'utf-8');
+ } catch (err: any) {
+ console.error(` [SkillCache] Save failed: ${err.message}`);
+ }
+ }
+
+ /**
+ * Find the best matching skill for a task+app pair.
+ * Returns null if no skill matches above the threshold.
+ */
+ findSkill(task: string, appName: string): Skill | null {
+ this.load();
+ if (this.skills.length === 0) return null;
+
+ const taskTokens = tokenize(task);
+ const appLower = (appName || '').toLowerCase();
+
+ let bestSkill: Skill | null = null;
+ let bestScore = 0;
+
+ for (const skill of this.skills) {
+ // App must match (case-insensitive)
+ if (skill.appName.toLowerCase() !== appLower) continue;
+
+ const skillTokens = tokenize(skill.taskPattern);
+ const score = tokenOverlap(taskTokens, skillTokens);
+
+ if (score >= MATCH_THRESHOLD && score > bestScore) {
+ bestScore = score;
+ bestSkill = skill;
+ }
+ }
+
+ return bestSkill;
+ }
+
+ /**
+ * Execute a cached skill. Returns 'success' if all steps completed,
+ * 'miss' if an element wasn't found or an action failed.
+ */
+ async executeSkill(
+ skill: Skill,
+ desktop: NativeDesktop,
+ a11y: AccessibilityBridge,
+ ): Promise<'success' | 'miss'> {
+ console.log(` ⚡ Skill cache HIT: "${skill.taskPattern}" (${skill.steps.length} steps, used ${skill.successCount}x)`);
+
+ try {
+ for (const step of skill.steps) {
+ switch (step.type) {
+ case 'click':
+ if (step.x !== undefined && step.y !== undefined) {
+ await desktop.mouseClick(step.x, step.y);
+ a11y.invalidateCache();
+ }
+ break;
+
+ case 'type':
+ if (step.text) {
+ await a11y.writeClipboard(step.text);
+ await new Promise(r => setTimeout(r, 50));
+ await desktop.keyPress('ctrl+v');
+ await new Promise(r => setTimeout(r, 100));
+ a11y.invalidateCache();
+ }
+ break;
+
+ case 'key':
+ if (step.key) {
+ await desktop.keyPress(step.key);
+ a11y.invalidateCache();
+ }
+ break;
+
+ case 'scroll':
+ if (step.x !== undefined && step.y !== undefined) {
+ const delta = step.direction === 'down' ? (step.amount ?? 3) : -(step.amount ?? 3);
+ await desktop.mouseScroll(step.x, step.y, delta);
+ }
+ break;
+
+ case 'wait':
+ await new Promise(r => setTimeout(r, step.ms ?? 500));
+ break;
+ }
+
+ // Brief pause between steps
+ await new Promise(r => setTimeout(r, 200));
+ }
+
+ // Update skill metadata
+ skill.successCount++;
+ skill.lastUsed = Date.now();
+ this.save();
+
+ console.log(` ✅ Skill replayed successfully (${skill.steps.length} steps)`);
+ return 'success';
+ } catch (err: any) {
+ console.log(` ❌ Skill replay failed: ${err.message} — falling through`);
+ // Decrement success count — the UI may have changed
+ skill.successCount = Math.max(0, skill.successCount - 1);
+ if (skill.successCount <= 0) {
+ // Remove broken skill
+ this.skills = this.skills.filter(s => s.id !== skill.id);
+ console.log(` 🗑️ Removed broken skill: "${skill.taskPattern}"`);
+ }
+ this.save();
+ return 'miss';
+ }
+ }
+
+ /**
+ * Record a successful task completion. After PROMOTE_THRESHOLD successes
+ * for the same task+app pair, auto-promote to a cached skill.
+ */
+ recordSuccess(task: string, appName: string, steps: SkillStep[]): void {
+ this.load();
+ if (steps.length === 0) return;
+
+ const key = `${appName.toLowerCase()}::${tokenize(task).sort().join(' ')}`;
+
+ const existing = this.pending.get(key);
+ if (existing) {
+ existing.count++;
+ existing.steps = steps; // use latest steps (may be more refined)
+
+ if (existing.count >= PROMOTE_THRESHOLD) {
+ this.promote(task, appName, steps);
+ this.pending.delete(key);
+ }
+ } else {
+ this.pending.set(key, { task, app: appName, steps, count: 1 });
+ }
+ }
+
+ /**
+ * Promote a task to a cached skill.
+ */
+ private promote(taskPattern: string, appName: string, steps: SkillStep[]): void {
+ // Check if skill already exists
+ const taskTokens = tokenize(taskPattern);
+ const exists = this.skills.some(s => {
+ if (s.appName.toLowerCase() !== appName.toLowerCase()) return false;
+ return tokenOverlap(taskTokens, tokenize(s.taskPattern)) >= 0.9;
+ });
+
+ if (exists) return; // Already have this skill
+
+ // Enforce max skills limit
+ if (this.skills.length >= MAX_SKILLS) {
+ // Evict least-recently-used
+ this.skills.sort((a, b) => a.lastUsed - b.lastUsed);
+ this.skills.shift();
+ }
+
+ const skill: Skill = {
+ id: `skill-${Date.now()}-${Math.random().toString(36).substring(2, 8)}`,
+ taskPattern,
+ appName,
+ steps,
+ successCount: PROMOTE_THRESHOLD,
+ lastUsed: Date.now(),
+ createdAt: Date.now(),
+ };
+
+ this.skills.push(skill);
+ this.save();
+ console.log(` 🎓 Skill promoted: "${taskPattern}" on ${appName} (${steps.length} steps)`);
+ }
+
+ /**
+ * Get skill cache stats.
+ */
+ getStats(): { total: number; pending: number } {
+ this.load();
+ return {
+ total: this.skills.length,
+ pending: this.pending.size,
+ };
+ }
+
+ /**
+ * Clear all skills (for testing or user reset).
+ */
+ clear(): void {
+ this.skills = [];
+ this.pending.clear();
+ try { fs.unlinkSync(SKILLS_PATH); } catch { /* non-fatal */ }
+ }
+}
diff --git a/src/smart-interaction.ts b/src/smart-interaction.ts
index b424a4b..140de0c 100644
--- a/src/smart-interaction.ts
+++ b/src/smart-interaction.ts
@@ -29,7 +29,10 @@ import { CDPDriver } from './cdp-driver';
import { UIDriver } from './ui-driver';
import { AccessibilityBridge } from './accessibility';
import { BrowserLayer } from './browser-layer';
+import { NativeDesktop } from './native-desktop';
+import { extractJsonObject } from './safe-json';
import { PROVIDERS } from './providers';
+import { uiKnowledge } from './ui-knowledge';
import type { PipelineConfig, ProviderProfile } from './providers';
import type { ClawdConfig, StepResult } from './types';
@@ -65,6 +68,8 @@ export interface SmartInteractionResult {
llmCalls: number;
/** Optional description */
description?: string;
+ /** Summary of what was accomplished before falling through — passed to Computer Use as prior context */
+ contextSummary?: string;
}
// ── System prompt for the planning LLM call ──
@@ -97,6 +102,7 @@ AVAILABLE ACTIONS (browser tasks via CDP):
AVAILABLE ACTIONS (native app tasks via UIDriver):
- click: Click by element name. method="name" or "automationId"
- type: Type into element. method="name" or "automationId", text=value
+- typeAtFocus: Type text directly at the currently focused element (no element lookup needed). Use after Tab navigation. text=value to type.
- pressKey: Press keyboard key. target=key combo
- focus: Focus element. method="name"
- select: Select item. method="name"
@@ -116,7 +122,60 @@ RULES:
9. IMPORTANT: UI elements only exist AFTER their parent action. For example, Gmail compose fields only appear AFTER clicking "Compose". Plan sequentially: click to open a dialog/form FIRST, then add a wait step (1000-2000ms), then interact with the new elements.
10. For email compose flows: click Compose → wait 2000ms → click/type each field individually. Do NOT use fillForm unless all fields are visible in the current context.
11. When the context shows an inbox/list view, you MUST click "Compose"/"New"/"Reply" first before trying to fill email fields.
-12. Prefer clicking fields by selector (e.g. [aria-label="To recipients"]) then typing, over fillForm — it's more reliable.`;
+12. Prefer clicking fields by selector (e.g. [aria-label="To recipients"]) then typing, over fillForm — it's more reliable.
+13. CRITICAL: Window titles (e.g. "Mail - John - Outlook", "Inbox - Gmail") are NOT clickable UI elements. Never use a window title as a click target. The window is already focused — proceed directly to interacting with its contents (buttons, fields, etc.).
+14. For Outlook: to compose a new email use pressKey "Control+n" (new message shortcut) rather than trying to click "New Email" button — it's more reliable.`;
+
+/** System prompt for the ReAct-style per-step native task handler */
+const REACT_STEP_SYSTEM_PROMPT = `You are a UI automation agent controlling a native desktop app via accessibility APIs. You operate ONE STEP AT A TIME in a reactive loop.
+
+You will receive:
+- The current TASK
+- The current ACCESSIBILITY TREE (fresh snapshot of visible UI elements)
+- The HISTORY of actions taken so far (with results)
+
+You must decide the SINGLE NEXT action to take. Respond with ONLY valid JSON, no other text.
+
+RESPONSE FORMAT — one of:
+{"action": "click", "target": "Button Name", "method": "name", "reasoning": "Why this click"}
+{"action": "type", "target": "Field Name", "text": "text to type", "method": "name", "reasoning": "Why typing this"}
+{"action": "pressKey", "target": "Control+n", "reasoning": "Keyboard shortcut to open new message"}
+{"action": "focus", "target": "Element Name", "method": "name", "reasoning": "Focus this element"}
+{"action": "select", "target": "Item Name", "method": "name", "reasoning": "Select this item"}
+{"action": "toggle", "target": "Checkbox Name", "method": "name", "reasoning": "Toggle checkbox"}
+{"action": "expand", "target": "Tree Item", "method": "name", "reasoning": "Expand this item"}
+{"action": "menuPath", "target": "File,Save As...", "reasoning": "Navigate menu path"}
+{"action": "wait", "waitMs": 1000, "reasoning": "Wait for UI to update"}
+{"action": "done", "reasoning": "Task is complete because X"}
+{"action": "give_up", "reasoning": "Cannot complete because X"}
+
+AVAILABLE ACTIONS:
+- click: Click by element name/automationId. method="name" or "automationId"
+- type: Type into element. method="name" or "automationId", text=value
+- typeAtFocus: Type text directly at the currently focused element (no element lookup needed). Use after Tab navigation. text=value to type.
+- pressKey: Press keyboard key combo. target=key combo (e.g. "Control+n", "Tab", "Return", "Escape")
+- focus: Focus element. method="name"
+- select: Select item. method="name"
+- toggle: Toggle checkbox. method="name"
+- expand: Expand tree/combo. method="name"
+- menuPath: Navigate menu. target=comma-separated path
+- wait: Wait for UI to settle. waitMs=duration in ms
+- done: Task is complete — explain why in reasoning
+- give_up: Cannot complete — explain why in reasoning
+
+RULES:
+1. Decide ONE action at a time based on what you SEE in the current accessibility tree.
+2. Use ONLY elements visible in the current tree. Don't guess at elements that might appear later.
+3. If the previous action failed, adapt — try an alternative approach (different element, keyboard shortcut, etc.).
+4. If you see an unexpected dialog/popup in the tree, handle it first (Escape or click its button) before continuing.
+5. Window titles like "Mail - John - Outlook" are NOT clickable. The window is already focused.
+6. For Outlook: use pressKey "Control+n" for new message rather than clicking buttons.
+7. After typing in a recipient/to field, use pressKey "Tab" to confirm.
+8. If you've been going in circles or the same action keeps failing, give_up.
+9. When the task appears complete based on what you see, return done.
+10. Prefer keyboard shortcuts when they're reliable (Control+s for save, Control+n for new, etc.).
+11. If an element is not found by name, try Tab key navigation or keyboard shortcuts instead. Do NOT give_up immediately — try at least 2 alternatives first.
+12. KEYBOARD-FIRST RULE: For email composition in any mail app (Outlook, Gmail, etc.), ALWAYS use this exact sequence: pressKey Control+n → pressKey Tab → typeAtFocus (recipient) → pressKey Tab → typeAtFocus (subject) → pressKey Tab → typeAtFocus (body) → pressKey Control+Enter (send). Never try to click field names — use Tab navigation + typeAtFocus instead.`;
/**
* SmartInteractionLayer — the orchestration layer between BrowserLayer and Computer Use.
@@ -125,6 +184,7 @@ export class SmartInteractionLayer {
private a11y: AccessibilityBridge;
private config: ClawdConfig;
private pipelineConfig: PipelineConfig | null;
+ private desktop: NativeDesktop;
// Lazy-initialized drivers
private cdpDriver: CDPDriver | null = null;
@@ -135,14 +195,19 @@ export class SmartInteractionLayer {
private readonly MAX_FAILURES = 3;
private disabled = false;
+ // For state change polling
+ private lastWindowTitle = '';
+
constructor(
a11y: AccessibilityBridge,
config: ClawdConfig,
pipelineConfig: PipelineConfig | null,
+ desktop: NativeDesktop,
) {
this.a11y = a11y;
this.config = config;
this.pipelineConfig = pipelineConfig;
+ this.desktop = desktop;
}
/** Check if this layer is available (has a text model configured) */
@@ -407,13 +472,15 @@ export class SmartInteractionLayer {
private async handleNativeTask(task: string): Promise {
const steps: StepResult[] = [];
+ const MAX_REACT_STEPS = 10;
+ let llmCalls = 0;
// Lazy-create UIDriver (no connection needed)
if (!this.uiDriver) {
this.uiDriver = new UIDriver();
}
- // Get accessibility tree context
+ // Get accessibility tree context (initial check)
console.log(` ♿ Smart Interaction: getting accessibility context...`);
const activeWindow = await this.a11y.getActiveWindow();
@@ -430,65 +497,217 @@ export class SmartInteractionLayer {
}
}
- const a11yContext = await this.a11y.getScreenContext(activeWindow?.processId).catch(() => '');
+ // ── UI Knowledge Layer: load app-specific instruction set ──
+ const currentWindow = await this.a11y.getActiveWindow();
+ const windowTitle = currentWindow?.title || '';
+ const knowledgeContext = await uiKnowledge.getContextForTask(task, windowTitle).catch(() => null);
- if (!a11yContext || a11yContext.includes('unavailable')) {
- return { handled: false, success: false, steps: [], llmCalls: 0, description: 'A11y context unavailable' };
- }
+ // ── ReAct Loop: step-by-step reactive execution ──
+ const actionHistory: Array<{ action: string; target?: string; text?: string; success: boolean; error?: string; stateAfter?: string }> = [];
+ let successfulActions = 0; // Track meaningful progress for partial-success detection
+ let elementNotFoundRetries = 0; // Separate retry counter for Tab-key fallbacks
- // Make ONE LLM call to plan actions
- console.log(` 🧠 Smart Interaction: planning with text LLM...`);
- const plan = await this.planActions(task, a11yContext, 'native');
+ console.log(` 🔄 Smart Interaction: starting ReAct loop (max ${MAX_REACT_STEPS} steps)...`);
- if (!plan || !plan.canHandle) {
- console.log(` 🤷 Smart Interaction: LLM says can't handle — ${plan?.reasoning || 'no plan'}`);
- return {
- handled: false,
- success: false,
- steps: [{ action: 'plan', description: `Can't handle: ${plan?.reasoning || 'unknown'}`, success: false, timestamp: Date.now() }],
- llmCalls: 1,
- description: plan?.reasoning,
+ // Track current a11y state for polling
+ let currentA11yState = '';
+
+ for (let step = 0; step < MAX_REACT_STEPS; step++) {
+ // 1. Get FRESH a11y snapshot each iteration
+ const currentWindow = await this.a11y.getActiveWindow();
+ const a11yContext = await this.a11y.getScreenContext(currentWindow?.processId).catch(() => '');
+ currentA11yState = a11yContext;
+
+ if (!a11yContext || a11yContext.includes('unavailable')) {
+ if (step === 0) {
+ return { handled: false, success: false, steps, llmCalls, description: 'A11y context unavailable' };
+ }
+ // Mid-loop: context lost, give up
+ console.log(` ⚠️ ReAct step ${step + 1}: a11y context lost — giving up`);
+ break;
+ }
+
+ // 2. Build history string for LLM context
+ const historyStr = actionHistory.length > 0
+ ? actionHistory.map((h, i) =>
+ `Step ${i + 1}: ${h.action}${h.target ? ` "${h.target}"` : ''}${h.text ? ` text="${h.text}"` : ''} → ${h.success ? 'SUCCESS' : `FAILED: ${h.error || 'unknown'}`}${h.stateAfter ? `\n State after: ${h.stateAfter}` : ''}`
+ ).join('\n')
+ : '(no actions taken yet)';
+
+ const knowledgeSection = knowledgeContext ? `\nAPP INSTRUCTION MANUAL:\n${knowledgeContext}\n` : '';
+ const userMessage = `TASK: ${task}${knowledgeSection}\n\nACTION HISTORY:\n${historyStr}\n\nCURRENT ACCESSIBILITY TREE:\n${a11yContext}`;
+
+ // 3. LLM decides ONE next action
+ console.log(` 🧠 ReAct step ${step + 1}/${MAX_REACT_STEPS}: asking LLM...`);
+ llmCalls++;
+
+ let response: string;
+ try {
+ response = await this.callTextModel(userMessage, REACT_STEP_SYSTEM_PROMPT);
+ } catch (err) {
+ console.log(` ⚠️ ReAct step ${step + 1}: LLM call failed — ${err}`);
+ steps.push({ action: 'error', description: `LLM call failed: ${err}`, success: false, timestamp: Date.now() });
+ break;
+ }
+
+ // 4. Parse LLM response
+ const decision = extractJsonObject(response) as any;
+ if (!decision) {
+ console.log(` ⚠️ ReAct step ${step + 1}: no valid JSON in LLM response`);
+ actionHistory.push({ action: 'parse_error', success: false, error: 'LLM returned no valid JSON' });
+ continue;
+ }
+
+ const action = decision.action;
+ const reasoning = decision.reasoning || '';
+
+ // Handle "wait" action — make it free (no step counter increment)
+ if (action === 'wait') {
+ // Don't count as a step — just poll silently
+ console.log(` ⏳ Auto-polling for state change (free step)...`);
+ currentA11yState = await this.pollUntilStateChanges(currentA11yState, 3000);
+ continue; // don't increment step counter
+ }
+
+ // 5. Handle terminal actions
+ if (action === 'done') {
+ console.log(` ✅ ReAct: done — ${reasoning}`);
+ steps.push({ action: 'done', description: `ReAct complete: ${reasoning}`, success: true, timestamp: Date.now() });
+ return {
+ handled: true,
+ success: true,
+ steps,
+ llmCalls,
+ description: `ReAct completed in ${step + 1} steps: ${reasoning}`,
+ };
+ }
+
+ if (action === 'give_up') {
+ if (successfulActions > 0) {
+ // Partial progress was made — report success so caller can continue with Computer Use
+ console.log(` 🔄 ReAct: give_up but ${successfulActions} actions succeeded — reporting partial progress`);
+ const completedSteps = actionHistory
+ .filter(h => h.success && h.action !== 'parse_error' && h.action !== 'wait')
+ .map(h => `${h.action}${h.target ? ` "${h.target}"` : ''}${h.text ? ` text="${h.text}"` : ''}`)
+ .join(', ');
+ const contextSummary = `Completed ${successfulActions} actions: ${completedSteps}. Gave up because: ${reasoning}`;
+ steps.push({ action: 'give_up_partial', description: `ReAct partial progress: ${reasoning}`, success: true, timestamp: Date.now() });
+ return {
+ handled: false,
+ success: false,
+ steps,
+ llmCalls,
+ description: `ReAct gave up after ${step + 1} steps (${successfulActions} succeeded): ${reasoning}`,
+ contextSummary,
+ };
+ }
+ console.log(` 🤷 ReAct: give_up (zero progress) — ${reasoning}`);
+ steps.push({ action: 'give_up', description: `ReAct gave up: ${reasoning}`, success: false, timestamp: Date.now() });
+ return {
+ handled: false,
+ success: false,
+ steps,
+ llmCalls,
+ description: `ReAct gave up after ${step + 1} steps: ${reasoning}`,
+ };
+ }
+
+ // 6. Build PlannedStep and execute
+ const plannedStep: PlannedStep = {
+ action: action || 'click',
+ target: decision.target || '',
+ method: decision.method || 'name',
+ text: decision.text,
+ key: decision.key,
+ waitMs: decision.waitMs,
+ fields: decision.fields,
};
- }
- steps.push({
- action: 'plan',
- description: `Planned ${plan.steps.length} native steps: ${plan.reasoning || ''}`,
- success: true,
- timestamp: Date.now(),
- });
+ console.log(` ▶️ ReAct step ${step + 1}: ${action} "${plannedStep.target || ''}" — ${reasoning}`);
+
+ let stepResult = await this.executeNativeStep(plannedStep);
+
+ // After pressKey or typeAtFocus — poll for state change instead of burning LLM steps
+ if (stepResult.action === 'pressKey' || stepResult.action === 'typeAtFocus') {
+ const updatedState = await this.pollUntilStateChanges(currentA11yState, 5000);
+ if (updatedState !== currentA11yState) {
+ currentA11yState = updatedState;
+ console.log(` ✅ State changed after ${stepResult.description} — proceeding`);
+ }
+ }
+
+ // Fallback: if click fails with "Element not found", try Tab key navigation
+ if (!stepResult.success && action === 'click' && stepResult.error?.includes('not found') && elementNotFoundRetries < 3) {
+ elementNotFoundRetries++;
+ console.log(` 🔄 Element not found — trying Tab key fallback (retry ${elementNotFoundRetries}/3)`);
+ const tabStep: PlannedStep = { action: 'pressKey', target: 'Tab', method: 'name' };
+ const tabResult = await this.executeNativeStep(tabStep);
+ steps.push(tabResult);
+ await this.delay(300);
+
+ // Retry the original click after Tab
+ stepResult = await this.executeNativeStep(plannedStep);
+ if (stepResult.success) {
+ console.log(` ✅ Tab fallback worked — element found after keyboard navigation`);
+ elementNotFoundRetries = 0; // Reset on success
+ }
+ // If fallback also fails, count as a normal step (LLM will adapt)
+ }
- // Execute each planned step — continue on non-critical failures
- let nativeCriticalFail = false;
- for (const plannedStep of plan.steps) {
- const stepResult = await this.executeNativeStep(plannedStep);
steps.push(stepResult);
- if (!stepResult.success) {
- console.log(` ⚠️ Step failed: ${stepResult.description}`);
- const criticalActions = ['type', 'fillForm', 'click'];
- if (criticalActions.includes(plannedStep.action)) {
- console.log(` ❌ Critical step failed — falling through`);
- nativeCriticalFail = true;
- break;
+ // 7. Capture fresh a11y state AFTER the action for verification
+ // Small delay for UI to settle before capturing state
+ const postActionDelay = action === 'typeAtFocus' ? 300 : action === 'pressKey' ? 500 : 400;
+ await this.delay(postActionDelay);
+
+ let stateAfter: string | undefined;
+ try {
+ const postWindow = await this.a11y.getActiveWindow();
+ const postA11y = await this.a11y.getScreenContext(postWindow?.processId).catch(() => '');
+ if (postA11y && !postA11y.includes('unavailable')) {
+ stateAfter = postA11y.substring(0, 500);
}
- console.log(` ⏭️ Non-critical, continuing...`);
+ } catch {
+ // Non-fatal — state capture failed
}
- // Delay between actions for UI to settle
- await this.delay(300);
- }
+ // Record result with post-action state for next iteration's history
+ actionHistory.push({
+ action,
+ target: plannedStep.target,
+ text: plannedStep.text,
+ success: stepResult.success,
+ error: stepResult.error,
+ stateAfter,
+ });
- if (nativeCriticalFail) {
- return { handled: false, success: false, steps, llmCalls: 1, description: 'Critical step failed' };
+ if (stepResult.success && action !== 'wait') {
+ successfulActions++;
+ }
+
+ if (!stepResult.success) {
+ console.log(` ⚠️ ReAct step ${step + 1} failed: ${stepResult.error || stepResult.description} — LLM will adapt`);
+ }
}
+ // Exhausted max steps without done/give_up — build context summary of what was accomplished
+ console.log(` ⏰ ReAct: exhausted ${MAX_REACT_STEPS} steps — falling through to Computer Use`);
+ let contextSummary: string | undefined;
+ if (successfulActions > 0) {
+ const completedSteps = actionHistory
+ .filter(h => h.success && h.action !== 'parse_error' && h.action !== 'wait')
+ .map(h => `${h.action}${h.target ? ` "${h.target}"` : ''}${h.text ? ` text="${h.text}"` : ''}`)
+ .join(', ');
+ contextSummary = `Completed ${successfulActions} of ${MAX_REACT_STEPS} steps: ${completedSteps}. Exhausted max steps — remaining work needs to be continued.`;
+ }
return {
- handled: true,
- success: true,
+ handled: false,
+ success: false,
steps,
- llmCalls: 1,
- description: `Completed ${plan.steps.length} native actions`,
+ llmCalls,
+ description: `ReAct exhausted ${MAX_REACT_STEPS} steps without completing`,
+ contextSummary,
};
}
@@ -499,6 +718,14 @@ export class SmartInteractionLayer {
try {
switch (step.action) {
case 'click': {
+ // Window titles look like "AppName - WindowTitle" or "Title - AppName".
+ // They are NOT clickable elements — the window is already focused.
+ // Detect and skip them to avoid cascading failures.
+ const isWindowTitle = /\s[-–]\s/.test(step.target) && step.target.length > 20;
+ if (isWindowTitle) {
+ console.log(` ⏭️ Skipping window-title click target "${step.target}" — window already focused`);
+ return { action: 'click', description: `Skipped window-title: "${step.target}"`, success: true, timestamp: ts };
+ }
const result = await ui.clickElement(step.target, {
controlType: step.method === 'automationId' ? undefined : undefined,
});
@@ -507,16 +734,30 @@ export class SmartInteractionLayer {
case 'type': {
const result = await ui.typeInElement(step.target, step.text || '');
+ if (!result.success) {
+ // Fallback: type at current focus
+ console.log(` 🔄 Element type failed — falling back to typeAtFocus`);
+ const fallback = await ui.typeAtCurrentFocus(step.text || '');
+ return { action: 'type', description: `Type "${step.text}" (typeAtFocus fallback)`, success: fallback.success, error: fallback.error, timestamp: ts };
+ }
return { action: 'type', description: `Type "${step.text}" into "${step.target}"`, success: result.success, error: result.error, timestamp: ts };
}
+ case 'typeAtFocus': {
+ // Type directly at current focus — no element lookup
+ const result = await ui.typeAtCurrentFocus(step.text || '');
+ const desc = result.success
+ ? `Typed at focus: '${step.text}' — ⚠️ UNVERIFIED`
+ : `Typed at focus: '${step.text}' — FAILED`;
+ return { action: 'typeAtFocus', description: desc, success: result.success, error: result.error, timestamp: ts };
+ }
+
case 'pressKey': {
- // UIDriver doesn't have pressKey — use a11y bridge keyboard action
- // Return the key info so the agent can execute via native desktop
+ await this.desktop.keyPress(step.target);
return {
action: 'pressKey',
- description: `Press ${step.target} (delegated to native)`,
- success: true, // Mark as success — the key press step is informational
+ description: `Pressed ${step.target}`,
+ success: true,
timestamp: ts,
};
}
@@ -803,4 +1044,31 @@ export class SmartInteractionLayer {
private delay(ms: number): Promise {
return new Promise(resolve => setTimeout(resolve, ms));
}
+
+ /**
+ * Poll the a11y tree every 500ms until it changes from the baseline, or until timeoutMs elapses.
+ * Returns the new state string, or the original if nothing changed.
+ * This replaces burning LLM "wait" steps — completely free, zero LLM calls.
+ */
+ private async pollUntilStateChanges(baselineState: string, timeoutMs = 5000): Promise {
+ const interval = 500;
+ const attempts = Math.ceil(timeoutMs / interval);
+ for (let i = 0; i < attempts; i++) {
+ await new Promise(r => setTimeout(r, interval));
+ try {
+ const activeWindow = await this.a11y.getActiveWindow();
+ const newState = await this.a11y.getScreenContext(activeWindow?.processId).catch(() => '');
+ // Meaningful change = active window title changed OR element count differs significantly
+ const baselineLines = baselineState.split('\n').length;
+ const newLines = newState.split('\n').length;
+ const titleChanged = (activeWindow?.title || '') !== this.lastWindowTitle;
+ const elementCountChanged = Math.abs(newLines - baselineLines) > 3;
+ if (titleChanged || elementCountChanged) {
+ this.lastWindowTitle = activeWindow?.title || '';
+ return newState; // state changed — return new snapshot
+ }
+ } catch { /* continue polling */ }
+ }
+ return baselineState; // timed out — return original
+ }
}
diff --git a/src/task-logger.ts b/src/task-logger.ts
new file mode 100644
index 0000000..5ef657a
--- /dev/null
+++ b/src/task-logger.ts
@@ -0,0 +1,235 @@
+/**
+ * Structured Task Logger — persistent JSONL logs for every task execution.
+ *
+ * Each task gets a separate .jsonl file with one JSON object per line:
+ * - Step entries: individual actions with params, results, verification status
+ * - Summary entry: final line with task-level stats (status, duration, LLM calls, cost)
+ *
+ * Key distinction: "verified_success" vs "unverified_success" — tracks whether
+ * completion was independently confirmed or just LLM-declared.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { TASK_LOGS_DIR } from './paths';
+
+// ─── Types ───────────────────────────────────────────────────
+
+export type PipelineLayer = 0 | 1 | 1.5 | 2 | 2.5 | 3 | 'preprocess' | 'decompose';
+
+export type CompletionStatus =
+ | 'verified_success'
+ | 'unverified_success'
+ | 'failed'
+ | 'timeout'
+ | 'aborted'
+ | 'needs_human'
+ | 'in_progress';
+
+export interface VerificationInfo {
+ method: 'action_verifier' | 'cdp_readback' | 'a11y_readback' | 'vision' | 'window_state' | 'checkpoint' | 'contradiction_check' | 'none';
+ verified: boolean;
+ detail?: string;
+}
+
+export interface StepLogEntry {
+ stepIndex: number;
+ timestamp: string;
+ layer: PipelineLayer;
+ actionType: string;
+ actionParams?: Record;
+ llmReasoning?: string;
+ uiStateSummary?: string;
+ result: 'success' | 'fail' | 'timeout' | 'skipped' | 'blocked';
+ verification?: VerificationInfo;
+ error?: string;
+ durationMs?: number;
+}
+
+export interface TaskSummary {
+ _type: 'task_summary';
+ task: string;
+ refinedTask?: string;
+ status: CompletionStatus;
+ totalSteps: number;
+ layersUsed: PipelineLayer[];
+ llmCallCount: number;
+ estimatedCostUsd?: number;
+ durationMs: number;
+ startedAt: string;
+ completedAt: string;
+ targetApp?: string;
+ navigatedUrl?: string;
+}
+
+// ─── TaskLogger Class ────────────────────────────────────────
+
+export class TaskLogger {
+ private logDir: string;
+ private stream: fs.WriteStream | null = null;
+ private currentLogPath: string | null = null;
+ private stepIndex = 0;
+ private startTime = 0;
+ private llmCallCount = 0;
+ private layersUsed = new Set();
+ private currentTask = '';
+
+ constructor(logDir?: string) {
+ this.logDir = logDir ?? TASK_LOGS_DIR;
+ try {
+ fs.mkdirSync(this.logDir, { recursive: true });
+ } catch { /* directory may already exist */ }
+ this.pruneOldLogs(30);
+ }
+
+ /**
+ * Start logging a new task. Opens a JSONL file stream.
+ */
+ startTask(task: string): string {
+ // Close any previous stream
+ this.endTask('failed');
+
+ const now = new Date();
+ const dateStr = now.toISOString().replace(/[:.]/g, '-').substring(0, 19);
+ const id = Math.random().toString(36).substring(2, 6);
+ const filename = `${dateStr}_${id}.jsonl`;
+ this.currentLogPath = path.join(this.logDir, filename);
+
+ try {
+ this.stream = fs.createWriteStream(this.currentLogPath, { flags: 'a' });
+ this.stream.on('error', () => { this.stream = null; });
+ } catch {
+ this.stream = null;
+ }
+
+ this.stepIndex = 0;
+ this.startTime = Date.now();
+ this.llmCallCount = 0;
+ this.layersUsed.clear();
+ this.currentTask = task;
+
+ return id;
+ }
+
+ /**
+ * Log a single step. Fire-and-forget — never blocks the agent loop.
+ */
+ logStep(entry: Partial & { layer: PipelineLayer; actionType: string; result: StepLogEntry['result'] }): void {
+ if (!this.stream) return;
+
+ this.layersUsed.add(entry.layer);
+
+ const full: StepLogEntry = {
+ stepIndex: this.stepIndex++,
+ timestamp: new Date().toISOString(),
+ layer: entry.layer,
+ actionType: entry.actionType,
+ result: entry.result,
+ ...(entry.actionParams && { actionParams: entry.actionParams }),
+ ...(entry.llmReasoning && { llmReasoning: entry.llmReasoning.substring(0, 500) }),
+ ...(entry.uiStateSummary && { uiStateSummary: entry.uiStateSummary.substring(0, 300) }),
+ ...(entry.verification && { verification: entry.verification }),
+ ...(entry.error && { error: entry.error.substring(0, 300) }),
+ ...(entry.durationMs !== undefined && { durationMs: entry.durationMs }),
+ };
+
+ try {
+ this.stream.write(JSON.stringify(full) + '\n');
+ } catch { /* never crash the agent */ }
+ }
+
+ /**
+ * Record an LLM API call (for cost tracking).
+ */
+ recordLlmCall(): void {
+ this.llmCallCount++;
+ }
+
+ /**
+ * Finalize the task log with a summary line. Closes the stream.
+ */
+ endTask(status: CompletionStatus, extras?: Partial): void {
+ if (!this.stream) return;
+
+ const summary: TaskSummary = {
+ _type: 'task_summary',
+ task: this.currentTask,
+ status,
+ totalSteps: this.stepIndex,
+ layersUsed: Array.from(this.layersUsed),
+ llmCallCount: this.llmCallCount,
+ durationMs: Date.now() - this.startTime,
+ startedAt: new Date(this.startTime).toISOString(),
+ completedAt: new Date().toISOString(),
+ ...extras,
+ };
+
+ try {
+ this.stream.write(JSON.stringify(summary) + '\n');
+ this.stream.end();
+ } catch { /* never crash */ }
+
+ this.stream = null;
+ // Keep currentLogPath for API access
+ }
+
+ /**
+ * Get the path to the current/last log file.
+ */
+ getCurrentLogPath(): string | null {
+ return this.currentLogPath;
+ }
+
+ /**
+ * Get the log directory path.
+ */
+ getLogDir(): string {
+ return this.logDir;
+ }
+
+ /**
+ * Delete log files older than maxAgeDays.
+ */
+ private pruneOldLogs(maxAgeDays: number): void {
+ try {
+ const cutoff = Date.now() - maxAgeDays * 24 * 60 * 60 * 1000;
+ const files = fs.readdirSync(this.logDir).filter(f => f.endsWith('.jsonl'));
+ for (const file of files) {
+ const filePath = path.join(this.logDir, file);
+ const stat = fs.statSync(filePath);
+ if (stat.mtimeMs < cutoff) {
+ fs.unlinkSync(filePath);
+ }
+ }
+ } catch { /* non-critical */ }
+ }
+
+ /**
+ * Read the most recent N task summaries (for dashboard/API).
+ */
+ getRecentSummaries(count = 20): TaskSummary[] {
+ try {
+ const files = fs.readdirSync(this.logDir)
+ .filter(f => f.endsWith('.jsonl'))
+ .sort()
+ .reverse()
+ .slice(0, count);
+
+ const summaries: TaskSummary[] = [];
+ for (const file of files) {
+ const content = fs.readFileSync(path.join(this.logDir, file), 'utf-8');
+ const lines = content.trim().split('\n');
+ const lastLine = lines[lines.length - 1];
+ try {
+ const parsed = JSON.parse(lastLine);
+ if (parsed._type === 'task_summary') {
+ summaries.push(parsed);
+ }
+ } catch { /* skip malformed */ }
+ }
+ return summaries;
+ } catch {
+ return [];
+ }
+ }
+}
diff --git a/src/tool-server.ts b/src/tool-server.ts
new file mode 100644
index 0000000..fa9ac12
--- /dev/null
+++ b/src/tool-server.ts
@@ -0,0 +1,173 @@
+/**
+ * HTTP Tool Server — REST API for any AI model to discover and execute tools.
+ *
+ * Endpoints:
+ * GET /tools — Tool schemas (OpenAI function-calling format by default)
+ * GET /tools?format=raw — Raw tool definitions with categories
+ * POST /execute/:name — Execute a tool by name
+ * GET /docs — Human-readable tool documentation
+ * GET /health — Server health check
+ *
+ * This is the model-agnostic interface. Any AI that can do function calling
+ * (OpenAI, Anthropic, Google, Meta, Mistral, local models) can use this.
+ */
+
+import express from 'express';
+import { getAllTools, toOpenAiFunctions, getTool, toJsonSchema } from './tools';
+import type { ToolContext } from './tools';
+import type { ToolDefinition } from './tools/types';
+import { VERSION } from './version';
+
+/** Validate request body against a tool's parameter schema. Returns error string or null. */
+function validateParams(body: Record, tool: ToolDefinition): string | null {
+ const params = tool.parameters;
+ for (const [name, def] of Object.entries(params)) {
+ const value = body[name];
+ if (def.required !== false && value === undefined) {
+ return `Missing required parameter: "${name}"`;
+ }
+ if (value !== undefined) {
+ const expected = def.type;
+ const actual = typeof value;
+ if (expected === 'number' && actual !== 'number') return `Parameter "${name}" must be a number, got ${actual}`;
+ if (expected === 'string' && actual !== 'string') return `Parameter "${name}" must be a string, got ${actual}`;
+ if (expected === 'boolean' && actual !== 'boolean') return `Parameter "${name}" must be a boolean, got ${actual}`;
+ }
+ }
+ // Reject unknown parameters to catch typos
+ for (const key of Object.keys(body)) {
+ if (!(key in params)) {
+ return `Unknown parameter: "${key}". Valid: ${Object.keys(params).join(', ') || '(none)'}`;
+ }
+ }
+ return null;
+}
+
+export function createToolServer(ctx: ToolContext): express.Router {
+ const router = express.Router();
+
+ // ── Tool Discovery ──
+
+ router.get('/tools', (_req, res) => {
+ const tools = getAllTools();
+ const format = _req.query.format as string;
+
+ if (format === 'raw') {
+ // Raw format with categories and full metadata
+ res.json(tools.map(t => ({
+ name: t.name,
+ description: t.description,
+ category: t.category,
+ parameters: toJsonSchema(t.parameters),
+ })));
+ } else {
+ // Default: OpenAI function-calling format (universal standard)
+ res.json(toOpenAiFunctions(tools));
+ }
+ });
+
+ // ── Tool Execution ──
+
+ router.post('/execute/:name', async (req, res) => {
+ const { name } = req.params;
+ const tool = getTool(name);
+
+ if (!tool) {
+ return res.status(404).json({
+ error: `Tool "${name}" not found`,
+ available: getAllTools().map(t => t.name),
+ });
+ }
+
+ try {
+ const body = req.body || {};
+ const validationError = validateParams(body, tool);
+ if (validationError) {
+ return res.status(400).json({ tool: name, text: validationError, isError: true });
+ }
+ const result = await tool.handler(body, ctx);
+
+ // Build response
+ const response: any = {
+ tool: name,
+ text: result.text,
+ };
+ if (result.image) {
+ response.image = result.image;
+ }
+ if (result.isError) {
+ response.isError = true;
+ return res.status(400).json(response);
+ }
+ res.json(response);
+ } catch (err: any) {
+ res.status(500).json({
+ tool: name,
+ text: `Internal error: ${err.message}`,
+ isError: true,
+ });
+ }
+ });
+
+ // ── Documentation ──
+
+ router.get('/docs', (_req, res) => {
+ const tools = getAllTools();
+ const categories = new Map();
+
+ for (const t of tools) {
+ const cat = categories.get(t.category) || [];
+ cat.push(t);
+ categories.set(t.category, cat);
+ }
+
+ let md = `# clawdcursor Tool API\n\n`;
+ md += `**${tools.length} tools** for OS-level desktop automation.\n\n`;
+ md += `## Endpoints\n\n`;
+ md += `- \`GET /tools\` — Tool schemas (OpenAI function format)\n`;
+ md += `- \`POST /execute/{name}\` — Execute a tool\n`;
+ md += `- \`GET /docs\` — This page\n\n`;
+
+ const categoryLabels: Record = {
+ perception: 'Perception (Screen Reading)',
+ mouse: 'Mouse Actions',
+ keyboard: 'Keyboard Actions',
+ window: 'Window & App Management',
+ clipboard: 'Clipboard',
+ browser: 'Browser (CDP)',
+ orchestration: 'Orchestration',
+ };
+
+ for (const [cat, catTools] of categories) {
+ md += `## ${categoryLabels[cat] || cat}\n\n`;
+ for (const t of catTools) {
+ md += `### \`${t.name}\`\n`;
+ md += `${t.description}\n\n`;
+ const params = Object.entries(t.parameters);
+ if (params.length > 0) {
+ md += `| Parameter | Type | Required | Description |\n`;
+ md += `|-----------|------|----------|-------------|\n`;
+ for (const [pname, pdef] of params) {
+ md += `| ${pname} | ${pdef.type} | ${pdef.required !== false ? 'yes' : 'no'} | ${pdef.description} |\n`;
+ }
+ md += `\n`;
+ }
+ }
+ }
+
+ res.type('text/markdown').send(md);
+ });
+
+ // ── Health ──
+
+ router.get('/health', (_req, res) => {
+ res.json({
+ status: 'ok',
+ version: VERSION,
+ tools: getAllTools().length,
+ platform: process.platform,
+ });
+ });
+
+ return router;
+}
diff --git a/src/tools/a11y.ts b/src/tools/a11y.ts
new file mode 100644
index 0000000..3282498
--- /dev/null
+++ b/src/tools/a11y.ts
@@ -0,0 +1,189 @@
+/**
+ * Accessibility & window management tools.
+ *
+ * Text-first strategy: always try read_screen before desktop_screenshot.
+ * Provides structured perception of the desktop without vision models.
+ */
+
+import type { ToolDefinition, ToolContext } from './types';
+import { a11yToMouse } from './types';
+
+export function getA11yTools(): ToolDefinition[] {
+ return [
+ {
+ name: 'read_screen',
+ description: 'Read the accessibility tree of the screen. Returns structured text showing: WINDOWS (all open windows), FOCUSED WINDOW UI TREE (buttons, inputs, text elements with coordinates), and FOCUSED ELEMENT (keyboard focus). This is fast, small, and structured — prefer this over screenshots.',
+ parameters: {
+ processId: { type: 'number', description: 'Focus on a specific process ID (optional — reads foreground window by default)', required: false },
+ },
+ category: 'perception',
+ handler: async ({ processId }, ctx) => {
+ await ctx.ensureInitialized();
+ const active = processId ?? (await ctx.a11y.getActiveWindow())?.processId;
+ const context = await ctx.a11y.getScreenContext(active);
+ return { text: context };
+ },
+ },
+
+ {
+ name: 'get_windows',
+ description: 'List all visible windows with their title, process name, PID, and bounds.',
+ parameters: {},
+ category: 'window',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const windows = await ctx.a11y.getWindows(true);
+ if (!windows?.length) return { text: '(no windows found)' };
+ const lines = windows.map((w: any) =>
+ `${w.isMinimized ? '[MIN]' : '[OK]'} [${w.processName}] "${w.title}" pid:${w.processId}` +
+ (!w.isMinimized ? ` at (${w.bounds.x},${w.bounds.y}) ${w.bounds.width}x${w.bounds.height}` : ' (minimized)')
+ );
+ return { text: lines.join('\n') };
+ },
+ },
+
+ {
+ name: 'get_active_window',
+ description: 'Get the currently focused/foreground window.',
+ parameters: {},
+ category: 'window',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const win = await ctx.a11y.getActiveWindow();
+ if (!win) return { text: '(no active window)' };
+ return {
+ text: JSON.stringify({
+ title: win.title, processName: win.processName,
+ processId: win.processId, bounds: win.bounds,
+ }),
+ };
+ },
+ },
+
+ {
+ name: 'get_focused_element',
+ description: 'Get the currently focused UI element (keyboard focus). Returns name, control type, value, bounds, and process ID.',
+ parameters: {},
+ category: 'window',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const el = await ctx.a11y.getFocusedElement();
+ if (!el) return { text: '(no focused element)' };
+ return { text: JSON.stringify(el) };
+ },
+ },
+
+ {
+ name: 'focus_window',
+ description: 'Bring a window to the foreground. Matches by process name, PID, or title substring. Verifies focus after attempt.',
+ parameters: {
+ processName: { type: 'string', description: 'Process name to focus (e.g. "notepad", "msedge")', required: false },
+ processId: { type: 'number', description: 'Process ID to focus', required: false },
+ title: { type: 'string', description: 'Window title substring to match', required: false },
+ },
+ category: 'window',
+ handler: async ({ processName, processId, title }, ctx) => {
+ await ctx.ensureInitialized();
+ let targetBounds: any = null;
+ let targetPid = processId;
+
+ if (processName && !targetPid) {
+ const win = await ctx.a11y.findWindow(processName);
+ if (win) { targetPid = win.processId; targetBounds = win.bounds; }
+ else return { text: `No window found for process "${processName}"`, isError: true };
+ }
+ if (!targetBounds && targetPid) {
+ const windows = await ctx.a11y.getWindows(true);
+ const win = windows?.find((w: any) => w.processId === targetPid);
+ if (win?.bounds) targetBounds = win.bounds;
+ }
+
+ const result = await ctx.a11y.focusWindow(title, targetPid);
+ ctx.a11y.invalidateCache();
+ if (!result.success) return { text: `Failed to focus: ${result.error}`, isError: true };
+
+ // Click window center to physically assert focus
+ if (targetBounds && targetBounds.x > -100 && targetBounds.width > 0) {
+ const centerX = a11yToMouse(targetBounds.x + Math.round(targetBounds.width / 2), ctx);
+ const centerY = a11yToMouse(targetBounds.y + Math.round(targetBounds.height / 4), ctx);
+ await ctx.desktop.mouseClick(centerX, centerY);
+ await new Promise(r => setTimeout(r, 200));
+ }
+
+ await new Promise(r => setTimeout(r, 200));
+ ctx.a11y.invalidateCache();
+ const active = await ctx.a11y.getActiveWindow();
+ const verified = active?.processId === targetPid ||
+ (processName && active?.processName.toLowerCase().includes(processName.toLowerCase())) ||
+ (title && active?.title.toLowerCase().includes(title.toLowerCase()));
+
+ if (!verified && targetBounds && targetBounds.x > -100 && targetBounds.width > 0) {
+ const centerX = a11yToMouse(targetBounds.x + Math.round(targetBounds.width / 2), ctx);
+ const centerY = a11yToMouse(targetBounds.y + Math.round(targetBounds.height / 2), ctx);
+ await ctx.desktop.mouseClick(centerX, centerY);
+ await new Promise(r => setTimeout(r, 300));
+ ctx.a11y.invalidateCache();
+ const a2 = await ctx.a11y.getActiveWindow();
+ const v2 = a2?.processId === targetPid ||
+ (processName && a2?.processName.toLowerCase().includes(processName.toLowerCase())) ||
+ (title && a2?.title.toLowerCase().includes(title.toLowerCase()));
+ return {
+ text: v2 ? `Focused (retry): "${a2?.title}" [${a2?.processName}] pid:${a2?.processId}`
+ : `Focus FAILED. Foreground: "${a2?.title}" [${a2?.processName}]`,
+ isError: !v2,
+ };
+ }
+ return { text: `Focused: "${active?.title}" [${active?.processName}] pid:${active?.processId}` };
+ },
+ },
+
+ {
+ name: 'find_element',
+ description: 'Search for UI elements by name, control type, or automation ID within a process. Returns matching elements with bounds.',
+ parameters: {
+ name: { type: 'string', description: 'Element name to search for', required: false },
+ controlType: { type: 'string', description: 'UI Automation control type (e.g. "ControlType.Button")', required: false },
+ automationId: { type: 'string', description: 'Automation ID', required: false },
+ processId: { type: 'number', description: 'Process ID to search within', required: false },
+ },
+ category: 'window',
+ handler: async ({ name, controlType, automationId, processId }, ctx) => {
+ await ctx.ensureInitialized();
+ const elements = await ctx.a11y.findElement({ name, controlType, automationId, processId });
+ if (!elements?.length) return { text: '(no elements found)' };
+ const lines = elements.slice(0, 20).map((el: any) =>
+ `[${el.controlType}] "${el.name}" id:${el.automationId} @${el.bounds.x},${el.bounds.y} ${el.bounds.width}x${el.bounds.height}` +
+ (el.isEnabled === false ? ' DISABLED' : '')
+ );
+ if (elements.length > 20) lines.push(`... and ${elements.length - 20} more`);
+ return { text: lines.join('\n') };
+ },
+ },
+
+ {
+ name: 'read_clipboard',
+ description: 'Read the current text content of the OS clipboard.',
+ parameters: {},
+ category: 'clipboard',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const text = await ctx.a11y.readClipboard();
+ return { text: text || '(clipboard empty or non-text)' };
+ },
+ },
+
+ {
+ name: 'write_clipboard',
+ description: 'Write text to the OS clipboard.',
+ parameters: {
+ text: { type: 'string', description: 'Text to write to clipboard', required: true },
+ },
+ category: 'clipboard',
+ handler: async ({ text }, ctx) => {
+ await ctx.ensureInitialized();
+ await ctx.a11y.writeClipboard(text);
+ return { text: `Clipboard set (${text.length} chars)` };
+ },
+ },
+ ];
+}
diff --git a/src/tools/cdp.ts b/src/tools/cdp.ts
new file mode 100644
index 0000000..4b7f45a
--- /dev/null
+++ b/src/tools/cdp.ts
@@ -0,0 +1,184 @@
+/**
+ * Browser CDP tools — interact with web page DOM via Chrome DevTools Protocol.
+ *
+ * Structured access to browser page elements without screenshots.
+ * Requires: Edge/Chrome running with --remote-debugging-port
+ */
+
+import type { ToolDefinition } from './types';
+
+const CDP_PORT = 9223;
+
+export function getCdpTools(): ToolDefinition[] {
+ return [
+ {
+ name: 'cdp_connect',
+ description: `Connect to Edge/Chrome browser via Chrome DevTools Protocol (port ${CDP_PORT}). Must be called before other cdp_* tools. Use navigate_browser to launch Edge with CDP enabled.`,
+ parameters: {},
+ category: 'browser',
+ handler: async (_params, ctx) => {
+ try { await ctx.cdp.disconnect(); } catch { /* ignore */ }
+ const ok = await ctx.cdp.connect();
+ if (ok) {
+ const url = await ctx.cdp.getUrl();
+ const title = await ctx.cdp.getTitle();
+ return { text: `Connected to: "${title}" at ${url}` };
+ }
+ return { text: `Failed to connect to CDP on port ${CDP_PORT}. Use navigate_browser to launch Edge with CDP.`, isError: true };
+ },
+ },
+
+ {
+ name: 'cdp_page_context',
+ description: 'Get a structured list of interactive elements on the current browser page (inputs, buttons, links with selectors and positions).',
+ parameters: {},
+ category: 'browser',
+ handler: async (_params, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ const context = await ctx.cdp.getPageContext();
+ return { text: context };
+ },
+ },
+
+ {
+ name: 'cdp_read_text',
+ description: 'Read text content from a DOM element. Useful for extracting information from web pages.',
+ parameters: {
+ selector: { type: 'string', description: 'CSS selector (default: "body" for full page text)', required: false },
+ maxLength: { type: 'number', description: 'Max characters to return (default: 3000)', required: false },
+ },
+ category: 'browser',
+ handler: async ({ selector, maxLength }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ const text = await ctx.cdp.readText(selector ?? 'body', maxLength ?? 3000);
+ return { text };
+ },
+ },
+
+ {
+ name: 'cdp_click',
+ description: 'Click a DOM element by CSS selector or by visible text content.',
+ parameters: {
+ selector: { type: 'string', description: 'CSS selector to click (e.g. "#submit", "button.primary")', required: false },
+ text: { type: 'string', description: 'Visible text of the element to click (alternative to selector)', required: false },
+ },
+ category: 'browser',
+ handler: async ({ selector, text }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ if (!selector && !text) return { text: 'Provide either selector or text parameter.', isError: true };
+ const result = text ? await ctx.cdp.clickByText(text) : await ctx.cdp.click(selector!);
+ return {
+ text: result.success ? `Clicked: ${selector || `"${text}"`} (method: ${result.method})` : `Click failed: ${result.error}`,
+ isError: !result.success,
+ };
+ },
+ },
+
+ {
+ name: 'cdp_type',
+ description: 'Type text into a DOM input field by CSS selector or by associated label text.',
+ parameters: {
+ selector: { type: 'string', description: 'CSS selector for the input field', required: false },
+ label: { type: 'string', description: 'Label text associated with the input (alternative to selector)', required: false },
+ text: { type: 'string', description: 'Text to type into the field', required: true },
+ },
+ category: 'browser',
+ handler: async ({ selector, label, text }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ if (!selector && !label) return { text: 'Provide either selector or label parameter.', isError: true };
+ const result = label ? await ctx.cdp.typeByLabel(label, text) : await ctx.cdp.typeInField(selector!, text);
+ return {
+ text: result.success ? `Typed "${(text as string).substring(0, 60)}" into ${selector || `label="${label}"`}` : `Type failed: ${result.error}`,
+ isError: !result.success,
+ };
+ },
+ },
+
+ {
+ name: 'cdp_select_option',
+ description: 'Select an option in a dropdown by value or visible text.',
+ parameters: {
+ selector: { type: 'string', description: 'CSS selector for the element', required: true },
+ value: { type: 'string', description: 'Option value or visible text to select', required: true },
+ },
+ category: 'browser',
+ handler: async ({ selector, value }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ const result = await ctx.cdp.selectOption(selector, value);
+ return { text: result.success ? `Selected "${value}" in ${selector}` : `Select failed: ${result.error}`, isError: !result.success };
+ },
+ },
+
+ {
+ name: 'cdp_evaluate',
+ description: 'Execute JavaScript in the browser page context. Returns the result.',
+ parameters: {
+ javascript: { type: 'string', description: 'JavaScript code to evaluate in the page', required: true },
+ },
+ category: 'browser',
+ handler: async ({ javascript }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ try {
+ const result = await ctx.cdp.evaluate(javascript);
+ const text = typeof result === 'string' ? result : JSON.stringify(result, null, 2);
+ return { text: text ?? '(undefined)' };
+ } catch (e: any) {
+ return { text: `JS error: ${e.message}`, isError: true };
+ }
+ },
+ },
+
+ {
+ name: 'cdp_wait_for_selector',
+ description: 'Wait for a DOM element matching a CSS selector to appear and become visible.',
+ parameters: {
+ selector: { type: 'string', description: 'CSS selector to wait for', required: true },
+ timeout: { type: 'number', description: 'Timeout in milliseconds (default: 10000)', required: false },
+ },
+ category: 'browser',
+ handler: async ({ selector, timeout }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ const result = await ctx.cdp.waitForSelector(selector, timeout ?? 10000);
+ return { text: result.success ? `Element "${selector}" found` : `Wait failed: ${result.error}`, isError: !result.success };
+ },
+ },
+
+ {
+ name: 'cdp_list_tabs',
+ description: 'List all open browser tabs with their URLs and titles.',
+ parameters: {},
+ category: 'browser',
+ handler: async () => {
+ try {
+ const resp = await fetch(`http://127.0.0.1:${CDP_PORT}/json`);
+ const tabs: any[] = await resp.json();
+ const pages = tabs.filter((t: any) => t.type === 'page' && !t.url.startsWith('edge://') && !t.url.startsWith('chrome://'));
+ if (!pages.length) return { text: '(no tabs found)' };
+ const lines = pages.map((t: any, i: number) => `${i + 1}. "${t.title}" — ${t.url}`);
+ return { text: lines.join('\n') };
+ } catch {
+ return { text: `Cannot list tabs. Use navigate_browser first to launch Edge with CDP on port ${CDP_PORT}.`, isError: true };
+ }
+ },
+ },
+
+ {
+ name: 'cdp_switch_tab',
+ description: 'Switch CDP connection to a different browser tab by URL or title substring.',
+ parameters: {
+ target: { type: 'string', description: 'URL or title substring to match', required: true },
+ },
+ category: 'browser',
+ handler: async ({ target }, ctx) => {
+ if (!(await ctx.cdp.isConnected())) return { text: 'Not connected to CDP. Call cdp_connect first.', isError: true };
+ const ok = await ctx.cdp.switchTab(target);
+ if (ok) {
+ const url = await ctx.cdp.getUrl();
+ const title = await ctx.cdp.getTitle();
+ return { text: `Switched to: "${title}" at ${url}` };
+ }
+ return { text: `No tab matching "${target}" found.`, isError: true };
+ },
+ },
+ ];
+}
diff --git a/src/tools/desktop.ts b/src/tools/desktop.ts
new file mode 100644
index 0000000..f88a959
--- /dev/null
+++ b/src/tools/desktop.ts
@@ -0,0 +1,237 @@
+/**
+ * Desktop tools — screenshot, mouse, keyboard, screen info.
+ *
+ * Coordinate system: All mouse tools accept IMAGE-SPACE coordinates
+ * (matching the 1280px-wide screenshots from desktop_screenshot).
+ * The server auto-scales to Windows LOGICAL coordinates via mouseScaleFactor.
+ */
+
+import type { ToolDefinition } from './types';
+
+/** Dangerous key combos that are blocked */
+const BLOCKED_KEYS = ['alt+f4', 'ctrl+alt+delete', 'ctrl+alt+del'];
+
+export function getDesktopTools(): ToolDefinition[] {
+ return [
+ // ── PERCEPTION ──
+
+ {
+ name: 'desktop_screenshot',
+ description: 'Take a screenshot of the entire screen, resized to 1280px wide. Returns the image and scale metadata. Use read_screen (accessibility tree) first — only screenshot when you need visual confirmation.',
+ parameters: {},
+ category: 'perception',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const frame = await ctx.desktop.captureForLLM();
+ const base64 = frame.buffer.toString('base64');
+ return {
+ text: `Screenshot: ${frame.llmWidth}x${frame.llmHeight}px (real: ${frame.width}x${frame.height}, scale: ${frame.scaleFactor.toFixed(2)}x). Mouse tools accept these image-space coordinates.`,
+ image: { data: base64, mimeType: 'image/jpeg' },
+ };
+ },
+ },
+
+ {
+ name: 'desktop_screenshot_region',
+ description: 'Take a zoomed screenshot of a specific screen region for detailed inspection. Coordinates are in image-space (from desktop_screenshot).',
+ parameters: {
+ x: { type: 'number', description: 'Left edge X in image-space coordinates', required: true },
+ y: { type: 'number', description: 'Top edge Y in image-space coordinates', required: true },
+ width: { type: 'number', description: 'Width in image-space pixels', required: true },
+ height: { type: 'number', description: 'Height in image-space pixels', required: true },
+ },
+ category: 'perception',
+ handler: async ({ x, y, width, height }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getScreenshotScaleFactor();
+ const frame = await ctx.desktop.captureRegionForLLM(
+ Math.round(x * sf), Math.round(y * sf),
+ Math.round(width * sf), Math.round(height * sf),
+ );
+ const base64 = frame.buffer.toString('base64');
+ return {
+ text: `Region: (${x},${y}) ${width}x${height} image-space → zoomed to ${frame.llmWidth}x${frame.llmHeight}px.`,
+ image: { data: base64, mimeType: 'image/jpeg' },
+ };
+ },
+ },
+
+ {
+ name: 'get_screen_size',
+ description: 'Get the screen dimensions and scale factor.',
+ parameters: {},
+ category: 'perception',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const size = ctx.desktop.getScreenSize();
+ const msf = ctx.getMouseScaleFactor();
+ const ssf = ctx.getScreenshotScaleFactor();
+ return {
+ text: JSON.stringify({
+ physicalWidth: size.width,
+ physicalHeight: size.height,
+ screenshotScaleFactor: ssf,
+ mouseScaleFactor: msf,
+ imageWidth: Math.round(size.width / ssf),
+ imageHeight: Math.round(size.height / ssf),
+ }),
+ };
+ },
+ },
+
+ // ── MOUSE ──
+
+ {
+ name: 'mouse_click',
+ description: 'Click the left mouse button at the given image-space coordinates.',
+ parameters: {
+ x: { type: 'number', description: 'X coordinate in image-space', required: true },
+ y: { type: 'number', description: 'Y coordinate in image-space', required: true },
+ },
+ category: 'mouse',
+ handler: async ({ x, y }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getMouseScaleFactor();
+ const rx = Math.round(x * sf), ry = Math.round(y * sf);
+ await ctx.desktop.mouseClick(rx, ry);
+ ctx.a11y.invalidateCache();
+ return { text: `Clicked at (${x}, ${y}) → logical (${rx}, ${ry})` };
+ },
+ },
+
+ {
+ name: 'mouse_double_click',
+ description: 'Double-click the left mouse button at the given image-space coordinates.',
+ parameters: {
+ x: { type: 'number', description: 'X coordinate in image-space', required: true },
+ y: { type: 'number', description: 'Y coordinate in image-space', required: true },
+ },
+ category: 'mouse',
+ handler: async ({ x, y }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getMouseScaleFactor();
+ await ctx.desktop.mouseDoubleClick(Math.round(x * sf), Math.round(y * sf));
+ ctx.a11y.invalidateCache();
+ return { text: `Double-clicked at (${x}, ${y})` };
+ },
+ },
+
+ {
+ name: 'mouse_right_click',
+ description: 'Right-click at the given image-space coordinates (opens context menu).',
+ parameters: {
+ x: { type: 'number', description: 'X coordinate in image-space', required: true },
+ y: { type: 'number', description: 'Y coordinate in image-space', required: true },
+ },
+ category: 'mouse',
+ handler: async ({ x, y }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getMouseScaleFactor();
+ await ctx.desktop.mouseRightClick(Math.round(x * sf), Math.round(y * sf));
+ ctx.a11y.invalidateCache();
+ return { text: `Right-clicked at (${x}, ${y})` };
+ },
+ },
+
+ {
+ name: 'mouse_hover',
+ description: 'Move the mouse to the given image-space coordinates without clicking. Useful for revealing tooltips or hover menus.',
+ parameters: {
+ x: { type: 'number', description: 'X coordinate in image-space', required: true },
+ y: { type: 'number', description: 'Y coordinate in image-space', required: true },
+ },
+ category: 'mouse',
+ handler: async ({ x, y }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getMouseScaleFactor();
+ await ctx.desktop.mouseMove(Math.round(x * sf), Math.round(y * sf));
+ return { text: `Mouse moved to (${x}, ${y})` };
+ },
+ },
+
+ {
+ name: 'mouse_scroll',
+ description: 'Scroll the mouse wheel at the given image-space coordinates.',
+ parameters: {
+ x: { type: 'number', description: 'X coordinate in image-space', required: true },
+ y: { type: 'number', description: 'Y coordinate in image-space', required: true },
+ direction: { type: 'string', description: 'Scroll direction', required: true, enum: ['up', 'down'] },
+ amount: { type: 'number', description: 'Scroll amount in wheel ticks (default: 3)', required: false, default: 3 },
+ },
+ category: 'mouse',
+ handler: async ({ x, y, direction, amount }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getMouseScaleFactor();
+ const ticks = amount ?? 3;
+ const delta = direction === 'down' ? ticks : -ticks;
+ await ctx.desktop.mouseScroll(Math.round(x * sf), Math.round(y * sf), delta);
+ return { text: `Scrolled ${direction} ${ticks} ticks at (${x}, ${y})` };
+ },
+ },
+
+ {
+ name: 'mouse_drag',
+ description: 'Drag from one image-space coordinate to another (click-hold-move-release). Useful for selecting text, moving objects, or resizing.',
+ parameters: {
+ startX: { type: 'number', description: 'Start X in image-space', required: true },
+ startY: { type: 'number', description: 'Start Y in image-space', required: true },
+ endX: { type: 'number', description: 'End X in image-space', required: true },
+ endY: { type: 'number', description: 'End Y in image-space', required: true },
+ },
+ category: 'mouse',
+ handler: async ({ startX, startY, endX, endY }, ctx) => {
+ await ctx.ensureInitialized();
+ const sf = ctx.getMouseScaleFactor();
+ await ctx.desktop.mouseDrag(
+ Math.round(startX * sf), Math.round(startY * sf),
+ Math.round(endX * sf), Math.round(endY * sf),
+ );
+ ctx.a11y.invalidateCache();
+ return { text: `Dragged (${startX},${startY}) → (${endX},${endY})` };
+ },
+ },
+
+ // ── KEYBOARD ──
+
+ {
+ name: 'type_text',
+ description: 'Type text into the currently focused element via clipboard paste (reliable, no dropped chars).',
+ parameters: {
+ text: { type: 'string', description: 'The text to type', required: true },
+ },
+ category: 'keyboard',
+ handler: async ({ text }, ctx) => {
+ await ctx.ensureInitialized();
+ const active = await ctx.a11y.getActiveWindow();
+ const activeInfo = active ? `[${active.processName}] "${active.title}"` : '(unknown)';
+ await ctx.a11y.writeClipboard(text);
+ await new Promise(r => setTimeout(r, 50));
+ await ctx.desktop.keyPress('ctrl+v');
+ await new Promise(r => setTimeout(r, 100));
+ ctx.a11y.invalidateCache();
+ return { text: `Typed ${text.length} chars into ${activeInfo}` };
+ },
+ },
+
+ {
+ name: 'key_press',
+ description: 'Press a keyboard key or key combination. Use "+" for combos (e.g. "ctrl+s", "shift+enter", "alt+tab"). Single keys: "Return", "Tab", "Escape", "Backspace", "Delete", "F1"-"F12", "Left/Right/Up/Down".',
+ parameters: {
+ key: { type: 'string', description: 'Key or combo to press (e.g. "Return", "ctrl+a", "F5", "Escape")', required: true },
+ },
+ category: 'keyboard',
+ handler: async ({ key }, ctx) => {
+ await ctx.ensureInitialized();
+ const lower = (key as string).toLowerCase().replace(/\s+/g, '');
+ if (BLOCKED_KEYS.some(b => lower === b)) {
+ return { text: `BLOCKED: "${key}" is a dangerous key combo.`, isError: true };
+ }
+ const active = await ctx.a11y.getActiveWindow();
+ const activeInfo = active ? `[${active.processName}] "${active.title}"` : '(unknown)';
+ await ctx.desktop.keyPress(key);
+ ctx.a11y.invalidateCache();
+ return { text: `Key pressed: ${key} in ${activeInfo}` };
+ },
+ },
+ ];
+}
diff --git a/src/tools/index.ts b/src/tools/index.ts
new file mode 100644
index 0000000..d3b9d92
--- /dev/null
+++ b/src/tools/index.ts
@@ -0,0 +1,42 @@
+/**
+ * Tool Registry — central registry of all clawdcursor tools.
+ *
+ * Import this to get all 40 tools in a transport-agnostic format.
+ * Adapters (HTTP, MCP) consume this registry.
+ */
+
+import { getDesktopTools } from './desktop';
+import { getA11yTools } from './a11y';
+import { getCdpTools } from './cdp';
+import { getOrchestrationTools } from './orchestration';
+import { getShortcutTools } from './shortcuts';
+import { getOcrTools } from './ocr';
+import { getSmartTools } from './smart';
+import type { ToolDefinition, ToolContext, ToolResult } from './types';
+import { toOpenAiFunctions, toJsonSchema } from './types';
+
+export type { ToolDefinition, ToolContext, ToolResult };
+export { toOpenAiFunctions, toJsonSchema };
+
+/** Get all registered tools */
+export function getAllTools(): ToolDefinition[] {
+ return [
+ ...getDesktopTools(),
+ ...getA11yTools(),
+ ...getCdpTools(),
+ ...getOrchestrationTools(),
+ ...getShortcutTools(),
+ ...getOcrTools(),
+ ...getSmartTools(),
+ ];
+}
+
+/** Get tools by category */
+export function getToolsByCategory(category: string): ToolDefinition[] {
+ return getAllTools().filter(t => t.category === category);
+}
+
+/** Get a tool by name */
+export function getTool(name: string): ToolDefinition | undefined {
+ return getAllTools().find(t => t.name === name);
+}
diff --git a/src/tools/ocr.ts b/src/tools/ocr.ts
new file mode 100644
index 0000000..2121259
--- /dev/null
+++ b/src/tools/ocr.ts
@@ -0,0 +1,70 @@
+/**
+ * OCR tools — expose OS-level OCR to MCP clients.
+ *
+ * Provides `ocr_read_screen` which returns structured text with bounding
+ * box coordinates — cheaper than a screenshot + vision LLM call.
+ */
+
+import { OcrEngine } from '../ocr-engine';
+import type { ToolDefinition } from './types';
+
+// Shared OcrEngine instance
+let ocrEngine: OcrEngine | null = null;
+
+function getOcrEngine(): OcrEngine {
+ if (!ocrEngine) ocrEngine = new OcrEngine();
+ return ocrEngine;
+}
+
+export function getOcrTools(): ToolDefinition[] {
+ return [
+ {
+ name: 'ocr_read_screen',
+ description:
+ 'Read all text on screen using OS-level OCR. Returns text elements with pixel coordinates (bounding boxes). Much cheaper than a screenshot — use this to find text, buttons, labels, and their positions. Coordinates are in real screen pixels.',
+ parameters: {},
+ category: 'perception',
+ handler: async (_params, ctx) => {
+ await ctx.ensureInitialized();
+ const engine = getOcrEngine();
+
+ if (!engine.isAvailable()) {
+ return {
+ text: 'OCR is not available on this platform. Use desktop_screenshot + read_screen instead.',
+ isError: true,
+ };
+ }
+
+ const result = await engine.recognizeScreen();
+
+ if (result.elements.length === 0) {
+ return {
+ text: JSON.stringify({
+ elements: [],
+ fullText: '',
+ durationMs: result.durationMs,
+ hint: 'No text detected. Screen may be blank or contain only images. Try desktop_screenshot for visual content.',
+ }),
+ };
+ }
+
+ // Compute scale factor for MCP clients that need to convert OCR→mouse coordinates
+ const ssf = ctx.getScreenshotScaleFactor();
+ const msf = ctx.getMouseScaleFactor();
+ const dpiRatio = ssf / msf;
+
+ return {
+ text: JSON.stringify({
+ elementCount: result.elements.length,
+ elements: result.elements,
+ fullText: result.fullText,
+ durationMs: result.durationMs,
+ coordinateSystem: 'real_screen_pixels',
+ toMouseClick: `Divide coordinates by ${dpiRatio.toFixed(4)} to convert to mouse_click image-space. Or better: use smart_click("element text") which handles conversion automatically.`,
+ hint: 'Coordinates are in real screen pixels. Prefer smart_click(target) over manual coordinate math. If you must use mouse_click, divide OCR coordinates by the dpiRatio above.',
+ }, null, 2),
+ };
+ },
+ },
+ ];
+}
diff --git a/src/tools/orchestration.ts b/src/tools/orchestration.ts
new file mode 100644
index 0000000..ca3c7a5
--- /dev/null
+++ b/src/tools/orchestration.ts
@@ -0,0 +1,163 @@
+/**
+ * Orchestration tools — delegate tasks, launch apps, navigate browser.
+ */
+
+import { execFile } from 'child_process';
+import { promisify } from 'util';
+import * as path from 'path';
+import * as fs from 'fs';
+import * as os from 'os';
+import type { ToolDefinition } from './types';
+
+const execFileAsync = promisify(execFile);
+
+/** Read auth token from ~/.clawdcursor/token for agent API calls. */
+function loadAgentToken(): string {
+ try {
+ return fs.readFileSync(path.join(os.homedir(), '.clawdcursor', 'token'), 'utf-8').trim();
+ } catch {
+ return '';
+ }
+}
+function agentHeaders(): Record {
+ const token = loadAgentToken();
+ return {
+ 'Content-Type': 'application/json',
+ ...(token ? { 'Authorization': `Bearer ${token}` } : {}),
+ };
+}
+const CDP_PORT = 9223;
+
+export function getOrchestrationTools(): ToolDefinition[] {
+ return [
+ {
+ name: 'delegate_to_agent',
+ description: "Delegate a task to clawdcursor's autonomous pipeline (runs independently with its own LLM reasoning). Returns when the task completes or times out.",
+ parameters: {
+ task: { type: 'string', description: 'Natural language task description', required: true },
+ timeout: { type: 'number', description: 'Timeout in seconds (default: 300)', required: false },
+ },
+ category: 'orchestration',
+ handler: async ({ task, timeout }) => {
+ const timeoutMs = (timeout ?? 300) * 1000;
+ const start = Date.now();
+ try {
+ const resp = await fetch('http://127.0.0.1:3847/task', {
+ method: 'POST',
+ headers: agentHeaders(),
+ body: JSON.stringify({ task }),
+ });
+ if (!resp.ok) {
+ const body = await resp.text().catch(() => '');
+ return { text: `Agent API error ${resp.status}: ${body || resp.statusText}`, isError: true };
+ }
+ while (Date.now() - start < timeoutMs) {
+ await new Promise(r => setTimeout(r, 2000));
+ try {
+ const status = await fetch('http://127.0.0.1:3847/status');
+ const data: any = await status.json();
+ if (data.status === 'idle') {
+ const result = data.lastResult;
+ return {
+ text: JSON.stringify({
+ success: result?.success ?? false,
+ verified: result?.verified ?? false,
+ steps: result?.steps?.length ?? 0,
+ duration: `${((Date.now() - start) / 1000).toFixed(1)}s`,
+ lastAction: result?.steps?.slice(-1)?.[0]?.description ?? '(unknown)',
+ }, null, 2),
+ };
+ }
+ } catch { /* keep polling */ }
+ }
+ await fetch('http://127.0.0.1:3847/abort', { method: 'POST', headers: agentHeaders() }).catch(() => {});
+ return { text: `Agent timed out after ${timeout ?? 300}s. Task aborted.`, isError: true };
+ } catch (err: any) {
+ return { text: `Agent unavailable: ${err.message}. Is clawdcursor running on port 3847?`, isError: true };
+ }
+ },
+ },
+
+ {
+ name: 'open_app',
+ description: 'Open an application by name. Uses platform-native launch.',
+ parameters: {
+ name: { type: 'string', description: 'Application name (e.g. "notepad", "calc", "mspaint")', required: true },
+ },
+ category: 'orchestration',
+ handler: async ({ name }, ctx) => {
+ await ctx.ensureInitialized();
+ try {
+ if (process.platform === 'win32') {
+ await execFileAsync('powershell.exe', ['-NoProfile', '-Command', `Start-Process "${name}"`], { timeout: 10000 });
+ } else if (process.platform === 'darwin') {
+ await execFileAsync('open', ['-a', name], { timeout: 10000 });
+ } else {
+ await execFileAsync(name, [], { timeout: 10000 });
+ }
+ await new Promise(r => setTimeout(r, 2000));
+ ctx.a11y.invalidateCache();
+ return { text: `Launched: ${name}` };
+ } catch (err: any) {
+ return { text: `Failed to launch "${name}": ${err.message}`, isError: true };
+ }
+ },
+ },
+
+ {
+ name: 'navigate_browser',
+ description: `Open a URL in the browser. Launches with CDP enabled (port ${CDP_PORT}) for DOM interaction. Call cdp_connect after.`,
+ parameters: {
+ url: { type: 'string', description: 'URL to navigate to', required: true },
+ },
+ category: 'orchestration',
+ handler: async ({ url }, ctx) => {
+ await ctx.ensureInitialized();
+ if (await ctx.cdp.isConnected()) {
+ try {
+ const page = ctx.cdp.getPage();
+ if (page) {
+ await page.goto(url, { timeout: 30000, waitUntil: 'domcontentloaded' });
+ const title = await page.title().catch(() => '(loading)');
+ return { text: `Navigated to: "${title}" at ${url}` };
+ }
+ } catch { /* fall through */ }
+ }
+ try {
+ const userDataDir = path.join(process.env.TEMP || process.env.TMPDIR || '/tmp', 'clawdcursor-edge');
+ if (process.platform === 'win32') {
+ await execFileAsync('powershell.exe', ['-NoProfile', '-Command',
+ `Start-Process "msedge" -ArgumentList @("--remote-debugging-port=${CDP_PORT}","--user-data-dir=${userDataDir}","--no-first-run","--disable-default-apps","${url}")`
+ ], { timeout: 10000 });
+ } else if (process.platform === 'darwin') {
+ await execFileAsync('open', ['-a', 'Google Chrome', '--args',
+ `--remote-debugging-port=${CDP_PORT}`, `--user-data-dir=${userDataDir}`, '--no-first-run', url
+ ], { timeout: 10000 });
+ } else {
+ await execFileAsync('google-chrome', [
+ `--remote-debugging-port=${CDP_PORT}`, `--user-data-dir=${userDataDir}`, '--no-first-run', url
+ ], { timeout: 10000 });
+ }
+ await new Promise(r => setTimeout(r, 3000));
+ ctx.a11y.invalidateCache();
+ return { text: `Opened: ${url} (CDP port ${CDP_PORT} enabled)` };
+ } catch (err: any) {
+ return { text: `Navigation failed: ${err.message}`, isError: true };
+ }
+ },
+ },
+
+ {
+ name: 'wait',
+ description: 'Wait for a specified duration. Useful after animations or page loads.',
+ parameters: {
+ seconds: { type: 'number', description: 'Duration to wait (0.1 to 30)', required: true, minimum: 0.1, maximum: 30 },
+ },
+ category: 'orchestration',
+ handler: async ({ seconds }) => {
+ await new Promise(r => setTimeout(r, seconds * 1000));
+ return { text: `Waited ${seconds}s` };
+ },
+ },
+ ];
+}
diff --git a/src/tools/shortcuts.ts b/src/tools/shortcuts.ts
new file mode 100644
index 0000000..03c28c3
--- /dev/null
+++ b/src/tools/shortcuts.ts
@@ -0,0 +1,173 @@
+/**
+ * Shortcut tools — expose the keyboard shortcuts database to MCP clients.
+ *
+ * Two tools:
+ * shortcuts_list — query available shortcuts by category/context
+ * shortcuts_execute — run a shortcut by intent (fuzzy-matched)
+ *
+ * This bridges the gap between the internal ActionRouter (which knows all
+ * shortcuts) and external agents calling MCP tools (which previously had
+ * to independently know keyboard combos).
+ */
+
+import * as os from 'os';
+import {
+ SHORTCUTS,
+ findShortcut,
+ resolveShortcutKey,
+ type ShortcutCategory,
+ type ShortcutDefinition,
+} from '../shortcuts';
+import type { ToolDefinition } from './types';
+
+const VALID_CATEGORIES: ShortcutCategory[] = [
+ 'navigation', 'browser', 'editing', 'social', 'window', 'file', 'view', 'quick',
+];
+
+/**
+ * Build a compact shortcut entry for the list response.
+ * Includes the resolved key combo for the current platform.
+ */
+function formatShortcut(s: ShortcutDefinition, platform: NodeJS.Platform): object {
+ return {
+ id: s.id,
+ category: s.category,
+ description: s.description,
+ intent: s.canonicalIntent,
+ key: resolveShortcutKey(s, platform),
+ context: s.contextHints?.length ? s.contextHints : undefined,
+ };
+}
+
+export function getShortcutTools(): ToolDefinition[] {
+ return [
+ {
+ name: 'shortcuts_list',
+ description:
+ 'List available keyboard shortcuts. Filter by category (navigation, browser, editing, social, window, file, view, quick) and/or context (e.g. "reddit", "outlook"). Returns shortcut names, descriptions, and key combos for the current platform. Use this BEFORE reaching for mouse_scroll or mouse_click — there is often a faster keyboard shortcut.',
+ parameters: {
+ category: {
+ type: 'string',
+ description: `Filter by category: ${VALID_CATEGORIES.join(', ')}. Omit to list all.`,
+ required: false,
+ enum: VALID_CATEGORIES as unknown as string[],
+ },
+ context: {
+ type: 'string',
+ description: 'Filter by app context (e.g. "reddit", "outlook", "x.com"). Shows context-specific shortcuts that match.',
+ required: false,
+ },
+ },
+ category: 'keyboard',
+ handler: async ({ category, context }) => {
+ const platform = os.platform();
+ let filtered = SHORTCUTS;
+
+ // Filter by category
+ if (category) {
+ filtered = filtered.filter(s => s.category === category);
+ }
+
+ // Filter by context — include shortcuts with no context hints (universal)
+ // plus those whose contextHints match the given context
+ if (context) {
+ const ctx = context.toLowerCase();
+ filtered = filtered.filter(s => {
+ if (!s.contextHints?.length) return true; // universal shortcut
+ return s.contextHints.some(h => h.toLowerCase().includes(ctx) || ctx.includes(h.toLowerCase()));
+ });
+ } else {
+ // When no context given, exclude context-specific shortcuts
+ // (they'd be confusing without context — e.g. reddit "a" for upvote)
+ filtered = filtered.filter(s => !s.contextHints?.length);
+ }
+
+ const results = filtered.map(s => formatShortcut(s, platform));
+
+ if (results.length === 0) {
+ return {
+ text: `No shortcuts found${category ? ` in category "${category}"` : ''}${context ? ` for context "${context}"` : ''}. Available categories: ${VALID_CATEGORIES.join(', ')}`,
+ };
+ }
+
+ return {
+ text: JSON.stringify({
+ platform,
+ count: results.length,
+ shortcuts: results,
+ hint: 'Use shortcuts_execute with the intent string to run a shortcut, or key_press with the key combo directly.',
+ }, null, 2),
+ };
+ },
+ },
+
+ {
+ name: 'shortcuts_execute',
+ description:
+ 'Execute a keyboard shortcut by describing what you want to do (e.g. "scroll down", "close tab", "upvote"). Uses fuzzy matching against the shortcuts database. Provide context (active app name) for app-specific shortcuts like Reddit or Outlook.',
+ parameters: {
+ intent: {
+ type: 'string',
+ description: 'What you want to do (e.g. "scroll down", "new tab", "copy", "upvote", "reply to email")',
+ required: true,
+ },
+ context: {
+ type: 'string',
+ description: 'Active app context for app-specific shortcuts (e.g. "reddit", "outlook"). Auto-detected from active window if omitted.',
+ required: false,
+ },
+ },
+ category: 'keyboard',
+ handler: async ({ intent, context }, ctx) => {
+ await ctx.ensureInitialized();
+
+ // Build context hint — use provided context or detect from active window
+ let contextHint = context ?? '';
+ if (!contextHint) {
+ try {
+ const win = await ctx.a11y.getActiveWindow();
+ if (win) {
+ contextHint = `${win.processName ?? ''} ${win.title ?? ''}`;
+ }
+ } catch { /* non-fatal — proceed without context */ }
+ }
+
+ const match = findShortcut(intent, os.platform(), {
+ contextHint,
+ enableFuzzy: true,
+ });
+
+ if (!match) {
+ // Return helpful error with similar shortcuts
+ const suggestions = SHORTCUTS
+ .filter(s => !s.contextHints?.length || (contextHint && s.contextHints.some(h => contextHint.toLowerCase().includes(h.toLowerCase()))))
+ .slice(0, 10)
+ .map(s => ` • "${s.canonicalIntent}" → ${resolveShortcutKey(s, os.platform())} (${s.description})`);
+
+ return {
+ text: `No shortcut matched intent "${intent}". Try one of these:\n${suggestions.join('\n')}\n\nOr use key_press directly with a specific key combo.`,
+ isError: true,
+ };
+ }
+
+ // Execute the matched shortcut
+ const active = await ctx.a11y.getActiveWindow().catch(() => null);
+ const activeInfo = active ? `[${active.processName}] "${active.title}"` : '(unknown)';
+
+ await ctx.desktop.keyPress(match.combo);
+ ctx.a11y.invalidateCache();
+
+ return {
+ text: JSON.stringify({
+ executed: match.combo,
+ intent: match.canonicalIntent,
+ matched: match.matchedIntent,
+ matchType: match.matchType,
+ description: match.shortcut.description,
+ window: activeInfo,
+ }),
+ };
+ },
+ },
+ ];
+}
diff --git a/src/tools/smart.ts b/src/tools/smart.ts
new file mode 100644
index 0000000..420e1d6
--- /dev/null
+++ b/src/tools/smart.ts
@@ -0,0 +1,511 @@
+/**
+ * Smart tools — high-level name-based interaction for blind agents.
+ *
+ * These tools let MCP clients interact with the desktop WITHOUT needing
+ * screenshots or coordinate math.
+ *
+ * Perception order: OCR first (primary), a11y tree in parallel (supplement).
+ * If combined OCR+a11y can't handle it → CDP fallback → fail.
+ * Vision is never called from here — that's the caller's last resort.
+ *
+ * Key design: NO coordinate conversion needed by the caller.
+ * OCR coords and a11y coords both match nut-js mouseClick coords directly.
+ */
+
+import type { ToolDefinition, ToolContext } from './types';
+import { OcrEngine } from '../ocr-engine';
+
+// Shared OCR engine singleton — avoids re-initialization per call
+let sharedOcr: OcrEngine | null = null;
+function getOcr(): OcrEngine {
+ if (!sharedOcr) sharedOcr = new OcrEngine();
+ return sharedOcr;
+}
+
+// ── Known apps with empty accessibility trees ──
+// These apps expose no useful UIA nodes — skip a11y, go straight to OCR.
+const EMPTY_A11Y_APPS = new Set([
+ 'windowsterminal', 'terminal', 'wt', 'alacritty', 'wezterm',
+ 'hyper', 'mintty', 'conhost',
+]);
+
+export function getSmartTools(): ToolDefinition[] {
+ return [
+ // ─── smart_read ──────────────────────────────────────────────────────
+ {
+ name: 'smart_read',
+ description:
+ 'Read text from the screen with automatic fallback. ' +
+ 'OCR-first pipeline: runs OCR (primary) and a11y tree (supplement) in parallel. ' +
+ 'Returns structured text without needing a screenshot. Use this as your primary perception tool.',
+ parameters: {
+ scope: {
+ type: 'string',
+ description: 'Read scope: "focused" for focused element, "window" for active window, "screen" for full screen',
+ required: false,
+ enum: ['focused', 'window', 'screen'],
+ },
+ target: {
+ type: 'string',
+ description: 'Element name to read from specifically',
+ required: false,
+ },
+ processId: {
+ type: 'number',
+ description: 'Limit to specific process',
+ required: false,
+ },
+ },
+ category: 'perception',
+ handler: async (params, ctx) => {
+ await ctx.ensureInitialized();
+ const scope = (params.scope as string) || 'window';
+ const target = params.target as string | undefined;
+ const processId = params.processId as number | undefined;
+
+ // ── Focused element read (shortcut — no OCR needed) ──
+ if (scope === 'focused') {
+ try {
+ const el = await ctx.a11y.getFocusedElement();
+ if (el) {
+ return {
+ text: `[via UI Automation focused element]\n${JSON.stringify(el, null, 2)}`,
+ };
+ }
+ } catch { /* fall through */ }
+ }
+
+ // ── Target-specific read (a11y search — precise) ──
+ if (target) {
+ try {
+ const elements = await ctx.a11y.findElement({ name: target, processId });
+ if (elements?.length) {
+ const lines = elements.slice(0, 10).map((el: any) =>
+ `[${el.controlType}] "${el.name}" id:${el.automationId} @${el.bounds.x},${el.bounds.y} ` +
+ `${el.bounds.width}x${el.bounds.height}` +
+ (el.value ? ` value="${el.value}"` : '') +
+ (el.isEnabled === false ? ' DISABLED' : '')
+ );
+ return { text: `[via UI Automation search]\n${lines.join('\n')}` };
+ }
+ } catch { /* fall through */ }
+ }
+
+ // ── OCR + a11y in parallel (OCR is primary, a11y supplements) ──
+ const activeWin = await ctx.a11y.getActiveWindow().catch(() => null);
+ const appName = activeWin?.processName?.toLowerCase() || '';
+ const hasA11y = !EMPTY_A11Y_APPS.has(appName);
+
+ // Launch both in parallel
+ const ocrPromise = (async () => {
+ try {
+ const engine = getOcr();
+ if (!engine.isAvailable()) return null;
+ const result = await engine.recognizeScreen();
+ if (result.elements.length === 0) return null;
+
+ // Group by line for readability
+ const lines = new Map();
+ for (const el of result.elements) {
+ const lineEls = lines.get(el.line) ?? [];
+ lineEls.push(el);
+ lines.set(el.line, lineEls);
+ }
+ const ocrLines: string[] = [];
+ for (const [, lineEls] of [...lines.entries()].sort((a, b) => a[0] - b[0])) {
+ ocrLines.push(lineEls.sort((a, b) => a.x - b.x).map(el => el.text).join(' '));
+ }
+ return { text: ocrLines.join('\n'), count: result.elements.length, ms: result.durationMs };
+ } catch { return null; }
+ })();
+
+ const a11yPromise = (async () => {
+ if (!hasA11y || scope === 'screen') return null;
+ try {
+ const active = processId ?? activeWin?.processId;
+ const context = await ctx.a11y.getScreenContext(active);
+ if (context && context.length > 50) return context;
+ } catch { /* */ }
+ return null;
+ })();
+
+ const [ocrResult, a11yResult] = await Promise.all([ocrPromise, a11yPromise]);
+
+ // OCR succeeded — return OCR text, attach a11y tree if available
+ if (ocrResult) {
+ const a11ySuffix = a11yResult
+ ? `\n\n=== A11Y TREE (supplement) ===\n${a11yResult}`
+ : '';
+ return {
+ text: `[via OCR — ${ocrResult.count} lines, ${ocrResult.ms}ms]\n${ocrResult.text}${a11ySuffix}`,
+ };
+ }
+
+ // OCR failed but a11y succeeded — return a11y alone
+ if (a11yResult) {
+ return { text: `[via UI Automation active window]\n${a11yResult}` };
+ }
+
+ // ── CDP fallback (browser content) ──
+ try {
+ if (await ctx.cdp.isConnected()) {
+ const page = ctx.cdp.getPage();
+ if (page) {
+ const title = await page.title().catch(() => '');
+ const text = await page.evaluate(() => document.body?.innerText?.substring(0, 5000) || '').catch(() => '');
+ if (text) {
+ return { text: `[via CDP — "${title}"]\n${text}` };
+ }
+ }
+ }
+ } catch { /* fall through */ }
+
+ return { text: '(could not read screen via any method)', isError: true };
+ },
+ },
+
+ // ─── smart_click ─────────────────────────────────────────────────────
+ {
+ name: 'smart_click',
+ description:
+ 'Click a UI element by name with automatic fallback. ' +
+ 'OCR-first: scans screen text and clicks by coordinates. ' +
+ 'Also tries a11y invoke (in parallel) and CDP as fallbacks. ' +
+ 'No screenshot or coordinate math needed — just provide the element text.',
+ parameters: {
+ target: {
+ type: 'string',
+ description: 'Element name/text to click (e.g., "Send", "Submit", "New Email")',
+ required: true,
+ },
+ processId: {
+ type: 'number',
+ description: 'Limit search to a specific process',
+ required: false,
+ },
+ timeout: {
+ type: 'number',
+ description: 'Max time in ms (default 5000)',
+ required: false,
+ },
+ },
+ category: 'orchestration',
+ handler: async (params, ctx) => {
+ await ctx.ensureInitialized();
+ const target = params.target as string;
+ const processId = params.processId as number | undefined;
+ const attempted: string[] = [];
+
+ // Detect active window and check traits
+ const activeWin = await ctx.a11y.getActiveWindow().catch(() => null);
+ const appName = activeWin?.processName?.toLowerCase() || '';
+ const isBrowser = ['msedge', 'chrome', 'firefox', 'brave', 'arc', 'safari'].includes(appName);
+ const emptyA11y = EMPTY_A11Y_APPS.has(appName);
+
+ // ── Step 1: OCR + a11y in parallel ──
+ // OCR finds text coordinates, a11y tries invoke — whoever succeeds first wins.
+
+ // Start OCR scan
+ const ocrPromise = (async (): Promise<{ x: number; y: number; text: string } | null> => {
+ try {
+ const engine = getOcr();
+ if (!engine.isAvailable()) return null;
+ const result = await engine.recognizeScreen();
+ const targetLower = target.toLowerCase();
+
+ let bestMatch: any = null;
+ let bestScore = 0;
+
+ for (const el of result.elements) {
+ const elText = el.text.toLowerCase();
+ if (elText === targetLower) {
+ bestMatch = el; bestScore = 1; break;
+ }
+ if (elText.includes(targetLower) || targetLower.includes(elText)) {
+ const score = Math.min(elText.length, targetLower.length) / Math.max(elText.length, targetLower.length);
+ if (score > bestScore) { bestMatch = el; bestScore = score; }
+ }
+ }
+
+ if (bestMatch && bestScore > 0.3) {
+ return {
+ x: bestMatch.x + Math.round(bestMatch.width / 2),
+ y: bestMatch.y + Math.round(bestMatch.height / 2),
+ text: bestMatch.text,
+ };
+ }
+ return null;
+ } catch { return null; }
+ })();
+
+ // Start a11y invoke in parallel
+ const a11yPromise = (async (): Promise<{ method: string; clickPoint?: { x: number; y: number } } | null> => {
+ if (emptyA11y) return null;
+ try {
+ const result = await ctx.a11y.invokeElement({
+ name: target,
+ processId: processId || activeWin?.processId,
+ action: 'click',
+ });
+ if (result.success) return { method: 'invoke' };
+ if (result.clickPoint) return { method: 'bounds', clickPoint: result.clickPoint };
+ return null;
+ } catch { return null; }
+ })();
+
+ const [ocrMatch, a11yResult] = await Promise.all([ocrPromise, a11yPromise]);
+
+ // a11y invoke succeeded — best outcome (OS-level click, most reliable)
+ if (a11yResult?.method === 'invoke') {
+ ctx.a11y.invalidateCache();
+ return { text: `Clicked "${target}" via UI Automation (invoke_element)` };
+ }
+
+ // OCR found the element — coordinate click
+ if (ocrMatch) {
+ await ctx.desktop.mouseClick(ocrMatch.x, ocrMatch.y);
+ ctx.a11y.invalidateCache();
+ return { text: `Clicked "${target}" via OCR (matched "${ocrMatch.text}" at ${ocrMatch.x},${ocrMatch.y})` };
+ }
+
+ // a11y had bounds but couldn't invoke — coordinate fallback
+ if (a11yResult?.clickPoint) {
+ await ctx.desktop.mouseClick(a11yResult.clickPoint.x, a11yResult.clickPoint.y);
+ ctx.a11y.invalidateCache();
+ return { text: `Clicked "${target}" via a11y bounds (coordinate fallback at ${a11yResult.clickPoint.x},${a11yResult.clickPoint.y})` };
+ }
+
+ // Track what was attempted for diagnostics
+ if (emptyA11y) {
+ attempted.push(`UIA(skipped): app "${appName}" has known traits: emptyAxTree`);
+ } else {
+ attempted.push('UIA(invoke): element not found or not invocable');
+ }
+ attempted.push(ocrMatch === null ? 'ocr: no text match found' : 'ocr: unavailable');
+
+ // ── Step 2: CDP click (browser content) ──
+ if (isBrowser || await ctx.cdp.isConnected().catch(() => false)) {
+ try {
+ const connected = await ctx.cdp.isConnected();
+ if (connected) {
+ const page = ctx.cdp.getPage();
+ if (page) {
+ const clicked = await page.evaluate((text: string) => {
+ const selectors = 'button, a, [role="button"], [role="link"], [role="menuitem"], input[type="submit"], input[type="button"], [onclick]';
+ const elements = document.querySelectorAll(selectors);
+ for (const el of elements) {
+ const htmlEl = el as HTMLElement;
+ const elText = htmlEl.textContent?.trim() || htmlEl.getAttribute('aria-label') || htmlEl.getAttribute('title') || '';
+ if (elText.toLowerCase().includes(text.toLowerCase())) {
+ htmlEl.click();
+ return true;
+ }
+ }
+ return false;
+ }, target).catch(() => false);
+
+ if (clicked) {
+ ctx.a11y.invalidateCache();
+ return { text: `Clicked "${target}" via CDP (JS click)` };
+ }
+ attempted.push('CDP: no text match found');
+ }
+ }
+ } catch (err: any) {
+ attempted.push(`CDP: ${err.message?.substring(0, 80)}`);
+ }
+ } else {
+ attempted.push(`CDP(skipped): foreground app "${appName}" is not a browser`);
+ }
+
+ // All methods failed
+ return {
+ text: `smart_click failed: could not click "${target}" after all fallback methods.\nAttempted:\n${attempted.map((a, i) => ` ${i + 1}. ${a}`).join('\n')}\nDiagnosis:\n No specific failure pattern detected`,
+ isError: true,
+ };
+ },
+ },
+
+ // ─── smart_type ──────────────────────────────────────────────────────
+ {
+ name: 'smart_type',
+ description:
+ 'Type text into a UI element. If target is specified, finds and focuses the element first. ' +
+ 'Uses clipboard paste for reliability (no dropped characters).',
+ parameters: {
+ text: {
+ type: 'string',
+ description: 'The text to type',
+ required: true,
+ },
+ target: {
+ type: 'string',
+ description: 'Element name to focus before typing (optional — types into currently focused element if omitted)',
+ required: false,
+ },
+ processId: {
+ type: 'number',
+ description: 'Limit search to a specific process',
+ required: false,
+ },
+ },
+ category: 'keyboard',
+ handler: async (params, ctx) => {
+ await ctx.ensureInitialized();
+ const text = params.text as string;
+ const target = params.target as string | undefined;
+ const processId = params.processId as number | undefined;
+
+ // If target specified, find and focus it first
+ if (target) {
+ let focused = false;
+
+ // Try UIA focus
+ try {
+ const activeWin = await ctx.a11y.getActiveWindow().catch(() => null);
+ const appName = activeWin?.processName?.toLowerCase() || '';
+
+ if (!EMPTY_A11Y_APPS.has(appName)) {
+ const result = await ctx.a11y.invokeElement({
+ name: target,
+ processId: processId || activeWin?.processId,
+ action: 'focus',
+ });
+ if (result.success) {
+ focused = true;
+ } else if (result.clickPoint) {
+ // Focus failed but we have bounds — click to focus
+ // a11y coords match nut-js mouse coords directly
+ await ctx.desktop.mouseClick(result.clickPoint.x, result.clickPoint.y);
+ await new Promise(r => setTimeout(r, 100));
+ focused = true;
+ }
+ }
+ } catch { /* fall through */ }
+
+ // Try CDP focus (browser)
+ if (!focused) {
+ try {
+ if (await ctx.cdp.isConnected()) {
+ const page = ctx.cdp.getPage();
+ if (page) {
+ const found = await page.evaluate((label: string) => {
+ const inputs = document.querySelectorAll('input, textarea, [contenteditable]');
+ for (const el of inputs) {
+ const htmlEl = el as HTMLElement;
+ const ariaLabel = htmlEl.getAttribute('aria-label') || '';
+ const placeholder = htmlEl.getAttribute('placeholder') || '';
+ const name = htmlEl.getAttribute('name') || '';
+ if ([ariaLabel, placeholder, name].some(a => a.toLowerCase().includes(label.toLowerCase()))) {
+ htmlEl.focus();
+ return true;
+ }
+ }
+ return false;
+ }, target).catch(() => false);
+ if (found) focused = true;
+ }
+ }
+ } catch { /* fall through */ }
+ }
+
+ if (!focused) {
+ return { text: `Could not find element "${target}" to focus before typing`, isError: true };
+ }
+ }
+
+ // Type via clipboard paste
+ await ctx.a11y.writeClipboard(text);
+ await new Promise(r => setTimeout(r, 50));
+ await ctx.desktop.keyPress('ctrl+v');
+ await new Promise(r => setTimeout(r, 100));
+ ctx.a11y.invalidateCache();
+
+ const active = await ctx.a11y.getActiveWindow().catch(() => null);
+ const activeInfo = active ? `[${active.processName}] "${active.title}"` : '(unknown)';
+ return { text: `Typed ${text.length} chars${target ? ` into "${target}"` : ''} in ${activeInfo}` };
+ },
+ },
+
+ // ─── invoke_element ──────────────────────────────────────────────────
+ {
+ name: 'invoke_element',
+ description:
+ 'Invoke a UI Automation action on an element. More precise than smart_click — ' +
+ 'supports set-value, get-value, focus, expand, collapse in addition to click.',
+ parameters: {
+ name: {
+ type: 'string',
+ description: 'Element name to find',
+ required: false,
+ },
+ automationId: {
+ type: 'string',
+ description: 'Element automation ID (more precise than name)',
+ required: false,
+ },
+ controlType: {
+ type: 'string',
+ description: 'Filter by control type (e.g., "ControlType.Button")',
+ required: false,
+ },
+ processId: {
+ type: 'number',
+ description: 'Target process ID',
+ required: false,
+ },
+ action: {
+ type: 'string',
+ description: 'Action to perform',
+ required: false,
+ enum: ['click', 'set-value', 'get-value', 'focus', 'expand', 'collapse'],
+ },
+ value: {
+ type: 'string',
+ description: 'Value for set-value action',
+ required: false,
+ },
+ },
+ category: 'window',
+ handler: async (params, ctx) => {
+ await ctx.ensureInitialized();
+
+ if (!params.name && !params.automationId) {
+ return { text: 'Either "name" or "automationId" is required', isError: true };
+ }
+
+ try {
+ const result = await ctx.a11y.invokeElement({
+ name: params.name,
+ automationId: params.automationId,
+ controlType: params.controlType,
+ processId: params.processId,
+ action: params.action || 'click',
+ value: params.value,
+ });
+
+ if (result.success) {
+ ctx.a11y.invalidateCache();
+ const valueInfo = result.value ? ` → value: "${result.value}"` : '';
+ return { text: `Invoked "${params.name || params.automationId}" (${params.action || 'click'})${valueInfo}` };
+ }
+
+ // Coordinate fallback for click actions
+ // a11y coords match nut-js mouse coords directly
+ if (result.clickPoint && (params.action === 'click' || !params.action)) {
+ await ctx.desktop.mouseClick(result.clickPoint.x, result.clickPoint.y);
+ ctx.a11y.invalidateCache();
+ return { text: `Invoked "${params.name || params.automationId}" via coordinate fallback (${result.clickPoint.x},${result.clickPoint.y})` };
+ }
+
+ return {
+ text: `invoke_element failed: ${result.error || 'element not found or action not supported'}`,
+ isError: true,
+ };
+ } catch (err: any) {
+ return { text: `invoke_element error: ${err.message}`, isError: true };
+ }
+ },
+ },
+ ];
+}
diff --git a/src/tools/types.ts b/src/tools/types.ts
new file mode 100644
index 0000000..422916d
--- /dev/null
+++ b/src/tools/types.ts
@@ -0,0 +1,111 @@
+/**
+ * Transport-agnostic tool definitions.
+ *
+ * Tools are defined once here and adapted to:
+ * - HTTP REST API (GET /tools, POST /execute/:name)
+ * - MCP protocol (stdio or SSE)
+ * - OpenAI function-calling format (GET /tools?format=openai)
+ *
+ * No MCP, no Zod, no framework dependency — just plain TypeScript.
+ */
+
+/** Parameter definition for a tool (maps to JSON Schema) */
+export interface ParameterDef {
+ type: 'string' | 'number' | 'boolean';
+ description: string;
+ required?: boolean;
+ enum?: string[];
+ minimum?: number;
+ maximum?: number;
+ default?: any;
+}
+
+/** Result returned by a tool handler */
+export interface ToolResult {
+ /** Text output */
+ text: string;
+ /** Optional image (base64 encoded) */
+ image?: { data: string; mimeType: string };
+ /** Whether this result represents an error */
+ isError?: boolean;
+}
+
+/** Shared context passed to tool handlers — initialized subsystems */
+export interface ToolContext {
+ /** NativeDesktop — mouse, keyboard, screenshot control */
+ desktop: any;
+ /** AccessibilityBridge — UI automation, windows, clipboard */
+ a11y: any;
+ /** CDPDriver — browser DOM interaction via Chrome DevTools Protocol */
+ cdp: any;
+ /** Image-space → logical (mouse) coords. mouseCoord = imageCoord * factor */
+ getMouseScaleFactor: () => number;
+ /** Image-space → physical pixel coords (for screenshot region crop) */
+ getScreenshotScaleFactor: () => number;
+ /** Ensure subsystems are initialized (lazy init gate) */
+ ensureInitialized: () => Promise;
+}
+
+/** A single tool definition — transport agnostic */
+export interface ToolDefinition {
+ /** Unique tool name (e.g. "mouse_click", "read_screen") */
+ name: string;
+ /** Human-readable description of what the tool does */
+ description: string;
+ /** Parameter schema — empty object for no-param tools */
+ parameters: Record;
+ /** Tool category for organization */
+ category: 'perception' | 'mouse' | 'keyboard' | 'window' | 'clipboard' | 'browser' | 'orchestration';
+ /** The handler function */
+ handler: (params: Record, ctx: ToolContext) => Promise;
+}
+
+// ── Schema Conversion Helpers ──
+
+/** Convert tool parameters to JSON Schema (for REST API /tools endpoint) */
+export function toJsonSchema(params: Record): object {
+ const properties: Record = {};
+ const required: string[] = [];
+
+ for (const [key, def] of Object.entries(params)) {
+ const prop: any = { type: def.type, description: def.description };
+ if (def.enum) prop.enum = def.enum;
+ if (def.minimum !== undefined) prop.minimum = def.minimum;
+ if (def.maximum !== undefined) prop.maximum = def.maximum;
+ if (def.default !== undefined) prop.default = def.default;
+ properties[key] = prop;
+ if (def.required !== false) required.push(key);
+ }
+
+ return {
+ type: 'object',
+ properties,
+ required: required.length > 0 ? required : undefined,
+ };
+}
+
+/** Convert tools to OpenAI function-calling format */
+export function toOpenAiFunctions(tools: ToolDefinition[]): object[] {
+ return tools.map(t => ({
+ type: 'function',
+ function: {
+ name: t.name,
+ description: t.description,
+ parameters: toJsonSchema(t.parameters),
+ },
+ }));
+}
+
+/**
+ * Convert a11y coordinate to mouseClick coordinate.
+ *
+ * NOTE: Empirical testing shows a11y bounds and nut-js mouseClick share the
+ * same coordinate system on most Windows configs (both use screen coords from
+ * the same DPI-awareness level). This function may divide unnecessarily on
+ * some setups. The smart tools (smart_click, invoke_element) pass coords
+ * directly for this reason. Only focus_window uses this helper as a fallback.
+ */
+export function a11yToMouse(physicalCoord: number, ctx: ToolContext): number {
+ const dpiRatio = ctx.getScreenshotScaleFactor() / ctx.getMouseScaleFactor();
+ return Math.round(physicalCoord / dpiRatio);
+}
diff --git a/src/ui-driver.ts b/src/ui-driver.ts
index 35ecc35..df40879 100644
--- a/src/ui-driver.ts
+++ b/src/ui-driver.ts
@@ -612,6 +612,45 @@ export class UIDriver {
});
}
+ // ════════════════════════════════════════════════════════════════════
+ // KEYBOARD-FIRST TYPING (no element lookup)
+ // ════════════════════════════════════════════════════════════════════
+
+ /**
+ * Type text directly at the currently focused element — no element lookup.
+ *
+ * Uses PowerShell SendKeys (Windows) or osascript keystroke (macOS) to type
+ * at whatever UI element currently has keyboard focus. Perfect for use after
+ * Tab navigation has moved focus to the desired field.
+ *
+ * @param text The text to type at the current focus
+ */
+ async typeAtCurrentFocus(text: string): Promise<{ success: boolean; error?: string }> {
+ try {
+ if (this.platform === 'win32') {
+ // Use PowerShell SendKeys to type at current focus
+ // Escape special SendKeys characters: +^%~(){}
+ const escaped = text.replace(/'/g, "''").replace(/[+^%~(){}]/g, '{$&}');
+ await execFileAsync('powershell.exe', [
+ '-NoProfile', '-NonInteractive', '-Command',
+ `Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escaped}')`,
+ ], { timeout: SCRIPT_TIMEOUT });
+ return { success: true };
+ } else if (this.platform === 'darwin') {
+ // Use osascript to type at current focus via System Events
+ const escaped = text.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
+ await execFileAsync('osascript', [
+ '-e', `tell application "System Events" to keystroke "${escaped}"`,
+ ], { timeout: SCRIPT_TIMEOUT });
+ return { success: true };
+ } else {
+ return { success: false, error: 'typeAtCurrentFocus: unsupported platform' };
+ }
+ } catch (err) {
+ return { success: false, error: `typeAtCurrentFocus failed: ${err instanceof Error ? err.message : String(err)}` };
+ }
+ }
+
// ════════════════════════════════════════════════════════════════════
// POLLING / WAITING
// ════════════════════════════════════════════════════════════════════
@@ -829,7 +868,7 @@ export class UIDriver {
/**
* Run a PowerShell script and parse its JSON output. (Windows only)
*
- * All PowerShell scripts in clawd-cursor follow the convention:
+ * All PowerShell scripts in clawdcursor follow the convention:
* - Output JSON to stdout
* - Include 'error' key on failure
* - Include 'success' key for action scripts
diff --git a/src/ui-knowledge.ts b/src/ui-knowledge.ts
new file mode 100644
index 0000000..0d0f387
--- /dev/null
+++ b/src/ui-knowledge.ts
@@ -0,0 +1,176 @@
+/**
+ * UI Knowledge Layer
+ *
+ * Loads app-specific instruction sets (shortcuts, workflows, selectors)
+ * from a local knowledge base. In production this becomes a Cloudana DB query.
+ *
+ * Think of it as a "blind person's instruction manual" — the AI knows exactly
+ * how to drive an app before it even looks at the screen.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { UI_KNOWLEDGE_DIR } from './paths';
+
+export interface AppWorkflowStep {
+ action: 'pressKey' | 'typeAtFocus' | 'click' | 'wait';
+ key?: string;
+ field?: string;
+ note?: string;
+}
+
+export interface AppWorkflow {
+ description: string;
+ steps: AppWorkflowStep[];
+}
+
+export interface AppKnowledge {
+ app: string;
+ domain: string;
+ shortcuts: Record;
+ workflows: Record;
+ selectors?: Record;
+ notes?: string[];
+}
+
+// Domain → app name mapping
+const DOMAIN_MAP: Record = {
+ 'mail.google.com': 'gmail',
+ 'gmail.com': 'gmail',
+ 'app.asana.com': 'asana',
+ 'asana.com': 'asana',
+ 'figma.com': 'figma',
+ 'app.slack.com': 'slack',
+ 'slack.com': 'slack',
+ 'monday.com': 'monday',
+ 'notion.so': 'notion',
+ 'app.posthog.com': 'posthog',
+ 'canva.com': 'canva',
+ 'app.hex.tech': 'hex',
+ 'amplitude.com': 'amplitude',
+ 'app.gusto.com': 'gusto',
+ 'box.com': 'box',
+};
+
+// Local knowledge base path — in production, replace with cloud DB fetch
+const KNOWLEDGE_BASE_DIR = UI_KNOWLEDGE_DIR;
+
+export class UIKnowledgeLayer {
+ private cache: Map = new Map();
+
+ /**
+ * Detect which app is being used from a URL or window title.
+ */
+ detectApp(urlOrTitle: string): string | null {
+ const lower = urlOrTitle.toLowerCase();
+
+ // Check domain map
+ for (const [domain, appName] of Object.entries(DOMAIN_MAP)) {
+ if (lower.includes(domain)) return appName;
+ }
+
+ // Fallback: title-based detection
+ if (lower.includes('gmail')) return 'gmail';
+ if (lower.includes('slack')) return 'slack';
+ if (lower.includes('figma')) return 'figma';
+ if (lower.includes('asana')) return 'asana';
+ if (lower.includes('notion')) return 'notion';
+
+ return null;
+ }
+
+ /**
+ * Load knowledge for a detected app.
+ * Returns null if no knowledge available for this app.
+ */
+ async loadKnowledge(appName: string): Promise {
+ if (this.cache.has(appName)) return this.cache.get(appName) ?? null;
+
+ const filePath = path.join(KNOWLEDGE_BASE_DIR, `${appName}.json`);
+
+ // TODO: Replace with Cloudana DB fetch:
+ // const knowledge = await fetch(`https://api.cloudana.io/ui-knowledge/${appName}`).then(r => r.json());
+
+ if (!fs.existsSync(filePath)) {
+ this.cache.set(appName, null);
+ return null;
+ }
+
+ try {
+ const raw = fs.readFileSync(filePath, 'utf-8');
+ const knowledge = JSON.parse(raw) as AppKnowledge;
+ this.cache.set(appName, knowledge);
+ return knowledge;
+ } catch {
+ this.cache.set(appName, null);
+ return null;
+ }
+ }
+
+ /**
+ * Get the best workflow for a task + app combination.
+ * Returns the workflow steps as a ready-to-inject prompt string.
+ */
+ getWorkflowPrompt(knowledge: AppKnowledge, taskDescription: string): string | null {
+ const taskLower = taskDescription.toLowerCase();
+
+ // Match task to workflow
+ let bestWorkflow: AppWorkflow | null = null;
+ let bestKey = '';
+
+ const workflowMatchMap: Record = {
+ 'compose_and_send': ['send email', 'compose', 'write email', 'new email', 'email to'],
+ 'reply': ['reply', 'respond'],
+ 'reply_all': ['reply all'],
+ 'forward': ['forward'],
+ 'search': ['search', 'find email', 'look for'],
+ 'archive': ['archive'],
+ 'delete': ['delete email', 'trash'],
+ 'go_to_inbox': ['go to inbox', 'open inbox', 'inbox'],
+ };
+
+ for (const [workflowKey, keywords] of Object.entries(workflowMatchMap)) {
+ if (keywords.some(kw => taskLower.includes(kw)) && knowledge.workflows[workflowKey]) {
+ bestKey = workflowKey;
+ bestWorkflow = knowledge.workflows[workflowKey];
+ break;
+ }
+ }
+
+ if (!bestWorkflow) return null;
+
+ const stepsText = bestWorkflow.steps.map((s, i) => {
+ if (s.action === 'pressKey') return `${i + 1}. pressKey "${s.key}"${s.note ? ` (${s.note})` : ''}`;
+ if (s.action === 'typeAtFocus') return `${i + 1}. typeAtFocus — type the ${s.field}${s.note ? ` (${s.note})` : ''}`;
+ return `${i + 1}. ${s.action}`;
+ }).join('\n');
+
+ return `APP KNOWLEDGE — ${knowledge.app.toUpperCase()} (${knowledge.domain}):
+Use this EXACT sequence for "${bestWorkflow.description}":
+${stepsText}
+
+Key shortcuts available: ${Object.entries(knowledge.shortcuts).slice(0, 10).map(([k, v]) => `${k}=${v}`).join(', ')}
+${knowledge.notes ? `Notes: ${knowledge.notes.join('; ')}` : ''}
+
+Follow this sequence precisely. Do not try to click UI elements — use the keyboard sequence above.`;
+ }
+
+ /**
+ * Full context string to inject into the ReAct prompt when an app is detected.
+ */
+ async getContextForTask(taskDescription: string, urlOrTitle: string): Promise {
+ const appName = this.detectApp(urlOrTitle);
+ if (!appName) return null;
+
+ const knowledge = await this.loadKnowledge(appName);
+ if (!knowledge) return null;
+
+ const workflowPrompt = this.getWorkflowPrompt(knowledge, taskDescription);
+ if (!workflowPrompt) return null;
+
+ console.log(` 📚 UI Knowledge: loaded ${appName} instruction set`);
+ return workflowPrompt;
+ }
+}
+
+export const uiKnowledge = new UIKnowledgeLayer();
diff --git a/src/verifiers.ts b/src/verifiers.ts
new file mode 100644
index 0000000..c1e5a8f
--- /dev/null
+++ b/src/verifiers.ts
@@ -0,0 +1,476 @@
+/**
+ * Verifiers — Ground-truth task completion verification.
+ *
+ * v0.7.0 redesign: LLM-as-primary-verifier.
+ *
+ * Instead of hardcoded regex patterns that silently pass on unrecognized tasks,
+ * the text LLM reads the actual a11y tree state and makes a semantic judgment
+ * about whether the task was FULLY completed. Evidence is required — vague
+ * confirmations are rejected.
+ *
+ * Fast-path heuristics still run first for trivial checks (app open, clipboard),
+ * but the LLM verifier is the authoritative fallback for anything semantic.
+ *
+ * Key design principles:
+ * - Default is UNCERTAIN, not PASS. Unrecognized tasks go to LLM, never auto-pass.
+ * - Error passthrough is FAIL, not PASS. Broken verifiers are never silent.
+ * - LLM must cite specific screen evidence to return PASS.
+ * - All verification attempts are logged in full detail for debugging.
+ */
+
+import { AccessibilityBridge } from './accessibility';
+import type { PipelineConfig } from './providers';
+
+export interface VerifyResult {
+ pass: boolean;
+ method: string;
+ detail: string;
+ confidence: number; // 0-1
+ evidence?: string; // exact text/state cited as proof
+ attemptLog: VerifyAttempt[]; // full audit trail of every check run
+}
+
+export interface VerifyAttempt {
+ checkName: string;
+ pass: boolean;
+ confidence: number;
+ detail: string;
+ durationMs: number;
+ error?: string;
+}
+
+export class TaskVerifier {
+ private pipelineConfig: PipelineConfig | null = null;
+
+ constructor(
+ private a11y: AccessibilityBridge,
+ pipelineConfig?: PipelineConfig,
+ ) {
+ this.pipelineConfig = pipelineConfig ?? null;
+ }
+
+ /**
+ * Run all applicable verifiers for the given task.
+ *
+ * Strategy:
+ * 1. Fast-path heuristics (zero LLM cost) for trivial cases.
+ * 2. If no fast-path gave high-confidence result, run LLM verifier.
+ * 3. Any failure from any check wins over passes.
+ * 4. Full attempt log is always returned for logging.
+ */
+ async verify(task: string, readClipboard?: () => Promise): Promise {
+ const taskLower = task.toLowerCase();
+ const attempts: VerifyAttempt[] = [];
+ const fastResults: VerifyResult[] = [];
+
+ // ── Fast-path heuristics ─────────────────────────────────────────────────
+
+ // App-open check
+ if (/^open\s/i.test(taskLower) && !/\band\b/i.test(taskLower)) {
+ const r = await this.timed('app_open_check', () => this.verifyAppOpen(task));
+ attempts.push(r.attempt);
+ if (r.result.confidence >= 0.8) fastResults.push(r.result);
+ }
+
+ // Clipboard copy check
+ if (/\bcopy\b/i.test(taskLower) && readClipboard) {
+ const r = await this.timed('clipboard_check', () => this.verifyClipboardHasContent(readClipboard));
+ attempts.push(r.attempt);
+ if (r.result.confidence >= 0.8) fastResults.push(r.result);
+ }
+
+ // Browser navigation check
+ if (/^(go to|navigate to|open|visit|browse)\s+https?:\/\//i.test(taskLower)) {
+ const r = await this.timed('navigation_check', () => this.verifyNavigation(task));
+ attempts.push(r.attempt);
+ if (r.result.confidence >= 0.75) fastResults.push(r.result);
+ }
+
+ // If a fast-path check already failed with high confidence, short-circuit
+ const highConfidenceFailure = fastResults.find(r => !r.pass && r.confidence >= 0.85);
+ if (highConfidenceFailure) {
+ return { ...highConfidenceFailure, attemptLog: attempts };
+ }
+
+ // If a fast-path check passed with high confidence AND task is simple, accept it
+ const highConfidencePass = fastResults.find(r => r.pass && r.confidence >= 0.85);
+ const isComplexTask = /\band\b|\bthen\b|,/i.test(taskLower);
+ if (highConfidencePass && !isComplexTask) {
+ return { ...highConfidencePass, attemptLog: attempts };
+ }
+
+ // ── LLM Verifier (semantic, evidence-based) ──────────────────────────────
+
+ if (this.pipelineConfig?.layer2.enabled) {
+ const r = await this.timed('llm_semantic_verify', () => this.verifyWithLLM(task));
+ attempts.push(r.attempt);
+ return { ...r.result, attemptLog: attempts };
+ }
+
+ // ── Fallback: no LLM available, read a11y tree and do best-effort check ──
+
+ const r = await this.timed('a11y_fallback_check', () => this.verifyWithA11yOnly(task));
+ attempts.push(r.attempt);
+ return { ...r.result, attemptLog: attempts };
+ }
+
+ // ── LLM Verifier ────────────────────────────────────────────────────────────
+
+ private async verifyWithLLM(task: string): Promise {
+ try {
+ // Read the current a11y tree
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const screenContext = await this.a11y.getScreenContext(activeWin?.processId).catch(() => null);
+ const focusedEl = await this.a11y.getFocusedElement().catch(() => null);
+
+ const stateLines: string[] = [];
+ if (activeWin) {
+ stateLines.push(`Active window: "${activeWin.title}" (process: ${activeWin.processName})`);
+ }
+ if (focusedEl) {
+ const val = focusedEl.value ? ` | value: "${focusedEl.value.substring(0, 200)}"` : '';
+ stateLines.push(`Focused element: ${focusedEl.name || '(unnamed)'}${val}`);
+ }
+ if (screenContext) {
+ // Include a trimmed version of the a11y tree — cap at 2000 chars to stay within token budget
+ stateLines.push(`\nAccessibility tree (truncated):\n${screenContext.substring(0, 2000)}`);
+ }
+
+ const screenState = stateLines.length > 0
+ ? stateLines.join('\n')
+ : 'Screen state unavailable.';
+
+ const prompt = `You are a strict task completion verifier. Your ONLY job is to determine if a desktop task was FULLY completed based on the current screen state.
+
+TASK: "${task}"
+
+CURRENT SCREEN STATE:
+${screenState}
+
+Answer in this exact JSON format:
+{
+ "verdict": "PASS" | "FAIL" | "UNCERTAIN",
+ "evidence": "",
+ "confidence": <0.0-1.0>,
+ "reasoning": ""
+}
+
+STRICT RULES:
+- PASS requires SPECIFIC evidence you can cite from the screen state above. "App is open" alone is NOT evidence for a task that requires writing, filling, or sending something.
+- FAIL if the required outcome (text written, form filled, message sent, file saved) is not clearly visible in the state.
+- UNCERTAIN if the screen state does not have enough information to judge — do NOT default to PASS.
+- For writing tasks: the actual content must be visible in the accessibility tree value field.
+- For send/submit tasks: the compose/form window must be GONE, not still open.
+- Confidence below 0.6 = UNCERTAIN.`;
+
+ const response = await this.callTextModel(prompt);
+
+ let parsed: any;
+ try {
+ parsed = JSON.parse(response);
+ } catch {
+ // Try to extract JSON from response
+ const match = response.match(/\{[\s\S]*\}/);
+ if (match) {
+ try { parsed = JSON.parse(match[0]); } catch { parsed = null; }
+ }
+ }
+
+ if (!parsed || !parsed.verdict) {
+ return {
+ pass: false,
+ method: 'llm_semantic_verify',
+ detail: `LLM returned unparseable response: ${response.substring(0, 100)}`,
+ confidence: 0.1,
+ attemptLog: [],
+ };
+ }
+
+ const verdict = String(parsed.verdict).toUpperCase();
+ const confidence = typeof parsed.confidence === 'number'
+ ? Math.max(0, Math.min(1, parsed.confidence))
+ : 0.5;
+ const evidence = String(parsed.evidence || '').substring(0, 300);
+ const reasoning = String(parsed.reasoning || '').substring(0, 200);
+
+ const pass = verdict === 'PASS' && confidence >= 0.65;
+ const detail = `[${verdict}] ${reasoning} | evidence: ${evidence}`;
+
+ return {
+ pass,
+ method: 'llm_semantic_verify',
+ detail,
+ confidence,
+ evidence,
+ attemptLog: [],
+ };
+
+ } catch (err) {
+ // LLM call failed — do NOT silently pass. Return fail with error detail.
+ return {
+ pass: false,
+ method: 'llm_semantic_verify',
+ detail: `LLM verifier error: ${String(err).substring(0, 200)}`,
+ confidence: 0.0,
+ attemptLog: [],
+ };
+ }
+ }
+
+ // ── A11y-only fallback (no LLM) ─────────────────────────────────────────────
+
+ private async verifyWithA11yOnly(task: string): Promise {
+ try {
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const focused = await this.a11y.getFocusedElement().catch(() => null);
+ const taskLower = task.toLowerCase();
+
+ // For writing tasks: require non-trivial content in focused element
+ if (/\b(write|type|compose|draft|enter)\b/i.test(taskLower)) {
+ if (focused?.value && focused.value.trim().length > 20) {
+ return {
+ pass: true,
+ method: 'a11y_fallback',
+ detail: `Focused element has ${focused.value.length} chars: "${focused.value.substring(0, 80)}..."`,
+ confidence: 0.7,
+ evidence: focused.value.substring(0, 80),
+ attemptLog: [],
+ };
+ }
+ return {
+ pass: false,
+ method: 'a11y_fallback',
+ detail: `Writing task but focused element empty or short: "${focused?.value?.substring(0, 50) || '(none)'}"`,
+ confidence: 0.75,
+ attemptLog: [],
+ };
+ }
+
+ // For send/submit tasks: check compose window is gone
+ if (/\b(send|submit|click send|click submit)\b/i.test(taskLower)) {
+ const title = (activeWin?.title || '').toLowerCase();
+ if (/compose|new message|untitled|draft/i.test(title)) {
+ return {
+ pass: false,
+ method: 'a11y_fallback',
+ detail: `Compose window still open: "${activeWin?.title}"`,
+ confidence: 0.85,
+ attemptLog: [],
+ };
+ }
+ }
+
+ // General: return uncertain rather than pass — LLM unavailable
+ return {
+ pass: false,
+ method: 'a11y_fallback',
+ detail: `No LLM available for semantic verification. A11y state: window="${activeWin?.title || 'none'}", focused="${focused?.name || 'none'}". Marking uncertain/fail to avoid false positive.`,
+ confidence: 0.4,
+ attemptLog: [],
+ };
+ } catch (err) {
+ return {
+ pass: false,
+ method: 'a11y_fallback_error',
+ detail: `A11y fallback verifier error: ${String(err).substring(0, 200)}`,
+ confidence: 0.0,
+ attemptLog: [],
+ };
+ }
+ }
+
+ // ── Fast-path heuristics ────────────────────────────────────────────────────
+
+ private async verifyAppOpen(_task: string): Promise {
+ try {
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ if (activeWin?.processName) {
+ return {
+ pass: true,
+ method: 'app_open_check',
+ detail: `Active window: "${activeWin.title}" (${activeWin.processName})`,
+ confidence: 0.85,
+ evidence: `${activeWin.processName} window open: "${activeWin.title}"`,
+ attemptLog: [],
+ };
+ }
+ return {
+ pass: false,
+ method: 'app_open_check',
+ detail: 'No active window detected after open command',
+ confidence: 0.8,
+ attemptLog: [],
+ };
+ } catch (err) {
+ return {
+ pass: false,
+ method: 'app_open_check_error',
+ detail: `App open check error: ${String(err).substring(0, 100)}`,
+ confidence: 0.0,
+ attemptLog: [],
+ };
+ }
+ }
+
+ private async verifyClipboardHasContent(readClipboard: () => Promise): Promise {
+ try {
+ const clip = await readClipboard();
+ if (clip && clip.trim().length > 5) {
+ return {
+ pass: true,
+ method: 'clipboard_check',
+ detail: `Clipboard has ${clip.length} chars: "${clip.substring(0, 80)}..."`,
+ confidence: 0.92,
+ evidence: clip.substring(0, 80),
+ attemptLog: [],
+ };
+ }
+ return {
+ pass: false,
+ method: 'clipboard_check',
+ detail: `Clipboard empty or too short: "${clip?.substring(0, 30) || '(empty)'}"`,
+ confidence: 0.9,
+ attemptLog: [],
+ };
+ } catch (err) {
+ return {
+ pass: false,
+ method: 'clipboard_check_error',
+ detail: `Clipboard read error: ${String(err).substring(0, 100)}`,
+ confidence: 0.0,
+ attemptLog: [],
+ };
+ }
+ }
+
+ private async verifyNavigation(task: string): Promise {
+ try {
+ const urlMatch = task.match(/https?:\/\/[^\s]+/i);
+ const expectedDomain = urlMatch
+ ? new URL(urlMatch[0]).hostname.replace(/^www\./, '')
+ : null;
+
+ const activeWin = await this.a11y.getActiveWindow().catch(() => null);
+ const title = (activeWin?.title || '').toLowerCase();
+ const pn = (activeWin?.processName || '').toLowerCase();
+
+ if (!/msedge|chrome|firefox/i.test(pn)) {
+ return {
+ pass: false,
+ method: 'navigation_check',
+ detail: `Expected browser but active process is: ${activeWin?.processName || 'none'}`,
+ confidence: 0.8,
+ attemptLog: [],
+ };
+ }
+
+ if (expectedDomain && title.includes(expectedDomain.replace('.com', '').replace('.org', ''))) {
+ return {
+ pass: true,
+ method: 'navigation_check',
+ detail: `Browser title matches expected domain "${expectedDomain}": "${activeWin?.title}"`,
+ confidence: 0.85,
+ evidence: `title: "${activeWin?.title}"`,
+ attemptLog: [],
+ };
+ }
+
+ return {
+ pass: true,
+ method: 'navigation_check',
+ detail: `Browser is active: "${activeWin?.title}" — domain match not confirmed`,
+ confidence: 0.6,
+ attemptLog: [],
+ };
+ } catch (err) {
+ return {
+ pass: false,
+ method: 'navigation_check_error',
+ detail: `Navigation check error: ${String(err).substring(0, 100)}`,
+ confidence: 0.0,
+ attemptLog: [],
+ };
+ }
+ }
+
+ // ── LLM call ────────────────────────────────────────────────────────────────
+
+ private async callTextModel(prompt: string): Promise {
+ if (!this.pipelineConfig) throw new Error('No pipeline config');
+
+ const { model, baseUrl } = this.pipelineConfig.layer2;
+ const apiKey = this.pipelineConfig.apiKey;
+ const provider = this.pipelineConfig.provider;
+
+ if (provider.openaiCompat || baseUrl.includes('localhost') || baseUrl.includes('11434')) {
+ const res = await fetch(`${baseUrl}/chat/completions`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...provider.authHeader(apiKey) },
+ body: JSON.stringify({
+ model,
+ max_tokens: 300,
+ temperature: 0,
+ response_format: { type: 'json_object' },
+ messages: [{ role: 'user', content: prompt }],
+ }),
+ signal: AbortSignal.timeout(10000),
+ });
+ const data = await res.json() as any;
+ if (data.error) throw new Error(data.error.message ?? JSON.stringify(data.error));
+ return data.choices?.[0]?.message?.content ?? '';
+ } else {
+ // Anthropic
+ const res = await fetch(`${baseUrl}/messages`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...provider.authHeader(apiKey) },
+ body: JSON.stringify({
+ model,
+ max_tokens: 300,
+ messages: [
+ { role: 'user', content: prompt },
+ { role: 'assistant', content: '{' },
+ ],
+ }),
+ signal: AbortSignal.timeout(10000),
+ });
+ const data = await res.json() as any;
+ if (data.error) throw new Error(data.error.message ?? JSON.stringify(data.error));
+ const text = data.content?.[0]?.text ?? '';
+ return text.startsWith('{') ? text : '{' + text;
+ }
+ }
+
+ // ── Timing wrapper ───────────────────────────────────────────────────────────
+
+ private async timed(
+ checkName: string,
+ fn: () => Promise,
+ ): Promise<{ result: VerifyResult; attempt: VerifyAttempt }> {
+ const t0 = Date.now();
+ let result: VerifyResult;
+ let error: string | undefined;
+ try {
+ result = await fn();
+ } catch (err) {
+ error = String(err).substring(0, 200);
+ result = {
+ pass: false,
+ method: checkName + '_error',
+ detail: `Unexpected error in ${checkName}: ${error}`,
+ confidence: 0.0,
+ attemptLog: [],
+ };
+ }
+ return {
+ result,
+ attempt: {
+ checkName,
+ pass: result.pass,
+ confidence: result.confidence,
+ detail: result.detail,
+ durationMs: Date.now() - t0,
+ ...(error && { error }),
+ },
+ };
+ }
+}
diff --git a/src/workspace-state.ts b/src/workspace-state.ts
new file mode 100644
index 0000000..93cf6f0
--- /dev/null
+++ b/src/workspace-state.ts
@@ -0,0 +1,164 @@
+/**
+ * WorkspaceState — tracks the desktop workspace across task steps.
+ * Maintains windows, browser tabs, clipboard, and task artifacts.
+ * Updated after every action to enable cross-app orchestration and progress detection.
+ */
+
+export interface TrackedWindow {
+ processName: string;
+ title: string;
+ processId: number;
+ isMinimized: boolean;
+ lastSeenAt: number;
+ url?: string; // for browser windows
+ tabId?: string; // for browser tabs
+}
+
+export interface ClipboardSnapshot {
+ text: string;
+ changedAt: number;
+ source?: string; // which app/action produced it
+}
+
+export class WorkspaceState {
+ activeWindowId?: number; // PID of active window
+ windows: Map = new Map();
+ clipboard: ClipboardSnapshot = { text: '', changedAt: 0 };
+ lastStateHash: string = ''; // for no-progress detection
+ stateHistory: string[] = []; // rolling window of last 10 state hashes
+
+ /**
+ * Update window list from accessibility bridge data
+ */
+ updateWindows(windowList: Array<{ processName: string; title: string; processId: number; isMinimized: boolean }>): void {
+ const now = Date.now();
+ for (const w of windowList) {
+ this.windows.set(w.processId, {
+ ...w,
+ lastSeenAt: now,
+ url: this.windows.get(w.processId)?.url,
+ tabId: this.windows.get(w.processId)?.tabId,
+ });
+ }
+ // Prune windows not seen in 60s
+ for (const [pid, w] of this.windows) {
+ if (now - w.lastSeenAt > 60000) this.windows.delete(pid);
+ }
+ }
+
+ /**
+ * Set the active window
+ */
+ setActiveWindow(pid: number): void {
+ this.activeWindowId = pid;
+ }
+
+ /**
+ * Update browser tab info for a window
+ */
+ updateBrowserTab(pid: number, url: string, tabId?: string): void {
+ const w = this.windows.get(pid);
+ if (w) {
+ w.url = url;
+ w.tabId = tabId;
+ }
+ }
+
+ /**
+ * Update clipboard state
+ */
+ updateClipboard(text: string, source?: string): void {
+ if (text !== this.clipboard.text) {
+ this.clipboard = { text, changedAt: Date.now(), source };
+ }
+ }
+
+ /**
+ * Find a window by process name (fuzzy match)
+ */
+ findWindow(processName: string): TrackedWindow | undefined {
+ const lower = processName.toLowerCase();
+ for (const w of this.windows.values()) {
+ if (w.processName.toLowerCase().includes(lower) && !w.isMinimized) {
+ return w;
+ }
+ }
+ return undefined;
+ }
+
+ /**
+ * Find a browser window with a specific URL
+ */
+ findBrowserTab(urlSubstring: string): TrackedWindow | undefined {
+ const lower = urlSubstring.toLowerCase();
+ for (const w of this.windows.values()) {
+ if (w.url && w.url.toLowerCase().includes(lower)) {
+ return w;
+ }
+ }
+ return undefined;
+ }
+
+ /**
+ * Compute a state hash for no-progress detection.
+ * Compares: active window title + URL + clipboard length
+ */
+ computeStateHash(): string {
+ const activeWin = this.activeWindowId ? this.windows.get(this.activeWindowId) : undefined;
+ const hash = [
+ activeWin?.processName || '?',
+ activeWin?.title?.substring(0, 50) || '?',
+ activeWin?.url?.substring(0, 80) || '',
+ this.clipboard.text.length.toString(),
+ ].join('|');
+ return hash;
+ }
+
+ /**
+ * Check if state has changed since last snapshot.
+ * Call after each action to detect stalls.
+ * Returns true if state changed, false if stuck.
+ */
+ checkProgress(): boolean {
+ const current = this.computeStateHash();
+ const changed = current !== this.lastStateHash;
+ this.lastStateHash = current;
+ this.stateHistory.push(current);
+ if (this.stateHistory.length > 10) this.stateHistory.shift();
+ return changed;
+ }
+
+ /**
+ * Count how many consecutive steps had the same state.
+ * Useful for detecting deep stalls (e.g., clicking same button 10 times).
+ */
+ getStallCount(): number {
+ if (this.stateHistory.length < 2) return 0;
+ const current = this.stateHistory[this.stateHistory.length - 1];
+ let count = 0;
+ for (let i = this.stateHistory.length - 2; i >= 0; i--) {
+ if (this.stateHistory[i] === current) count++;
+ else break;
+ }
+ return count;
+ }
+
+ /**
+ * Get a summary string for logging/LLM context
+ */
+ getSummary(): string {
+ const activeWin = this.activeWindowId ? this.windows.get(this.activeWindowId) : undefined;
+ const windowCount = this.windows.size;
+ const clip = this.clipboard.text ? `${this.clipboard.text.length} chars` : 'empty';
+ return `Active: ${activeWin?.processName || 'none'} "${activeWin?.title?.substring(0, 40) || ''}" | Windows: ${windowCount} | Clipboard: ${clip}`;
+ }
+
+ /**
+ * Reset for a new task
+ */
+ reset(): void {
+ this.lastStateHash = '';
+ this.stateHistory = [];
+ // Keep windows and clipboard — they persist across tasks
+ }
+}
diff --git a/tests/openclaw-credentials.test.ts b/tests/credentials.test.ts
similarity index 91%
rename from tests/openclaw-credentials.test.ts
rename to tests/credentials.test.ts
index e41064c..6c2b851 100644
--- a/tests/openclaw-credentials.test.ts
+++ b/tests/credentials.test.ts
@@ -2,14 +2,14 @@ import fs from 'fs';
import os from 'os';
import path from 'path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
-import { resolveApiConfig } from '../src/openclaw-credentials';
+import { resolveApiConfig } from '../src/credentials';
function writeJson(filePath: string, value: unknown) {
fs.mkdirSync(path.dirname(filePath), { recursive: true });
fs.writeFileSync(filePath, JSON.stringify(value, null, 2), 'utf-8');
}
-describe.sequential('openclaw credential resolution', () => {
+describe.sequential('credential resolution', () => {
const originalCwd = process.cwd();
const originalHome = os.homedir();
@@ -18,7 +18,7 @@ describe.sequential('openclaw credential resolution', () => {
let tempCwd: string;
beforeEach(() => {
- tempRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'clawd-credentials-'));
+ tempRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'clawdcursor-credentials-'));
tempHome = path.join(tempRoot, 'home');
tempCwd = path.join(tempRoot, 'project');
fs.mkdirSync(tempHome, { recursive: true });
@@ -57,14 +57,14 @@ describe.sequential('openclaw credential resolution', () => {
});
const resolved = resolveApiConfig();
- expect(resolved.source).toBe('openclaw');
+ expect(resolved.source).toBe('external');
expect(resolved.apiKey).toBe('moonshot-real-key');
expect(resolved.baseUrl).toBe('https://api.moonshot.ai/v1');
expect(resolved.visionApiKey).toBe('moonshot-real-key');
expect(resolved.visionBaseUrl).toBe('https://api.moonshot.ai/v1');
});
- it('prefers doctor-configured provider from .clawd-config.json', () => {
+ it('prefers doctor-configured provider from .clawdcursor-config.json', () => {
writeJson(path.join(tempHome, '.openclaw', 'agents', 'main', 'agent', 'auth-profiles.json'), {
anthropic: {
apiKey: 'anthropic-auth-profile-key',
@@ -93,12 +93,12 @@ describe.sequential('openclaw credential resolution', () => {
},
});
- writeJson(path.join(tempCwd, '.clawd-config.json'), {
+ writeJson(path.join(tempCwd, '.clawdcursor-config.json'), {
provider: 'anthropic',
});
const resolved = resolveApiConfig();
- expect(resolved.source).toBe('openclaw');
+ expect(resolved.source).toBe('external');
expect(resolved.provider).toBe('anthropic');
expect(resolved.apiKey).toBe('anthropic-auth-profile-key');
expect(resolved.baseUrl).toBe('https://api.anthropic.com/v1');
@@ -131,7 +131,7 @@ describe.sequential('openclaw credential resolution', () => {
});
const resolved = resolveApiConfig();
- expect(resolved.source).toBe('openclaw');
+ expect(resolved.source).toBe('external');
expect(resolved.provider).toBe('anthropic');
expect(resolved.apiKey).toBe('anthropic-auth-profile-key');
expect(resolved.baseUrl).toBe('https://api.anthropic.com/v1');
@@ -160,7 +160,7 @@ describe.sequential('openclaw credential resolution', () => {
it('supports provider-scoped env keys for arbitrary providers', () => {
process.env.MY_CUSTOM_PROVIDER_API_KEY = 'custom-provider-key';
- writeJson(path.join(tempCwd, '.clawd-config.json'), {
+ writeJson(path.join(tempCwd, '.clawdcursor-config.json'), {
provider: 'my-custom-provider',
});
diff --git a/tests/test-loop.sh b/tests/test-loop.sh
new file mode 100644
index 0000000..809189e
--- /dev/null
+++ b/tests/test-loop.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Test loop: send email task, 60s timeout, inspect logs after each
+RESULTS_DIR="$HOME/.clawdcursor/test-results"
+mkdir -p "$RESULTS_DIR"
+TASK='open outlook and send an email to test@hotmail.com saying hello'
+TIMEOUT=60
+TOTAL=10
+LOGFILE=$(ls -t /c/Users/amr_d/AppData/Local/Temp/claude/C--Users-amr-d/tasks/*.output 2>/dev/null | head -1)
+
+echo "=== Test Loop: $TOTAL iterations, ${TIMEOUT}s timeout ===" | tee "$RESULTS_DIR/summary.txt"
+echo "Agent log: $LOGFILE"
+echo ""
+
+for i in $(seq 1 $TOTAL); do
+ echo "──── TEST $i/$TOTAL — $(date +%H:%M:%S) ────"
+
+ # Make sure agent is idle — abort + wait
+ curl -s -X POST http://127.0.0.1:3847/abort > /dev/null 2>&1
+ sleep 2
+
+ # Verify idle
+ for retry in 1 2 3 4 5; do
+ STATUS=$(curl -s http://127.0.0.1:3847/status 2>/dev/null)
+ IS_IDLE=$(echo "$STATUS" | grep -o '"status":"idle"')
+ if [ -n "$IS_IDLE" ]; then
+ break
+ fi
+ echo " Waiting for idle (attempt $retry)..."
+ curl -s -X POST http://127.0.0.1:3847/abort > /dev/null 2>&1
+ sleep 3
+ done
+
+ if [ -z "$IS_IDLE" ]; then
+ echo " Agent stuck — skipping"
+ echo "TEST $i: SKIP (agent stuck)" >> "$RESULTS_DIR/summary.txt"
+ continue
+ fi
+
+ # Record log position before test
+ LOG_LINES_BEFORE=0
+ if [ -n "$LOGFILE" ] && [ -f "$LOGFILE" ]; then
+ LOG_LINES_BEFORE=$(wc -l < "$LOGFILE")
+ fi
+
+ # Send task
+ RESPONSE=$(curl -s -X POST http://127.0.0.1:3847/task \
+ -H "Content-Type: application/json" \
+ -d "{\"task\": \"$TASK\"}")
+
+ ACCEPTED=$(echo "$RESPONSE" | grep -o '"accepted":true')
+ if [ -z "$ACCEPTED" ]; then
+ echo " NOT ACCEPTED: $RESPONSE"
+ echo "TEST $i: SKIP (not accepted)" >> "$RESULTS_DIR/summary.txt"
+ sleep 2
+ continue
+ fi
+
+ # Poll for completion
+ START_TIME=$(date +%s)
+ RESULT="TIMEOUT"
+
+ while true; do
+ sleep 3
+ ELAPSED=$(( $(date +%s) - START_TIME ))
+
+ STATUS=$(curl -s http://127.0.0.1:3847/status 2>/dev/null)
+ AGENT_STATUS=$(echo "$STATUS" | grep -o '"status":"[^"]*"' | head -1 | cut -d'"' -f4)
+
+ if [ "$AGENT_STATUS" = "idle" ] && [ $ELAPSED -gt 5 ]; then
+ RESULT="COMPLETED"
+ break
+ fi
+
+ if [ $ELAPSED -ge $TIMEOUT ]; then
+ curl -s -X POST http://127.0.0.1:3847/abort > /dev/null 2>&1
+ RESULT="TIMEOUT"
+ sleep 3
+ break
+ fi
+ done
+
+ DURATION=$(( $(date +%s) - START_TIME ))
+
+ # Extract this test's logs
+ if [ -n "$LOGFILE" ] && [ -f "$LOGFILE" ]; then
+ tail -n +$((LOG_LINES_BEFORE + 1)) "$LOGFILE" > "$RESULTS_DIR/test${i}.log" 2>/dev/null
+ # Check for success/failure in log
+ HAS_SUCCESS=$(grep -c "SUCCESS" "$RESULTS_DIR/test${i}.log" 2>/dev/null || echo 0)
+ HAS_FAILED=$(grep -c "FAILED" "$RESULTS_DIR/test${i}.log" 2>/dev/null || echo 0)
+ if [ "$HAS_SUCCESS" -gt 0 ]; then
+ RESULT="SUCCESS"
+ elif [ "$HAS_FAILED" -gt 0 ] && [ "$RESULT" != "TIMEOUT" ]; then
+ RESULT="FAILED"
+ fi
+ fi
+
+ echo " => $RESULT (${DURATION}s)"
+ echo "TEST $i: $RESULT (${DURATION}s)" >> "$RESULTS_DIR/summary.txt"
+
+ # Print key lines from this test's log
+ if [ -f "$RESULTS_DIR/test${i}.log" ]; then
+ echo " --- Key log lines ---"
+ grep -E "Layer [23]|Subtask|SUCCESS|FAILED|TIMEOUT|stuck|error|visual hint|Clicked|max steps" "$RESULTS_DIR/test${i}.log" | tail -15
+ echo " ---"
+ fi
+
+ echo ""
+ sleep 2
+done
+
+echo ""
+echo "========== SUMMARY =========="
+cat "$RESULTS_DIR/summary.txt"
diff --git a/test-shortcuts.js b/tests/test-shortcuts.js
similarity index 100%
rename from test-shortcuts.js
rename to tests/test-shortcuts.js
diff --git a/vitest.config.ts b/vitest.config.ts
new file mode 100644
index 0000000..02b68b0
--- /dev/null
+++ b/vitest.config.ts
@@ -0,0 +1,13 @@
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+ test: {
+ globals: true,
+ environment: 'node',
+ include: ['src/__tests__/**/*.test.ts'],
+ testTimeout: 15000,
+ fakeTimers: {
+ // Tests that need real timers opt out with vi.useRealTimers()
+ },
+ },
+});