From 1544e1b09afb9a331c9c89caa4ba96050b556fa4 Mon Sep 17 00:00:00 2001 From: Najeem Laaroussi Date: Thu, 23 Apr 2026 19:00:13 +0100 Subject: [PATCH 1/2] Major upgrade: memory system, Firefox default, voice config, and system control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Core changes: - Persistent 3-tier conversation memory: per-message SQLite logging, rolling session summaries (every 5 messages), end-of-session Haiku compaction, and cross-session context injected into every LLM call on reconnect - Memory extraction: Haiku extracts facts/preferences/decisions after every exchange and stores them as searchable memories - Edge TTS migration: replaced Fish Audio with edge-tts (free, no API key). Voice defaults to en-GB-RyanNeural - Firefox set as default browser across all browse actions - "pull up" is now a universal Firefox command — any descriptive phrase opens Firefox search; bare app names (1-2 words, no articles) still switch the app - Weather location corrected from St. Petersburg FL to Leicester UK (°C) - Calendar and Mail no longer auto-launch on startup — only used if already open - Model config externalised: JARVIS_FAST_MODEL and JARVIS_SMART_MODEL env vars - Local LLM support via LM Studio (LOCAL_LLM_MODEL) with Anthropic fallback - system_control.py: AppleScript-based macOS system automation (app switching, volume, keyboard shortcuts, window management) - dispatch_registry.py: blueprint column, increased response storage to 10k chars - work_mode.py: Claude Code session timeout raised from 5 to 15 minutes - planner.py: plans serialise to JSON blueprints for sub-agent reading - HANDBOOK.md: operations and configuration guide Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 4 + HANDBOOK.md | 250 ++++++++++ actions.py | 138 +++++- calendar_access.py | 15 +- dispatch_registry.py | 21 +- frontend/src/main.ts | 64 ++- frontend/src/settings.ts | 79 ++- frontend/src/voice.ts | 46 +- frontend/vite.config.ts | 3 +- mail_access.py | 15 +- memory.py | 205 +++++++- notes_access.py | 16 +- planner.py | 65 ++- requirements.txt | 1 + server.py | 1004 +++++++++++++++++++++++++++++++------- setup_permissions.sh | 245 ++++++++++ system_control.py | 584 ++++++++++++++++++++++ work_mode.py | 37 +- 18 files changed, 2502 insertions(+), 290 deletions(-) create mode 100644 HANDBOOK.md create mode 100755 setup_permissions.sh create mode 100644 system_control.py diff --git a/.env.example b/.env.example index 76f8e29..cde1dcc 100644 --- a/.env.example +++ b/.env.example @@ -11,3 +11,7 @@ FISH_API_KEY=your-fish-audio-api-key-here # Optional: Specific Apple Calendar accounts to read (comma-separated emails) # If not set, JARVIS reads ALL calendars from Apple Calendar # CALENDAR_ACCOUNTS=you@gmail.com,work@company.com + +# Optional: Override AI model names if Anthropic renames them +# JARVIS_FAST_MODEL=claude-haiku-4-5-20251001 +# JARVIS_SMART_MODEL=claude-opus-4-6 diff --git a/HANDBOOK.md b/HANDBOOK.md new file mode 100644 index 0000000..3e2ea55 --- /dev/null +++ b/HANDBOOK.md @@ -0,0 +1,250 @@ +# JARVIS — Operations Handbook + +**Your voice-first AI assistant for macOS. Talks back, takes action, builds software.** + +--- + +## How to Start and Stop + +**Start:** +```bash +cd ~/jarvis +(unset ANTHROPIC_API_KEY && source venv/bin/activate && python server.py &) +cd frontend && npm run dev & +``` +Then open Firefox and go to: `http://localhost:5200` +Click anywhere to enable audio. You're live. + +**Stop:** +```bash +pkill -f "python server.py" && pkill -f "vite" +``` + +**Interface:** +- The glowing orb is Jarvis. It pulses when he speaks. +- Three-dot menu (top right): Settings, Restart Server, Fix Yourself +- Settings panel: change your name, voice, API key, check system status +- Mute button: toggle listening on/off + +--- + +## How to Talk to Him + +Just speak naturally. Jarvis listens continuously and responds by voice. +Short commands get short answers. Open-ended questions get a conversation. + +He addresses you as **"sir"** by default. Change it in `.env` → `HONORIFIC=`. + +--- + +## Commands and Speech Patterns + +### "Pull up [anything]" +Your universal Firefox command. Whatever follows goes straight to Firefox as a Google search. + +``` +"Pull up the weather forecast in Ireland" → Firefox, weather search +"Pull up the mathematical symbol for pi" → Firefox, Google search +"Pull up BBC News" → Firefox, Google search +"Pull up flights from Leicester to Lisbon" → Firefox, Google search +``` + +Exception: bare app names with no articles open the app instead. +``` +"Pull up Spotify" → switches to Spotify app +"Pull up Slack" → switches to Slack app +``` + +--- + +### Browser & Web +``` +"Search for..." → Firefox, Google search +"Go to [website]" → Firefox, opens that site +"Pull up [anything]" → Firefox (see above) +"Go back / go forward" → browser navigation +"Reload / refresh" → reloads current tab +"Close this tab" → closes active tab +"What page is this?" → reads you the current URL and title +``` + +--- + +### Building Software +``` +"Build me a [description]" → Jarvis asks 1-2 clarifying questions, then + spawns a Claude Code session to build it. + A full project lands on your Desktop. +"Jump into [project name]" → connects to an existing project via Claude Code +"Resume where we left off on X" → picks up from the last session on that project +"Check for improvements on X" → reviews the project and suggests next steps +"Pull up what you built" → opens the last completed build in Firefox +``` + +--- + +### Research +``` +"Research [topic]" → Claude Code browses the web, gathers real data, + and produces a formatted HTML report on your Desktop. + More thorough than a browser search — takes 2-3 minutes. +``` + +--- + +### Calendar, Mail & Notes +**Calendar and Mail only respond if you already have them open.** +They do not auto-launch. This is by design — no background surprises. + +``` +"What's on my schedule today?" → reads from Apple Calendar +"Any meetings this week?" → calendar summary +"Any unread emails?" → reads from Apple Mail (read-only) +"Who emailed me today?" → mail scan +"Create a note: [title] / [content]" → saves to Apple Notes +"Read my note about [topic]" → reads a note back to you +"Note that [fact]" → saves to Jarvis's internal memory, not Notes +``` + +--- + +### Tasks & Memory +``` +"Remind me to [task] tomorrow" → creates a task with a due date +"Add a high-priority task: [title]" → adds to task list +"Remember that I prefer [X] over [Y]" → Jarvis stores this and uses it in future +``` + +--- + +### App & Window Control +``` +"Open [app name]" → launches or switches to that app +"Switch to [app name]" → brings app to foreground +"Quit [app name]" → asks confirmation, then quits +"Hide [app name]" → hides the app (Cmd+H) +"Minimise window" → minimises front window +"Snap left / snap right" → moves window to half-screen +``` + +--- + +### System & Audio +``` +"Volume up / volume down" → adjusts system volume +"Mute / unmute" → mutes audio +"What's the volume?" → reads current level +"Screenshot" → takes a screenshot +"What's on my screen?" → Jarvis describes what's currently visible +``` + +--- + +### Self-Awareness +``` +"How are you running?" → Jarvis checks his own code and reports status +"Fix yourself" → opens Claude Code in Jarvis's own project directory +"Restart server" → available in the three-dot menu +``` + +--- + +## What Jarvis Does Not Do + +- **Read-only on Mail.** He can read your emails. He cannot send, delete, or move them. +- **No auto-launch of Calendar or Mail.** Have them open if you want him to use them. +- **No financial or sensitive data.** Keep that away from him. +- **Builds go to your Desktop.** Every project Claude Code creates lands in `~/Desktop`. + +--- + +## Customisation — The `.env` File + +Located at `~/jarvis/.env`. Edit this to change core behaviour. Restart Jarvis after saving. + +| Variable | What It Does | Current Value | +|---|---|---| +| `ANTHROPIC_API_KEY` | Your Anthropic API key — Jarvis's brain | set | +| `EDGE_TTS_VOICE` | His voice. Run `edge-tts --list-voices` to browse options | `en-GB-RyanNeural` | +| `USER_NAME` | Your name — he uses it in conversation | *(empty — set this)* | +| `HONORIFIC` | How he addresses you | `sir` | +| `CALENDAR_ACCOUNTS` | Filter which calendars he reads. `auto` = all | `auto` | +| `LOCAL_LLM_URL` | LM Studio URL for local fast responses | `http://localhost:1234/v1` | +| `LOCAL_LLM_MODEL` | Local model for fast voice responses (Gemma 4 E4B) | `google/gemma-4-e4b` | +| `JARVIS_FAST_MODEL` | Claude model for quick responses | `claude-haiku-4-5-20251001` | +| `JARVIS_SMART_MODEL` | Claude model for research and complex tasks | `claude-opus-4-6` | + +**Voice options worth trying:** +- `en-GB-RyanNeural` — British male (current, suits the JARVIS character) +- `en-GB-SoniaNeural` — British female +- `en-US-GuyNeural` — American male +- Run `edge-tts --list-voices` for the full list + +--- + +## Customisation — The System Prompt + +Located in `~/jarvis/server.py` around **line 85**. +This is Jarvis's personality, rules, and instructions. It's plain English — edit it directly. + +**Things worth tweaking here:** +- His personality and tone (currently: dry British butler, economy of language) +- Response length rules (currently: 1-2 sentences for commands, up to 5 for discussion) +- Default behaviour for specific phrases (the "pull up" rule lives here) +- What he should do when you say specific things + +Restart Jarvis after any changes to `server.py`. + +--- + +## How the AI Brain Works + +Jarvis uses two AI models in parallel, depending on the task: + +**Fast model** (`claude-haiku-4-5-20251001`) — or your local Gemma 4 if LM Studio is running +- Used for: voice conversation, quick commands, task creation, browsing decisions +- Response time: under 1 second with local model, 1-2 seconds via Anthropic + +**Smart model** (`claude-opus-4-6`) +- Used for: deep research, complex builds, anything that needs real thinking +- Response time: 2-5 seconds. Worth the wait. + +**Local LLM (LM Studio + Gemma 4 E4B)** +If LM Studio is running with Gemma 4 loaded, fast responses are routed there instead of Anthropic. This means zero API cost and zero latency for everyday commands. If LM Studio is off, Jarvis falls back to Claude Haiku automatically. + +--- + +## Files Worth Knowing + +``` +~/jarvis/ +├── server.py — the brain: all logic, actions, personality, LLM wiring +├── .env — your config: API keys, voice, name, models +├── actions.py — system actions: browser, apps, volume, screenshots +├── memory.py — conversation memory stored in SQLite +├── planner.py — multi-step task planning for builds +├── calendar_access.py — Apple Calendar integration (read-only) +├── mail_access.py — Apple Mail integration (read-only) +├── notes_access.py — Apple Notes integration +├── browser.py — Playwright web automation for research +├── work_mode.py — persistent Claude Code session management +├── frontend/ — the orb UI (Vite + TypeScript + Three.js) +└── data/jarvis.db — SQLite: memory, tasks, notes, dispatch history +``` + +--- + +## Quick Troubleshooting + +| Problem | Fix | +|---|---| +| Jarvis doesn't respond | Check if ports 8340 and 5200 are in use: `lsof -i :8340 -i :5200` | +| No voice / silent | Confirm `edge-tts` is installed: `cd ~/jarvis && source venv/bin/activate && edge-tts --list-voices` | +| API errors / LLM not working | Check `ANTHROPIC_API_KEY` is set in `.env` | +| Calendar/Mail not working | Make sure you have those apps open before asking | +| Local model not working | Make sure LM Studio is running with Gemma 4 E4B loaded | +| Port already in use on restart | Run: `pkill -f "python server.py" && pkill -f "vite"` then restart | + +--- + +*JARVIS — Just A Rather Very Intelligent System.* diff --git a/actions.py b/actions.py index ac433d2..3f6d456 100644 --- a/actions.py +++ b/actions.py @@ -6,6 +6,7 @@ """ import asyncio +import json import logging import os import re @@ -113,7 +114,7 @@ async def open_terminal(command: str = "") -> dict: } -async def open_browser(url: str, browser: str = "chrome") -> dict: +async def open_browser(url: str, browser: str = "firefox") -> dict: """Open URL in user's browser (Chrome or Firefox).""" escaped_url = url.replace('"', '\\"') @@ -166,10 +167,11 @@ async def open_claude_in_project(project_dir: str, prompt: str) -> dict: claude_md.write_text(f"# Task\n\n{prompt}\n\nBuild this completely. If web app, make index.html work standalone.\n") # Launch claude interactive — it reads CLAUDE.md on its own + escaped_dir = project_dir.replace('"', '\\"') script = ( 'tell application "Terminal"\n' " activate\n" - f' do script "cd {project_dir} && claude --dangerously-skip-permissions"\n' + f' do script "cd \\"{escaped_dir}\\" && claude --dangerously-skip-permissions"\n' "end tell" ) proc = await asyncio.create_subprocess_exec( @@ -275,6 +277,136 @@ async def prompt_existing_terminal(project_name: str, prompt: str) -> dict: return {"success": False, "confirmation": "Something went wrong reaching that terminal, sir."} +async def close_tab() -> dict: + """Close the active tab in Google Chrome via AppleScript. + + Guards against closing the JARVIS interface tab (localhost:5173/5174). + If the front window is JARVIS itself, targets the next available window. + """ + script = ''' +tell application "Google Chrome" + if (count of windows) = 0 then return "NO_WINDOW" + + set targetWindow to missing value + set activeURL to "" + + -- Try front window first + set w to front window + set activeURL to URL of active tab of w + + -- If active tab is JARVIS itself, find another window + if activeURL contains "localhost:5173" or activeURL contains "localhost:5174" then + repeat with i from 1 to count of windows + set candidate to item i of windows + set candidateURL to URL of active tab of candidate + if candidateURL does not contain "localhost:5173" and candidateURL does not contain "localhost:5174" then + set targetWindow to candidate + exit repeat + end if + end repeat + if targetWindow is missing value then + return "IS_JARVIS" + end if + else + set targetWindow to w + end if + + -- Guard: don't close last tab in last window + if (count of tabs of targetWindow) = 1 and (count of windows) = 1 then + return "LAST_TAB" + end if + + close active tab of targetWindow + return "OK" +end tell +''' + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", script, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10) + result = stdout.decode().strip() + + if result == "OK": + return {"success": True, "confirmation": "Tab closed, sir."} + elif result == "IS_JARVIS": + return {"success": False, "confirmation": "That's our interface tab, sir. I won't close that one."} + elif result == "LAST_TAB": + return {"success": False, "confirmation": "That's the last tab open, sir. Closing it would shut Chrome entirely."} + elif result == "NO_WINDOW": + return {"success": False, "confirmation": "Chrome doesn't appear to have any windows open, sir."} + else: + log.error(f"close_tab unexpected result: {result}, stderr: {stderr.decode()[:200]}") + return {"success": False, "confirmation": "Couldn't close that tab, sir."} + except asyncio.TimeoutError: + return {"success": False, "confirmation": "Close tab timed out, sir."} + except Exception as e: + log.error(f"close_tab failed: {e}") + return {"success": False, "confirmation": "Couldn't close that tab, sir."} + + +async def click_element(target: str) -> dict: + """Click an element in Chrome's active tab via JavaScript injection. + + target: CSS selector (e.g. '#submit-btn') or visible text (e.g. 'Sign in'). + Tries CSS selector first, falls back to matching visible text in links/buttons. + """ + # JS uses single-quoted strings so embedding in AppleScript double-quoted string + # only requires escaping double quotes and backslashes in the target value. + js_target = target.replace("\\", "\\\\").replace("'", "\\'") + js_target_lower = js_target.lower() + + js = ( + "(function(){" + f"var el=document.querySelector('{js_target}');" + "if(!el){" + "var all=document.querySelectorAll('a,button,[role=button],input[type=submit],[onclick]');" + "for(var i=0;i=0){{" + "el=all[i];break;" + "}}" # closes inner if + for loop + "}" # closes if(!el) + "if(el){el.click();return 'clicked';}" + "return 'not_found';" + "})()" + ) + + # Escape double quotes for embedding in AppleScript string literal + js_escaped = js.replace('"', '\\"') + + script = ( + 'tell application "Google Chrome"\n' + f' set res to execute javascript "{js_escaped}" in active tab of front window\n' + ' return res\n' + 'end tell' + ) + + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", script, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10) + result = stdout.decode().strip() + success = proc.returncode == 0 and result == "clicked" + if result == "not_found": + log.warning(f"click_element: '{target}' not found on page") + elif not success: + log.error(f"click_element failed: {stderr.decode()[:200]}") + return { + "success": success, + "confirmation": "Done, sir." if success else f"Couldn't find '{target}' on the page, sir.", + } + except asyncio.TimeoutError: + return {"success": False, "confirmation": "Click operation timed out, sir."} + except Exception as e: + log.error(f"click_element error: {e}") + return {"success": False, "confirmation": "Something went wrong with the click, sir."} + + async def get_chrome_tab_info() -> dict: """Read the current Chrome tab's title and URL via AppleScript.""" script = ( @@ -324,7 +456,7 @@ async def monitor_build(project_dir: str, ws=None, synthesize_fn=None) -> None: encoded = base64.b64encode(audio_bytes).decode() await ws.send_json({"type": "status", "state": "speaking"}) await ws.send_json({"type": "audio", "data": encoded, "text": msg}) - await ws.send_json({"type": "status", "state": "idle"}) + # No "idle" send — frontend audioPlayer.onFinished handles it. except Exception as e: log.warning(f"Build notification failed: {e}") return diff --git a/calendar_access.py b/calendar_access.py index c91d090..db5593b 100644 --- a/calendar_access.py +++ b/calendar_access.py @@ -45,22 +45,21 @@ async def _ensure_calendar_running(): - """Launch Calendar.app if not already running.""" + """Check if Calendar.app is already running — does not auto-launch it.""" global _calendar_launched if _calendar_launched: return + check = 'tell application "System Events" to return (name of every application process) contains "Calendar"' try: proc = await asyncio.create_subprocess_exec( - "open", "-a", "Calendar", "-g", + "osascript", "-e", check, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - await asyncio.wait_for(proc.communicate(), timeout=5) - await asyncio.sleep(2) - _calendar_launched = True - log.info("Calendar.app launched") - except Exception as e: - log.warning(f"Failed to launch Calendar: {e}") + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=3) + _calendar_launched = "true" in stdout.decode().lower() + except Exception: + pass async def _fetch_calendar_events(cal_name: str, timeout: float = 12.0) -> list[dict]: diff --git a/dispatch_registry.py b/dispatch_registry.py index ea37310..de4ad6c 100644 --- a/dispatch_registry.py +++ b/dispatch_registry.py @@ -5,6 +5,7 @@ what just finished, and what the user is likely referring to. """ +import json import logging import sqlite3 import time @@ -32,6 +33,7 @@ def init_dispatch_db(): project_path TEXT NOT NULL, original_prompt TEXT NOT NULL, refined_prompt TEXT DEFAULT '', + blueprint TEXT DEFAULT '', status TEXT DEFAULT 'pending', claude_response TEXT DEFAULT '', summary TEXT DEFAULT '', @@ -42,6 +44,12 @@ def init_dispatch_db(): CREATE INDEX IF NOT EXISTS idx_dispatch_status ON dispatches(status); CREATE INDEX IF NOT EXISTS idx_dispatch_updated ON dispatches(updated_at DESC); """) + # Migrate existing tables that predate the blueprint column + try: + conn.execute("ALTER TABLE dispatches ADD COLUMN blueprint TEXT DEFAULT ''") + conn.commit() + except Exception: + pass # Column already exists conn.close() @@ -73,7 +81,7 @@ def update_status(self, dispatch_id: int, status: str, conn.execute( "UPDATE dispatches SET status=?, claude_response=?, summary=?, updated_at=?, " "completed_at=? WHERE id=?", - (status, response[:5000], summary or "", now, + (status, response[:10000], summary or "", now, now if status in ("completed", "failed", "timeout") else None, dispatch_id) ) @@ -85,6 +93,17 @@ def update_status(self, dispatch_id: int, status: str, conn.commit() conn.close() + def update_blueprint(self, dispatch_id: int, blueprint: dict): + """Store the structured plan/blueprint JSON for a dispatch.""" + conn = _get_db() + conn.execute( + "UPDATE dispatches SET blueprint=?, updated_at=? WHERE id=?", + (json.dumps(blueprint), time.time(), dispatch_id) + ) + conn.commit() + conn.close() + log.debug(f"Blueprint stored for dispatch #{dispatch_id}") + def get_most_recent(self) -> dict | None: """Get the most recently updated dispatch.""" conn = _get_db() diff --git a/frontend/src/main.ts b/frontend/src/main.ts index ca5d186..87c807e 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -82,14 +82,19 @@ function transition(newState: State) { const voiceInput = createVoiceInput( (text: string) => { - // Cancel any current JARVIS response before sending new input + // Full thought received — cancel any current JARVIS response and send audioPlayer.stop(); - // User spoke — send transcript socket.send({ type: "transcript", text, isFinal: true }); + statusEl.textContent = ""; transition("thinking"); }, (msg: string) => { showError(msg); + }, + (partial: string) => { + // Accumulating — still listening, show word count as subtle cue + const words = partial.trim().split(/\s+/).length; + statusEl.textContent = words >= 3 ? "listening..." : ""; } ); @@ -98,7 +103,10 @@ const voiceInput = createVoiceInput( // --------------------------------------------------------------------------- audioPlayer.onFinished(() => { - transition("idle"); + // Debounce before resuming the mic — wait for room echo to clear. + // 300ms was not enough for some environments; 1000ms prevents JARVIS's own + // voice being captured and re-sent as the next user message. + setTimeout(() => transition("idle"), 1000); }); // --------------------------------------------------------------------------- @@ -117,9 +125,18 @@ socket.onMessage((msg) => { } audioPlayer.enqueue(audioData); } else { - // TTS failed — no audio but still need to return to idle - console.warn("[audio] no data received, returning to idle"); - transition("idle"); + // TTS failed — fall back to browser speech synthesis + const fallbackText = msg.text as string; + console.warn("[audio] no data received, falling back to speechSynthesis"); + if (fallbackText) { + transition("speaking"); + const utterance = new SpeechSynthesisUtterance(fallbackText); + utterance.onend = () => transition("idle"); + utterance.onerror = () => transition("idle"); + speechSynthesis.speak(utterance); + } else { + transition("idle"); + } } // Log text for debugging if (msg.text) console.log("[JARVIS]", msg.text); @@ -135,8 +152,16 @@ socket.onMessage((msg) => { transition("idle"); } } else if (type === "text") { - // Text fallback when TTS fails - console.log("[JARVIS]", msg.text); + // Text fallback when TTS fails — use browser speech synthesis + const text = msg.text as string; + console.log("[JARVIS]", text); + if (text) { + transition("speaking"); + const utterance = new SpeechSynthesisUtterance(text); + utterance.onend = () => transition("idle"); + utterance.onerror = () => transition("idle"); + speechSynthesis.speak(utterance); + } } else if (type === "task_spawned") { console.log("[task]", "spawned:", msg.task_id, msg.prompt); } else if (type === "task_complete") { @@ -148,25 +173,36 @@ socket.onMessage((msg) => { // Kick off // --------------------------------------------------------------------------- -// Start listening after a brief delay for the orb to render -setTimeout(() => { +// Chrome requires a user gesture before granting microphone to a new origin. +// We defer voice start until first click/key so the permission dialog shows. +let voiceStarted = false; + +function activateVoice() { + if (voiceStarted) return; + voiceStarted = true; + statusEl.textContent = ""; voiceInput.start(); transition("listening"); -}, 1000); +} -// Resume AudioContext on ANY user interaction (browser autoplay policy) +// Resume AudioContext, warm up speechSynthesis, and kick off voice on first gesture function ensureAudioContext() { const ctx = audioPlayer.getAnalyser().context as AudioContext; if (ctx.state === "suspended") { ctx.resume().then(() => console.log("[audio] context resumed")); } + // Warm up speechSynthesis — Chrome blocks it until first user gesture + if (speechSynthesis.paused) speechSynthesis.resume(); + const warmup = new SpeechSynthesisUtterance(""); + speechSynthesis.speak(warmup); + activateVoice(); } document.addEventListener("click", ensureAudioContext); document.addEventListener("touchstart", ensureAudioContext); document.addEventListener("keydown", ensureAudioContext, { once: true }); -// Try to resume audio context on load -ensureAudioContext(); +// Show a tap hint until the user interacts +statusEl.textContent = "tap to activate"; // --------------------------------------------------------------------------- // UI Controls diff --git a/frontend/src/settings.ts b/frontend/src/settings.ts index 7e945ef..4b839b0 100644 --- a/frontend/src/settings.ts +++ b/frontend/src/settings.ts @@ -20,8 +20,7 @@ interface StatusResponse { uptime_seconds: number; env_keys_set: { anthropic: boolean; - fish_audio: boolean; - fish_voice_id: boolean; + edge_tts_voice: string; user_name: string; }; } @@ -39,7 +38,7 @@ interface PreferencesResponse { let panelEl: HTMLElement | null = null; let isOpen = false; let isFirstTimeSetup = false; -let setupStep = 0; // 0=anthropic, 1=fish, 2=name, 3=done +let setupStep = 0; // 0=anthropic, 1=name, 2=done // --------------------------------------------------------------------------- // API helpers @@ -92,19 +91,18 @@ function buildPanelHTML(): string {
- +
- - - -
-
- -
- -
- - + + +
@@ -216,7 +214,12 @@ async function loadStatus() { // API key status dots setDotStatus("status-anthropic", status.env_keys_set.anthropic ? "green" : "red"); - setDotStatus("status-fish", status.env_keys_set.fish_audio ? "green" : "red"); + + // Pre-select current voice + const voiceEl = document.getElementById("input-edge-voice") as HTMLSelectElement; + if (voiceEl && status.env_keys_set.edge_tts_voice) { + voiceEl.value = status.env_keys_set.edge_tts_voice; + } // System info const memEl = document.getElementById("sysinfo-memory"); @@ -258,25 +261,17 @@ function wireEvents() { // Save keys document.getElementById("btn-save-keys")?.addEventListener("click", async () => { const anthropicKey = (document.getElementById("input-anthropic-key") as HTMLInputElement).value.trim(); - const fishKey = (document.getElementById("input-fish-key") as HTMLInputElement).value.trim(); + const voice = (document.getElementById("input-edge-voice") as HTMLSelectElement).value; if (anthropicKey) { await apiPost("/api/settings/keys", { key_name: "ANTHROPIC_API_KEY", key_value: anthropicKey }); } - if (fishKey) { - await apiPost("/api/settings/keys", { key_name: "FISH_API_KEY", key_value: fishKey }); + if (voice) { + await apiPost("/api/settings/keys", { key_name: "EDGE_TTS_VOICE", key_value: voice }); } await loadStatus(); }); - // Save voice ID - document.getElementById("btn-save-voice-id")?.addEventListener("click", async () => { - const voiceId = (document.getElementById("input-fish-voice-id") as HTMLInputElement).value.trim(); - if (voiceId) { - await apiPost("/api/settings/keys", { key_name: "FISH_VOICE_ID", key_value: voiceId }); - } - }); - // Test Anthropic document.getElementById("btn-test-anthropic")?.addEventListener("click", async () => { setDotStatus("status-anthropic", "yellow"); @@ -289,15 +284,19 @@ function wireEvents() { } }); - // Test Fish - document.getElementById("btn-test-fish")?.addEventListener("click", async () => { - setDotStatus("status-fish", "yellow"); - const key = (document.getElementById("input-fish-key") as HTMLInputElement).value.trim(); + // Test TTS + document.getElementById("btn-test-tts")?.addEventListener("click", async () => { + // Save selected voice first, then test + const voice = (document.getElementById("input-edge-voice") as HTMLSelectElement).value; + if (voice) { + await apiPost("/api/settings/keys", { key_name: "EDGE_TTS_VOICE", key_value: voice }); + } + setDotStatus("status-tts", "yellow"); try { - const result = await apiPost<{ valid: boolean; error?: string }>("/api/settings/test-fish", { key_value: key || undefined }); - setDotStatus("status-fish", result.valid ? "green" : "red"); + const result = await apiPost<{ valid: boolean; error?: string }>("/api/settings/test-tts", {}); + setDotStatus("status-tts", result.valid ? "green" : "red"); } catch { - setDotStatus("status-fish", "red"); + setDotStatus("status-tts", "red"); } }); @@ -338,24 +337,22 @@ function showSetupStep(step: number) { const el = document.getElementById(id); if (!el) return; if (step === 0 && i === 0) el.style.display = ""; - else if (step === 1 && i === 0) el.style.display = ""; - else if (step === 2 && i === 2) el.style.display = ""; - else if (step === 3) el.style.display = ""; + else if (step === 1 && i === 2) el.style.display = ""; + else if (step === 2) el.style.display = ""; else el.style.display = "none"; }); const nextBtn = document.getElementById("btn-setup-next"); if (nextBtn) { - if (step === 0) nextBtn.textContent = "Next: Test Keys"; - else if (step === 1) nextBtn.textContent = "Next: Set Your Name"; - else if (step === 2) nextBtn.textContent = "Finish Setup"; + if (step === 0) nextBtn.textContent = "Next: Set Your Name"; + else if (step === 1) nextBtn.textContent = "Finish Setup"; else nextBtn.style.display = "none"; } } async function advanceSetup() { setupStep++; - if (setupStep >= 3) { + if (setupStep >= 2) { // Done — save everything and close isFirstTimeSetup = false; const welcome = document.getElementById("settings-welcome"); diff --git a/frontend/src/voice.ts b/frontend/src/voice.ts index 8ca5e0a..905ccd5 100644 --- a/frontend/src/voice.ts +++ b/frontend/src/voice.ts @@ -18,7 +18,8 @@ declare const webkitSpeechRecognition: any; export function createVoiceInput( onTranscript: (text: string) => void, - onError: (msg: string) => void + onError: (msg: string) => void, + onAccumulating?: (partial: string) => void, ): VoiceInput { // eslint-disable-next-line @typescript-eslint/no-explicit-any const SR = (window as any).SpeechRecognition || (typeof webkitSpeechRecognition !== "undefined" ? webkitSpeechRecognition : null); @@ -35,11 +36,49 @@ export function createVoiceInput( let shouldListen = false; let paused = false; + // ── Adaptive send debounce ────────────────────────────────────────────────── + // The Web Speech API fires isFinal on every natural pause, which is too eager. + // Instead we accumulate final segments and wait for silence before sending. + // The wait scales with how much has been said: short phrases get a fast response, + // longer thoughts get full space to finish. + let pendingText = ""; + let sendTimer: ReturnType | null = null; + + function adaptiveDelay(wordCount: number): number { + if (wordCount < 5) return 1800; // quick question — respond in ~2 s + if (wordCount < 15) return 2500; // normal sentence + return 3200; // longer thought — give it full room + } + + function flush() { + sendTimer = null; + const text = pendingText.trim(); + pendingText = ""; + if (text) onTranscript(text); + } + + function cancelPending() { + if (sendTimer) { clearTimeout(sendTimer); sendTimer = null; } + pendingText = ""; + } + // ─────────────────────────────────────────────────────────────────────────── + recognition.onresult = (event: any) => { + // Guard: recognition.stop() during pause flushes a late final result — + // without this, JARVIS's own TTS would re-enter as the next user message. + if (paused) return; + for (let i = event.resultIndex; i < event.results.length; i++) { if (event.results[i].isFinal) { const text = event.results[i][0].transcript.trim(); - if (text) onTranscript(text); + if (!text) continue; + + // Accumulate and reset the silence timer + pendingText = pendingText ? pendingText + " " + text : text; + onAccumulating?.(pendingText); + + if (sendTimer) clearTimeout(sendTimer); + sendTimer = setTimeout(flush, adaptiveDelay(pendingText.split(/\s+/).length)); } } }; @@ -80,10 +119,13 @@ export function createVoiceInput( stop() { shouldListen = false; paused = false; + cancelPending(); recognition.stop(); }, pause() { paused = true; + // Discard accumulated text — JARVIS is about to speak, not listen + cancelPending(); recognition.stop(); }, resume() { diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts index 9c854c5..59eb0c3 100644 --- a/frontend/vite.config.ts +++ b/frontend/vite.config.ts @@ -2,7 +2,8 @@ import { defineConfig } from "vite"; export default defineConfig({ server: { - port: 5173, + port: 5200, + strictPort: true, proxy: { "/ws": { target: "https://localhost:8340", diff --git a/mail_access.py b/mail_access.py index e68ebfb..ddd070a 100644 --- a/mail_access.py +++ b/mail_access.py @@ -18,7 +18,7 @@ async def _ensure_mail_running(): - """Launch Mail.app if not already running.""" + """Check if Mail.app is already running — does not auto-launch it.""" global _mail_launched if _mail_launched: return @@ -37,18 +37,7 @@ async def _ensure_mail_running(): except Exception: pass - try: - proc = await asyncio.create_subprocess_exec( - "open", "-a", "Mail", "-g", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - await asyncio.wait_for(proc.communicate(), timeout=5) - await asyncio.sleep(2) - _mail_launched = True - log.info("Mail.app launched") - except Exception as e: - log.warning(f"Failed to launch Mail: {e}") + # Auto-launch disabled — Mail is only used if already open async def _run_mail_script(script: str, timeout: float = 20) -> str: diff --git a/memory.py b/memory.py index b041581..9a15763 100644 --- a/memory.py +++ b/memory.py @@ -12,11 +12,14 @@ import json import logging +import os import sqlite3 import time from datetime import datetime, timedelta from pathlib import Path +FAST_MODEL = os.getenv("JARVIS_FAST_MODEL", "claude-haiku-4-5-20251001") + log = logging.getLogger("jarvis.memory") DB_PATH = Path(__file__).parent / "data" / "jarvis.db" @@ -84,6 +87,28 @@ def init_db(): title, content, topic, content='notes', content_rowid='id' ); + + CREATE TABLE IF NOT EXISTS conversations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL, + role TEXT NOT NULL, + content TEXT NOT NULL, + timestamp REAL NOT NULL + ); + + CREATE VIRTUAL TABLE IF NOT EXISTS conversation_fts USING fts5( + content, session_id, + content='conversations', content_rowid='id' + ); + + CREATE TABLE IF NOT EXISTS sessions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL UNIQUE, + started_at REAL NOT NULL, + ended_at REAL, + summary TEXT DEFAULT '', + message_count INTEGER DEFAULT 0 + ); """) conn.close() log.info("Memory database initialized") @@ -317,12 +342,13 @@ def get_notes_by_topic(topic: str) -> list[dict]: # --------------------------------------------------------------------------- def build_memory_context(user_message: str) -> str: - """Build relevant context from memories, tasks, and notes for the LLM. + """Build relevant context from memories, tasks, and conversation history for the LLM. - Searches for relevant memories based on what the user is talking about. + Searches for relevant memories and past exchanges based on what the user is saying. Fast — runs FTS queries, no heavy computation. """ parts = [] + relevant = [] # Always include: open high-priority tasks high_tasks = [t for t in get_open_tasks() if t["priority"] == "high"] @@ -334,18 +360,33 @@ def build_memory_context(user_message: str) -> str: # Search memories relevant to what user is saying if len(user_message) > 5: - relevant = recall(user_message, limit=3) + relevant = recall(user_message, limit=4) if relevant: mem_lines = [f" - [{m['type']}] {m['content']}" for m in relevant] parts.append("RELEVANT MEMORIES:\n" + "\n".join(mem_lines)) # Recent important memories (always available) - important = get_important_memories(limit=3) + important = get_important_memories(limit=4) if important: - imp_lines = [f" - {m['content']}" for m in important - if not any(m["content"] == r["content"] for r in (relevant if 'relevant' in dir() else []))] + seen = {m["content"] for m in relevant} + imp_lines = [f" - {m['content']}" for m in important if m["content"] not in seen] if imp_lines: - parts.append("KEY FACTS:\n" + "\n".join(imp_lines[:3])) + parts.append("KEY FACTS:\n" + "\n".join(imp_lines[:4])) + + # Search past conversations for relevant exchanges + if len(user_message) > 10: + past = search_conversations(user_message, limit=3) + if past: + seen_content = {m["content"] for m in relevant} + past_lines = [] + for c in past: + snippet = c["content"][:120].replace("\n", " ") + if snippet not in seen_content: + dt = datetime.fromtimestamp(c["timestamp"]).strftime("%d %b") + who = "You" if c["role"] == "user" else "JARVIS" + past_lines.append(f" - [{dt}] {who}: {snippet}") + if past_lines: + parts.append("RELEVANT PAST EXCHANGES:\n" + "\n".join(past_lines)) return "\n\n".join(parts) if parts else "" @@ -413,14 +454,23 @@ async def extract_memories(user_text: str, jarvis_response: str, anthropic_clien try: response = await anthropic_client.messages.create( - model="claude-haiku-4-5-20251001", - max_tokens=200, + model=FAST_MODEL, + max_tokens=400, system=( - "Extract facts worth remembering from this conversation. " - "Only extract CONCRETE facts: preferences, decisions, names, dates, plans, goals. " - "NOT opinions, greetings, or casual chat. " - "Return JSON array of objects: [{\"type\": \"fact|preference|project|person|decision\", \"content\": \"...\", \"importance\": 1-10}] " - "Return [] if nothing worth remembering. Be very selective." + "You are a memory extraction engine for JARVIS, an AI assistant. " + "Extract every fact worth remembering from this conversation exchange. " + "Be thorough — capture:\n" + "- Personal facts: name, location, job, family, finances, health\n" + "- Preferences: likes, dislikes, habits, routines, communication style\n" + "- Projects: what's being built, tech stack, status, goals\n" + "- Decisions: choices made, approaches agreed on\n" + "- Goals & plans: short and long-term intentions\n" + "- People mentioned: names, roles, relationships\n" + "- Recurring topics: anything the user cares deeply about\n" + "Do NOT extract greetings, filler, or things already universally known. " + "Return a JSON array: [{\"type\": \"fact|preference|project|person|decision|goal\", " + "\"content\": \"concise statement of the fact\", \"importance\": 1-10}]. " + "Return [] only if truly nothing notable was said. Importance 8-10 for personal/financial/health facts." ), messages=[{"role": "user", "content": f"User: {user_text}\nJARVIS: {jarvis_response}"}], ) @@ -446,5 +496,132 @@ async def extract_memories(user_text: str, jarvis_response: str, anthropic_clien return [] +# --------------------------------------------------------------------------- +# Conversation log — every exchange persisted across sessions +# --------------------------------------------------------------------------- + +def log_message(session_id: str, role: str, content: str) -> None: + """Append a single message to the persistent conversation log.""" + conn = _get_db() + cur = conn.execute( + "INSERT INTO conversations (session_id, role, content, timestamp) VALUES (?,?,?,?)", + (session_id, role, content, time.time()) + ) + conn.execute( + "INSERT INTO conversation_fts (rowid, content, session_id) VALUES (?,?,?)", + (cur.lastrowid, content, session_id) + ) + conn.commit() + conn.close() + + +def search_conversations(query: str, limit: int = 5) -> list[dict]: + """Full-text search across all logged conversation messages.""" + fts_query = _sanitize_fts_query(query) + if not fts_query: + return [] + conn = _get_db() + try: + rows = conn.execute( + "SELECT c.session_id, c.role, c.content, c.timestamp " + "FROM conversation_fts f " + "JOIN conversations c ON c.id = f.rowid " + "WHERE conversation_fts MATCH ? " + "ORDER BY c.timestamp DESC LIMIT ?", + (fts_query, limit) + ).fetchall() + except Exception: + rows = [] + conn.close() + return [dict(r) for r in rows] + + +# --------------------------------------------------------------------------- +# Session tracking — summaries persisted across server restarts +# --------------------------------------------------------------------------- + +def start_session(session_id: str) -> None: + """Record the start of a new JARVIS session.""" + conn = _get_db() + conn.execute( + "INSERT OR IGNORE INTO sessions (session_id, started_at) VALUES (?,?)", + (session_id, time.time()) + ) + conn.commit() + conn.close() + log.info(f"Session started: {session_id}") + + +def end_session(session_id: str, summary: str = "", message_count: int = 0) -> None: + """Record session end with a Haiku-generated summary.""" + conn = _get_db() + conn.execute( + "UPDATE sessions SET ended_at=?, summary=?, message_count=? WHERE session_id=?", + (time.time(), summary, message_count, session_id) + ) + conn.commit() + conn.close() + log.info(f"Session saved: {session_id} ({message_count} exchanges)") + + +def get_recent_sessions(limit: int = 5) -> list[dict]: + """Return the most recent sessions that have a summary (most recent first).""" + conn = _get_db() + rows = conn.execute( + "SELECT session_id, started_at, ended_at, summary, message_count " + "FROM sessions WHERE summary != '' AND summary IS NOT NULL " + "ORDER BY started_at DESC LIMIT ?", + (limit,) + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def build_session_context() -> str: + """Build a cross-session memory block from recent past session summaries. + + Called once per WebSocket connection so JARVIS remembers previous conversations. + """ + sessions = get_recent_sessions(limit=5) + if not sessions: + return "" + + lines = [] + for s in sessions: + if not s.get("summary"): + continue + dt = datetime.fromtimestamp(s["started_at"]).strftime("%a %d %b") + count = s.get("message_count") or 0 + suffix = f" ({count} exchanges)" if count else "" + lines.append(f"[{dt}{suffix}] {s['summary']}") + + if not lines: + return "" + + return "PREVIOUS SESSIONS (most recent first):\n" + "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Maintenance +# --------------------------------------------------------------------------- + +def prune_old_conversations(days: int = 60) -> int: + """Delete conversation log entries older than `days` days. Returns rows deleted.""" + cutoff = time.time() - days * 86400 + conn = _get_db() + # Remove FTS entries first + conn.execute( + "DELETE FROM conversation_fts WHERE rowid IN " + "(SELECT id FROM conversations WHERE timestamp < ?)", (cutoff,) + ) + cur = conn.execute("DELETE FROM conversations WHERE timestamp < ?", (cutoff,)) + deleted = cur.rowcount + conn.commit() + conn.close() + if deleted: + log.info(f"Pruned {deleted} conversation entries older than {days} days") + return deleted + + # Initialize on import init_db() diff --git a/notes_access.py b/notes_access.py index 1d4c06f..1d81813 100644 --- a/notes_access.py +++ b/notes_access.py @@ -41,11 +41,17 @@ async def get_recent_notes(count: int = 10) -> list[dict]: set limit to count of allNotes if limit > {count} then set limit to {count} repeat with i from 1 to limit - set n to item i of allNotes - set nName to name of n - set nDate to creation date of n as string - set nFolder to name of container of n - set output to output & nName & "|||" & nDate & "|||" & nFolder & linefeed + try + set n to item i of allNotes + set nName to name of n + set nDate to creation date of n as string + try + set nFolder to name of container of n + on error + set nFolder to "Notes" + end try + set output to output & nName & "|||" & nDate & "|||" & nFolder & linefeed + end try end repeat return output end tell diff --git a/planner.py b/planner.py index 24c55f2..a2451e8 100644 --- a/planner.py +++ b/planner.py @@ -15,10 +15,15 @@ from pathlib import Path from typing import Optional +import os + import anthropic from templates import TEMPLATES, get_template +# Honour the same env vars as server.py so models stay in sync +FAST_MODEL = os.getenv("JARVIS_FAST_MODEL", "claude-haiku-4-5-20251001") + log = logging.getLogger("jarvis.planner") DESKTOP_PATH = Path.home() / "Desktop" @@ -129,7 +134,7 @@ async def _classify_planning_mode_llm( """Use Haiku to classify request and identify missing info.""" try: response = await client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=400, system=( "You analyze development requests to decide if they need planning.\n" @@ -311,6 +316,18 @@ def current_question(self) -> Optional[dict]: return self.pending_questions[self.current_question_index] return None + def to_dict(self) -> dict: + """Serialize plan to a structured dict for blueprint storage / sub-agent passing.""" + return { + "task_type": self.task_type, + "original_request": self.original_request, + "project": self.project, + "project_path": self.project_path, + "answers": self.answers, + "confirmed": self.confirmed, + "skipped": self.skipped, + } + # --------------------------------------------------------------------------- # Context Gatherer @@ -614,8 +631,13 @@ async def get_confirmation_summary(self) -> str: summary = " ".join(parts) + ". Shall I proceed, sir?" return summary - async def build_prompt(self) -> str: - """Build the structured claude -p prompt from the finalized plan.""" + async def build_prompt(self, memory_context: str = "") -> str: + """Build the structured claude -p prompt from the finalized plan. + + Args: + memory_context: Optional JARVIS memory context injected before instructions + so Claude Code has user preferences / past decisions available. + """ plan = self.active_plan if not plan: return "" @@ -659,8 +681,43 @@ async def build_prompt(self) -> str: if context_section: prompt += "\n\n" + context_section + # Prepend structured blueprint as a JSON block so sub-agents can parse it + blueprint_json = json.dumps(plan.to_dict(), indent=2) + blueprint_header = ( + "\n\n" + ) + prompt = blueprint_header + prompt + + # Prepend memory context if JARVIS has relevant memories for this task + if memory_context: + prompt = ( + "\n" + f"{memory_context}\n" + "\n\n" + ) + prompt + return prompt + def write_blueprint(self, project_path: str) -> Optional[str]: + """Write the current plan as a JSON blueprint file in the project directory. + + Returns the file path on success, None on failure. + This lets sub-agents read structured plan data independently. + """ + plan = self.active_plan + if not plan: + return None + try: + bp_path = Path(project_path) / ".jarvis_blueprint.json" + bp_path.write_text(json.dumps(plan.to_dict(), indent=2)) + log.info(f"Blueprint written to {bp_path}") + return str(bp_path) + except Exception as e: + log.warning(f"Failed to write blueprint: {e}") + return None + def get_working_dir(self) -> str: """Get the working directory for the current plan.""" if self.active_plan and self.active_plan.project_path: @@ -677,7 +734,7 @@ async def _classify_request(self, text: str, client: anthropic.AsyncAnthropic) - """Use Haiku to classify request type and extract known info.""" try: response = await client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=300, system=( "Classify this development request. Respond with JSON only, no markdown.\n" diff --git a/requirements.txt b/requirements.txt index e9b967f..955f81f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ anthropic>=0.39.0 +edge-tts>=7.0.0 httpx>=0.27.0 fastapi>=0.115.0 uvicorn[standard]>=0.32.0 diff --git a/server.py b/server.py index acabce2..0b86156 100644 --- a/server.py +++ b/server.py @@ -33,13 +33,15 @@ from typing import Optional import anthropic +import edge_tts import httpx from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from pydantic import BaseModel -from actions import execute_action, monitor_build, open_terminal, open_browser, open_claude_in_project, _generate_project_name, prompt_existing_terminal +from actions import execute_action, monitor_build, open_terminal, open_browser, open_claude_in_project, _generate_project_name, prompt_existing_terminal, close_tab, click_element +import system_control from work_mode import WorkSession, is_casual_question from screen import get_active_windows, take_screenshot, describe_screen, format_windows_for_context from calendar_access import get_todays_events, get_upcoming_events, get_next_event, format_events_for_context, format_schedule_summary, refresh_cache as refresh_calendar_cache @@ -48,6 +50,8 @@ remember, recall, get_open_tasks, create_task, complete_task, search_tasks, create_note, search_notes, get_tasks_for_date, build_memory_context, format_tasks_for_voice, extract_memories, get_important_memories, + log_message, start_session, end_session, build_session_context, + prune_old_conversations, ) from notes_access import get_recent_notes, read_note, search_notes_apple, create_apple_note from dispatch_registry import DispatchRegistry @@ -61,12 +65,21 @@ # --------------------------------------------------------------------------- ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") -FISH_API_KEY = os.getenv("FISH_API_KEY", "") -FISH_VOICE_ID = os.getenv("FISH_VOICE_ID", "612b878b113047d9a770c069c8b4fdfe") # JARVIS (MCU) -FISH_API_URL = "https://api.fish.audio/v1/tts" +EDGE_TTS_VOICE = os.getenv("EDGE_TTS_VOICE", "en-GB-RyanNeural") # British male, suits JARVIS USER_NAME = os.getenv("USER_NAME", "sir") PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) +# Model names — override via .env if Anthropic renames them +# e.g. JARVIS_FAST_MODEL=claude-haiku-4-5-20251001 +FAST_MODEL = os.getenv("JARVIS_FAST_MODEL", "claude-haiku-4-5-20251001") +SMART_MODEL = os.getenv("JARVIS_SMART_MODEL", "claude-opus-4-6") + +# Local LLM via LM Studio (OpenAI-compatible). Set LOCAL_LLM_MODEL in .env to enable. +# e.g. LOCAL_LLM_MODEL=gemma-3-4b-it-qat +# Leave empty to use Anthropic for all fast responses. +LOCAL_LLM_URL = os.getenv("LOCAL_LLM_URL", "http://localhost:1234/v1") +LOCAL_LLM_MODEL = os.getenv("LOCAL_LLM_MODEL", "") + DESKTOP_PATH = Path.home() / "Desktop" JARVIS_SYSTEM_PROMPT = """\ @@ -95,7 +108,7 @@ - When you don't know something: "I'm afraid I don't have that information, sir" not "I don't know" SELF-AWARENESS: -You ARE the JARVIS project at {project_dir} on {user_name}'s computer. Your code is Python (FastAPI server, WebSocket voice, Fish Audio TTS, Anthropic API). You were built by {user_name}. If asked about yourself, your code, how you work, or your line count — use [ACTION:PROMPT_PROJECT] to check the jarvis project. You have full access to your own source code. +You ARE the JARVIS project at {project_dir} on {user_name}'s computer. Your code is Python (FastAPI server, WebSocket voice, Edge TTS, Anthropic API). You were built by {user_name}. If asked about yourself, your code, how you work, or your line count — use [ACTION:PROMPT_PROJECT] to check the jarvis project. You have full access to your own source code. YOUR CAPABILITIES (these are REAL and ACTIVE — you CAN do all of these RIGHT NOW): - You CAN open Terminal.app via AppleScript @@ -111,6 +124,13 @@ - You CAN manage tasks — create, complete, and list to-do items with priorities and due dates - You CAN help plan {user_name}'s day — combine calendar events, tasks, and priorities into an organized plan - You CAN remember facts about {user_name} — preferences, decisions, goals. Use [ACTION:REMEMBER] to store important info. +- You CAN close Chrome tabs and click page elements via JavaScript — use [ACTION:CLOSE_TAB] and [ACTION:CLICK]. +- You CAN control Chrome: open new tabs, close windows, navigate back/forward, reload, get current tab info. +- You CAN switch focus to any running app, hide apps, minimise/maximise windows, snap windows to left or right half. +- You CAN trigger keyboard shortcuts: copy, paste, undo, redo, select all, save, take a screenshot. +- You CAN scroll the front window up or down. +- You CAN control system volume: set level, mute, unmute, query current level. +- You CAN open folders in Finder, reveal files in Finder, and move files to the Trash (not permanent delete). DAY PLANNING: When {user_name} asks to plan his day or schedule, DO NOT dispatch to a project. Instead: @@ -132,13 +152,15 @@ - NEVER hallucinate progress. If the build is still running, say "Still working on it, sir" — don't make up details about what's happening. - NEVER guess localhost ports. Check the DISPATCHES section for the actual URL. If a dispatch says "Running at http://localhost:5174" — use THAT URL, not a guess. - When asked to "pull it up" or "show me" — use [ACTION:BROWSE] with the URL from DISPATCHES. Do NOT dispatch to the project again just to find the URL. +- "pull up [anything]" is a universal Firefox command. It means: open a new page in Firefox and search for or navigate to whatever follows. Examples: "pull up the weather forecast in Ireland", "pull up the mathematical symbol for pi", "pull up BBC News", "pull up flights to Lisbon" — all go straight to Firefox. Never ask for confirmation. Never treat it as an app-switch unless it's a bare 1-2 word app name with no articles (e.g. "pull up Spotify"). +- Firefox is the default browser. Use Firefox for all [ACTION:BROWSE] actions unless the user specifically says Chrome. IMPORTANT: Actions like opening Terminal, Chrome, or building projects are handled AUTOMATICALLY by your system — you do NOT need to describe doing them. If the user asks you to build something or search something, your system will handle the execution separately. In your response, just TALK — have a conversation. Don't say "I'll build that now" or "Claude Code is working on..." unless your system has actually triggered the action. If the user asks you to do something you genuinely can't do, say "I'm afraid that's beyond my current reach, sir." Don't fake executing actions. YOUR INTERFACE: The user interacts with you through a web browser showing a particle orb visualization that reacts to your voice. The interface has these controls: - **Three-dot menu** (top right): contains Settings, Restart Server, and Fix Yourself options -- **Settings panel**: Opens from the menu. Users can enter API keys (Anthropic, Fish Audio), test connections, set their name and preferences, and see system status (calendar, mail, notes connectivity). Keys are saved to the .env file. +- **Settings panel**: Opens from the menu. Users can enter their Anthropic API key, choose a voice, set their name and preferences, and see system status (calendar, mail, notes connectivity). Settings are saved to the .env file. - **Mute button**: Toggles your listening on/off. When muted, you can't hear the user. They click it again to unmute. - **Restart Server**: Restarts your backend process. Useful if something seems stuck. - **Fix Yourself**: Opens Claude Code in your own project directory so you can debug and fix issues in your own code. @@ -152,10 +174,27 @@ - "clock code" = "Claude Code" RESPONSE LENGTH — THIS IS CRITICAL: -ONE sentence is ideal. TWO is the maximum for the spoken part. Never three. +For COMMANDS, SIMPLE QUESTIONS, and CONFIRMATIONS: one sentence. Two maximum. +For IDEATION, PLANNING, and OPEN-ENDED DISCUSSION: up to five sentences. + Use this structure: (1) Mirror back what you understood — one sentence. + (2) Surface two or three possible directions or considerations — one sentence each. + (3) Close with a question that helps the user choose their path. + Example: "So the idea is a real-time portfolio dashboard with live price feeds — solid concept, sir. + You could go client-side with a public API proxy for simplicity, server-side with scheduled syncs + for reliability, or WebSocket streaming if real-time is non-negotiable. + The first two ship in a day; the third takes longer but impresses. + Which matters more — speed to launch or live data, sir?" No markdown, no bullet points, no code blocks in voice responses. Action tags at the end do NOT count toward your sentence limit. +IDEATION MODE — when {user_name} is thinking through an idea out loud: +- He has just given you a full thought (the input will be longer and exploratory) +- Do NOT immediately dispatch a build or execute — confirm understanding first +- Your job is to be a thinking partner: reflect, surface options, invite direction +- One sentence confirming understanding, two or three sentences exploring avenues, one closing question +- Never jump to "I'll build that now" until the user explicitly says proceed or confirms a direction +- The goal is to get the idea out, shaped, and agreed on — THEN execute + BANNED PHRASES — NEVER USE THESE: - "Absolutely" / "Absolutely right" - "Great question" @@ -178,13 +217,13 @@ - "Consider it done." - "Done, sir." - "Terminal is open." -- "Pulled that up in Chrome." +- "Pulled that up in Firefox." ACTION SYSTEM: When you decide the user needs something DONE (not just discussed), include an action tag in your response: -- [ACTION:SCREEN] — capture and describe what's visible on the user's screen. Use when user says "look at my screen", "what's running", "what do you see", etc. Do NOT use PROMPT_PROJECT for screen requests. +- [ACTION:SCREEN] — capture and describe what's visible on the user's screen. Use ONLY when user EXPLICITLY asks to see/look at/describe the screen ("look at my screen", "what do you see", "what's on my screen"). NEVER use [ACTION:SCREEN] for opening apps, switching windows, or any non-visual request. - [ACTION:BUILD] description — when user wants a project built. Claude Code does the work. -- [ACTION:BROWSE] url or search query — when user wants to see a webpage or search result in Chrome +- [ACTION:BROWSE] url or search query — when user wants to see a webpage or search result in Firefox (default browser) - [ACTION:RESEARCH] detailed research brief — when user wants real research with real data. Claude Code will browse the web, find real listings/data, and create a report document. Give it a detailed brief of what to find. - [ACTION:OPEN_TERMINAL] — when user just wants a fresh Claude Code terminal with no specific project CRITICAL: When the user asks about their SCREEN, what's RUNNING, or what they're LOOKING AT — ALWAYS use [ACTION:SCREEN] or let the fast action system handle it. NEVER use [ACTION:PROMPT_PROJECT] for screen requests. PROMPT_PROJECT is ONLY for working on code projects. @@ -203,6 +242,50 @@ - [ACTION:CREATE_NOTE] title ||| body — create a new Apple Note. For saving plans, ideas, lists. "save that as a note" → [ACTION:CREATE_NOTE] Day Plan March 19 ||| Morning: client calls. Afternoon: TikTok dashboard. Evening: JARVIS improvements. - [ACTION:READ_NOTE] title search — read an existing Apple Note by title keyword. +- [ACTION:CLOSE_TAB] — close the active Chrome tab. Use when user says "close this tab", "close the tab", "close that", etc. +- [ACTION:CLICK] css_selector — click an element in the active Chrome tab by CSS selector. Use when user says "click the submit button", "click sign in", "click that link", etc. Generate a CSS selector from context (e.g. "click sign in" → [ACTION:CLICK] button[type=submit], "click the login link" → [ACTION:CLICK] a.login). Prefer attribute selectors without quotes: button[type=submit] not button[type="submit"]. + +BROWSER CONTROL: +- [ACTION:OPEN_TAB] url — open a new Chrome tab at the given URL (or blank if no URL). +- [ACTION:CLOSE_WINDOW] — close the front Chrome window. ASK CONFIRMATION first: "Shall I close that window, sir?" +- [ACTION:BROWSER_BACK] — go back in Chrome. +- [ACTION:BROWSER_FORWARD] — go forward in Chrome. +- [ACTION:RELOAD] — reload the current Chrome tab. +- [ACTION:GET_TAB] — get the current tab title and URL. Use when user asks "what site is this", "what page am I on", etc. + +APP & WINDOW CONTROL: +- [ACTION:SWITCH_APP] AppName — bring an app to the foreground and unminimise it. e.g. "switch to Slack" → [ACTION:SWITCH_APP] Slack, "open Firefox" → [ACTION:SWITCH_APP] Firefox, "pull up Spotify" → [ACTION:SWITCH_APP] Spotify. Use this for ANY request to open, switch to, or bring up an app — do NOT use [ACTION:SCREEN] for these. +- [ACTION:QUIT_APP] AppName — quit an app. ALWAYS ask confirmation first: "Shall I quit Slack, sir?" +- [ACTION:HIDE_APP] AppName — hide an app (Cmd+H equivalent). +- [ACTION:MINIMIZE_WINDOW] — minimise the front window. +- [ACTION:MAXIMIZE_WINDOW] — maximise / enter full-screen the front window. +- [ACTION:MOVE_WINDOW] left|right — snap the front window to the left or right half of the screen. + +KEYBOARD & EDITING: +- [ACTION:COPY] — copy the current selection (Cmd+C). +- [ACTION:PASTE] — paste from clipboard (Cmd+V). +- [ACTION:UNDO] — undo last action (Cmd+Z). +- [ACTION:REDO] — redo (Cmd+Shift+Z). +- [ACTION:SELECT_ALL] — select all (Cmd+A). +- [ACTION:SAVE] — save the current document (Cmd+S). +- [ACTION:SCREENSHOT] — capture the full screen to ~/Desktop. +- [ACTION:SCROLL] up|down — scroll the front window. + +VOLUME: +- [ACTION:SET_VOLUME] 0-100 — set system volume. e.g. "set volume to 50" → [ACTION:SET_VOLUME] 50 +- [ACTION:MUTE] — mute system audio. +- [ACTION:UNMUTE] — unmute system audio. +- [ACTION:GET_VOLUME] — report current volume level. + +FINDER / FILES: +- [ACTION:OPEN_FOLDER] /path/to/folder — open a folder in Finder. +- [ACTION:TRASH_FILE] /path/to/file — move a file to Trash (NOT permanent). ALWAYS ask confirmation: "Move that to Trash, shall I, sir?" +- [ACTION:REVEAL_FILE] /path/to/file — reveal a file in Finder. + +SAFETY RULES FOR DESTRUCTIVE ACTIONS: +- QUIT_APP, CLOSE_WINDOW, TRASH_FILE are DESTRUCTIVE. Always say what you're about to do and ask "shall I proceed, sir?" in your spoken response WITHOUT the action tag. Only include the action tag when the user has confirmed. +- Never use TRASH_FILE for system files, app bundles, or anything outside ~/Desktop, ~/Documents, ~/Downloads. +- Never use QUIT_APP for critical system processes. You use Claude Code as your tool to build, research, and write code — but YOU are the one doing the work. Never say "Claude Code did X" or "Claude Code is asking" — say "I built X", "I'm checking on that", "I found X". You ARE the intelligence. Claude Code is just your hands. @@ -264,6 +347,28 @@ async def fetch_weather() -> str: return "Weather data unavailable." +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + +def _escape_applescript_string(s: str) -> str: + """Properly escape a string for embedding inside an AppleScript double-quoted string. + + AppleScript's `do script` receives a shell command. The escaping has two + layers: first make the string safe for the AppleScript string literal, then + the result is passed to the shell — but `do script` handles shell quoting + for us, so we only need to escape backslashes and double-quotes here. + Newlines and tabs are replaced with spaces so the shell command stays on + one line (multi-line `do script` arguments behave unpredictably). + """ + s = s.replace("\\", "\\\\") # backslash must come first + s = s.replace('"', '\\"') # double-quote + s = s.replace("\n", " ") # newline → space + s = s.replace("\r", " ") # carriage return → space + s = s.replace("\t", " ") # tab → space + return s + + # --------------------------------------------------------------------------- # Data Models # --------------------------------------------------------------------------- @@ -392,10 +497,11 @@ async def _run_task(self, task: ClaudeTask): prompt_file.write_text(task.prompt) # Open Terminal.app with claude running in the project directory + escaped_work_dir = _escape_applescript_string(work_dir) applescript = f''' tell application "Terminal" activate - set newTab to do script "cd {work_dir} && cat .jarvis_prompt.md | claude -p --dangerously-skip-permissions | tee .jarvis_output.txt; echo '\\n--- JARVIS TASK COMPLETE ---'" + set newTab to do script "cd \\"{escaped_work_dir}\\" && cat .jarvis_prompt.md | claude -p --dangerously-skip-permissions | tee .jarvis_output.txt; echo '\\n--- JARVIS TASK COMPLETE ---'" end tell ''' @@ -564,42 +670,45 @@ def get_active_tasks_summary(self) -> str: # --------------------------------------------------------------------------- async def scan_projects() -> list[dict]: - """Quick scan of ~/Desktop for git repos (depth 1).""" + """Scan ~/Desktop and ~/ (home) for git repos (depth 1).""" projects = [] - desktop = DESKTOP_PATH + seen = set() + scan_roots = [DESKTOP_PATH, Path.home()] - if not desktop.exists(): - return projects - - try: - for entry in sorted(desktop.iterdir()): - if not entry.is_dir() or entry.name.startswith("."): - continue - git_dir = entry / ".git" - if git_dir.exists(): - branch = "unknown" - head_file = git_dir / "HEAD" - try: - head_content = head_file.read_text().strip() - if head_content.startswith("ref: refs/heads/"): - branch = head_content.replace("ref: refs/heads/", "") - except Exception: - pass - - projects.append({ - "name": entry.name, - "path": str(entry), - "branch": branch, - }) - except PermissionError: - pass + for root in scan_roots: + if not root.exists(): + continue + try: + for entry in sorted(root.iterdir()): + if not entry.is_dir() or entry.name.startswith("."): + continue + if str(entry) in seen: + continue + git_dir = entry / ".git" + if git_dir.exists(): + seen.add(str(entry)) + branch = "unknown" + head_file = git_dir / "HEAD" + try: + head_content = head_file.read_text().strip() + if head_content.startswith("ref: refs/heads/"): + branch = head_content.replace("ref: refs/heads/", "") + except Exception: + pass + projects.append({ + "name": entry.name, + "path": str(entry), + "branch": branch, + }) + except PermissionError: + pass return projects def format_projects_for_prompt(projects: list[dict]) -> str: if not projects: - return "No projects found on Desktop." + return "No projects found." lines = [] for p in projects: lines.append(f"- {p['name']} ({p['branch']}) @ {p['path']}") @@ -643,7 +752,7 @@ async def classify_intent(text: str, client: anthropic.AsyncAnthropic) -> dict: """ try: response = await client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=100, system=( "Classify this voice command. The user is talking to JARVIS, an AI assistant that can:\n" @@ -738,7 +847,12 @@ def extract_action(response: str) -> tuple[str, dict | None]: Returns (clean_text_for_tts, action_dict_or_none). """ match = _action_re.search( - r'\[ACTION:(BUILD|BROWSE|RESEARCH|OPEN_TERMINAL|PROMPT_PROJECT|ADD_TASK|ADD_NOTE|COMPLETE_TASK|REMEMBER|CREATE_NOTE|READ_NOTE|SCREEN)\]\s*(.*?)$', + r'\[ACTION:(BUILD|BROWSE|RESEARCH|OPEN_TERMINAL|PROMPT_PROJECT|ADD_TASK|ADD_NOTE|COMPLETE_TASK|REMEMBER|CREATE_NOTE|READ_NOTE|SCREEN|CLOSE_TAB|CLICK' + r'|OPEN_TAB|CLOSE_WINDOW|BROWSER_BACK|BROWSER_FORWARD|RELOAD|GET_TAB' + r'|SWITCH_APP|QUIT_APP|HIDE_APP|MINIMIZE_WINDOW|MAXIMIZE_WINDOW|MOVE_WINDOW' + r'|COPY|PASTE|UNDO|REDO|SELECT_ALL|SAVE|SCREENSHOT|SCROLL' + r'|SET_VOLUME|MUTE|UNMUTE|GET_VOLUME' + r'|OPEN_FOLDER|TRASH_FILE|REVEAL_FILE)\]\s*(.*?)$', response, _action_re.DOTALL, ) if match: @@ -821,11 +935,13 @@ async def _execute_research(target: str, ws=None): try: notify_text = f"Research is complete, sir. Report is open in your browser." audio = await synthesize_speech(notify_text) + await ws.send_json({"type": "status", "state": "speaking"}) if audio: - await ws.send_json({"type": "status", "state": "speaking"}) await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": notify_text}) - await ws.send_json({"type": "status", "state": "idle"}) - log.info(f"JARVIS: {notify_text}") + # Root-cause fix: no premature "idle" — frontend handles it on playback end. + else: + await ws.send_json({"type": "text", "text": notify_text}) + log.info(f"JARVIS: {notify_text}") except Exception: pass # WebSocket might be gone @@ -833,9 +949,12 @@ async def _execute_research(target: str, ws=None): log.error("Research timed out after 5 minutes") if ws: try: - audio = await synthesize_speech("Research timed out, sir. It was taking too long.") + timeout_text = "Research timed out, sir. It was taking too long." + audio = await synthesize_speech(timeout_text) if audio: - await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": "Research timed out, sir."}) + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": timeout_text}) + else: + await ws.send_json({"type": "text", "text": timeout_text}) except Exception: pass except Exception as e: @@ -875,15 +994,36 @@ async def _execute_open_terminal(): log.error(f"Open terminal failed: {e}") +async def _execute_close_tab(): + """Execute a close-tab action from an LLM-embedded [ACTION:CLOSE_TAB] tag.""" + try: + result = await close_tab() + log.info(f"close_tab: {result}") + except Exception as e: + log.error(f"close_tab execution failed: {e}") + + +async def _execute_click(selector: str): + """Execute a click action from an LLM-embedded [ACTION:CLICK] tag.""" + try: + result = await click_element(selector) + log.info(f"click_element '{selector}': {result}") + except Exception as e: + log.error(f"click_element execution failed: {e}") + + def _find_project_dir(project_name: str) -> str | None: - """Find a project directory by name from cached projects or Desktop.""" + """Find a project directory by name from cached projects, ~/Desktop, or ~/.""" for p in cached_projects: if project_name.lower() in p.get("name", "").lower(): return p.get("path") - desktop = Path.home() / "Desktop" - for d in desktop.iterdir(): - if d.is_dir() and project_name.lower() in d.name.lower(): - return str(d) + for root in [Path.home() / "Desktop", Path.home()]: + try: + for d in root.iterdir(): + if d.is_dir() and project_name.lower() in d.name.lower(): + return str(d) + except Exception: + pass return None @@ -903,10 +1043,13 @@ async def _execute_prompt_project(project_name: str, prompt: str, work_session: if not project_dir: msg = f"Couldn't find the {project_name} project directory, sir." audio = await synthesize_speech(msg) - if audio and ws: + if ws: try: await ws.send_json({"type": "status", "state": "speaking"}) - await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) + if audio: + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) + else: + await ws.send_json({"type": "text", "text": msg}) except Exception: pass return @@ -921,18 +1064,24 @@ async def _execute_prompt_project(project_name: str, prompt: str, work_session: log.info(f"Dispatching to {project_name} in {project_dir}: {prompt[:80]}") dispatch_registry.update_status(dispatch_id, "building") + # Inject relevant JARVIS memories so Claude Code has user context + memory_ctx = build_memory_context(prompt) + # Run claude -p in background - full_response = await dispatch.send(prompt) + full_response = await dispatch.send(prompt, memory_context=memory_ctx) await dispatch.stop() - # Auto-open any localhost URLs from response + # Auto-open any localhost URLs from response. + # Prefer the explicit RUNNING_AT= marker; fall back to first localhost URL. + # Always use group(0) to avoid IndexError when the fallback regex has no capture group. import re as _re - # Check for the explicit RUNNING_AT marker first running_match = _re.search(r'RUNNING_AT=(https?://localhost:\d+)', full_response or "") - if not running_match: - running_match = _re.search(r'https?://localhost:\d+', full_response or "") if running_match: - url = running_match.group(1) if running_match.lastindex else running_match.group(0) + url = running_match.group(1) # captured group inside RUNNING_AT= + else: + fb_match = _re.search(r'https?://localhost:\d+', full_response or "") + url = fb_match.group(0) if fb_match else None + if url: asyncio.create_task(_execute_browse(url)) log.info(f"Auto-opening {url}") # Store URL in dispatch @@ -948,7 +1097,7 @@ async def _execute_prompt_project(project_name: str, prompt: str, work_session: if anthropic_client: try: summary = await anthropic_client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=150, system=( "You are JARVIS reporting back on what you found or built in a project. " @@ -999,9 +1148,12 @@ async def _execute_prompt_project(project_name: str, prompt: str, work_session: try: msg = f"Had trouble connecting to {project_name}, sir." audio = await synthesize_speech(msg) - if audio and ws: + if ws: await ws.send_json({"type": "status", "state": "speaking"}) - await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) + if audio: + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) + else: + await ws.send_json({"type": "text", "text": msg}) except Exception: pass @@ -1009,14 +1161,15 @@ async def _execute_prompt_project(project_name: str, prompt: str, work_session: async def self_work_and_notify(session: WorkSession, prompt: str, ws): """Run claude -p in background and notify via voice when done.""" try: - full_response = await session.send(prompt) + memory_ctx = build_memory_context(prompt) + full_response = await session.send(prompt, memory_context=memory_ctx) log.info(f"Background work complete ({len(full_response)} chars)") # Summarize and speak if anthropic_client and full_response: try: summary = await anthropic_client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=100, system="You are JARVIS. Summarize what you just completed in 1 sentence. First person — 'I built', 'I set up'. No markdown. Never say 'Claude Code'.", messages=[{"role": "user", "content": f"Claude Code completed:\n{full_response[:2000]}"}], @@ -1027,11 +1180,13 @@ async def self_work_and_notify(session: WorkSession, prompt: str, ws): try: audio = await synthesize_speech(msg) + await ws.send_json({"type": "status", "state": "speaking"}) if audio: - await ws.send_json({"type": "status", "state": "speaking"}) await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) - await ws.send_json({"type": "status", "state": "idle"}) - log.info(f"JARVIS: {msg}") + # Root-cause fix: no premature "idle" — frontend handles it on playback end. + else: + await ws.send_json({"type": "text", "text": msg}) + log.info(f"JARVIS: {msg}") except Exception: pass except Exception as e: @@ -1043,41 +1198,208 @@ async def self_work_and_notify(session: WorkSession, prompt: str, ws): # --------------------------------------------------------------------------- -# TTS (Fish Audio) +# TTS (Edge TTS — free, no API key needed) # --------------------------------------------------------------------------- async def synthesize_speech(text: str) -> Optional[bytes]: - """Generate speech audio from text using Fish Audio TTS.""" - if not FISH_API_KEY: - log.warning("FISH_API_KEY not set, skipping TTS") - return None - + """Generate speech audio from text using Microsoft Edge TTS (free).""" + import io try: - async with httpx.AsyncClient(timeout=15.0) as http: - response = await http.post( - FISH_API_URL, - headers={ - "Authorization": f"Bearer {FISH_API_KEY}", - "Content-Type": "application/json", - }, - json={ - "text": text, - "reference_id": FISH_VOICE_ID, - "format": "mp3", - }, - ) - if response.status_code == 200: - _session_tokens["tts_calls"] += 1 - _append_usage_entry(0, 0, "tts") - return response.content - else: - log.error(f"TTS error: {response.status_code}") - return None + communicate = edge_tts.Communicate(text, EDGE_TTS_VOICE) + audio_data = io.BytesIO() + async for chunk in communicate.stream(): + if chunk["type"] == "audio": + audio_data.write(chunk["data"]) + audio_data.seek(0) + result = audio_data.read() + if result: + _session_tokens["tts_calls"] += 1 + _append_usage_entry(0, 0, "tts") + return result + return None except Exception as e: log.error(f"TTS error: {e}") return None +# --------------------------------------------------------------------------- +# Local LLM (LM Studio) +# --------------------------------------------------------------------------- + +_LOCAL_SYSTEM_TEMPLATE = """\ +You are JARVIS — Tony Stark's AI assistant. British, dry, precise. Address the user as "sir". + +PERSONALITY: Economy of language. Never filler. Never "Absolutely", "Great question", "Of course". +Say: "Will do, sir." / "Right away, sir." / "Done, sir." / "Understood." + +RESPONSE LENGTH — CRITICAL: +- Commands / simple requests: ONE sentence max. +- Discussion / planning: up to four sentences. +- No markdown, no bullet points. Voice only. + +TIME: {current_time} +{weather_info} + +SCREEN: {screen_context} +SCHEDULE: {calendar_context} +TASKS: {active_tasks} + +ACTION TAGS — append ONE at end of response when needed: +[ACTION:SWITCH_APP] AppName — open/focus an app ("open Firefox" → [ACTION:SWITCH_APP] Firefox) +[ACTION:SCREEN] — ONLY when user explicitly asks to see/describe the screen +[ACTION:BROWSE] url — open a URL or search in Chrome +[ACTION:BUILD] description — build a project with Claude Code +[ACTION:PROMPT_PROJECT] name ||| prompt — work on an existing project +[ACTION:ADD_TASK] priority ||| title ||| desc ||| due — create a task +[ACTION:REMEMBER] content — store a fact +[ACTION:MINIMIZE_WINDOW] / [ACTION:MAXIMIZE_WINDOW] / [ACTION:SCREENSHOT] +[ACTION:SET_VOLUME] 0-100 / [ACTION:MUTE] / [ACTION:UNMUTE] +[ACTION:SWITCH_APP] / [ACTION:HIDE_APP] / [ACTION:QUIT_APP] AppName +[ACTION:SCROLL] up|down / [ACTION:COPY] / [ACTION:PASTE] / [ACTION:SAVE] +NEVER use [ACTION:SCREEN] for app switching. NEVER emit action tags in casual chat. + +{recent_responses}""" + + +def _build_local_system( + current_time: str, + weather_info: str, + screen_context: str, + calendar_context: str, + active_tasks: str, + recent_responses: str, +) -> str: + recent_block = "" + if recent_responses: + recent_block = f"YOUR RECENT RESPONSES (do NOT repeat):\n{recent_responses}" + return _LOCAL_SYSTEM_TEMPLATE.format( + current_time=current_time, + weather_info=weather_info, + screen_context=screen_context or "Not checked yet.", + calendar_context=calendar_context, + active_tasks=active_tasks, + recent_responses=recent_block, + ) + + +_local_http_client: Optional[httpx.AsyncClient] = None + + +def _get_local_http_client() -> httpx.AsyncClient: + global _local_http_client + if _local_http_client is None or _local_http_client.is_closed: + _local_http_client = httpx.AsyncClient(timeout=10.0) + return _local_http_client + + +async def _local_generate(system: str, messages: list[dict], max_tokens: int = 150) -> str | None: + """Call LM Studio's OpenAI-compatible API. Returns None if disabled or unreachable.""" + if not LOCAL_LLM_MODEL: + return None + try: + payload = { + "model": LOCAL_LLM_MODEL, + "max_tokens": max_tokens, + "temperature": 0.7, + "messages": [{"role": "system", "content": system}] + messages, + } + r = await _get_local_http_client().post(f"{LOCAL_LLM_URL}/chat/completions", json=payload) + if r.status_code == 200: + text = r.json()["choices"][0]["message"]["content"] + log.debug(f"[local-llm] response: {text[:80]}") + return text + log.warning(f"[local-llm] HTTP {r.status_code}: {r.text[:120]}") + except Exception as e: + log.debug(f"[local-llm] unavailable: {e}") + return None + + +import re as _re_tts + +_SENTENCE_END = _re_tts.compile(r'(?<=[.!?])\s+(?=[A-Z\[])|(?<=\.)$') +_ACTION_TAG = _re_tts.compile(r'\[ACTION:') + + +async def _local_stream_sentences(system: str, messages: list[dict], max_tokens: int = 150): + """Stream local LLM and yield speakable sentences as they complete. + + Yields (sentence, is_last) tuples. Stops yielding spoken text once an + [ACTION:...] tag starts — the caller handles the tag separately. + Full response is available as the final yield with is_last=True and + sentence='' (just signals end, full_text returned separately). + """ + if not LOCAL_LLM_MODEL: + return + payload = { + "model": LOCAL_LLM_MODEL, + "max_tokens": max_tokens, + "temperature": 0.7, + "stream": True, + "messages": [{"role": "system", "content": system}] + messages, + } + full_text = "" + buf = "" + try: + async with _get_local_http_client().stream( + "POST", f"{LOCAL_LLM_URL}/chat/completions", json=payload, timeout=10.0 + ) as resp: + async for line in resp.aiter_lines(): + if not line.startswith("data: "): + continue + data = line[6:] + if data.strip() == "[DONE]": + break + try: + delta = json.loads(data)["choices"][0]["delta"].get("content", "") + except Exception: + continue + full_text += delta + buf += delta + + # Stop speaking once an action tag begins + if _ACTION_TAG.search(buf): + speak_part = buf[:_ACTION_TAG.search(buf).start()].strip() + if speak_part: + yield speak_part, False + buf = "" + # Drain the rest silently + async for rest_line in resp.aiter_lines(): + if not rest_line.startswith("data: "): + continue + rest_data = rest_line[6:] + if rest_data.strip() == "[DONE]": + break + try: + full_text += json.loads(rest_data)["choices"][0]["delta"].get("content", "") + except Exception: + pass + break + + # Emit complete sentences + parts = _SENTENCE_END.split(buf) + if len(parts) > 1: + for sentence in parts[:-1]: + sentence = sentence.strip() + if sentence: + yield sentence, False + buf = parts[-1] + + # Emit any remaining text + remainder = buf.strip() + if remainder and not _ACTION_TAG.search(remainder): + yield remainder, False + + except Exception as e: + log.debug(f"[local-llm stream] error: {e}") + + yield "", True # sentinel — signals stream done, caller reads full_text via closure + # Stash full_text so caller can extract action tags + _local_stream_sentences._last_full_text = full_text + + +_local_stream_sentences._last_full_text = "" + + # --------------------------------------------------------------------------- # LLM Response # --------------------------------------------------------------------------- @@ -1090,6 +1412,7 @@ async def generate_response( conversation_history: list[dict], last_response: str = "", session_summary: str = "", + prior_context: str = "", ) -> str: """Generate a JARVIS response using Anthropic API.""" now = datetime.now() @@ -1126,25 +1449,51 @@ async def generate_response( if memory_ctx: system += f"\n\nJARVIS MEMORY:\n{memory_ctx}" + # Cross-session persistent memory — what happened in previous sessions + if prior_context: + system += f"\n\nLONG-TERM MEMORY:\n{prior_context}" + # Three-tier memory — inject rolling summary of earlier conversation if session_summary: system += f"\n\nSESSION CONTEXT (earlier in this conversation):\n{session_summary}" - # Self-awareness — remind JARVIS of last response to avoid repetition - if last_response: - system += f'\n\nYOUR LAST RESPONSE (do not repeat this):\n"{last_response[:150]}"' + # Self-awareness — include recent JARVIS responses to prevent repetition + recent_assistant = [m["content"] for m in conversation_history[-10:] if m.get("role") == "assistant"] + if recent_assistant: + system += "\n\nYOUR RECENT RESPONSES (do NOT repeat or paraphrase these — vary your wording and do not re-open with the same greeting):\n" + for r in recent_assistant[-3:]: + system += f'- "{r[:200]}"\n' # Use conversation history — keep the last 20 messages for context # (older conversation is captured in session_summary) messages = conversation_history[-20:] + # API requires messages start with user role — drop any leading assistant turns + while messages and messages[0].get("role") == "assistant": + messages = messages[1:] # If the last message isn't the current user text, add it if not messages or messages[-1].get("content") != text: messages = messages + [{"role": "user", "content": text}] + # Try local LLM first — slim system prompt keeps prefill fast + recent_assistant = [m["content"] for m in conversation_history[-10:] if m.get("role") == "assistant"] + recent_str = "".join(f'- "{r[:150]}"\n' for r in recent_assistant[-3:]) + local_system = _build_local_system( + current_time=current_time, + weather_info=weather_info, + screen_context=screen_ctx or "", + calendar_context=calendar_ctx, + active_tasks=task_mgr.get_active_tasks_summary(), + recent_responses=recent_str, + ) + local_result = await _local_generate(local_system, messages[-6:], max_tokens=150) + if local_result is not None: + return local_result + + # Fall back to Anthropic Haiku try: response = await client.messages.create( - model="claude-haiku-4-5-20251001", - max_tokens=250, # Extra room for [ACTION:X] tags + model=FAST_MODEL, + max_tokens=400, # Room for ideation/discussion responses system=system, messages=messages, ) @@ -1319,11 +1668,11 @@ def _worker(): # Weather — refresh every loop (30s is fine, API is fast) try: import urllib.request, json as _json - url = "https://api.open-meteo.com/v1/forecast?latitude=27.77&longitude=-82.64¤t=temperature_2m,weathercode&temperature_unit=fahrenheit" + url = "https://api.open-meteo.com/v1/forecast?latitude=52.64&longitude=-1.14¤t=temperature_2m,weathercode&temperature_unit=celsius" with urllib.request.urlopen(url, timeout=3) as resp: d = _json.loads(resp.read()).get("current", {}) temp = d.get("temperature_2m", "?") - _ctx_cache["weather"] = f"Current weather in St. Petersburg, FL: {temp}°F" + _ctx_cache["weather"] = f"Current weather in Leicester, UK: {temp}°C" except Exception: pass @@ -1437,15 +1786,18 @@ async def api_list_projects(): # -- Fast Action Detection (no LLM call) ----------------------------------- def _scan_projects_sync() -> list[dict]: - """Synchronous Desktop scan — runs in executor.""" + """Synchronous scan of ~/Desktop and ~/ — runs in executor.""" projects = [] - desktop = Path.home() / "Desktop" - try: - for entry in desktop.iterdir(): - if entry.is_dir() and not entry.name.startswith("."): - projects.append({"name": entry.name, "path": str(entry), "branch": ""}) - except Exception: - pass + seen = set() + for root in [Path.home() / "Desktop", Path.home()]: + try: + for entry in root.iterdir(): + if entry.is_dir() and not entry.name.startswith(".") and str(entry) not in seen: + if (entry / ".git").exists(): + seen.add(str(entry)) + projects.append({"name": entry.name, "path": str(entry), "branch": ""}) + except Exception: + pass return projects @@ -1468,10 +1820,101 @@ def detect_action_fast(text: str) -> dict | None: "what's running on my", "whats running on my", "check my screen"]): return {"action": "describe_screen"} + # Chrome — close tab + if any(p in t for p in ["close this tab", "close the tab", "close tab", "shut this tab", + "close current tab", "close active tab"]): + return {"action": "close_tab"} + + # Chrome — close window + if any(p in t for p in ["close chrome window", "close this window", "close the window", + "shut chrome", "close browser window"]): + return {"action": "close_window"} + + # Chrome — navigation + if any(p in t for p in ["go back", "browser back", "previous page"]): + return {"action": "browser_back"} + if any(p in t for p in ["go forward", "browser forward", "next page"]): + return {"action": "browser_forward"} + if any(p in t for p in ["reload", "refresh page", "refresh this", "refresh the page"]): + return {"action": "reload"} + if any(p in t for p in ["what tab is this", "what page is this", "current tab", "what site is this", + "what url is this", "get tab info"]): + return {"action": "get_tab"} + + # Window control + if any(p in t for p in ["minimize", "minimise", "minimize window", "minimise window"]): + return {"action": "minimize_window"} + if any(p in t for p in ["maximize", "maximise", "full screen", "fullscreen"]): + return {"action": "maximize_window"} + if any(p in t for p in ["snap left", "move window left", "window left", "left half"]): + return {"action": "move_window", "target": "left"} + if any(p in t for p in ["snap right", "move window right", "window right", "right half"]): + return {"action": "move_window", "target": "right"} + + # Keyboard shortcuts + if any(p in t for p in ["take a screenshot", "screenshot", "capture screen"]): + return {"action": "screenshot"} + if any(p in t for p in ["save this", "save the file", "save document", "save that"]): + return {"action": "save"} + if any(p in t for p in ["scroll down", "scroll up"]): + direction = "down" if "down" in t else "up" + return {"action": "scroll", "target": direction} + + # Volume + if any(p in t for p in ["mute", "mute audio", "silence", "quiet"]): + return {"action": "mute"} + if any(p in t for p in ["unmute", "unmute audio", "un mute"]): + return {"action": "unmute"} + if any(p in t for p in ["volume up", "louder", "turn it up", "increase volume"]): + return {"action": "set_volume", "target": "70"} + if any(p in t for p in ["volume down", "quieter", "turn it down", "lower volume", "turn down the volume"]): + return {"action": "set_volume", "target": "30"} + if any(p in t for p in ["what's the volume", "whats the volume", "volume level", + "how loud", "current volume"]): + return {"action": "get_volume"} + # Terminal / Claude Code — explicit open requests if any(w in t for w in ["open claude", "start claude", "launch claude", "run claude"]): return {"action": "open_terminal"} + # "pull up" = universal Firefox command. Anything after it is a web search/URL. + # Exception: bare 1-2 word app names with no articles/prepositions → app switch instead. + if t.startswith("pull up "): + _query = t[len("pull up "):].strip() + _words = _query.split() + _articles = {"the", "a", "an", "for", "in", "of", "from", "to", "my", "this", "that", "some", "how", "what", "where", "who", "why"} + _looks_like_app = ( + _query + and len(_words) <= 2 + and not (_articles & set(w.lower() for w in _words)) + and not re.search(r'\.(com|io|org|net|co|ai|app|dev)(\s|$)', _query) + ) + if _looks_like_app: + return {"action": "switch_app", "target": _query.title()} + from urllib.parse import quote as _quote + return {"action": "browse", "target": f"https://www.google.com/search?q={_quote(_query)}"} + + # App switching — "open Firefox", "switch to Slack", "launch Spotify", etc. + # Fast-path these so the LLM never confuses them with screen/browse requests. + _SWITCH_VERBS = ("open ", "switch to ", "launch ", "bring up ", "focus ", "show me ") + for _verb in _SWITCH_VERBS: + if t.startswith(_verb): + _app_candidate = t[len(_verb):].strip() + # Skip URLs, search phrases, and already-handled Claude/terminal commands + if ( + _app_candidate + and len(_app_candidate.split()) <= 3 + and not re.search(r'\.(com|io|org|net|co|ai|app|dev)(\s|$)', _app_candidate) + and "claude" not in _app_candidate + and "terminal" not in _app_candidate + and "tab" not in _app_candidate + and "window" not in _app_candidate + and "website" not in _app_candidate + and "browser" not in _app_candidate + ): + return {"action": "switch_app", "target": _app_candidate.title()} + break + # Show recent build if any(w in t for w in ["show me what you built", "pull up what you made", "open what you built"]): return {"action": "show_recent"} @@ -1520,6 +1963,43 @@ def detect_action_fast(text: str) -> dict | None: # -- Action Handlers ------------------------------------------------------- +def _write_project_claude_md(path: str, target: str, plan=None, memory_ctx: str = "") -> None: + """Write a rich CLAUDE.md to a new project directory.""" + lines = ["# JARVIS Project Brief\n\n"] + + lines.append("## Task\n\n") + lines.append(target.strip() + "\n") + + if plan: + bp = plan.to_dict() + answers = bp.get("answers", {}) + if answers.get("tech_stack"): + lines.append(f"\n## Tech Stack\n\n{answers['tech_stack']}\n") + if answers.get("design"): + lines.append(f"\n## Design\n\n{answers['design']}\n") + if answers.get("details"): + lines.append(f"\n## Requirements\n\n{answers['details']}\n") + lines.append(f"\n## Blueprint\n\n```json\n{json.dumps(bp, indent=2)}\n```\n") + + if memory_ctx.strip(): + lines.append(f"\n## User Context\n\n{memory_ctx.strip()}\n") + + lines.append( + "\n## Build Instructions\n\n" + "- BUILD THIS NOW. Do not ask clarifying questions.\n" + "- Use your best judgment for any design/architecture decisions.\n" + "- Write complete, working code files — not plans or specs.\n" + "- If it's a web app: use React + Vite + Tailwind unless tech stack is specified above.\n" + "- Make it look polished and professional. Modern UI, clean layout.\n" + "- Use realistic mock data, not placeholder Lorem Ipsum.\n" + "- Ensure it runs with a single command (npm run dev or similar).\n" + "- After building, start the dev server and verify the app loads without errors.\n" + "- IMPORTANT: Your LAST line of output MUST be exactly: RUNNING_AT=http://localhost:PORT\n" + ) + + Path(path, "CLAUDE.md").write_text("".join(lines)) + + async def handle_open_terminal() -> str: result = await open_terminal("claude --dangerously-skip-permissions") return result["confirmation"] @@ -1530,26 +2010,30 @@ async def handle_build(target: str) -> str: path = str(Path.home() / "Desktop" / name) os.makedirs(path, exist_ok=True) - # Write CLAUDE.md with clear instructions - claude_md = Path(path) / "CLAUDE.md" - claude_md.write_text(f"# Task\n\n{target}\n\nBuild this completely. If web app, make index.html work standalone.\n") + memory_ctx = build_memory_context(target) + _write_project_claude_md(path, target, plan=None, memory_ctx=memory_ctx) # Write prompt to a file, then pipe it to claude -p # This avoids all shell escaping issues prompt_file = Path(path) / ".jarvis_prompt.txt" prompt_file.write_text(target) + escaped_path = _escape_applescript_string(path) script = ( 'tell application "Terminal"\n' " activate\n" - f' do script "cd {path} && cat .jarvis_prompt.txt | claude -p --dangerously-skip-permissions"\n' + f' do script "cd \\"{escaped_path}\\" && cat .jarvis_prompt.txt | claude -p --dangerously-skip-permissions"\n' "end tell" ) - await asyncio.create_subprocess_exec( + proc = await asyncio.create_subprocess_exec( "osascript", "-e", script, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) + _, stderr = await proc.communicate() + if proc.returncode != 0: + log.error(f"handle_build: Terminal spawn failed: {stderr.decode()[:200]}") + return f"Had trouble opening Terminal for {name}, sir." recently_built.append({"name": name, "path": path, "time": time.time()}) return f"On it, sir. Claude Code is working in {name}." @@ -1610,9 +2094,12 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict _active_lookups[lookup_id]["status"] = "done" - # Speak the result — skip audio if user spoke recently to avoid collision - if voice_state and time.time() - voice_state["last_user_time"] < 3: - log.info(f"Skipping lookup audio for {lookup_type} — user spoke recently") + # Speak the result — skip audio if user or JARVIS spoke very recently + _now = time.time() + _user_gap = _now - voice_state.get("last_user_time", 0) if voice_state else 99 + _jarvis_gap = _now - voice_state.get("last_jarvis_time", 0) if voice_state else 99 + if _user_gap < 3 or _jarvis_gap < 2: + log.info(f"Skipping lookup audio for {lookup_type} — collision guard (user_gap={_user_gap:.1f}s, jarvis_gap={_jarvis_gap:.1f}s)") # Result is still stored in history below else: tts = strip_markdown_for_tts(result_text) @@ -1620,10 +2107,11 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict try: await ws.send_json({"type": "status", "state": "speaking"}) if audio: - await ws.send_json({"type": "audio", "data": audio, "text": result_text}) + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": result_text}) + # Root-cause fix: do NOT send "idle" — frontend audioPlayer.onFinished handles it. else: await ws.send_json({"type": "text", "text": result_text}) - await ws.send_json({"type": "status", "state": "idle"}) + # Text fallback: frontend utterance.onend handles idle transition. except Exception: pass @@ -1640,8 +2128,10 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict audio = await synthesize_speech(fallback) await ws.send_json({"type": "status", "state": "speaking"}) if audio: - await ws.send_json({"type": "audio", "data": audio, "text": fallback}) - await ws.send_json({"type": "status", "state": "idle"}) + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": fallback}) + # Root-cause fix: no premature "idle" — frontend handles it on playback end. + else: + await ws.send_json({"type": "text", "text": fallback}) except Exception: pass except Exception as e: @@ -1725,7 +2215,7 @@ async def handle_browse(text: str, target: str) -> str: import re from urllib.parse import quote - browser = "firefox" if "firefox" in text.lower() else "chrome" + browser = "chrome" if "chrome" in text.lower() else "firefox" combined = text.lower() # 1. Try to find a URL or domain in the text @@ -1777,7 +2267,7 @@ async def handle_research(text: str, target: str, client: anthropic.AsyncAnthrop """Deep research with Opus — write results to HTML, open in browser.""" try: research_response = await client.messages.create( - model="claude-opus-4-6", + model=SMART_MODEL, max_tokens=2000, system=f"You are JARVIS, researching a topic for {USER_NAME}. Be thorough, organized, and cite sources where possible.", messages=[{"role": "user", "content": f"Research this thoroughly:\n\n{target}"}], @@ -1813,7 +2303,7 @@ async def handle_research(text: str, target: str, client: anthropic.AsyncAnthrop # Short voice summary via Haiku summary = await client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=80, system="Summarize this research in ONE sentence for voice. No markdown.", messages=[{"role": "user", "content": research_text[:2000]}], @@ -1846,7 +2336,7 @@ async def _update_session_summary( try: response = await client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=200, messages=[{"role": "user", "content": prompt}], ) @@ -1882,8 +2372,12 @@ async def voice_handler(ws: WebSocket): _current_response_id = 0 _cancel_response = False - # Audio collision prevention — track when user last spoke - voice_state = {"last_user_time": 0.0} + # Audio collision prevention — track when user last spoke and when JARVIS last spoke + voice_state = {"last_user_time": 0.0, "last_jarvis_time": 0.0} + + # Transcript deduplication — ignore identical consecutive transcripts within 5s + last_transcript_text = "" + last_transcript_time = 0.0 # Self-awareness — track last spoken response to avoid repetition last_jarvis_response = "" @@ -1894,6 +2388,16 @@ async def voice_handler(ws: WebSocket): summary_update_pending: bool = False messages_since_last_summary: int = 0 + # Persistent session identity — survives across server restarts + session_id = datetime.now().strftime("%Y%m%d_%H%M%S") + start_session(session_id) + + # Load cross-session memory once on connect — injected into every response + prior_context = build_session_context() + + # Prune conversation log older than 60 days (runs quickly, fire-and-forget) + prune_old_conversations(days=60) + log.info("Voice WebSocket connected") try: @@ -1916,13 +2420,21 @@ async def voice_handler(ws: WebSocket): async def _send_greeting(): try: audio_bytes = await synthesize_speech(greeting) + await ws.send_json({"type": "status", "state": "speaking"}) if audio_bytes: encoded = base64.b64encode(audio_bytes).decode() - await ws.send_json({"type": "status", "state": "speaking"}) await ws.send_json({"type": "audio", "data": encoded, "text": greeting}) - history.append({"role": "assistant", "content": greeting}) - log.info(f"JARVIS: {greeting}") - await ws.send_json({"type": "status", "state": "idle"}) + # Root-cause fix: do NOT send "idle" here. The frontend's + # audioPlayer.onFinished handler transitions to idle once the + # audio buffer actually finishes playing. Sending "idle" early + # was resuming the mic before the greeting audio had even decoded, + # which let the microphone pick up the greeting itself and start + # the feedback loop. + else: + await ws.send_json({"type": "text", "text": greeting}) + # Text-only fallback: the frontend's utterance.onend handles idle. + history.append({"role": "assistant", "content": greeting}) + log.info(f"JARVIS: {greeting}") except Exception as e: log.warning(f"Greeting failed: {e}") @@ -1949,7 +2461,7 @@ async def _send_greeting(): await ws.send_json({"type": "status", "state": "speaking"}) audio = await synthesize_speech(tts) if audio: - await ws.send_json({"type": "audio", "data": audio, "text": response_text}) + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": response_text}) else: await ws.send_json({"type": "text", "text": response_text}) continue @@ -1961,6 +2473,14 @@ async def _send_greeting(): if not user_text: continue + # Deduplicate — skip if this is the same transcript we just processed + now_ts = time.time() + if user_text == last_transcript_text and (now_ts - last_transcript_time) < 5.0: + log.info(f"Skipping duplicate transcript: {user_text[:60]}") + continue + last_transcript_text = user_text + last_transcript_time = now_ts + # Cancel any in-flight response _current_response_id += 1 my_response_id = _current_response_id @@ -2000,12 +2520,21 @@ async def _send_greeting(): for q in plan.pending_questions[plan.current_question_index:]: if q.get("default") is not None and q["key"] not in plan.answers: plan.answers[q["key"]] = q["default"] - prompt = await planner.build_prompt() + memory_ctx = build_memory_context(user_text) + prompt = await planner.build_prompt(memory_context=memory_ctx) name = _generate_project_name(prompt) path = str(Path.home() / "Desktop" / name) os.makedirs(path, exist_ok=True) - Path(path, "CLAUDE.md").write_text(prompt) + _write_project_claude_md( + path, + planner.active_plan.original_request if planner.active_plan else user_text, + plan=planner.active_plan, + memory_ctx=memory_ctx, + ) + planner.write_blueprint(path) did = dispatch_registry.register(name, path, prompt[:200]) + if planner.active_plan: + dispatch_registry.update_blueprint(did, planner.active_plan.to_dict()) asyncio.create_task(_execute_prompt_project(name, prompt, work_session, ws, dispatch_id=did, history=history, voice_state=voice_state)) planner.reset() response_text = "Building it now, sir." @@ -2013,12 +2542,21 @@ async def _send_greeting(): # Confirmation phase result = await planner.handle_confirmation(user_text) if result["confirmed"]: - prompt = await planner.build_prompt() + memory_ctx = build_memory_context(user_text) + prompt = await planner.build_prompt(memory_context=memory_ctx) name = _generate_project_name(prompt) path = str(Path.home() / "Desktop" / name) os.makedirs(path, exist_ok=True) - Path(path, "CLAUDE.md").write_text(prompt) + _write_project_claude_md( + path, + planner.active_plan.original_request if planner.active_plan else user_text, + plan=planner.active_plan, + memory_ctx=memory_ctx, + ) + planner.write_blueprint(path) did = dispatch_registry.register(name, path, prompt[:200]) + if planner.active_plan: + dispatch_registry.update_blueprint(did, planner.active_plan.to_dict()) asyncio.create_task(_execute_prompt_project(name, prompt, work_session, ws, dispatch_id=did, history=history, voice_state=voice_state)) planner.reset() response_text = "On it, sir." @@ -2050,6 +2588,7 @@ async def _send_greeting(): cached_projects, history, last_response=last_jarvis_response, session_summary=session_summary, + prior_context=prior_context, ) else: # Send to claude -p (full power) @@ -2086,7 +2625,7 @@ async def _send_greeting(): if full_response and anthropic_client: try: summary = await anthropic_client.messages.create( - model="claude-haiku-4-5-20251001", + model=FAST_MODEL, max_tokens=100, system=( f"You are JARVIS reporting to the user ({USER_NAME}). Summarize what happened in 1-2 sentences. " @@ -2143,18 +2682,74 @@ async def _send_greeting(): response_text = format_tasks_for_voice(tasks) elif action["action"] == "check_usage": response_text = get_usage_summary() + elif action["action"] == "close_tab": + result = await close_tab() + response_text = result["confirmation"] + # ── NEW SYSTEM CONTROL ACTIONS (fast path) ── + elif action["action"] in ( + "close_window", "browser_back", "browser_forward", "reload", + "get_tab", "minimize_window", "maximize_window", "screenshot", + "save", "mute", "unmute", "get_volume", + ): + result = await system_control.dispatch(action["action"], "") + response_text = result["confirmation"] + elif action["action"] in ("move_window", "scroll", "set_volume"): + tgt = action.get("target", "") + result = await system_control.dispatch(action["action"], tgt) + response_text = result["confirmation"] else: response_text = "Understood, sir." else: if not anthropic_client: response_text = "API key not configured." else: - response_text = await generate_response( - user_text, anthropic_client, task_manager, - cached_projects, history, - last_response=last_jarvis_response, - session_summary=session_summary, - ) + # ── Streaming local LLM path (low latency) ── + _streaming_spoken = False + if LOCAL_LLM_MODEL: + try: + _now = datetime.now() + _ct = _now.strftime("%A, %B %d, %Y at %I:%M %p") + _recent = [m["content"] for m in history[-10:] if m.get("role") == "assistant"] + _recent_str = "".join(f'- "{r[:150]}"\n' for r in _recent[-3:]) + _lsys = _build_local_system( + current_time=_ct, + weather_info=_ctx_cache.get("weather", ""), + screen_context=_ctx_cache["screen"], + calendar_context=_ctx_cache["calendar"], + active_tasks=task_manager.get_active_tasks_summary(), + recent_responses=_recent_str, + ) + _lmsgs = history[-6:] + while _lmsgs and _lmsgs[0].get("role") == "assistant": + _lmsgs = _lmsgs[1:] + if not _lmsgs or _lmsgs[-1].get("content") != user_text: + _lmsgs = _lmsgs + [{"role": "user", "content": user_text}] + + _spoken_parts: list[str] = [] + async for _sentence, _is_last in _local_stream_sentences(_lsys, _lmsgs): + if _is_last: + response_text = _local_stream_sentences._last_full_text or " ".join(_spoken_parts) + break + _tts_text = strip_markdown_for_tts(_sentence) + if _tts_text: + _audio = await synthesize_speech(_tts_text) + if _audio: + await ws.send_json({"type": "status", "state": "speaking"}) + await ws.send_json({"type": "audio", "data": base64.b64encode(_audio).decode(), "text": _sentence}) + _streaming_spoken = True + _spoken_parts.append(_sentence) + except Exception as _e: + log.warning(f"[local-llm stream] fell back to Anthropic: {_e}") + _streaming_spoken = False + + if not _streaming_spoken: + response_text = await generate_response( + user_text, anthropic_client, task_manager, + cached_projects, history, + last_response=last_jarvis_response, + session_summary=session_summary, + prior_context=prior_context, + ) # Check for action tags embedded in LLM response clean_response, embedded_action = extract_action(response_text) @@ -2171,6 +2766,10 @@ async def _send_greeting(): response_text = "On it, sir." elif action_type == "research": response_text = "Looking into that now, sir." + elif action_type == "close_tab": + response_text = "Closing that tab, sir." + elif action_type == "click": + response_text = "Done, sir." else: response_text = "Right away, sir." @@ -2280,13 +2879,40 @@ async def _read_and_report(search_term, _ws): else: msg = f"Couldn't find a note matching '{search_term}', sir." audio = await synthesize_speech(strip_markdown_for_tts(msg)) - if audio and _ws: + if _ws: try: await _ws.send_json({"type": "status", "state": "speaking"}) - await _ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) + if audio: + await _ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": msg}) + else: + await _ws.send_json({"type": "text", "text": msg}) except Exception: pass asyncio.create_task(_read_and_report(embedded_action["target"].strip(), ws)) + elif embedded_action["action"] == "close_tab": + result = await close_tab() + log.info(f"close_tab: {result}") + if not response_text.strip(): + response_text = result["confirmation"] + elif embedded_action["action"] == "click": + selector = embedded_action["target"].strip() + result = await click_element(selector) + log.info(f"click '{selector}': {result}") + elif embedded_action["action"] in ( + # ── System Control actions via LLM tag ── + "open_tab", "close_window", "browser_back", "browser_forward", + "reload", "get_tab", "switch_app", "quit_app", "hide_app", + "minimize_window", "maximize_window", "move_window", + "copy", "paste", "undo", "redo", "select_all", "save", + "screenshot", "scroll", "set_volume", "mute", "unmute", + "get_volume", "open_folder", "trash_file", "reveal_file", + ): + sc_action = embedded_action["action"] + sc_target = embedded_action["target"].strip() + result = await system_control.dispatch(sc_action, sc_target) + log.info(f"system_control {sc_action}: {result}") + if not response_text.strip(): + response_text = result["confirmation"] # Update history history.append({"role": "user", "content": user_text}) @@ -2296,6 +2922,10 @@ async def _read_and_report(search_term, _ws): session_buffer.append({"role": "user", "content": user_text}) session_buffer.append({"role": "assistant", "content": response_text}) + # Persist every exchange to the conversation log + log_message(session_id, "user", user_text) + log_message(session_id, "jarvis", response_text) + # Check if rolling summary needs updating messages_since_last_summary += 1 if messages_since_last_summary >= 5 and len(history) > 20 and not summary_update_pending: @@ -2318,17 +2948,21 @@ async def _do_summary(): if anthropic_client and len(user_text) > 15: asyncio.create_task(extract_memories(user_text, response_text, anthropic_client)) - # TTS - tts = strip_markdown_for_tts(response_text) - await ws.send_json({"type": "status", "state": "speaking"}) - audio = await synthesize_speech(tts) - if audio: - await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": response_text}) - else: - await ws.send_json({"type": "text", "text": response_text}) - await ws.send_json({"type": "status", "state": "idle"}) + # TTS — skip if streaming already sent audio sentence-by-sentence + _already_spoken = locals().get("_streaming_spoken", False) + if not _already_spoken: + tts = strip_markdown_for_tts(response_text) + await ws.send_json({"type": "status", "state": "speaking"}) + audio = await synthesize_speech(tts) + if audio: + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": response_text}) + else: + await ws.send_json({"type": "text", "text": response_text}) + # Do NOT send "idle" status here either — it would beat the speech + # synthesis start and open the mic while the browser is still speaking. log.info(f"JARVIS: {response_text}") last_jarvis_response = response_text + voice_state["last_jarvis_time"] = time.time() except Exception as e: log.error(f"Error: {e}", exc_info=True) @@ -2350,6 +2984,35 @@ async def _do_summary(): finally: task_manager.unregister_websocket(ws) + # Persist session summary so JARVIS remembers this conversation next time + msg_count = len(session_buffer) // 2 + if msg_count > 0 and anthropic_client: + try: + # Summarise the session using Haiku + recent = session_buffer[-30:] # Cap at last 30 messages + convo_text = "\n".join( + f"{'User' if m['role'] == 'user' else 'JARVIS'}: {m['content'][:200]}" + for m in recent + ) + summary_resp = await anthropic_client.messages.create( + model=FAST_MODEL, + max_tokens=250, + system=( + "Summarise this JARVIS conversation in 2-3 sentences. " + "Cover: main topics discussed, decisions made, tasks or projects started, " + "anything personal the user shared. Be specific — names, numbers, and details matter. " + "Write in third person past tense." + ), + messages=[{"role": "user", "content": convo_text}], + ) + final_summary = summary_resp.content[0].text.strip() + except Exception as e: + log.warning(f"Session summary generation failed: {e}") + final_summary = session_summary # Fall back to rolling summary + end_session(session_id, final_summary, msg_count) + elif msg_count > 0: + end_session(session_id, session_summary, msg_count) + # --------------------------------------------------------------------------- # Settings / Configuration endpoints @@ -2413,7 +3076,7 @@ class PreferencesUpdate(BaseModel): @app.post("/api/settings/keys") async def api_settings_keys(body: KeyUpdate): - allowed = {"ANTHROPIC_API_KEY", "FISH_API_KEY", "FISH_VOICE_ID", "USER_NAME", "HONORIFIC", "CALENDAR_ACCOUNTS"} + allowed = {"ANTHROPIC_API_KEY", "EDGE_TTS_VOICE", "USER_NAME", "HONORIFIC", "CALENDAR_ACCOUNTS"} if body.key_name not in allowed: return JSONResponse({"success": False, "error": "Invalid key name"}, status_code=400) _write_env_key(body.key_name, body.key_value) @@ -2426,29 +3089,17 @@ async def api_test_anthropic(body: KeyTest): return {"valid": False, "error": "No key provided"} try: client = anthropic.AsyncAnthropic(api_key=key) - await client.messages.create(model="claude-haiku-4-5-20251001", max_tokens=10, messages=[{"role": "user", "content": "Hi"}]) + await client.messages.create(model=FAST_MODEL, max_tokens=10, messages=[{"role": "user", "content": "Hi"}]) return {"valid": True} except Exception as e: return {"valid": False, "error": str(e)[:200]} -@app.post("/api/settings/test-fish") -async def api_test_fish(body: KeyTest): - key = body.key_value or os.getenv("FISH_API_KEY", "") - if not key: - return {"valid": False, "error": "No key provided"} +@app.post("/api/settings/test-tts") +async def api_test_tts(): + """Test Edge TTS by synthesizing a short phrase.""" try: - async with httpx.AsyncClient(timeout=10.0) as client: - resp = await client.post( - "https://api.fish.audio/v1/tts", - headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"}, - json={"text": "test", "reference_id": FISH_VOICE_ID}, - ) - if resp.status_code in (200, 201): - return {"valid": True} - elif resp.status_code == 401: - return {"valid": False, "error": "Invalid API key"} - else: - return {"valid": False, "error": f"HTTP {resp.status_code}"} + result = await synthesize_speech("JARVIS online.") + return {"valid": bool(result)} except Exception as e: return {"valid": False, "error": str(e)[:200]} @@ -2480,8 +3131,7 @@ async def api_settings_status(): "uptime_seconds": int(time.time() - _session_start), "env_keys_set": { "anthropic": bool(env_dict.get("ANTHROPIC_API_KEY", "").strip() and env_dict.get("ANTHROPIC_API_KEY", "") != "your-anthropic-api-key-here"), - "fish_audio": bool(env_dict.get("FISH_API_KEY", "").strip() and env_dict.get("FISH_API_KEY", "") != "your-fish-audio-api-key-here"), - "fish_voice_id": bool(env_dict.get("FISH_VOICE_ID", "").strip()), + "edge_tts_voice": env_dict.get("EDGE_TTS_VOICE", EDGE_TTS_VOICE), "user_name": env_dict.get("USER_NAME", ""), }, } @@ -2524,17 +3174,19 @@ async def api_fix_self(): jarvis_dir = str(Path(__file__).parent) # The work_session is per-WebSocket, so we set a flag that the handler picks up # For now, also open Terminal so user can see + escaped_jarvis_dir = jarvis_dir.replace('"', '\\"') script = ( 'tell application "Terminal"\n' ' activate\n' - f' do script "cd {jarvis_dir} && claude --dangerously-skip-permissions"\n' + f' do script "cd \\"{escaped_jarvis_dir}\\" && claude --dangerously-skip-permissions"\n' 'end tell' ) - await asyncio.create_subprocess_exec( + proc = await asyncio.create_subprocess_exec( "osascript", "-e", script, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) + await proc.communicate() log.info("Work mode: JARVIS repo opened for self-improvement") return {"status": "work_mode_active", "path": jarvis_dir} diff --git a/setup_permissions.sh b/setup_permissions.sh new file mode 100755 index 0000000..5a3834a --- /dev/null +++ b/setup_permissions.sh @@ -0,0 +1,245 @@ +#!/usr/bin/env bash +# setup_permissions.sh — JARVIS macOS Permissions Setup Guide +# Run this script to check and configure all required system permissions. + +set -euo pipefail + +BOLD="\033[1m" +GREEN="\033[0;32m" +YELLOW="\033[1;33m" +RED="\033[0;31m" +CYAN="\033[0;36m" +RESET="\033[0m" + +print_header() { + echo "" + echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════════╗${RESET}" + echo -e "${BOLD}${CYAN}║ JARVIS — macOS Permissions Setup Guide ║${RESET}" + echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════════╝${RESET}" + echo "" +} + +print_section() { + echo "" + echo -e "${BOLD}${YELLOW}── $1 ──${RESET}" +} + +check_mark() { echo -e " ${GREEN}✓${RESET} $1"; } +warn_mark() { echo -e " ${YELLOW}⚠${RESET} $1"; } +fail_mark() { echo -e " ${RED}✗${RESET} $1"; } +info_mark() { echo -e " ${CYAN}ℹ${RESET} $1"; } + +open_pane() { + # Open a System Settings pane by URL scheme + open "$1" 2>/dev/null || true +} + +# --------------------------------------------------------------------------- +# 1. Accessibility +# --------------------------------------------------------------------------- +check_accessibility() { + print_section "1. Accessibility" + echo " WHY: JARVIS uses Accessibility to send keystrokes (copy, paste, undo," + echo " type text) and manipulate windows via System Events. Without this," + echo " keyboard shortcuts and window control will fail." + echo "" + + # Try a no-op keystroke — if it fails, Accessibility is denied + local result + result=$(osascript -e 'tell application "System Events" to keystroke ""' 2>&1) || true + + if echo "$result" | grep -qi "not allowed\|accessibility\|1002\|osascript is not allowed"; then + fail_mark "Accessibility: NOT granted" + warn_mark "Opening System Settings → Privacy & Security → Accessibility..." + open_pane "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility" + echo "" + echo " ACTION REQUIRED:" + echo " 1. In the list, find 'Terminal' (or 'iTerm2' / your terminal app)" + echo " 2. Toggle it ON" + echo " 3. If Python or 'jarvis' appears, toggle that ON too" + echo " 4. Re-run this script to verify" + else + check_mark "Accessibility: granted" + fi +} + +# --------------------------------------------------------------------------- +# 2. Automation +# --------------------------------------------------------------------------- +check_automation() { + print_section "2. Automation" + echo " WHY: JARVIS controls other apps (Chrome, Finder, VS Code, etc.) via" + echo " AppleScript 'tell application' commands. Automation permission is" + echo " required per-app — Terminal must be allowed to control each one." + echo "" + + # Try to get the name of frontmost app — requires Automation for System Events + local result + result=$(osascript -e 'tell application "System Events" to get name of first application process whose frontmost is true' 2>&1) || true + + if echo "$result" | grep -qi "not authorized\|not allowed\|automation"; then + fail_mark "Automation: NOT granted for System Events" + warn_mark "Opening System Settings → Privacy & Security → Automation..." + open_pane "x-apple.systempreferences:com.apple.preference.security?Privacy_Automation" + echo "" + echo " ACTION REQUIRED:" + echo " 1. Find 'Terminal' (or your terminal/Python process) in the list" + echo " 2. Enable 'System Events' under it" + echo " 3. Enable 'Google Chrome', 'Finder', and any other apps JARVIS should" + echo " control (VS Code, Safari, etc.)" + echo " 4. Re-run this script to verify" + else + check_mark "Automation (System Events): granted" + info_mark "If controlling a specific app (e.g. Chrome) fails, go to" + info_mark "Privacy & Security → Automation and enable it there." + fi +} + +# --------------------------------------------------------------------------- +# 3. Screen Recording +# --------------------------------------------------------------------------- +check_screen_recording() { + print_section "3. Screen Recording" + echo " WHY: JARVIS uses 'screencapture' to take screenshots when you ask it to" + echo " capture your screen. Without this, screenshots will be blank or fail." + echo "" + + # screencapture -x writes a file; if Screen Recording is denied the file is + # produced but contains a black/blank frame. We test by checking the exit code + # of a quick attempt — not perfect but avoids writing a real file. + local tmp + tmp=$(mktemp /tmp/jarvis_sc_test_XXXXXX.png) + local result=0 + screencapture -x "$tmp" 2>/dev/null || result=$? + rm -f "$tmp" + + if [[ $result -ne 0 ]]; then + fail_mark "Screen Recording: likely NOT granted (screencapture exited $result)" + warn_mark "Opening System Settings → Privacy & Security → Screen Recording..." + open_pane "x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture" + echo "" + echo " ACTION REQUIRED:" + echo " 1. Find 'Terminal' (or your terminal app) in the list" + echo " 2. Toggle it ON" + echo " 3. You may need to restart Terminal after granting" + else + check_mark "Screen Recording: granted (or not yet denied)" + info_mark "If screenshots appear black/blank, revoke & re-grant in" + info_mark "Privacy & Security → Screen Recording." + fi +} + +# --------------------------------------------------------------------------- +# 4. Microphone +# --------------------------------------------------------------------------- +check_microphone() { + print_section "4. Microphone" + echo " WHY: JARVIS listens to your voice via the browser's Web Speech API." + echo " The browser (e.g. Chrome) must have Microphone access granted." + echo " The Python server itself does NOT need microphone access." + echo "" + + # We can't reliably check browser microphone permission from a shell script. + # Instead, remind the user to check in the browser and System Settings. + info_mark "Browser-level check (cannot be automated from shell):" + echo "" + echo " TO GRANT:" + echo " System Settings → Privacy & Security → Microphone" + echo " → Ensure your browser (Chrome / Safari / Firefox) is toggled ON" + echo "" + echo " In Chrome: visit chrome://settings/content/microphone" + echo " → JARVIS runs on localhost — make sure it is not in the 'Blocked' list" + echo " → Or open http://localhost:8000, click the lock icon → allow microphone" + echo "" + warn_mark "Opening System Settings → Privacy & Security → Microphone..." + open_pane "x-apple.systempreferences:com.apple.preference.security?Privacy_Microphone" +} + +# --------------------------------------------------------------------------- +# 5. Full Disk Access (optional) +# --------------------------------------------------------------------------- +check_full_disk_access() { + print_section "5. Full Disk Access (optional)" + echo " WHY: Required only if you want JARVIS to open, move, or reveal files" + echo " in protected directories (Desktop, Documents, Downloads, iCloud)." + echo " Without it, Finder operations on those paths will fail with a" + echo " 'permission denied' or 'not allowed' error." + echo "" + echo " This is OPTIONAL — JARVIS works without it for most use cases." + echo "" + + # Test access to ~/Documents (will fail if Full Disk Access is missing for Terminal) + if ls ~/Documents/ &>/dev/null; then + check_mark "Full Disk Access: appears granted (~/Documents readable)" + else + warn_mark "Full Disk Access: NOT granted (cannot read ~/Documents)" + warn_mark "Opening System Settings → Privacy & Security → Full Disk Access..." + open_pane "x-apple.systempreferences:com.apple.preference.security?Privacy_AllFiles" + echo "" + echo " ACTION REQUIRED (if you want Finder file operations):" + echo " 1. Find 'Terminal' in the list and toggle it ON" + echo " 2. Restart Terminal after granting" + fi +} + +# --------------------------------------------------------------------------- +# 6. Rectangle (optional — window snapping) +# --------------------------------------------------------------------------- +check_rectangle() { + print_section "6. Rectangle App (optional — window snapping)" + echo " WHY: JARVIS uses Rectangle keyboard shortcuts (Ctrl+Opt+Arrow) to snap" + echo " windows to left/right halves of the screen. Without Rectangle," + echo " JARVIS falls back to AppleScript-based resizing (less precise)." + echo "" + + if [[ -d "/Applications/Rectangle.app" ]]; then + check_mark "Rectangle: installed at /Applications/Rectangle.app" + info_mark "Make sure Rectangle is running and has Accessibility permission." + else + warn_mark "Rectangle: NOT installed" + info_mark "Download free from https://rectangleapp.com" + info_mark "JARVIS will use AppleScript window resizing as fallback." + fi +} + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +print_summary() { + echo "" + echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════════╗${RESET}" + echo -e "${BOLD}${CYAN}║ Summary ║${RESET}" + echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════════╝${RESET}" + echo "" + echo " REQUIRED (JARVIS won't work without these):" + echo " • Accessibility — keyboard shortcuts, window control" + echo " • Automation — controlling Chrome, Finder, other apps" + echo " • Microphone — voice input in your browser" + echo "" + echo " RECOMMENDED:" + echo " • Screen Recording — screenshot capability" + echo "" + echo " OPTIONAL:" + echo " • Full Disk Access — Finder ops on protected folders" + echo " • Rectangle App — precise window snapping" + echo "" + echo " After granting any permission, restart JARVIS:" + echo " cd $(dirname "$0") && python server.py" + echo "" +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +main() { + print_header + check_accessibility + check_automation + check_screen_recording + check_microphone + check_full_disk_access + check_rectangle + print_summary +} + +main diff --git a/system_control.py b/system_control.py new file mode 100644 index 0000000..35fa9ef --- /dev/null +++ b/system_control.py @@ -0,0 +1,584 @@ +""" +JARVIS System Control — AppleScript-based macOS system actions. + +Every function: + - Uses AppleScript as the primary method + - Falls back to subprocess/pyautogui where noted + - Returns {"success": bool, "confirmation": str} + - Logs every action to data/jarvis_actions.log (no file contents logged) + +Requires: + - Accessibility access for the process running JARVIS (Terminal / python) + - Automation access for Chrome, Finder, System Events + - Microphone already handled by the frontend +""" + +import asyncio +import logging +import time +from datetime import datetime +from pathlib import Path + +log = logging.getLogger("jarvis.system_control") + +_ACTION_LOG = Path(__file__).parent / "data" / "jarvis_actions.log" + +_PERM_HINTS = { + "accessibility": ( + "JARVIS needs Accessibility access. " + "Open System Settings → Privacy & Security → Accessibility " + "and enable Terminal (or the app running JARVIS)." + ), + "automation": ( + "JARVIS needs Automation access. " + "Open System Settings → Privacy & Security → Automation " + "and allow the required app targets." + ), + "screen_recording": ( + "JARVIS needs Screen Recording access for screenshots. " + "Open System Settings → Privacy & Security → Screen Recording " + "and enable Terminal (or the app running JARVIS)." + ), +} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _log_action(action: str, result: str) -> None: + """Append one line to the action log. Never logs file contents or personal data.""" + try: + _ACTION_LOG.parent.mkdir(parents=True, exist_ok=True) + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + with open(_ACTION_LOG, "a") as f: + f.write(f"{ts} | {action} | {result}\n") + except Exception: + pass + + +async def _run_script(script: str, timeout: int = 10) -> tuple[bool, str, str]: + """Execute an AppleScript via osascript. + + Returns (success, stdout_stripped, stderr_stripped). + """ + try: + proc = await asyncio.create_subprocess_exec( + "osascript", "-e", script, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout) + ok = proc.returncode == 0 + return ok, stdout.decode().strip(), stderr.decode().strip() + except asyncio.TimeoutError: + return False, "", "timeout" + except Exception as e: + return False, "", str(e) + + +def _perm_error(stderr: str) -> str | None: + """If stderr signals a missing permission, return a helpful hint.""" + s = stderr.lower() + if "not authorized" in s or "assistive" in s or "accessibility" in s: + return _PERM_HINTS["accessibility"] + if "automation" in s or "not allowed" in s: + return _PERM_HINTS["automation"] + if "screen recording" in s or "screencapture" in s: + return _PERM_HINTS["screen_recording"] + return None + + +def _result(action: str, success: bool, msg: str, stderr: str = "") -> dict: + """Build the standard return dict and write the action log.""" + hint = _perm_error(stderr) if not success else "" + confirmation = msg + (f" ({hint})" if hint else "") + _log_action(action, "OK" if success else f"FAIL: {stderr[:80]}") + return {"success": success, "confirmation": confirmation} + + +# --------------------------------------------------------------------------- +# Browser Control (Google Chrome via AppleScript) +# --------------------------------------------------------------------------- + +async def open_new_tab(url: str = "") -> dict: + """Open a new Chrome tab, optionally navigating to url.""" + safe_url = url.replace('"', "") + if safe_url: + script = f''' +tell application "Google Chrome" + activate + tell front window to make new tab with properties {{URL:"{safe_url}"}} +end tell''' + else: + script = ''' +tell application "Google Chrome" + activate + tell front window to make new tab +end tell''' + ok, _, err = await _run_script(script) + if not ok and "no windows" in err.lower(): + # No window open — open Chrome fresh + ok2, _, err2 = await _run_script( + f'tell application "Google Chrome"\n activate\n open location "{safe_url or "about:newtab"}"\nend tell' + ) + ok, err = ok2, err2 + msg = ("Opened a new tab, sir." if not url else f"Navigated to that in a new tab, sir.") if ok \ + else "Couldn't open a new tab, sir." + return _result("open_new_tab", ok, msg, err) + + +async def close_chrome_window() -> dict: + """Close the front Chrome window (all its tabs).""" + script = ''' +tell application "Google Chrome" + if (count of windows) = 0 then return "NO_WINDOW" + close front window + return "OK" +end tell''' + ok, out, err = await _run_script(script) + if out == "NO_WINDOW": + return _result("close_chrome_window", False, "No Chrome window to close, sir.", "") + msg = "Chrome window closed, sir." if ok else "Couldn't close that window, sir." + return _result("close_chrome_window", ok, msg, err) + + +async def browser_back() -> dict: + """Go back in the front Chrome tab.""" + script = ''' +tell application "Google Chrome" + tell active tab of front window to go back +end tell''' + ok, _, err = await _run_script(script) + return _result("browser_back", ok, "Going back, sir." if ok else "Couldn't go back, sir.", err) + + +async def browser_forward() -> dict: + """Go forward in the front Chrome tab.""" + script = ''' +tell application "Google Chrome" + tell active tab of front window to go forward +end tell''' + ok, _, err = await _run_script(script) + return _result("browser_forward", ok, "Going forward, sir." if ok else "Couldn't go forward, sir.", err) + + +async def reload_page() -> dict: + """Reload the front Chrome tab.""" + script = ''' +tell application "Google Chrome" + tell active tab of front window to reload +end tell''' + ok, _, err = await _run_script(script) + return _result("reload_page", ok, "Page reloaded, sir." if ok else "Couldn't reload, sir.", err) + + +async def get_tab_info() -> dict: + """Return title and URL of the front Chrome tab.""" + script = ''' +tell application "Google Chrome" + if (count of windows) = 0 then return "NO_WINDOW" + set t to active tab of front window + return (title of t) & "|" & (URL of t) +end tell''' + ok, out, err = await _run_script(script) + if not ok or out == "NO_WINDOW": + return _result("get_tab_info", False, "No Chrome tab available, sir.", err) + parts = out.split("|", 1) + title = parts[0].strip() if parts else "" + url = parts[1].strip() if len(parts) > 1 else "" + _log_action("get_tab_info", f"title={title[:60]}") + return {"success": True, "confirmation": f"Active tab: {title}", "title": title, "url": url} + + +# --------------------------------------------------------------------------- +# App / Window Control +# --------------------------------------------------------------------------- + +async def switch_to_app(app_name: str) -> dict: + """Bring an application to the foreground.""" + safe = app_name.replace('"', "") + script = f'tell application "{safe}" to activate' + ok, _, err = await _run_script(script) + msg = f"Switched to {safe}, sir." if ok else f"Couldn't find or switch to {safe}, sir." + return _result("switch_to_app", ok, msg, err) + + +async def quit_app(app_name: str) -> dict: + """Quit an application gracefully (Cmd+Q through System Events).""" + safe = app_name.replace('"', "") + script = f''' +tell application "{safe}" + quit +end tell''' + ok, _, err = await _run_script(script) + # Fallback: use System Events keystroke + if not ok: + fallback = f''' +tell application "{safe}" to activate +delay 0.3 +tell application "System Events" + keystroke "q" using command down +end tell''' + ok, _, err = await _run_script(fallback) + msg = f"Quitting {safe}, sir." if ok else f"Couldn't quit {safe}, sir." + return _result("quit_app", ok, msg, err) + + +async def hide_app(app_name: str) -> dict: + """Hide (Cmd+H) the named application.""" + safe = app_name.replace('"', "") + script = f''' +tell application "{safe}" to activate +delay 0.2 +tell application "System Events" + keystroke "h" using command down +end tell''' + ok, _, err = await _run_script(script) + msg = f"Hidden {safe}, sir." if ok else f"Couldn't hide {safe}, sir." + return _result("hide_app", ok, msg, err) + + +async def minimize_window() -> dict: + """Minimize the front window of the frontmost app.""" + script = ''' +tell application "System Events" + set frontApp to name of first application process whose frontmost is true +end tell +tell application frontApp + set miniaturized of front window to true +end tell''' + ok, _, err = await _run_script(script) + # Fallback: Cmd+M keystroke + if not ok: + script2 = ''' +tell application "System Events" + keystroke "m" using command down +end tell''' + ok, _, err = await _run_script(script2) + return _result("minimize_window", ok, "Minimised, sir." if ok else "Couldn't minimise that window, sir.", err) + + +async def maximize_window() -> dict: + """Zoom (maximise) the front window via Cmd+Ctrl+F (full screen) or green button.""" + script = ''' +tell application "System Events" + keystroke "f" using {command down, control down} +end tell''' + ok, _, err = await _run_script(script) + return _result("maximize_window", ok, "Maximised, sir." if ok else "Couldn't maximise, sir.", err) + + +async def move_window_to_half(side: str) -> dict: + """Snap the front window to the left or right half of the screen via Rectangle shortcuts. + + Rectangle's default shortcuts: + Left half: Ctrl+Opt+Left + Right half: Ctrl+Opt+Right + Falls back to a direct window bounds calculation if Rectangle isn't running. + """ + side = side.lower().strip() + if side not in ("left", "right"): + return _result("move_window", False, f"I don't recognise '{side}' — say left or right, sir.", "") + + key_code = "123" if side == "left" else "124" # left/right arrow key codes + # Try Rectangle shortcut first (most reliable) + script = f''' +tell application "System Events" + key code {key_code} using {{control down, option down}} +end tell''' + ok, _, err = await _run_script(script) + + if not ok: + # Fallback: use built-in macOS window tiling (macOS 15+) + tile_side = "left" if side == "left" else "right" + script2 = f''' +tell application "System Events" + set frontApp to name of first application process whose frontmost is true +end tell +tell application frontApp + set bounds of front window to tile to {tile_side} +end tell''' + ok, _, err = await _run_script(script2) + + msg = f"Moved window to the {side} half, sir." if ok else f"Couldn't snap window to {side}, sir. Rectangle may not be installed." + return _result("move_window", ok, msg, err) + + +# --------------------------------------------------------------------------- +# Keyboard Shortcuts (via System Events) +# --------------------------------------------------------------------------- + +_KBD_ACTIONS: dict[str, tuple[str, str, str]] = { + # action_name: (key, modifier_string, friendly_name) + "copy": ("c", "command down", "Copied"), + "paste": ("v", "command down", "Pasted"), + "undo": ("z", "command down", "Undone"), + "redo": ("z", "{command down, shift down}", "Redone"), + "select_all": ("a", "command down", "Selected all"), + "save": ("s", "command down", "Saved"), + "new_tab": ("t", "command down", "New tab opened"), + "new_window": ("n", "command down", "New window opened"), + "new_file": ("n", "command down", "New file opened"), +} + + +async def _send_keystroke(key: str, modifiers: str, action_name: str, friendly: str) -> dict: + script = f''' +tell application "System Events" + keystroke "{key}" using {modifiers} +end tell''' + ok, _, err = await _run_script(script) + msg = f"{friendly}, sir." if ok else f"Keystroke failed for {action_name}, sir." + return _result(action_name, ok, msg, err) + + +async def copy_selection() -> dict: + return await _send_keystroke("c", "command down", "copy", "Copied") + + +async def paste_clipboard() -> dict: + return await _send_keystroke("v", "command down", "paste", "Pasted") + + +async def undo_last() -> dict: + return await _send_keystroke("z", "command down", "undo", "Undone") + + +async def redo_last() -> dict: + script = ''' +tell application "System Events" + keystroke "z" using {command down, shift down} +end tell''' + ok, _, err = await _run_script(script) + return _result("redo", ok, "Redone, sir." if ok else "Redo failed, sir.", err) + + +async def select_all() -> dict: + return await _send_keystroke("a", "command down", "select_all", "Selected all") + + +async def save_document() -> dict: + return await _send_keystroke("s", "command down", "save", "Saved") + + +async def take_screenshot() -> dict: + """Capture the full screen to ~/Desktop using screencapture.""" + ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + path = Path.home() / "Desktop" / f"screenshot_{ts}.png" + try: + proc = await asyncio.create_subprocess_exec( + "screencapture", "-x", str(path), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, err_bytes = await asyncio.wait_for(proc.communicate(), timeout=15) + ok = proc.returncode == 0 and path.exists() + err = err_bytes.decode().strip() + msg = f"Screenshot saved to Desktop as {path.name}, sir." if ok \ + else "Screenshot failed, sir. Screen Recording permission may be needed." + return _result("screenshot", ok, msg, err) + except asyncio.TimeoutError: + return _result("screenshot", False, "Screenshot timed out, sir.", "timeout") + except Exception as e: + return _result("screenshot", False, "Screenshot failed, sir.", str(e)) + + +# --------------------------------------------------------------------------- +# Scroll +# --------------------------------------------------------------------------- + +async def scroll(direction: str, amount: int = 3) -> dict: + """Scroll the frontmost window up or down. + + Uses Page Up / Page Down key codes via System Events. + direction: 'up' | 'down' + """ + direction = direction.lower().strip() + if direction not in ("up", "down"): + return _result("scroll", False, f"Say 'up' or 'down', sir.", "") + + # Page Down = key code 121, Page Up = key code 116 + key_code = "121" if direction == "down" else "116" + script = f''' +tell application "System Events" + repeat {min(amount, 5)} times + key code {key_code} + end repeat +end tell''' + ok, _, err = await _run_script(script) + return _result("scroll", ok, f"Scrolled {direction}, sir." if ok else f"Couldn't scroll, sir.", err) + + +# --------------------------------------------------------------------------- +# Volume & Audio +# --------------------------------------------------------------------------- + +async def set_volume(level: int) -> dict: + """Set system output volume to 0–100.""" + level = max(0, min(100, int(level))) + script = f"set volume output volume {level}" + ok, _, err = await _run_script(script) + msg = f"Volume set to {level}, sir." if ok else "Couldn't set volume, sir." + return _result("set_volume", ok, msg, err) + + +async def mute_audio() -> dict: + """Mute system audio.""" + ok, _, err = await _run_script("set volume with output muted") + return _result("mute", ok, "Muted, sir." if ok else "Couldn't mute, sir.", err) + + +async def unmute_audio() -> dict: + """Unmute system audio.""" + ok, _, err = await _run_script("set volume without output muted") + return _result("unmute", ok, "Unmuted, sir." if ok else "Couldn't unmute, sir.", err) + + +async def get_volume() -> dict: + """Return current volume level and mute state.""" + ok, out, err = await _run_script("get volume settings") + if not ok: + return _result("get_volume", False, "Couldn't read volume, sir.", err) + # out looks like: "output volume:57, input volume:75, alert volume:100, output muted:false" + level = "unknown" + muted = False + for part in out.split(","): + p = part.strip() + if p.startswith("output volume:"): + level = p.split(":")[1].strip() + if p.startswith("output muted:"): + muted = p.split(":")[1].strip().lower() == "true" + mute_str = " (muted)" if muted else "" + _log_action("get_volume", f"level={level} muted={muted}") + return {"success": True, "confirmation": f"Volume is at {level}{mute_str}, sir.", + "level": level, "muted": muted} + + +# --------------------------------------------------------------------------- +# Finder / File System +# --------------------------------------------------------------------------- + +async def open_folder(path: str) -> dict: + """Open a folder in Finder.""" + safe = path.replace('"', "").replace("\\", "/") + # Expand ~ manually since AppleScript doesn't + if safe.startswith("~"): + safe = str(Path.home()) + safe[1:] + if not Path(safe).exists(): + return _result("open_folder", False, f"That path doesn't appear to exist, sir.", "") + script = f''' +tell application "Finder" + activate + open folder POSIX file "{safe}" +end tell''' + ok, _, err = await _run_script(script) + return _result("open_folder", ok, + "Opened in Finder, sir." if ok else "Couldn't open that folder, sir.", err) + + +async def trash_file(path: str) -> dict: + """Move a file to the Trash (NOT permanent delete).""" + safe = path.replace('"', "").replace("\\", "/") + if safe.startswith("~"): + safe = str(Path.home()) + safe[1:] + p = Path(safe) + if not p.exists(): + return _result("trash_file", False, f"That file doesn't appear to exist, sir.", "") + script = f''' +tell application "Finder" + move POSIX file "{safe}" to trash +end tell''' + ok, _, err = await _run_script(script) + return _result("trash_file", ok, + f"Moved {p.name} to Trash, sir." if ok else "Couldn't move that to Trash, sir.", err) + + +async def reveal_in_finder(path: str) -> dict: + """Reveal a file or folder in Finder (select it).""" + safe = path.replace('"', "").replace("\\", "/") + if safe.startswith("~"): + safe = str(Path.home()) + safe[1:] + if not Path(safe).exists(): + return _result("reveal_file", False, "That path doesn't appear to exist, sir.", "") + script = f''' +tell application "Finder" + activate + reveal POSIX file "{safe}" +end tell''' + ok, _, err = await _run_script(script) + return _result("reveal_file", ok, + "Revealed in Finder, sir." if ok else "Couldn't reveal that file, sir.", err) + + +# --------------------------------------------------------------------------- +# Dispatch table — maps action tag string to coroutine +# --------------------------------------------------------------------------- + +async def dispatch(action: str, target: str) -> dict: + """Route an action tag to the right handler. + + action: lowercase action name (e.g. "open_tab", "set_volume") + target: argument string from the tag (may be empty) + """ + t = target.strip() + + if action == "open_tab": + return await open_new_tab(t) + elif action == "close_window": + return await close_chrome_window() + elif action == "browser_back": + return await browser_back() + elif action == "browser_forward": + return await browser_forward() + elif action == "reload": + return await reload_page() + elif action == "get_tab": + return await get_tab_info() + elif action == "switch_app": + return await switch_to_app(t) if t else {"success": False, "confirmation": "Which app, sir?"} + elif action == "quit_app": + return await quit_app(t) if t else {"success": False, "confirmation": "Which app shall I quit, sir?"} + elif action == "hide_app": + return await hide_app(t) if t else {"success": False, "confirmation": "Which app shall I hide, sir?"} + elif action == "minimize_window": + return await minimize_window() + elif action == "maximize_window": + return await maximize_window() + elif action == "move_window": + return await move_window_to_half(t or "left") + elif action == "copy": + return await copy_selection() + elif action == "paste": + return await paste_clipboard() + elif action == "undo": + return await undo_last() + elif action == "redo": + return await redo_last() + elif action == "select_all": + return await select_all() + elif action == "save": + return await save_document() + elif action == "screenshot": + return await take_screenshot() + elif action == "scroll": + return await scroll(t or "down") + elif action == "set_volume": + try: + return await set_volume(int(t)) + except (ValueError, TypeError): + return {"success": False, "confirmation": "Please give a volume level from 0 to 100, sir."} + elif action == "mute": + return await mute_audio() + elif action == "unmute": + return await unmute_audio() + elif action == "get_volume": + return await get_volume() + elif action == "open_folder": + return await open_folder(t) if t else {"success": False, "confirmation": "Which folder, sir?"} + elif action == "trash_file": + return await trash_file(t) if t else {"success": False, "confirmation": "Which file, sir?"} + elif action == "reveal_file": + return await reveal_in_finder(t) if t else {"success": False, "confirmation": "Which file, sir?"} + else: + return {"success": False, "confirmation": f"Unknown action '{action}', sir."} diff --git a/work_mode.py b/work_mode.py index 09747b7..dea741d 100644 --- a/work_mode.py +++ b/work_mode.py @@ -53,13 +53,18 @@ async def start(self, working_dir: str, project_name: str = None): self._active = True self._message_count = 0 self._status = "idle" + self._save_session() log.info(f"Work mode started: {self._project_name} ({working_dir})") - async def send(self, user_text: str) -> str: + async def send(self, user_text: str, memory_context: str = "") -> str: """Send a message to claude -p and get the full response. First message in a session: fresh claude -p Subsequent messages: claude -p --continue (resumes last session in dir) + + Args: + user_text: The prompt/instruction to send. + memory_context: Optional memory/context preamble injected before the prompt. """ claude_path = shutil.which("claude") if not claude_path: @@ -77,6 +82,18 @@ async def send(self, user_text: str) -> str: self._status = "working" + # Prepend memory context if provided, separated clearly so Claude Code + # treats it as background knowledge, not an instruction to repeat. + if memory_context: + full_prompt = ( + "\n" + f"{memory_context}\n" + "\n\n" + f"{user_text}" + ) + else: + full_prompt = user_text + try: process = await asyncio.create_subprocess_exec( *cmd, @@ -86,18 +103,21 @@ async def send(self, user_text: str) -> str: cwd=self._working_dir, ) + # Increased from 300s: complex builds (npm install + webpack + tests) + # routinely exceed 5 minutes. 15 minutes covers the vast majority. stdout, stderr = await asyncio.wait_for( - process.communicate(input=user_text.encode()), - timeout=300, + process.communicate(input=full_prompt.encode("utf-8", errors="replace")), + timeout=900, ) - response = stdout.decode().strip() + response = stdout.decode("utf-8", errors="replace").strip() self._message_count += 1 + self._save_session() self._status = "done" if process.returncode != 0: - error = stderr.decode().strip()[:200] - log.error(f"claude -p error: {error}") + error = stderr.decode("utf-8", errors="replace").strip()[:200] + log.error(f"claude -p error (rc={process.returncode}): {error}") self._status = "error" return f"Hit a problem, sir: {error}" @@ -105,11 +125,11 @@ async def send(self, user_text: str) -> str: return response except asyncio.TimeoutError: - log.error("claude -p timed out after 300s") + log.error("claude -p timed out after 900s") self._status = "timeout" return "That's taking longer than expected, sir. The operation timed out." except Exception as e: - log.error(f"Work mode error: {e}") + log.error(f"Work mode error: {e}", exc_info=True) self._status = "error" return f"Something went wrong, sir: {str(e)[:100]}" @@ -121,6 +141,7 @@ async def stop(self): self._project_name = None self._message_count = 0 self._status = "idle" + self._clear_session() log.info(f"Work mode ended for {project}") def _save_session(self): From 552bc6cdeb1d5f9526eea1cd6c761a7aa935dd5d Mon Sep 17 00:00:00 2001 From: Najeem Laaroussi Date: Thu, 23 Apr 2026 19:08:22 +0100 Subject: [PATCH 2/2] Add signal handler for session persistence on hard kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SIGTERM/SIGINT handlers registered in lifespan — on pkill or Ctrl+C, all active WebSocket sessions generate a Haiku summary and persist to SQLite before the process exits. Previously a hard kill discarded the final session summary. - Global _active_sessions registry tracks live sessions with buffer, client, and rolling summary refs. Deregistered on clean disconnect so the signal handler only touches sessions that didn't close gracefully. - Rolling summary synced to registry after each mid-session compaction. Co-Authored-By: Claude Sonnet 4.6 --- server.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/server.py b/server.py index 0b86156..9ad8ca9 100644 --- a/server.py +++ b/server.py @@ -13,6 +13,7 @@ import json import logging import os +import signal import sys import time from pathlib import Path @@ -1683,6 +1684,48 @@ def _worker(): log.info("Context refresh thread started") +# Global registry of active WebSocket sessions — used by signal handler for clean shutdown +_active_sessions: dict[str, dict] = {} + + +async def _save_session(sid: str) -> None: + """Generate and persist a session summary. Called on clean disconnect and signal shutdown.""" + data = _active_sessions.get(sid) + if not data: + return + buf = data.get("buffer", []) + client = data.get("client") + rolling_summary = data.get("summary", "") + msg_count = len(buf) // 2 + if msg_count == 0: + return + final_summary = rolling_summary + if client: + try: + recent = buf[-30:] + convo_text = "\n".join( + f"{'User' if m['role'] == 'user' else 'JARVIS'}: {m['content'][:200]}" + for m in recent + ) + resp = await client.messages.create( + model=FAST_MODEL, + max_tokens=250, + system=( + "Summarise this JARVIS conversation in 2-3 sentences. " + "Cover: main topics discussed, decisions made, tasks or projects started, " + "anything personal the user shared. Be specific — names, numbers, and details matter. " + "Write in third person past tense." + ), + messages=[{"role": "user", "content": convo_text}], + ) + final_summary = resp.content[0].text.strip() + except Exception as e: + log.warning(f"Signal shutdown summary failed: {e}") + end_session(sid, final_summary, msg_count) + _active_sessions.pop(sid, None) + log.info(f"Session {sid} saved ({msg_count} exchanges)") + + @asynccontextmanager async def lifespan(application: FastAPI): global anthropic_client, cached_projects @@ -1692,6 +1735,18 @@ async def lifespan(application: FastAPI): log.warning("ANTHROPIC_API_KEY not set — LLM features disabled") cached_projects = [] + # Signal handlers — save all active sessions before process exits + loop = asyncio.get_event_loop() + + async def _graceful_shutdown(signame: str) -> None: + log.info(f"Received {signame} — saving {len(_active_sessions)} active session(s)") + await asyncio.gather(*[_save_session(sid) for sid in list(_active_sessions)], return_exceptions=True) + log.info("All sessions saved. Shutting down.") + loop.stop() + + for _sig in (signal.SIGTERM, signal.SIGINT): + loop.add_signal_handler(_sig, lambda s=_sig.name: asyncio.create_task(_graceful_shutdown(s))) + # Start context refresh in a separate thread (never touches event loop) _refresh_context_sync() log.info("JARVIS server starting") @@ -2392,6 +2447,13 @@ async def voice_handler(ws: WebSocket): session_id = datetime.now().strftime("%Y%m%d_%H%M%S") start_session(session_id) + # Register with global session registry so signal handlers can save on hard kill + _active_sessions[session_id] = { + "buffer": session_buffer, + "client": anthropic_client, + "summary": session_summary, + } + # Load cross-session memory once on connect — injected into every response prior_context = build_session_context() @@ -2940,6 +3002,9 @@ async def _do_summary(): session_summary, rotated, anthropic_client ) summary_update_pending = False + # Keep registry in sync so signal handler has latest summary + if session_id in _active_sessions: + _active_sessions[session_id]["summary"] = session_summary asyncio.create_task(_do_summary()) else: summary_update_pending = False @@ -3013,6 +3078,9 @@ async def _do_summary(): elif msg_count > 0: end_session(session_id, session_summary, msg_count) + # Deregister from global registry — session is saved, no longer needs signal protection + _active_sessions.pop(session_id, None) + # --------------------------------------------------------------------------- # Settings / Configuration endpoints