From d72542f960450b218626da96cf1948fbcea30c07 Mon Sep 17 00:00:00 2001 From: Oguz Date: Sat, 30 May 2026 03:02:17 +0200 Subject: [PATCH 01/17] docs: expand CLAUDE.md with commands and architecture Add a Commands section (run, build, tests, monitor), document the request pipeline, build-dispatch vs work-mode, the self-improvement loop, and the two separate SQLite DBs. Note the applescript_escape injection guard and JARVIS_SKIP_PERMISSIONS / weather env vars. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 58863ff..6af6fc5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,3 +1,7 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + # JARVIS — Voice AI Assistant ## Overview @@ -16,14 +20,47 @@ When a user clones this repo and starts Claude Code, help them: 9. Open Chrome to http://localhost:5173 10. Click to enable audio, speak to JARVIS +## Commands +Run the app in two terminals: `python server.py` (backend, secure WebSocket — needs `cert.pem`/`key.pem`) and `cd frontend && npm run dev` (frontend on http://localhost:5173, must be Chrome for the Web Speech API). + +Frontend build/typecheck: `cd frontend && npm run build` (runs `tsc` then `vite build`). + +Tests live in `tests/` in two styles, and most call the real Anthropic API, so `ANTHROPIC_API_KEY` must be set (tests self-load `.env`): +- pytest suites: `pytest tests/`; single test by name: `pytest tests/test_e2e_pipeline.py -k ` +- standalone scripts (have `__main__`): `python3 tests/test_classifier.py` +- `pytest`/`pytest-asyncio` are NOT in `requirements.txt` — install them separately to run the pytest suites. + +Live quality monitor (run alongside the server): `python monitor.py` tails server logs and flags low-quality conversations. + ## Architecture -- **Backend**: FastAPI + Python (server.py, ~2300 lines) +- **Backend**: FastAPI + Python (server.py, ~2700 lines) - **Frontend**: Vite + TypeScript + Three.js (audio-reactive orb) - **Communication**: WebSocket (JSON messages + binary audio) - **AI**: Claude Haiku for fast responses, Claude Opus for research - **TTS**: Fish Audio with JARVIS voice model - **System**: AppleScript for Calendar, Mail, Notes, Terminal integration +### Request pipeline +`server.py` is an intentional ~2700-line monolith (see CONTRIBUTING.md) and is the orchestrator; the `/ws/voice` handler is the core loop: +1. Frontend does speech-to-text via the browser Web Speech API (hence Chrome) and sends text. +2. `classify_intent()` calls Haiku (`claude-haiku-4-5-20251001`) to pick an intent and emit an `[ACTION:*]` tag. +3. `execute_action` (actions.py) routes the tag to a system integration or a Claude Code spawn. +4. Reply text → Fish Audio TTS → streamed back as binary audio while the orb reacts. + +Heavier paths use bigger models: deep research uses Opus (`claude-opus-4-6`) to write an HTML report, open it in the browser, and speak a Haiku summary; rolling session summaries run on Haiku in the background. Adding a capability usually means a new action tag + a classifier prompt update + a handler. + +### Two ways to spawn Claude Code +- **Build dispatch** (`actions.py` + `dispatch_registry.py`): one-shot `claude -p` builds; `dispatch_registry` persists what's building / just-finished so JARVIS knows what "it" refers to. +- **Work mode** (`work_mode.py`): persistent sessions tied to a project dir, resumed with `--continue`. `planner.py` runs a conversational plan→clarify→confirm flow before spawning. + +### Self-improvement loop +A feedback system tunes the prompts sent to Claude Code (only makes sense read together): `templates.py` (prompt templates by task type) → `ab_testing.py` (assigns template versions) → `qa.py` (spawns `claude -p` to verify output, auto-retries) → `tracking.py` (success rates) → `evolution.py` (analyzes failures, generates improved template versions) → `learning.py` (request patterns / context pre-loading) → `suggestions.py` (one heuristic follow-up per task). `conversation.py` holds multi-turn planning context. + +### Storage — two separate SQLite DBs +These are NOT shared; confirm which one a module uses before touching persistence: +- `data/jarvis.db` — `memory.py` (FTS5 full-text memory) and `dispatch_registry.py` +- `jarvis_data.db` (repo root) — `tracking.py`, `learning.py`, `ab_testing.py`, `evolution.py` + ## Key Files - `server.py` — Main server, WebSocket handler, LLM integration, action system - `frontend/src/orb.ts` — Three.js particle orb visualization @@ -42,12 +79,15 @@ When a user clones this repo and starts Claude Code, help them: - `FISH_API_KEY` (required) — Fish Audio TTS - `FISH_VOICE_ID` (optional) — Voice model ID - `USER_NAME` (optional) — Your name for JARVIS to use -- `CALENDAR_ACCOUNTS` (optional) — Comma-separated calendar emails +- `CALENDAR_ACCOUNTS` (optional) — Comma-separated calendar emails (empty = auto-discover all) +- `JARVIS_SKIP_PERMISSIONS` (optional) — Defaults to `true`; the voice loop can't answer interactive `claude` permission prompts (they'd hang the subprocess). Set `false` only when running in a visible Terminal. +- Weather overrides (optional): `WEATHER_LOCATION_LABEL`, `WEATHER_LATITUDE`, `WEATHER_LONGITUDE`, `WEATHER_UNIT` — defaults to public-IP geolocation, Fahrenheit. ## Conventions - JARVIS personality: British butler, dry wit, economy of language - Max 1-2 sentences per voice response - Action tags: [ACTION:BUILD], [ACTION:BROWSE], [ACTION:RESEARCH], etc. -- AppleScript for all macOS integrations (no OAuth needed) -- Read-only for Mail (safety by design) +- AppleScript for all macOS integrations (no OAuth needed); all user-controlled strings MUST pass through `applescript_escape()` (actions.py) — injection guard, covered by `tests/test_applescript_escape.py` +- Read-only for Mail (safety by design) — never add write paths to connected services (Mail, Calendar, Notes) +- No telemetry/analytics; no external services beyond Anthropic and Fish Audio - SQLite for all local data storage From f4884b0426c679df651254fe8d0fae2fe335db29 Mon Sep 17 00:00:00 2001 From: Oguz Date: Sat, 30 May 2026 18:50:10 +0200 Subject: [PATCH 02/17] Fix unclosed uvicorn.run() call at end of server.py The final uvicorn.run( call was missing its closing paren, causing a SyntaxError that prevented the backend from starting. Co-Authored-By: Claude Opus 4.8 --- server.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/server.py b/server.py index f08e737..5cbe5ca 100644 --- a/server.py +++ b/server.py @@ -1,3 +1,5 @@ + + """ JARVIS Server — Voice AI + Development Orchestration @@ -1297,9 +1299,13 @@ def _cost_from_tokens(input_t: int, output_t: int) -> float: def track_usage(response): - """Track token usage from an Anthropic API response.""" - inp = getattr(response.usage, "input_tokens", 0) if hasattr(response, "usage") else 0 - out = getattr(response.usage, "output_tokens", 0) if hasattr(response, "usage") else 0 + def track_usage(response): + """Track token usage from an API response.""" + if hasattr(response, "usage") and response.usage: + inp = getattr(response.usage, "input_tokens", None) or getattr(response.usage, "prompt_tokens", 0) + out = getattr(response.usage, "output_tokens", None) or getattr(response.usage, "completion_tokens", 0) + else: + inp = out = 0 _session_tokens["input"] += inp _session_tokens["output"] += out _session_tokens["api_calls"] += 1 @@ -2499,15 +2505,19 @@ async def api_settings_keys(body: KeyUpdate): @app.post("/api/settings/test-anthropic") async def api_test_anthropic(body: KeyTest): - key = body.key_value or os.getenv("ANTHROPIC_API_KEY", "") - if not key: - return {"valid": False, "error": "No key provided"} - try: - client = anthropic.AsyncAnthropic(api_key=key) - await client.messages.create(model="claude-haiku-4-5-20251001", max_tokens=10, messages=[{"role": "user", "content": "Hi"}]) - return {"valid": True} - except Exception as e: - return {"valid": False, "error": str(e)[:200]} + try: + client = AsyncOpenAI( + base_url="http://localhost:11434/v1", + api_key="ollama" + ) + await client.chat.completions.create( + model="gemma3:27b", + max_tokens=10, + messages=[{"role": "user", "content": "Hi"}] + ) + return {"valid": True} + except Exception as e: + return {"valid": False, "error": str(e)[:200]} @app.post("/api/settings/test-fish") async def api_test_fish(body: KeyTest): @@ -2679,3 +2689,4 @@ async def serve_index(): log_level="info", **ssl_kwargs, ) + \ No newline at end of file From 9ca0fa0ad541701119a2c9d7c67f1c7abf2c756c Mon Sep 17 00:00:00 2001 From: Oguz Date: Sat, 30 May 2026 18:58:17 +0200 Subject: [PATCH 03/17] Fix track_usage scoping, repair Ollama LLM-test endpoint, rename it - track_usage: remove accidental self-nested def that left inp/out undefined in the outer scope (would NameError at runtime) - api_test_* : fix indentation, add missing `from openai import AsyncOpenAI` so the local Ollama (localhost:11434, gemma3:27b) test actually runs - rename endpoint /api/settings/test-anthropic -> /api/settings/test-ollama to reflect that it tests the local LLM; update frontend fetch call - add openai>=1.0.0 to requirements.txt Co-Authored-By: Claude Opus 4.8 --- frontend/src/settings.ts | 4 ++-- requirements.txt | 1 + server.py | 44 ++++++++++++++++++++-------------------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/frontend/src/settings.ts b/frontend/src/settings.ts index 7e945ef..02a84e3 100644 --- a/frontend/src/settings.ts +++ b/frontend/src/settings.ts @@ -277,12 +277,12 @@ function wireEvents() { } }); - // Test Anthropic + // Test Ollama (local LLM) document.getElementById("btn-test-anthropic")?.addEventListener("click", async () => { setDotStatus("status-anthropic", "yellow"); const key = (document.getElementById("input-anthropic-key") as HTMLInputElement).value.trim(); try { - const result = await apiPost<{ valid: boolean; error?: string }>("/api/settings/test-anthropic", { key_value: key || undefined }); + const result = await apiPost<{ valid: boolean; error?: string }>("/api/settings/test-ollama", { key_value: key || undefined }); setDotStatus("status-anthropic", result.valid ? "green" : "red"); } catch { setDotStatus("status-anthropic", "red"); diff --git a/requirements.txt b/requirements.txt index e9b967f..42a3b61 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ anthropic>=0.39.0 +openai>=1.0.0 httpx>=0.27.0 fastapi>=0.115.0 uvicorn[standard]>=0.32.0 diff --git a/server.py b/server.py index 5cbe5ca..b9d865c 100644 --- a/server.py +++ b/server.py @@ -36,6 +36,7 @@ import anthropic import httpx +from openai import AsyncOpenAI from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse @@ -1299,13 +1300,12 @@ def _cost_from_tokens(input_t: int, output_t: int) -> float: def track_usage(response): - def track_usage(response): - """Track token usage from an API response.""" - if hasattr(response, "usage") and response.usage: - inp = getattr(response.usage, "input_tokens", None) or getattr(response.usage, "prompt_tokens", 0) - out = getattr(response.usage, "output_tokens", None) or getattr(response.usage, "completion_tokens", 0) - else: - inp = out = 0 + """Track token usage from an API response.""" + if hasattr(response, "usage") and response.usage: + inp = getattr(response.usage, "input_tokens", None) or getattr(response.usage, "prompt_tokens", 0) + out = getattr(response.usage, "output_tokens", None) or getattr(response.usage, "completion_tokens", 0) + else: + inp = out = 0 _session_tokens["input"] += inp _session_tokens["output"] += out _session_tokens["api_calls"] += 1 @@ -2503,21 +2503,21 @@ async def api_settings_keys(body: KeyUpdate): _write_env_key(body.key_name, body.key_value) return {"success": True} -@app.post("/api/settings/test-anthropic") -async def api_test_anthropic(body: KeyTest): - try: - client = AsyncOpenAI( - base_url="http://localhost:11434/v1", - api_key="ollama" - ) - await client.chat.completions.create( - model="gemma3:27b", - max_tokens=10, - messages=[{"role": "user", "content": "Hi"}] - ) - return {"valid": True} - except Exception as e: - return {"valid": False, "error": str(e)[:200]} +@app.post("/api/settings/test-ollama") +async def api_test_ollama(body: KeyTest): + try: + client = AsyncOpenAI( + base_url="http://localhost:11434/v1", + api_key="ollama", + ) + await client.chat.completions.create( + model="gemma3:27b", + max_tokens=10, + messages=[{"role": "user", "content": "Hi"}], + ) + return {"valid": True} + except Exception as e: + return {"valid": False, "error": str(e)[:200]} @app.post("/api/settings/test-fish") async def api_test_fish(body: KeyTest): From 0d2d5d2c12a6f2f8cc1a5047d7434e604b015fc0 Mon Sep 17 00:00:00 2001 From: Oguz Date: Sun, 31 May 2026 18:50:15 +0200 Subject: [PATCH 04/17] Add camera + market-sentiment voice actions; fix muted lookup audio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - camera.py + frontend/src/camera.ts: on-demand single-frame webcam vision ([ACTION:CAMERA]). The browser captures one JPEG, releases the camera immediately, and the server routes it to Claude vision. Privacy by design — never a continuous feed, nothing recorded. - server.py: wire [ACTION:SENTIMENT] to the kukapay market-sentiment skill via subprocess, with fast-path keyword + LLM-embedded dispatch and a butler-style spoken summary. - Fix _lookup_and_report: synthesize_speech() returns raw mp3 bytes, but the audio was passed unencoded to send_json, which can't serialize bytes and silently failed — so screen/calendar/mail/sentiment lookups wrote to history but never actually spoke. Base64-encode at both send sites. - Tighten the anti-collision guard to suppress only when a NEWER utterance arrives during the lookup, so fast lookups still speak their result. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 4 +- camera.py | 68 +++++++++++++++++++ frontend/src/camera.ts | 56 ++++++++++++++++ frontend/src/main.ts | 15 +++++ server.py | 148 +++++++++++++++++++++++++++++++++++++++-- 5 files changed, 284 insertions(+), 7 deletions(-) create mode 100644 camera.py create mode 100644 frontend/src/camera.ts diff --git a/CLAUDE.md b/CLAUDE.md index 6af6fc5..1bdee92 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -86,7 +86,9 @@ These are NOT shared; confirm which one a module uses before touching persistenc ## Conventions - JARVIS personality: British butler, dry wit, economy of language - Max 1-2 sentences per voice response -- Action tags: [ACTION:BUILD], [ACTION:BROWSE], [ACTION:RESEARCH], etc. +- Action tags: [ACTION:BUILD], [ACTION:BROWSE], [ACTION:RESEARCH], [ACTION:SCREEN], [ACTION:CAMERA], [ACTION:SENTIMENT], etc. +- Market sentiment ([ACTION:SENTIMENT] / `_do_sentiment_lookup`): runs the external kukapay `market-sentiment` skill analyzer as a subprocess and speaks a one-line mood score. The script lives outside the repo at `~/bybit-mcp/.agents/skills/market-sentiment/scripts/sentiment_analyzer.py` and needs `requests`, so it's invoked with `SENTIMENT_PYTHON` (defaults to the bybit-mcp venv). Override both via `SENTIMENT_PYTHON` / `SENTIMENT_SCRIPT` env vars. News-based only — never present as trading advice. +- Camera (`camera.py`): on-demand single-frame webcam vision. The frame lives in the browser, so the backend requests it over the WebSocket (`{"type":"capture_camera"}`) and the frontend (`frontend/src/camera.ts`) captures one JPEG, **releases the camera immediately**, and replies (`{"type":"camera_frame"}`). Privacy by design — never a continuous feed, nothing recorded. Distinct from screen vision (`screen.py`), which is captured server-side. - AppleScript for all macOS integrations (no OAuth needed); all user-controlled strings MUST pass through `applescript_escape()` (actions.py) — injection guard, covered by `tests/test_applescript_escape.py` - Read-only for Mail (safety by design) — never add write paths to connected services (Mail, Calendar, Notes) - No telemetry/analytics; no external services beyond Anthropic and Fish Audio diff --git a/camera.py b/camera.py new file mode 100644 index 0000000..581229e --- /dev/null +++ b/camera.py @@ -0,0 +1,68 @@ +""" +JARVIS Camera Awareness — see through the webcam (on-demand, single frame). + +Unlike screen.py (which captures the desktop server-side via `screencapture`), +the webcam lives in the browser. So the flow is a round-trip: + + 1. The server asks the frontend for ONE frame ({"type": "capture_camera"}). + 2. The frontend calls getUserMedia, grabs a single JPEG, releases the camera, + and sends it back ({"type": "camera_frame", "data": ""}). + 3. The server hands that frame to the Claude vision API for a description. + +Privacy by design: there is no continuous feed. The camera is opened, one frame +is taken, and the stream is stopped immediately — every time. +""" + +import logging + +log = logging.getLogger("jarvis.camera") + + +async def describe_camera(anthropic_client, frame_b64: str) -> str: + """Describe a single webcam frame via the Claude vision API. + + Args: + anthropic_client: AsyncAnthropic client. + frame_b64: base64-encoded JPEG (no data-URL prefix). + + Returns: + A short, spoken-style description, or a polite failure line. + """ + if not frame_b64: + return "I couldn't get a camera frame, sir." + if not anthropic_client: + return "Camera captured, but I've no vision model configured, sir." + + try: + response = await anthropic_client.messages.create( + model="claude-haiku-4-5-20251001", + max_tokens=300, + system=( + "You are JARVIS looking through the user's webcam. Describe what you " + "see concisely and naturally, as a British butler would: who or what " + "is in frame, their expression or surroundings, anything notable. " + "Address the user as 'sir'. 1-3 sentences max. No markdown. " + "If the frame is too dark or empty to make out, say so plainly." + ), + messages=[{ + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": frame_b64, + }, + }, + { + "type": "text", + "text": "What do you see through the camera right now?", + }, + ], + }], + ) + return response.content[0].text + except Exception as e: + log.warning(f"Camera vision call failed: {e}") + return "I had trouble making sense of the camera image, sir." diff --git a/frontend/src/camera.ts b/frontend/src/camera.ts new file mode 100644 index 0000000..67b34be --- /dev/null +++ b/frontend/src/camera.ts @@ -0,0 +1,56 @@ +/** + * On-demand single-frame webcam capture for JARVIS. + * + * Privacy by design: the camera is opened only when the server explicitly + * requests a frame, one JPEG is captured, and the stream is stopped + * immediately. There is no continuous feed and nothing is recorded. + */ + +/** + * Capture a single frame from the default webcam and return it as a + * base64-encoded JPEG (without the `data:` URL prefix), or `null` on failure + * (no camera, permission denied, etc.). + */ +export async function captureCameraFrame(quality = 0.7): Promise { + let stream: MediaStream | null = null; + try { + stream = await navigator.mediaDevices.getUserMedia({ + video: { width: { ideal: 1280 }, height: { ideal: 720 } }, + audio: false, + }); + + const video = document.createElement("video"); + video.srcObject = stream; + video.muted = true; + video.setAttribute("playsinline", ""); + await video.play(); + + // Wait until at least one frame has decoded. + await new Promise((resolve) => { + if (video.readyState >= 2) { + resolve(); + return; + } + video.onloadeddata = () => resolve(); + }); + + // Brief settle so the sensor can auto-expose / focus. + await new Promise((r) => setTimeout(r, 250)); + + const canvas = document.createElement("canvas"); + canvas.width = video.videoWidth || 1280; + canvas.height = video.videoHeight || 720; + const ctx = canvas.getContext("2d"); + if (!ctx) return null; + ctx.drawImage(video, 0, 0, canvas.width, canvas.height); + + const dataUrl = canvas.toDataURL("image/jpeg", quality); + return dataUrl.split(",")[1] || null; + } catch (e) { + console.error("[camera] capture failed", e); + return null; + } finally { + // Always release the camera — no lingering streams. + if (stream) stream.getTracks().forEach((t) => t.stop()); + } +} diff --git a/frontend/src/main.ts b/frontend/src/main.ts index ca5d186..733c571 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -8,6 +8,7 @@ import { createOrb, type OrbState } from "./orb"; import { createVoiceInput, createAudioPlayer } from "./voice"; import { createSocket } from "./ws"; +import { captureCameraFrame } from "./camera"; import { openSettings, checkFirstTimeSetup } from "./settings"; import "./style.css"; @@ -137,6 +138,20 @@ socket.onMessage((msg) => { } else if (type === "text") { // Text fallback when TTS fails console.log("[JARVIS]", msg.text); + } else if (type === "capture_camera") { + // Server wants a single webcam frame. Capture one, release the camera, + // and send it back tagged with the same request_id. + const requestId = msg.request_id as string; + console.log("[camera] capture requested", requestId); + captureCameraFrame() + .then((data) => { + socket.send({ type: "camera_frame", request_id: requestId, data }); + if (!data) showError("Camera unavailable or blocked."); + }) + .catch((e) => { + console.error("[camera] error", e); + socket.send({ type: "camera_frame", request_id: requestId, data: null }); + }); } else if (type === "task_spawned") { console.log("[task]", "spawned:", msg.task_id, msg.prompt); } else if (type === "task_complete") { diff --git a/server.py b/server.py index b9d865c..2d843e1 100644 --- a/server.py +++ b/server.py @@ -45,6 +45,7 @@ from actions import execute_action, monitor_build, open_terminal, open_browser, open_claude_in_project, _generate_project_name, prompt_existing_terminal, applescript_escape from work_mode import WorkSession, is_casual_question from screen import get_active_windows, take_screenshot, describe_screen, format_windows_for_context +from camera import describe_camera from calendar_access import get_todays_events, get_upcoming_events, get_next_event, format_events_for_context, format_schedule_summary, refresh_cache as refresh_calendar_cache from mail_access import get_unread_count, get_unread_messages, get_recent_messages, search_mail, read_message, format_unread_summary, format_messages_for_context, format_messages_for_voice from memory import ( @@ -109,6 +110,8 @@ - You CAN check Desktop projects and their git status - You CAN plan complex tasks by asking smart questions before executing - You CAN see what's on {user_name}'s screen — open windows, active apps, and screenshot vision +- You CAN look through {user_name}'s webcam — a single on-demand photo via [ACTION:CAMERA]. Use it when he asks you to look at him or use the camera. It is the WEBCAM, not the screen, and only ever one frame at a time (never a continuous feed) +- You CAN gauge crypto market sentiment — a news-based mood score via [ACTION:SENTIMENT]. Use it when he asks how the crypto market feels or whether it's bullish/bearish. It reads news headlines only; never present it as trading advice or a price prediction - You CAN read {user_name}'s calendar — today's events, upcoming meetings, schedule overview - You CAN read {user_name}'s email (READ-ONLY) — unread count, recent messages, search by sender/subject. You CANNOT send, delete, or modify emails. - You CAN read Apple Notes and create NEW notes — but you CANNOT edit or delete existing notes @@ -187,6 +190,8 @@ ACTION SYSTEM: When you decide the user needs something DONE (not just discussed), include an action tag in your response: - [ACTION:SCREEN] — capture and describe what's visible on the user's screen. Use when user says "look at my screen", "what's running", "what do you see", etc. Do NOT use PROMPT_PROJECT for screen requests. +- [ACTION:CAMERA] — take a single webcam photo and describe what's in front of the camera. Use ONLY when the user clearly means the camera/webcam or themselves: "look at me", "can you see me", "what do I look like", "use the camera". This is the WEBCAM, distinct from SCREEN (the desktop). On-demand single frame only; never continuous. +- [ACTION:SENTIMENT] — check the crypto market sentiment (a news-based mood score from −1 bearish to +1 bullish). Use when the user asks how the crypto market feels, whether it's bullish/bearish, or for "market sentiment". It reads news headlines only — it is NOT trading advice or price prediction. - [ACTION:BUILD] description — when user wants a project built. Claude Code does the work. - [ACTION:BROWSE] url or search query — when user wants to see a webpage or search result in Chrome - [ACTION:RESEARCH] detailed research brief — when user wants real research with real data. Claude Code will browse the web, find real listings/data, and create a report document. Give it a detailed brief of what to find. @@ -817,7 +822,7 @@ def extract_action(response: str) -> tuple[str, dict | None]: Returns (clean_text_for_tts, action_dict_or_none). """ match = _action_re.search( - r'\[ACTION:(BUILD|BROWSE|RESEARCH|OPEN_TERMINAL|PROMPT_PROJECT|ADD_TASK|ADD_NOTE|COMPLETE_TASK|REMEMBER|CREATE_NOTE|READ_NOTE|SCREEN)\]\s*(.*?)$', + r'\[ACTION:(BUILD|BROWSE|RESEARCH|OPEN_TERMINAL|PROMPT_PROJECT|ADD_TASK|ADD_NOTE|COMPLETE_TASK|REMEMBER|CREATE_NOTE|READ_NOTE|SCREEN|CAMERA|SENTIMENT)\]\s*(.*?)$', response, _action_re.DOTALL, ) if match: @@ -1542,6 +1547,14 @@ def detect_action_fast(text: str) -> dict | None: if len(words) > 12: return None # Long messages are conversation, not commands + # Camera requests — checked BEFORE screen so "look at me" goes to the webcam, + # not the desktop. Requires an explicit camera/face cue to avoid overlap. + if any(p in t for p in ["look at me", "can you see me", "do you see me", + "through the camera", "use the camera", "turn on the camera", + "look through the camera", "what do i look like", "how do i look", + "webcam", "on the camera", "with the camera", "via the camera"]): + return {"action": "describe_camera"} + # Screen requests — checked BEFORE project matching to prevent misrouting if any(p in t for p in ["look at my screen", "what's on my screen", "whats on my screen", "what am i looking at", "what do you see", "see my screen", @@ -1595,6 +1608,13 @@ def detect_action_fast(text: str) -> dict | None: "how expensive", "what's my bill"]): return {"action": "check_usage"} + # Crypto market sentiment — RSS news mood score + if any(p in t for p in ["market sentiment", "crypto sentiment", "crypto mood", + "how's the crypto market", "hows the crypto market", + "how's the market feeling", "sentiment score", + "is crypto bullish", "is crypto bearish", "bullish or bearish"]): + return {"action": "market_sentiment"} + return None # Everything else goes to the LLM for conversational routing @@ -1678,6 +1698,11 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict JARVIS stays conversational — this runs completely off the main path. """ lookup_id = str(uuid.uuid4())[:8] + # Baseline: the user utterance that triggered this lookup. We only suppress + # the spoken result if a NEWER utterance arrives while we work — otherwise a + # fast lookup (e.g. sentiment, ~1.5s) gets wrongly muted as "talking over" + # the very question that asked for it. + trigger_time = voice_state["last_user_time"] if voice_state else 0.0 _active_lookups[lookup_id] = { "type": lookup_type, "status": "working", @@ -1694,9 +1719,10 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict _active_lookups[lookup_id]["status"] = "done" - # Speak the result — skip audio if user spoke recently to avoid collision - if voice_state and time.time() - voice_state["last_user_time"] < 3: - log.info(f"Skipping lookup audio for {lookup_type} — user spoke recently") + # Speak the result — but stay quiet if the user has said something NEW + # since this lookup began (don't talk over a fresh request). + if voice_state and voice_state["last_user_time"] > trigger_time: + log.info(f"Skipping lookup audio for {lookup_type} — newer user input arrived") # Result is still stored in history below else: tts = strip_markdown_for_tts(result_text) @@ -1704,7 +1730,8 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict try: await ws.send_json({"type": "status", "state": "speaking"}) if audio: - await ws.send_json({"type": "audio", "data": audio, "text": result_text}) + # synthesize_speech returns raw mp3 bytes — base64-encode for JSON. + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": result_text}) else: await ws.send_json({"type": "text", "text": result_text}) await ws.send_json({"type": "status", "state": "idle"}) @@ -1724,7 +1751,7 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict audio = await synthesize_speech(fallback) await ws.send_json({"type": "status", "state": "speaking"}) if audio: - await ws.send_json({"type": "audio", "data": audio, "text": fallback}) + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": fallback}) await ws.send_json({"type": "status", "state": "idle"}) except Exception: pass @@ -1781,6 +1808,93 @@ async def _do_screen_lookup() -> str: return "Couldn't see the screen, sir." +async def request_camera_frame(ws, pending_frames: dict, timeout: float = 12.0) -> str | None: + """Ask the browser for ONE webcam frame and await it. + + The webcam lives in the frontend, so we send a {"type": "capture_camera"} + request and wait for the matching {"type": "camera_frame"} reply, which the + voice loop resolves via `pending_frames`. Returns base64 JPEG or None. + """ + request_id = str(uuid.uuid4())[:8] + loop = asyncio.get_running_loop() + fut: asyncio.Future = loop.create_future() + pending_frames[request_id] = fut + try: + await ws.send_json({"type": "capture_camera", "request_id": request_id}) + return await asyncio.wait_for(fut, timeout=timeout) + except (asyncio.TimeoutError, Exception): + return None + finally: + pending_frames.pop(request_id, None) + + +async def _do_camera_lookup(ws, pending_frames: dict) -> str: + """Webcam describe — request a single frame from the browser, then vision.""" + frame_b64 = await request_camera_frame(ws, pending_frames) + if not frame_b64: + return ("I couldn't get a camera frame, sir. The webcam may be blocked, " + "in use by another app, or permission hasn't been granted.") + return await describe_camera(anthropic_client, frame_b64) + + +# Market sentiment — runs the kukapay market-sentiment skill's analyzer as a +# subprocess. It needs `requests`, so it's invoked with an interpreter that has +# it (the bybit-mcp venv by default). Both paths are env-overridable. +SENTIMENT_PYTHON = os.getenv("SENTIMENT_PYTHON", "/Users/oguz/bybit-mcp/venv/bin/python") +SENTIMENT_SCRIPT = os.getenv( + "SENTIMENT_SCRIPT", + "/Users/oguz/bybit-mcp/.agents/skills/market-sentiment/scripts/sentiment_analyzer.py", +) + + +async def _do_sentiment_lookup() -> str: + """Run the market-sentiment analyzer and condense it into one spoken line.""" + if not Path(SENTIMENT_SCRIPT).exists(): + return "The market sentiment tool isn't installed, sir." + try: + proc = await asyncio.create_subprocess_exec( + SENTIMENT_PYTHON, SENTIMENT_SCRIPT, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=25) + except asyncio.TimeoutError: + return "The sentiment feeds are slow to respond, sir. Try again in a moment." + except Exception as e: + log.warning(f"Sentiment lookup failed: {e}") + return "I couldn't reach the market sentiment tool, sir." + + # Parse score / article count / verdict from the script's printed report. + score = None + articles = None + overall = "" + for line in stdout.decode(errors="replace").splitlines(): + s = line.strip() + if s.startswith("Market Sentiment Score:"): + try: + score = float(s.split(":", 1)[1].strip()) + except ValueError: + pass + elif s.startswith("- Analyzed"): + m = _action_re.search(r"Analyzed\s+(\d+)\s+recent articles", s) + if m: + articles = m.group(1) + elif s.startswith("Overall:"): + overall = s.split(":", 1)[1].strip() + + if score is None: + return "The sentiment tool returned nothing readable, sir." + + mood = "bullish" if score > 0.1 else "bearish" if score < -0.1 else "neutral" + detail = f"a score of {score:.2f}" + if articles: + detail += f" across {articles} recent articles" + summary = f"Crypto market sentiment is {mood}, sir — {detail}." + if overall: + summary += f" {overall}" + return summary + + def get_lookup_status() -> str: """Get status of active lookups for when user asks 'how's that coming'.""" if not _active_lookups: @@ -1969,6 +2083,10 @@ async def voice_handler(ws: WebSocket): # Audio collision prevention — track when user last spoke voice_state = {"last_user_time": 0.0} + # Pending webcam frame requests — request_id -> Future resolved by the + # browser's "camera_frame" reply (see request_camera_frame). + pending_frames: dict[str, asyncio.Future] = {} + # Self-awareness — track last spoken response to avoid repetition last_jarvis_response = "" @@ -2024,6 +2142,14 @@ async def _send_greeting(): except json.JSONDecodeError: continue + # ── Webcam frame reply: resolve the waiting request_camera_frame ── + if msg.get("type") == "camera_frame": + rid = msg.get("request_id") + fut = pending_frames.get(rid) + if fut and not fut.done(): + fut.set_result(msg.get("data") or None) + continue + # ── Fix-self: activate work mode in JARVIS repo ── if msg.get("type") == "fix_self": jarvis_dir = str(Path(__file__).parent) @@ -2200,6 +2326,12 @@ async def _send_greeting(): elif action["action"] == "describe_screen": response_text = "Taking a look now, sir." asyncio.create_task(_lookup_and_report("screen", _do_screen_lookup, ws, history=history, voice_state=voice_state)) + elif action["action"] == "describe_camera": + response_text = "Let me have a look, sir." + asyncio.create_task(_lookup_and_report("camera", lambda: _do_camera_lookup(ws, pending_frames), ws, history=history, voice_state=voice_state)) + elif action["action"] == "market_sentiment": + response_text = "Checking the crypto mood now, sir." + asyncio.create_task(_lookup_and_report("sentiment", _do_sentiment_lookup, ws, history=history, voice_state=voice_state)) elif action["action"] == "check_calendar": response_text = "Checking your calendar now, sir." asyncio.create_task(_lookup_and_report("calendar", _do_calendar_lookup, ws, history=history, voice_state=voice_state)) @@ -2355,6 +2487,10 @@ async def _send_greeting(): asyncio.create_task(create_apple_note("JARVIS Note", target)) elif embedded_action["action"] == "screen": asyncio.create_task(_lookup_and_report("screen", _do_screen_lookup, ws, history=history, voice_state=voice_state)) + elif embedded_action["action"] == "camera": + asyncio.create_task(_lookup_and_report("camera", lambda: _do_camera_lookup(ws, pending_frames), ws, history=history, voice_state=voice_state)) + elif embedded_action["action"] == "sentiment": + asyncio.create_task(_lookup_and_report("sentiment", _do_sentiment_lookup, ws, history=history, voice_state=voice_state)) elif embedded_action["action"] == "read_note": # Read note in background and report back async def _read_and_report(search_term, _ws): From a79416f6a86522dd6e892dfe5a375f04cd6b5452 Mon Sep 17 00:00:00 2001 From: Oguz Date: Sun, 31 May 2026 18:51:36 +0200 Subject: [PATCH 05/17] Ignore .env.save and .run/ runtime artifacts Keep the secrets backup (.env.save) and start_jarvis.sh log/pid dir (.run/) out of the working tree. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 9259a7d..d729597 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ # Environment .env .env.local +.env.save + +# Runtime logs / pids (start_jarvis.sh) +.run/ # Dependencies node_modules/ From a750c34946ca129625cee7a189e2efe66d8734c6 Mon Sep 17 00:00:00 2001 From: Oguz Date: Sun, 31 May 2026 18:52:02 +0200 Subject: [PATCH 06/17] Add start_jarvis.sh launcher Idempotent helper that starts the backend (:8340) and frontend (:5173) only if not already listening, waits for the frontend, then opens Chrome. Used by the SessionStart auto-start hook. Logs to .run/ (gitignored). Co-Authored-By: Claude Opus 4.8 --- start_jarvis.sh | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 start_jarvis.sh diff --git a/start_jarvis.sh b/start_jarvis.sh new file mode 100755 index 0000000..772150d --- /dev/null +++ b/start_jarvis.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Idempotently start the JARVIS backend + frontend and open Chrome. +# Safe to run repeatedly: it only starts what isn't already listening. +set -u + +JARVIS_DIR="/Users/oguz/jarvis" +BACKEND_PORT=8340 +FRONTEND_PORT=5173 +URL="http://localhost:${FRONTEND_PORT}/" +LOG_DIR="${JARVIS_DIR}/.run" +mkdir -p "$LOG_DIR" + +port_up() { lsof -nP -iTCP:"$1" -sTCP:LISTEN >/dev/null 2>&1; } + +# Backend +if port_up "$BACKEND_PORT"; then + echo "[jarvis] backend already running on :$BACKEND_PORT" +else + echo "[jarvis] starting backend on :$BACKEND_PORT" + ( cd "$JARVIS_DIR" && nohup ./venv/bin/python server.py \ + >"$LOG_DIR/backend.log" 2>&1 & ) +fi + +# Frontend +if port_up "$FRONTEND_PORT"; then + echo "[jarvis] frontend already running on :$FRONTEND_PORT" +else + echo "[jarvis] starting frontend on :$FRONTEND_PORT" + ( cd "$JARVIS_DIR/frontend" && nohup npm run dev \ + >"$LOG_DIR/frontend.log" 2>&1 & ) +fi + +# Wait (bounded) for the frontend to accept connections, then open Chrome once. +for _ in $(seq 1 30); do + port_up "$FRONTEND_PORT" && break + sleep 0.5 +done + +if port_up "$FRONTEND_PORT"; then + open -a "Google Chrome" "$URL" + echo "[jarvis] opened $URL in Chrome" +else + echo "[jarvis] frontend did not come up in time; not opening browser" >&2 +fi From 4108ba8967a28c69d1423a21bf4b73fb5d7a0a66 Mon Sep 17 00:00:00 2001 From: Oguz Date: Mon, 1 Jun 2026 18:47:06 +0200 Subject: [PATCH 07/17] Add multilingual voice (English/French/Turkish) with Whisper STT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Speak in English, French, or Turkish and JARVIS replies in kind, in a matching voice. A top-left EN/FR/TR toggle forces the language for recognition, reply, and TTS — auto-detection proved unreliable on short spoken phrases. - whisper_service.py: local STT microservice in a dedicated Python 3.12 venv (faster-whisper has no 3.14 wheels). Decodes the browser's recorded audio via ffmpeg/av, peak-normalizes it, and transcribes; ?lang= forces a language. Default model "small"; launched by start_jarvis.sh on :8765. - frontend/src/audio_capture.ts: mic capture with MediaRecorder (off-thread, clean audio) + adaptive-VAD utterance segmentation, replacing the language- locked Web Speech API. ws.ts gains sendBinary for streaming audio. - EN/FR/TR toggle (index.html + main.ts + style.css) → {type:"set_lang"}. - server.py: transcribe_audio() client, binary-audio handling in the voice loop, set_lang control message, per-language Fish voice map (private cloned French + Turkish voices, MCU English), language-aware generate_response and synthesize_speech with correct honorifics (monsieur/efendim). Returns the UI to idle when nothing is understood so the mic never wedges on "thinking". Co-Authored-By: Claude Opus 4.8 --- .gitignore | 1 + CLAUDE.md | 3 +- frontend/index.html | 7 ++ frontend/src/audio_capture.ts | 163 +++++++++++++++++++++++++++++++ frontend/src/main.ts | 29 ++++-- frontend/src/style.css | 31 ++++++ frontend/src/ws.ts | 6 ++ server.py | 177 ++++++++++++++++++++++++++-------- start_jarvis.sh | 13 +++ whisper_service.py | 111 +++++++++++++++++++++ 10 files changed, 494 insertions(+), 47 deletions(-) create mode 100644 frontend/src/audio_capture.ts create mode 100644 whisper_service.py diff --git a/.gitignore b/.gitignore index d729597..b27fbca 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ node_modules/ .venv/ venv/ +whisper-venv/ # Python __pycache__/ diff --git a/CLAUDE.md b/CLAUDE.md index 1bdee92..6989069 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -42,7 +42,7 @@ Live quality monitor (run alongside the server): `python monitor.py` tails serve ### Request pipeline `server.py` is an intentional ~2700-line monolith (see CONTRIBUTING.md) and is the orchestrator; the `/ws/voice` handler is the core loop: -1. Frontend does speech-to-text via the browser Web Speech API (hence Chrome) and sends text. +1. Frontend captures mic audio (`audio_capture.ts`, MediaRecorder + VAD) and streams each utterance as binary over the WebSocket. The backend transcribes it via a local Whisper service (`whisper_service.py`, runs in `whisper-venv` / Python 3.12 on :8765) which also returns the language. A legacy browser-Web-Speech transcript path still exists as a fallback. 2. `classify_intent()` calls Haiku (`claude-haiku-4-5-20251001`) to pick an intent and emit an `[ACTION:*]` tag. 3. `execute_action` (actions.py) routes the tag to a system integration or a Claude Code spawn. 4. Reply text → Fish Audio TTS → streamed back as binary audio while the orb reacts. @@ -88,6 +88,7 @@ These are NOT shared; confirm which one a module uses before touching persistenc - Max 1-2 sentences per voice response - Action tags: [ACTION:BUILD], [ACTION:BROWSE], [ACTION:RESEARCH], [ACTION:SCREEN], [ACTION:CAMERA], [ACTION:SENTIMENT], etc. - Market sentiment ([ACTION:SENTIMENT] / `_do_sentiment_lookup`): runs the external kukapay `market-sentiment` skill analyzer as a subprocess and speaks a one-line mood score. The script lives outside the repo at `~/bybit-mcp/.agents/skills/market-sentiment/scripts/sentiment_analyzer.py` and needs `requests`, so it's invoked with `SENTIMENT_PYTHON` (defaults to the bybit-mcp venv). Override both via `SENTIMENT_PYTHON` / `SENTIMENT_SCRIPT` env vars. News-based only — never present as trading advice. +- Multilingual voice (English/French/Turkish): a top-left EN/FR/TR toggle sends `{type:"set_lang"}`; the chosen language is FORCED for Whisper transcription, the LLM reply, and the TTS voice (auto-detect proved unreliable on short utterances). Per-language Fish voices live in `_LANG_VOICE` — French and Turkish use private cloned voices (native speakers), English uses the MCU JARVIS voice. `whisper_service.py` peak-normalizes audio and accepts `?lang=` to force a language. Start it with `WHISPER_MODEL=base` for speed or `small` (default) for accuracy. - Camera (`camera.py`): on-demand single-frame webcam vision. The frame lives in the browser, so the backend requests it over the WebSocket (`{"type":"capture_camera"}`) and the frontend (`frontend/src/camera.ts`) captures one JPEG, **releases the camera immediately**, and replies (`{"type":"camera_frame"}`). Privacy by design — never a continuous feed, nothing recorded. Distinct from screen vision (`screen.py`), which is captured server-side. - AppleScript for all macOS integrations (no OAuth needed); all user-controlled strings MUST pass through `applescript_escape()` (actions.py) — injection guard, covered by `tests/test_applescript_escape.py` - Read-only for Mail (safety by design) — never add write paths to connected services (Mail, Calendar, Notes) diff --git a/frontend/index.html b/frontend/index.html index 9440bda..f051d2d 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -9,6 +9,13 @@ + +
+ + + +
+
+
+ +
+ +
+
+
@@ -260,12 +267,17 @@ function wireEvents() { const anthropicKey = (document.getElementById("input-anthropic-key") as HTMLInputElement).value.trim(); const fishKey = (document.getElementById("input-fish-key") as HTMLInputElement).value.trim(); + const gmapsKey = (document.getElementById("input-gmaps-key") as HTMLInputElement).value.trim(); + if (anthropicKey) { await apiPost("/api/settings/keys", { key_name: "ANTHROPIC_API_KEY", key_value: anthropicKey }); } if (fishKey) { await apiPost("/api/settings/keys", { key_name: "FISH_API_KEY", key_value: fishKey }); } + if (gmapsKey) { + await apiPost("/api/settings/keys", { key_name: "GOOGLE_MAPS_API_KEY", key_value: gmapsKey }); + } await loadStatus(); }); diff --git a/gmail_access.py b/gmail_access.py new file mode 100644 index 0000000..c44cb82 --- /dev/null +++ b/gmail_access.py @@ -0,0 +1,105 @@ +""" +JARVIS Gmail — READ-ONLY access via the Gmail API (for the morning briefing). + +Scope is gmail.readonly only: JARVIS can read but never send, delete or modify, +consistent with the project's read-only-mail rule. OAuth credentials live in +gmail_credentials.json; the user token is cached in gmail_token.json (both +gitignored). First use runs a one-time browser consent. +""" + +import asyncio +import logging +from pathlib import Path + +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build + +log = logging.getLogger("jarvis.gmail") + +SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] +BASE = Path(__file__).resolve().parent +CREDS_FILE = BASE / "gmail_credentials.json" +TOKEN_FILE = BASE / "gmail_token.json" + + +def _load_creds(interactive: bool = False) -> Credentials: + creds = None + if TOKEN_FILE.exists(): + creds = Credentials.from_authorized_user_file(str(TOKEN_FILE), SCOPES) + if creds and creds.valid: + return creds + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + TOKEN_FILE.write_text(creds.to_json()) + return creds + if not interactive: + raise RuntimeError("Gmail not authorized yet (run the one-time auth).") + # One-time interactive consent (opens a browser). + flow = InstalledAppFlow.from_client_secrets_file(str(CREDS_FILE), SCOPES) + creds = flow.run_local_server(port=0) + TOKEN_FILE.write_text(creds.to_json()) + return creds + + +def _service(interactive: bool = False): + return build("gmail", "v1", credentials=_load_creds(interactive), cache_discovery=False) + + +def _header(headers, name): + for h in headers: + if h.get("name", "").lower() == name: + return h.get("value", "") + return "" + + +def _short_sender(value: str) -> str: + # "Jane Doe " -> "Jane Doe"; else the address local part. + if "<" in value: + return value.split("<")[0].strip().strip('"') or value + if "@" in value: + return value.split("@")[0] + return value + + +def _fetch_briefing() -> dict: + svc = _service() + # Exact inbox unread count. + label = svc.users().labels().get(userId="me", id="INBOX").execute() + unread_total = label.get("messagesUnread", 0) + # "Important to reply to" = unread in the Primary category (real correspondence, + # not promotions/social/updates). + res = svc.users().messages().list( + userId="me", q="is:unread in:inbox category:primary", maxResults=5 + ).execute() + important = [] + for item in res.get("messages", []): + msg = svc.users().messages().get( + userId="me", id=item["id"], format="metadata", + metadataHeaders=["From", "Subject"], + ).execute() + headers = msg.get("payload", {}).get("headers", []) + important.append({ + "from": _short_sender(_header(headers, "from")), + "subject": _header(headers, "subject") or "(no subject)", + "snippet": (msg.get("snippet", "") or "")[:140], + }) + return {"ok": True, "unread_total": unread_total, "primary_unread": len(important), "important": important} + + +async def get_briefing_mail() -> dict: + """Async wrapper used by the morning briefing.""" + try: + return await asyncio.to_thread(_fetch_briefing) + except Exception as e: + log.warning(f"Gmail briefing fetch failed: {e}") + return {"ok": False, "reason": str(e)} + + +if __name__ == "__main__": + # One-time authorization: opens a browser for consent, caches the token. + _load_creds(interactive=True) + print("Gmail authorized — token saved to gmail_token.json") + import json + print(json.dumps(_fetch_briefing(), indent=2)[:800]) diff --git a/mail_access.py b/mail_access.py index 03a88fe..775e764 100644 --- a/mail_access.py +++ b/mail_access.py @@ -94,21 +94,17 @@ async def get_unread_count() -> dict: Returns: {"total": int, "accounts": {"Google": 5, "Work": 3, ...}} """ + # Single unified-inbox count only — the per-account loop made this >20s on a + # large inbox (the user has ~850 unread). One call is ~11s. script = """ tell application "Mail" - set totalUnread to unread count of inbox - set output to "total:" & totalUnread & linefeed - repeat with acct in every account - set acctName to name of acct - try - set acctUnread to unread count of mailbox "INBOX" of acct - set output to output & acctName & ":" & acctUnread & linefeed - end try - end repeat - return output + return "total:" & (unread count of inbox) end tell """ - raw = await _run_mail_script(script) + raw = await _run_mail_script(script, timeout=18) + if not raw: + # Empty output means the script failed/timed out — NOT an empty inbox. + return {"total": None, "accounts": {}, "error": "unavailable"} result = {"total": 0, "accounts": {}} for line in raw.split("\n"): line = line.strip() @@ -125,6 +121,37 @@ async def get_unread_count() -> dict: return result +async def get_recent_headers(count: int = 4) -> list[dict]: + """Fast: the N most recent inbox messages — sender/subject/read only, no body. + + Avoids the slow `whose read status is false` filter + content fetch, so it + stays quick even on a large inbox. Returns [{sender, subject, read}]. + """ + script = f""" +tell application "Mail" + set out to "" + repeat with i from 1 to {count} + try + set m to message i of inbox + set out to out & (sender of m) & "|||" & (subject of m) & "|||" & (read status of m) & linefeed + end try + end repeat + return out +end tell +""" + raw = await _run_mail_script(script, timeout=12) + msgs = [] + for line in raw.split("\n"): + parts = line.strip().split("|||") + if len(parts) >= 3: + msgs.append({ + "sender": parts[0].strip(), + "subject": parts[1].strip(), + "read": parts[2].strip().lower() == "true", + }) + return msgs + + async def get_recent_messages(count: int = 10) -> list[dict]: """Get most recent messages from unified inbox. diff --git a/requirements.txt b/requirements.txt index 42a3b61..d63e2fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,6 @@ pydantic>=2.0.0 websockets>=13.0 playwright>=1.40.0 pyyaml>=6.0 +google-api-python-client +google-auth +google-auth-oauthlib diff --git a/server.py b/server.py index 92169d7..ad7f731 100644 --- a/server.py +++ b/server.py @@ -46,8 +46,10 @@ from work_mode import WorkSession, is_casual_question from screen import get_active_windows, take_screenshot, describe_screen, format_windows_for_context from camera import describe_camera +import briefing +import gmail_access from calendar_access import get_todays_events, get_upcoming_events, get_next_event, format_events_for_context, format_schedule_summary, refresh_cache as refresh_calendar_cache -from mail_access import get_unread_count, get_unread_messages, get_recent_messages, search_mail, read_message, format_unread_summary, format_messages_for_context, format_messages_for_voice +from mail_access import get_unread_count, get_unread_messages, get_recent_messages, get_recent_headers, search_mail, read_message, format_unread_summary, format_messages_for_context, format_messages_for_voice from memory import ( remember, recall, get_open_tasks, create_task, complete_task, search_tasks, create_note, search_notes, get_tasks_for_date, build_memory_context, @@ -1670,6 +1672,11 @@ def detect_action_fast(text: str) -> dict | None: "how expensive", "what's my bill"]): return {"action": "check_usage"} + # Morning briefing — full daily rundown + if any(p in t for p in ["morning briefing", "brief me", "my briefing", "daily briefing", + "give me my briefing", "good morning jarvis", "start my day"]): + return {"action": "briefing"} + # Crypto market sentiment — RSS news mood score if any(p in t for p in ["market sentiment", "crypto sentiment", "crypto mood", "how's the crypto market", "hows the crypto market", @@ -1839,16 +1846,19 @@ async def _do_mail_lookup() -> str: """Slow mail fetch — runs in thread.""" unread_info = await get_unread_count() if isinstance(unread_info, dict): + if unread_info.get("error") or unread_info.get("total") is None: + return "I couldn't reach Mail just now, sir — it may still be syncing." _ctx_cache["mail"] = format_unread_summary(unread_info) if unread_info["total"] == 0: return "Inbox is clear, sir. No unread messages." - unread_msgs = await get_unread_messages(count=5) summary = format_unread_summary(unread_info) - if unread_msgs: - top = unread_msgs[:3] + # Fast recent headers (no slow read-status filter / body fetch). + recent = await get_recent_headers(count=5) + if recent: details = ". ".join( f"{_short_sender(m['sender'])} regarding {m['subject']}" - for m in top + + ("" if m["read"] else " (unread)") + for m in recent[:4] ) return f"{summary} Most recent: {details}." return summary @@ -1957,6 +1967,142 @@ async def _do_sentiment_lookup() -> str: return summary +# --------------------------------------------------------------------------- +# Morning briefing — runs after the startup sequence +# --------------------------------------------------------------------------- + +async def _prepare_briefing(lang: str) -> tuple[str, Optional[bytes]]: + """Gather all sources, compose the briefing text, and synthesize the audio. + + Returns (text, mp3_bytes). This is the slow part (~20s) and is safe to run + during the boot screen so the result is ready the instant the boot ends. + """ + # Gather everything concurrently, each bounded so one slow source (e.g. a + # laggy feed) can't stall the whole briefing. + async def _timed(coro, t): + try: + return await asyncio.wait_for(coro, timeout=t) + except Exception: + return None + + traffic, weather, portfolio, gmail, cal_txt, senti_txt = await asyncio.gather( + _timed(briefing.get_traffic(), 12), + _timed(briefing.get_weather(), 12), + _timed(briefing.get_portfolio(), 25), + _timed(gmail_access.get_briefing_mail(), 15), + _timed(_do_calendar_lookup(), 12), + _timed(_do_sentiment_lookup(), 22), + ) + + def _safe(v, default="unavailable"): + return default if (v is None or isinstance(v, Exception)) else v + + traffic = _safe(traffic, {}); weather = _safe(weather, {}); portfolio = _safe(portfolio, {}) + gmail = _safe(gmail, {}); cal_txt = _safe(cal_txt); senti_txt = _safe(senti_txt) + + # Build a plain-facts block for the LLM to turn into a spoken briefing. + facts = [] + if isinstance(traffic, dict) and traffic.get("ok"): + facts.append(f"COMMUTE: {traffic['condition']}, about {traffic['eta_min']} minutes to the office " + f"({traffic['distance']} via {traffic['route']}). Usual time {traffic['normal_min']} min.") + else: + facts.append("COMMUTE: traffic data unavailable.") + if isinstance(weather, dict) and weather.get("ok"): + facts.append(f"WEATHER (today, home area): {weather['conditions']}, currently {weather['current_c']}°C, " + f"high {weather['high_c']}°C, low {weather['low_c']}°C, {weather['rain_chance']}% chance of rain. " + f"Give a brief clothing suggestion based on this.") + else: + facts.append("WEATHER: unavailable.") + if isinstance(gmail, dict) and gmail.get("ok"): + lines = [f"EMAIL (Gmail): {gmail['unread_total']} total unread; " + f"{gmail['primary_unread']} unread in the Primary category (real correspondence)."] + for m in gmail.get("important", []): + lines.append(f" - from {m['from']}: {m['subject']}") + lines.append("Judge which, if any, genuinely look like they need a reply; " + "ignore receipts, notifications and automated mail. If none need action, say so briefly.") + facts.append("\n".join(lines)) + else: + facts.append("EMAIL: Gmail unavailable.") + facts.append(f"AGENDA: {cal_txt}") + if isinstance(portfolio, dict) and portfolio.get("ok"): + best = portfolio.get("best"); worst = portfolio.get("worst") + line = f"PORTFOLIO: total value ${portfolio['total_value']}, {portfolio['total_gain_pct']:+.1f}% today." + if best: line += f" Best {best['ticker']} {best['gain_pct']:+.1f}%." + if worst: line += f" Worst {worst['ticker']} {worst['gain_pct']:+.1f}%." + facts.append(line) + else: + facts.append("PORTFOLIO: unavailable.") + facts.append(f"CRYPTO MOOD: {senti_txt}") + + _names = {"fr": ("French", "monsieur"), "tr": ("Turkish", "efendim")} + name, honorific = _names.get(lang, ("English", "sir")) + lang_rule = (f"Respond ONLY in natural {name}, addressing the user as '{honorific}'." + if lang in _names else "Address the user as 'sir'.") + + system = ( + f"You are JARVIS delivering {USER_NAME}'s morning briefing as a refined British butler. " + f"{lang_rule} From the facts below compose ONE flowing, spoken briefing covering, in order: " + "a brief good-morning, the commute (traffic and ETA to the office), the weather with a short " + "clothing suggestion, any important emails, today's agenda, the portfolio with the key numbers, " + "and the crypto market mood. Natural and warm, no markdown, no lists, dry wit welcome but concise " + "— aim for 7 to 10 sentences. Do not invent facts; if something says unavailable, mention it briefly or skip." + ) + + response_text = None + if anthropic_client: + try: + resp = await anthropic_client.messages.create( + model="claude-haiku-4-5-20251001", + max_tokens=600, + system=system, + messages=[{"role": "user", "content": "FACTS:\n" + "\n".join(facts)}], + ) + response_text = resp.content[0].text.strip() + except Exception as e: + log.warning(f"briefing compose failed: {e}") + if not response_text: + response_text = "Good morning, sir. I'm afraid I couldn't assemble the full briefing just now." + + # Synthesize the audio here too, so the boot prefetch hides this latency. + audio = await synthesize_speech(strip_markdown_for_tts(response_text), lang=lang) + return response_text, audio + + +async def morning_briefing(ws, history: list[dict] = None, voice_state: dict = None): + """Deliver the briefing — using the result prefetched during the boot screen + if available, otherwise preparing it now — then open the dashboard window.""" + lang = "en" + task = None + if voice_state: + lang = voice_state.get("forced_lang") or voice_state.get("lang") or "en" + task = voice_state.pop("briefing_task", None) + await ws.send_json({"type": "status", "state": "thinking"}) + try: + if task is not None: + response_text, audio = await task # prepared during the boot screen + else: + response_text, audio = await _prepare_briefing(lang) + except Exception as e: + log.warning(f"briefing failed: {e}") + response_text, audio = ("Good morning, sir. I couldn't assemble the briefing just now.", None) + + # Open the portfolio dashboard window alongside the spoken briefing. + asyncio.create_task(briefing.open_dashboard_window()) + + try: + await ws.send_json({"type": "status", "state": "speaking"}) + if audio: + await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": response_text}) + else: + await ws.send_json({"type": "text", "text": response_text}) + await ws.send_json({"type": "status", "state": "idle"}) + except Exception: + pass + if history is not None: + history.append({"role": "assistant", "content": f"[morning briefing]: {response_text}"}) + log.info(f"Briefing delivered ({lang}): {response_text[:80]}") + + def get_lookup_status() -> str: """Get status of active lookups for when user asks 'how's that coming'.""" if not _active_lookups: @@ -2239,6 +2385,19 @@ async def _send_greeting(): log.info(f"Forced language set to: {voice_state.get('forced_lang')}") continue + # ── Briefing prefetch: start gathering DURING the boot screen so + # the briefing is ready the instant the boot finishes. ── + if msg.get("type") == "briefing_prefetch": + pf_lang = voice_state.get("forced_lang") or voice_state.get("lang") or "en" + voice_state["briefing_task"] = asyncio.create_task(_prepare_briefing(pf_lang)) + log.info(f"Briefing prefetch started ({pf_lang})") + continue + + # ── Morning briefing: triggered by the frontend after startup ── + if msg.get("type") == "briefing": + asyncio.create_task(morning_briefing(ws, history=history, voice_state=voice_state)) + continue + # ── Fix-self: activate work mode in JARVIS repo ── if msg.get("type") == "fix_self": jarvis_dir = str(Path(__file__).parent) @@ -2428,6 +2587,9 @@ async def _send_greeting(): elif action["action"] == "market_sentiment": response_text = "Checking the crypto mood now, sir." asyncio.create_task(_lookup_and_report("sentiment", _do_sentiment_lookup, ws, history=history, voice_state=voice_state)) + elif action["action"] == "briefing": + response_text = "Preparing your morning briefing, sir." + asyncio.create_task(morning_briefing(ws, history=history, voice_state=voice_state)) elif action["action"] == "check_calendar": response_text = "Checking your calendar now, sir." asyncio.create_task(_lookup_and_report("calendar", _do_calendar_lookup, ws, history=history, voice_state=voice_state)) @@ -2730,7 +2892,7 @@ class PreferencesUpdate(BaseModel): @app.post("/api/settings/keys") async def api_settings_keys(body: KeyUpdate): - allowed = {"ANTHROPIC_API_KEY", "FISH_API_KEY", "FISH_VOICE_ID", "USER_NAME", "HONORIFIC", "CALENDAR_ACCOUNTS"} + allowed = {"ANTHROPIC_API_KEY", "FISH_API_KEY", "FISH_VOICE_ID", "USER_NAME", "HONORIFIC", "CALENDAR_ACCOUNTS", "GOOGLE_MAPS_API_KEY"} if body.key_name not in allowed: return JSONResponse({"success": False, "error": "Invalid key name"}, status_code=400) _write_env_key(body.key_name, body.key_value) From 33f3b51335424f6f47f390763186902e1aec817f Mon Sep 17 00:00:00 2001 From: Oguz Date: Mon, 1 Jun 2026 21:35:37 +0200 Subject: [PATCH 10/17] Size the portfolio dashboard window to fit all columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The boot/briefing dashboard window was 500px wide — too narrow for the 8-column table and long position names. Open it at ~1040x760 (clamped to the screen) so all the numbers are readable. Co-Authored-By: Claude Opus 4.8 --- briefing.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/briefing.py b/briefing.py index 49b8532..92cb2b4 100644 --- a/briefing.py +++ b/briefing.py @@ -178,11 +178,22 @@ async def open_dashboard_window() -> None: if not dash.exists(): return url = f"file://{dash}" + # Wide enough for the 8-column table + long position names, tall enough for + # all rows + totals + footer. Clamped to the main screen so it never exceeds it. script = f''' +tell application "Finder" to set sb to bounds of window of desktop +set screenW to item 3 of sb +set screenH to item 4 of sb +set winW to 1040 +set winH to 760 +if winW > (screenW - 40) then set winW to (screenW - 40) +if winH > (screenH - 80) then set winH to (screenH - 80) +set x1 to 40 +set y1 to 60 tell application "Google Chrome" make new window set URL of active tab of front window to "{url}" - set bounds of front window to {{60, 80, 560, 720}} + set bounds of front window to {{x1, y1, x1 + winW, y1 + winH}} end tell ''' try: From def12533f4f6d3e784c585853142c9ab74824514 Mon Sep 17 00:00:00 2001 From: Oguz Date: Mon, 1 Jun 2026 21:49:23 +0200 Subject: [PATCH 11/17] Make the morning briefing play instantly after boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two latency fixes so the briefing speaks the moment the boot ends: - briefing.get_sentiment(): fetch the 6 crypto RSS feeds CONCURRENTLY (~1s vs the ~20s sequential subprocess) and score inline; replaces _do_sentiment_lookup in the briefing path. - _prepare_briefing: split the composed briefing into chunks and synthesize the TTS segments CONCURRENTLY (~7s vs ~24s for one long call), delivered as ordered audio the player queues seamlessly. Combined with the boot-time prefetch, the whole briefing (gather + compose + TTS) now completes in ~7s, well inside the ~28s boot — first audio plays ~0.02s after the boot finishes. Co-Authored-By: Claude Opus 4.8 --- briefing.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ server.py | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/briefing.py b/briefing.py index 92cb2b4..8dd04e3 100644 --- a/briefing.py +++ b/briefing.py @@ -172,6 +172,53 @@ async def get_portfolio() -> dict: } +# ---- Crypto sentiment (fast, concurrent) --------------------------------- + +_POS = ["adoption", "launch", "partnership", "etf", "rally", "breakthrough", + "growth", "approval", "bullish", "surge", "adopts", "soar", "gains"] +_NEG = ["crash", "exploit", "hack", "delay", "liquidation", "depeg", "bearish", + "decline", "setback", "breach", "drop", "plunge", "selloff", "lawsuit"] +_FEEDS = [ + "https://www.coindesk.com/arc/outboundfeeds/rss/?outputType=xml", + "https://cointelegraph.com/rss", + "https://cryptopotato.com/feed/", + "https://bitcoinist.com/feed/", + "https://www.newsbtc.com/feed/", + "https://cryptonews.com/news/feed/", +] + + +def _fetch_feed(url: str) -> list[str]: + import xml.etree.ElementTree as ET + try: + root = ET.fromstring(_get(url, timeout=8)) + out = [] + for it in root.findall(".//item"): + title = it.findtext("title") or "" + desc = it.findtext("description") or "" + out.append((title + " " + desc).lower()) + return out + except Exception: + return [] + + +async def get_sentiment() -> dict: + """Crypto news sentiment — fetches all feeds concurrently (~4s vs ~20s).""" + results = await asyncio.gather(*[asyncio.to_thread(_fetch_feed, u) for u in _FEEDS]) + texts = [t for sub in results for t in sub] + if not texts: + return {"ok": False} + total = pos = neg = 0 + for txt in texts: + p = sum(1 for w in _POS if w in txt) + n = sum(1 for w in _NEG if w in txt) + total += 1 if p > n else -1 if n > p else 0 + count = len(texts) + score = total / count if count else 0.0 + mood = "bullish" if score > 0.1 else "bearish" if score < -0.1 else "neutral" + return {"ok": True, "score": round(score, 2), "mood": mood, "articles": count} + + async def open_dashboard_window() -> None: """Open the portfolio dashboard in a small Chrome app window.""" dash = PORTFOLIO_DIR / "dashboard.html" diff --git a/server.py b/server.py index ad7f731..fa67deb 100644 --- a/server.py +++ b/server.py @@ -1985,20 +1985,20 @@ async def _timed(coro, t): except Exception: return None - traffic, weather, portfolio, gmail, cal_txt, senti_txt = await asyncio.gather( + traffic, weather, portfolio, gmail, cal_txt, senti = await asyncio.gather( _timed(briefing.get_traffic(), 12), _timed(briefing.get_weather(), 12), _timed(briefing.get_portfolio(), 25), _timed(gmail_access.get_briefing_mail(), 15), _timed(_do_calendar_lookup(), 12), - _timed(_do_sentiment_lookup(), 22), + _timed(briefing.get_sentiment(), 12), ) def _safe(v, default="unavailable"): return default if (v is None or isinstance(v, Exception)) else v traffic = _safe(traffic, {}); weather = _safe(weather, {}); portfolio = _safe(portfolio, {}) - gmail = _safe(gmail, {}); cal_txt = _safe(cal_txt); senti_txt = _safe(senti_txt) + gmail = _safe(gmail, {}); cal_txt = _safe(cal_txt); senti = _safe(senti, {}) # Build a plain-facts block for the LLM to turn into a spoken briefing. facts = [] @@ -2032,7 +2032,11 @@ def _safe(v, default="unavailable"): facts.append(line) else: facts.append("PORTFOLIO: unavailable.") - facts.append(f"CRYPTO MOOD: {senti_txt}") + if isinstance(senti, dict) and senti.get("ok"): + facts.append(f"CRYPTO MOOD: {senti['mood']} (score {senti['score']:+.2f} " + f"across {senti['articles']} crypto news articles).") + else: + facts.append("CRYPTO MOOD: unavailable.") _names = {"fr": ("French", "monsieur"), "tr": ("Turkish", "efendim")} name, honorific = _names.get(lang, ("English", "sir")) @@ -2063,9 +2067,18 @@ def _safe(v, default="unavailable"): if not response_text: response_text = "Good morning, sir. I'm afraid I couldn't assemble the full briefing just now." - # Synthesize the audio here too, so the boot prefetch hides this latency. - audio = await synthesize_speech(strip_markdown_for_tts(response_text), lang=lang) - return response_text, audio + # A long briefing is ~24s of TTS in one call. Split into chunks and + # synthesize them CONCURRENTLY (~8s), returned as ordered audio segments the + # player queues — fits inside the boot prefetch window so it plays instantly. + sentences = _action_re.split(r"(?<=[.!?])\s+", response_text.strip()) + n = 3 + size = max(1, -(-len(sentences) // n)) + chunks = [" ".join(sentences[i:i + size]) for i in range(0, len(sentences), size)] or [response_text] + audios = await asyncio.gather(*[ + synthesize_speech(strip_markdown_for_tts(c), lang=lang) for c in chunks + ]) + audios = [a for a in audios if a] + return response_text, audios async def morning_briefing(ws, history: list[dict] = None, voice_state: dict = None): @@ -2076,23 +2089,27 @@ async def morning_briefing(ws, history: list[dict] = None, voice_state: dict = N if voice_state: lang = voice_state.get("forced_lang") or voice_state.get("lang") or "en" task = voice_state.pop("briefing_task", None) + log.info(f"morning_briefing ({lang}); prefetched={task is not None}") await ws.send_json({"type": "status", "state": "thinking"}) try: if task is not None: - response_text, audio = await task # prepared during the boot screen + response_text, audios = await task # prepared during the boot screen else: - response_text, audio = await _prepare_briefing(lang) + response_text, audios = await _prepare_briefing(lang) except Exception as e: log.warning(f"briefing failed: {e}") - response_text, audio = ("Good morning, sir. I couldn't assemble the briefing just now.", None) + response_text, audios = ("Good morning, sir. I couldn't assemble the briefing just now.", []) # Open the portfolio dashboard window alongside the spoken briefing. asyncio.create_task(briefing.open_dashboard_window()) try: await ws.send_json({"type": "status", "state": "speaking"}) - if audio: - await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": response_text}) + if audios: + # Send the segments in order; the player queues them seamlessly. + for i, a in enumerate(audios): + await ws.send_json({"type": "audio", "data": base64.b64encode(a).decode(), + "text": response_text if i == 0 else ""}) else: await ws.send_json({"type": "text", "text": response_text}) await ws.send_json({"type": "status", "state": "idle"}) From 30d87cd796487599741acd84e89165171cc53658 Mon Sep 17 00:00:00 2001 From: Oguz Date: Mon, 1 Jun 2026 22:10:16 +0200 Subject: [PATCH 12/17] Time-aware briefing greeting + keep mic off during the briefing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Briefing now greets by time of day: morning ("Good morning, I hope you slept well"), daytime ("Hello, welcome back"), evening ("Good evening, I hope you had a great day") — in the active language. - Fix the briefing interrupting itself: the mic was started at boot end, so it transcribed JARVIS's own briefing voice as user input and cut it off. Now the mic stays off through the whole briefing and starts only once it finishes speaking (with a 60s safety fallback). Co-Authored-By: Claude Opus 4.8 --- frontend/src/main.ts | 24 +++++++++++++++++++++--- server.py | 15 ++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/frontend/src/main.ts b/frontend/src/main.ts index dcb35b3..5b0abcd 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -22,6 +22,7 @@ let currentState: State = "idle"; let isMuted = false; let bootActive = true; // during the startup boot video — suppress greeting + mic let currentLang = "en"; // active language (drives boot audio + recognition) +let awaitingBriefing = false; // mic stays off until the post-boot briefing finishes const statusEl = document.getElementById("status-text")!; const errorEl = document.getElementById("error-text")!; @@ -102,6 +103,14 @@ const voiceInput = createAudioCapture( // --------------------------------------------------------------------------- audioPlayer.onFinished(() => { + // After the post-boot briefing finishes speaking, NOW start the mic — keeping + // it off during the briefing so JARVIS never transcribes its own voice. + if (awaitingBriefing) { + awaitingBriefing = false; + voiceInput.start(); + transition("listening"); + return; + } transition("idle"); }); @@ -194,10 +203,19 @@ function endBoot() { try { bootVideo.pause(); bootAudio.pause(); } catch {} bootOverlay.classList.add("done"); setTimeout(() => { bootOverlay.style.display = "none"; }, 1500); - // Hand off to the live assistant, then deliver the morning briefing. - voiceInput.start(); - transition("listening"); + // Deliver the briefing with the mic OFF so JARVIS can't hear (and transcribe) + // its own voice. The mic starts only when the briefing finishes (onFinished). + awaitingBriefing = true; + transition("thinking"); setTimeout(() => socket.send({ type: "briefing" }), 600); + // Safety net: if the briefing never produces audio, start the mic anyway. + setTimeout(() => { + if (awaitingBriefing) { + awaitingBriefing = false; + voiceInput.start(); + transition("listening"); + } + }, 60000); } function startBoot() { diff --git a/server.py b/server.py index fa67deb..f88af49 100644 --- a/server.py +++ b/server.py @@ -2043,10 +2043,19 @@ def _safe(v, default="unavailable"): lang_rule = (f"Respond ONLY in natural {name}, addressing the user as '{honorific}'." if lang in _names else "Address the user as 'sir'.") + # Time-aware opening greeting. + hour = datetime.now().hour + if 5 <= hour < 12: + greet_rule = "Open with a warm 'Good morning' and that you hope he slept well." + elif 12 <= hour < 18: + greet_rule = "It is daytime (NOT morning): open simply with 'Hello, welcome back' — do NOT say good morning." + else: + greet_rule = "It is the evening: open with 'Good evening' and that you hope he had a great day — do NOT say good morning." + system = ( - f"You are JARVIS delivering {USER_NAME}'s morning briefing as a refined British butler. " - f"{lang_rule} From the facts below compose ONE flowing, spoken briefing covering, in order: " - "a brief good-morning, the commute (traffic and ETA to the office), the weather with a short " + f"You are JARVIS delivering {USER_NAME}'s briefing as a refined British butler. " + f"{lang_rule} {greet_rule} From the facts below compose ONE flowing, spoken briefing covering, in order: " + "the time-appropriate greeting above, the commute (traffic and ETA to the office), the weather with a short " "clothing suggestion, any important emails, today's agenda, the portfolio with the key numbers, " "and the crypto market mood. Natural and warm, no markdown, no lists, dry wit welcome but concise " "— aim for 7 to 10 sentences. Do not invent facts; if something says unavailable, mention it briefly or skip." From 583d2b613a07eb53513b73fc1d910b9a00533809 Mon Sep 17 00:00:00 2001 From: Oguz Date: Tue, 2 Jun 2026 00:51:15 +0200 Subject: [PATCH 13/17] Fix briefing composing in English for French/Turkish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The time-aware greeting prompt contained literal English greetings ("Good evening", "Hello, welcome back"), which made the model write the whole briefing in English even with lang=fr/tr (then spoken by the cloned voice — English with a French accent). Describe the greeting semantically (no English words) and force the entire briefing into the target language. Co-Authored-By: Claude Opus 4.8 --- server.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/server.py b/server.py index f88af49..39e13b4 100644 --- a/server.py +++ b/server.py @@ -2040,24 +2040,26 @@ def _safe(v, default="unavailable"): _names = {"fr": ("French", "monsieur"), "tr": ("Turkish", "efendim")} name, honorific = _names.get(lang, ("English", "sir")) - lang_rule = (f"Respond ONLY in natural {name}, addressing the user as '{honorific}'." - if lang in _names else "Address the user as 'sir'.") - # Time-aware opening greeting. + # Time-aware greeting — described SEMANTICALLY with no literal English words, + # otherwise the model copies them and writes the whole briefing in English. hour = datetime.now().hour if 5 <= hour < 12: - greet_rule = "Open with a warm 'Good morning' and that you hope he slept well." + greet_rule = "It is the morning: greet him for the morning and say you hope he slept well." elif 12 <= hour < 18: - greet_rule = "It is daytime (NOT morning): open simply with 'Hello, welcome back' — do NOT say good morning." + greet_rule = "It is the middle of the day, NOT morning: greet him simply — a hello and welcome back." else: - greet_rule = "It is the evening: open with 'Good evening' and that you hope he had a great day — do NOT say good morning." + greet_rule = "It is the evening, NOT morning: greet him for the evening and say you hope he had a great day." + only = "" if lang not in _names else f" Use absolutely no English — every word must be in {name}." system = ( f"You are JARVIS delivering {USER_NAME}'s briefing as a refined British butler. " - f"{lang_rule} {greet_rule} From the facts below compose ONE flowing, spoken briefing covering, in order: " - "the time-appropriate greeting above, the commute (traffic and ETA to the office), the weather with a short " - "clothing suggestion, any important emails, today's agenda, the portfolio with the key numbers, " - "and the crypto market mood. Natural and warm, no markdown, no lists, dry wit welcome but concise " + f"IMPORTANT: write the ENTIRE briefing — every word, including the greeting — in {name}, " + f"addressing the user as '{honorific}'.{only} " + f"{greet_rule} Compose ONE flowing, spoken briefing covering, in order: the time-appropriate " + "greeting, the commute (traffic and ETA to the office), the weather with a short clothing " + "suggestion, any important emails, today's agenda, the portfolio with the key numbers, and the " + "crypto market mood. Natural and warm, no markdown, no lists, dry wit welcome but concise " "— aim for 7 to 10 sentences. Do not invent facts; if something says unavailable, mention it briefly or skip." ) From 0733b88e993dd6ef101e2258191c7387546a5678 Mon Sep 17 00:00:00 2001 From: Oguz Date: Tue, 2 Jun 2026 01:02:29 +0200 Subject: [PATCH 14/17] Fix briefing cutting off the last (crypto) segment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The briefing was sent as 3 separate audio messages; if the final chunk decoded just after the previous finished playing, the player's queue briefly emptied, fired "finished", started the mic, and the mic then cut the last (crypto) segment. Concatenate the parallel-synthesized mp3 chunks into ONE audio blob so playback is a single buffer — no inter-segment race, the whole briefing plays. Co-Authored-By: Claude Opus 4.8 --- server.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/server.py b/server.py index 39e13b4..753fbce 100644 --- a/server.py +++ b/server.py @@ -2117,10 +2117,12 @@ async def morning_briefing(ws, history: list[dict] = None, voice_state: dict = N try: await ws.send_json({"type": "status", "state": "speaking"}) if audios: - # Send the segments in order; the player queues them seamlessly. - for i, a in enumerate(audios): - await ws.send_json({"type": "audio", "data": base64.b64encode(a).decode(), - "text": response_text if i == 0 else ""}) + # Concatenate the parallel-synthesized mp3 chunks into ONE blob — a + # single audio buffer avoids the multi-segment playback race that was + # cutting off the last (crypto) segment. + combined = b"".join(audios) + await ws.send_json({"type": "audio", "data": base64.b64encode(combined).decode(), + "text": response_text}) else: await ws.send_json({"type": "text", "text": response_text}) await ws.send_json({"type": "status", "state": "idle"}) From 905b546837e241d55ab65685a54f00f206c47427 Mon Sep 17 00:00:00 2001 From: Oguz Date: Tue, 2 Jun 2026 01:07:38 +0200 Subject: [PATCH 15/17] Stop the mic safety-timeout from cutting off long briefings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-boot mic fallback fired at 60s, but a full briefing (esp. French) can run ~60s, so it started the mic mid-briefing and cut the final (crypto) segment. Push the fallback to 180s — well beyond any real briefing — so onFinished stays the normal trigger and the briefing always plays to the end. Co-Authored-By: Claude Opus 4.8 --- frontend/src/main.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/frontend/src/main.ts b/frontend/src/main.ts index 5b0abcd..f258bad 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -208,14 +208,16 @@ function endBoot() { awaitingBriefing = true; transition("thinking"); setTimeout(() => socket.send({ type: "briefing" }), 600); - // Safety net: if the briefing never produces audio, start the mic anyway. + // Safety net for the rare case the briefing produces NO audio at all. Kept + // well beyond any real briefing length (the briefing can run ~60s) so it never + // fires mid-briefing and cuts the end off — onFinished is the normal trigger. setTimeout(() => { if (awaitingBriefing) { awaitingBriefing = false; voiceInput.start(); transition("listening"); } - }, 60000); + }, 180000); } function startBoot() { From be04c3d65a04ce16c90c16fce3da046d247094db Mon Sep 17 00:00:00 2001 From: Oguz Date: Tue, 2 Jun 2026 01:19:55 +0200 Subject: [PATCH 16/17] Keep camera/screen lookups + replies in the active language MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In French/Turkish mode, "what am I wearing"/"look at my screen" triggered the camera/screen vision lookups, whose prompts were hardcoded English — so JARVIS answered in English. Thread the active language into describe_camera and describe_screen (and their server lookups) so they reply in FR/TR. Also harden generate_response's language rule (reply ONLY in the target language, never English/Spanish/Italian, even on garbled transcripts). Co-Authored-By: Claude Opus 4.8 --- camera.py | 13 ++++++++++--- screen.py | 11 ++++++++--- server.py | 25 +++++++++++++------------ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/camera.py b/camera.py index 581229e..2f6b78d 100644 --- a/camera.py +++ b/camera.py @@ -18,12 +18,16 @@ log = logging.getLogger("jarvis.camera") -async def describe_camera(anthropic_client, frame_b64: str) -> str: +_LANG = {"fr": ("French", "monsieur"), "tr": ("Turkish", "efendim")} + + +async def describe_camera(anthropic_client, frame_b64: str, lang: str = "en") -> str: """Describe a single webcam frame via the Claude vision API. Args: anthropic_client: AsyncAnthropic client. frame_b64: base64-encoded JPEG (no data-URL prefix). + lang: 'fr'/'tr' to reply in that language; otherwise English. Returns: A short, spoken-style description, or a polite failure line. @@ -33,6 +37,9 @@ async def describe_camera(anthropic_client, frame_b64: str) -> str: if not anthropic_client: return "Camera captured, but I've no vision model configured, sir." + name, honorific = _LANG.get(lang, ("English", "sir")) + lang_line = (f" Reply ONLY in {name}, addressing the user as '{honorific}'." + if lang in _LANG else " Address the user as 'sir'.") try: response = await anthropic_client.messages.create( model="claude-haiku-4-5-20251001", @@ -41,8 +48,8 @@ async def describe_camera(anthropic_client, frame_b64: str) -> str: "You are JARVIS looking through the user's webcam. Describe what you " "see concisely and naturally, as a British butler would: who or what " "is in frame, their expression or surroundings, anything notable. " - "Address the user as 'sir'. 1-3 sentences max. No markdown. " - "If the frame is too dark or empty to make out, say so plainly." + "1-3 sentences max. No markdown. " + "If the frame is too dark or empty to make out, say so plainly." + lang_line ), messages=[{ "role": "user", diff --git a/screen.py b/screen.py index 343dda4..0634454 100644 --- a/screen.py +++ b/screen.py @@ -151,11 +151,16 @@ async def take_screenshot(display_only: bool = True) -> str | None: pass -async def describe_screen(anthropic_client) -> str: +_LANG = {"fr": "French", "tr": "Turkish"} + + +async def describe_screen(anthropic_client, lang: str = "en") -> str: """Describe what's on the user's screen. Tries screenshot + vision first. Falls back to window list + LLM summary. + lang: 'fr'/'tr' to reply in that language; otherwise English. """ + lang_line = f" Reply ONLY in {_LANG[lang]}." if lang in _LANG else "" # Try screenshot + vision screenshot_b64 = await take_screenshot() if screenshot_b64 and anthropic_client: @@ -168,7 +173,7 @@ async def describe_screen(anthropic_client) -> str: "Describe what you see concisely: which apps are open, what the user " "appears to be working on, any notable content visible. " "Be specific about app names, file names, URLs, code, or documents visible. " - "2-4 sentences max. No markdown." + "2-4 sentences max. No markdown." + lang_line ), messages=[{ "role": "user", @@ -219,7 +224,7 @@ async def describe_screen(anthropic_client) -> str: max_tokens=100, system=( "You are JARVIS. Given the user's open windows and apps, summarize " - "what they appear to be working on in 1-2 sentences. Natural voice, no markdown." + "what they appear to be working on in 1-2 sentences. Natural voice, no markdown." + lang_line ), messages=[{"role": "user", "content": "Open windows:\n" + "\n".join(context_parts)}], ) diff --git a/server.py b/server.py index 753fbce..c0779ee 100644 --- a/server.py +++ b/server.py @@ -1278,10 +1278,11 @@ async def generate_response( if lang in _lang_names: name, honorific = _lang_names[lang] system += ( - f"\n\nLANGUAGE: The user is speaking {name}. Respond ONLY in natural, " - f"fluent {name}, keeping the same butler personality. Do NOT use English " - f"and do NOT mix languages. Address the user as '{honorific}' (never 'sir' " - f"or another language's honorific). [ACTION:X] tags (if any) stay in English " + f"\n\nLANGUAGE (critical): You MUST reply ONLY in {name}. Never English, " + f"Spanish, Italian, Portuguese or any other language — reply in {name} even " + f"if the transcribed input looks garbled or like another language. Keep the " + f"butler personality and address the user as '{honorific}' (never 'sir' or " + f"another language's honorific). [ACTION:X] tags (if any) stay in English " f"exactly as specified, but every spoken word must be {name}." ) @@ -1865,10 +1866,10 @@ async def _do_mail_lookup() -> str: return "Couldn't reach Mail at the moment, sir." -async def _do_screen_lookup() -> str: +async def _do_screen_lookup(lang: str = "en") -> str: """Screen describe — runs in thread.""" if anthropic_client: - return await describe_screen(anthropic_client) + return await describe_screen(anthropic_client, lang=lang) windows = await get_active_windows() if windows: apps = set(w["app"] for w in windows) @@ -1900,13 +1901,13 @@ async def request_camera_frame(ws, pending_frames: dict, timeout: float = 12.0) pending_frames.pop(request_id, None) -async def _do_camera_lookup(ws, pending_frames: dict) -> str: +async def _do_camera_lookup(ws, pending_frames: dict, lang: str = "en") -> str: """Webcam describe — request a single frame from the browser, then vision.""" frame_b64 = await request_camera_frame(ws, pending_frames) if not frame_b64: return ("I couldn't get a camera frame, sir. The webcam may be blocked, " "in use by another app, or permission hasn't been granted.") - return await describe_camera(anthropic_client, frame_b64) + return await describe_camera(anthropic_client, frame_b64, lang=lang) # Market sentiment — runs the kukapay market-sentiment skill's analyzer as a @@ -2610,10 +2611,10 @@ async def _send_greeting(): response_text = await handle_show_recent() elif action["action"] == "describe_screen": response_text = "Taking a look now, sir." - asyncio.create_task(_lookup_and_report("screen", _do_screen_lookup, ws, history=history, voice_state=voice_state)) + asyncio.create_task(_lookup_and_report("screen", lambda: _do_screen_lookup(voice_state.get("lang", "en")), ws, history=history, voice_state=voice_state)) elif action["action"] == "describe_camera": response_text = "Let me have a look, sir." - asyncio.create_task(_lookup_and_report("camera", lambda: _do_camera_lookup(ws, pending_frames), ws, history=history, voice_state=voice_state)) + asyncio.create_task(_lookup_and_report("camera", lambda: _do_camera_lookup(ws, pending_frames, voice_state.get("lang", "en")), ws, history=history, voice_state=voice_state)) elif action["action"] == "market_sentiment": response_text = "Checking the crypto mood now, sir." asyncio.create_task(_lookup_and_report("sentiment", _do_sentiment_lookup, ws, history=history, voice_state=voice_state)) @@ -2775,9 +2776,9 @@ async def _send_greeting(): else: asyncio.create_task(create_apple_note("JARVIS Note", target)) elif embedded_action["action"] == "screen": - asyncio.create_task(_lookup_and_report("screen", _do_screen_lookup, ws, history=history, voice_state=voice_state)) + asyncio.create_task(_lookup_and_report("screen", lambda: _do_screen_lookup(voice_state.get("lang", "en")), ws, history=history, voice_state=voice_state)) elif embedded_action["action"] == "camera": - asyncio.create_task(_lookup_and_report("camera", lambda: _do_camera_lookup(ws, pending_frames), ws, history=history, voice_state=voice_state)) + asyncio.create_task(_lookup_and_report("camera", lambda: _do_camera_lookup(ws, pending_frames, voice_state.get("lang", "en")), ws, history=history, voice_state=voice_state)) elif embedded_action["action"] == "sentiment": asyncio.create_task(_lookup_and_report("sentiment", _do_sentiment_lookup, ws, history=history, voice_state=voice_state)) elif embedded_action["action"] == "read_note": From d9e95e98ddf9e345fa317d1c645af3c6a7c7f47e Mon Sep 17 00:00:00 2001 From: Oguz Date: Tue, 2 Jun 2026 09:26:34 +0200 Subject: [PATCH 17/17] Localize lookup acks + speak lookup results in the active voice Two fixes for French/Turkish follow-ups that trigger an action (e.g. camera "what am I wearing"): - The action ack ("Right away, sir") was hardcoded English; now localized (Tout de suite, monsieur / Hemen, efendim, etc.). - _lookup_and_report spoke its result with the default English voice; now it uses the active language so the description is read by the cloned FR/TR voice, not the MCU voice. Co-Authored-By: Claude Opus 4.8 --- server.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/server.py b/server.py index c0779ee..5c00ce5 100644 --- a/server.py +++ b/server.py @@ -1796,7 +1796,7 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict # Result is still stored in history below else: tts = strip_markdown_for_tts(result_text) - audio = await synthesize_speech(tts) + audio = await synthesize_speech(tts, lang=voice_state.get("lang", "en") if voice_state else "en") try: await ws.send_json({"type": "status", "state": "speaking"}) if audio: @@ -1818,7 +1818,7 @@ async def _lookup_and_report(lookup_type: str, lookup_fn, ws, history: list[dict _active_lookups[lookup_id]["status"] = "timeout" try: fallback = f"That {lookup_type} check is taking too long, sir. The data may still be syncing." - audio = await synthesize_speech(fallback) + audio = await synthesize_speech(fallback, lang=voice_state.get("lang", "en") if voice_state else "en") await ws.send_json({"type": "status", "state": "speaking"}) if audio: await ws.send_json({"type": "audio", "data": base64.b64encode(audio).decode(), "text": fallback}) @@ -2670,15 +2670,21 @@ async def _send_greeting(): # Ensure there's always something to speak if not response_text.strip(): action_type = embedded_action["action"] + _lg = voice_state.get("lang", "en") if action_type == "prompt_project": proj = embedded_action["target"].split("|||")[0].strip() - response_text = f"Connecting to {proj} now, sir." + response_text = ({"fr": f"Connexion à {proj}, monsieur.", + "tr": f"{proj} bağlanıyorum, efendim."} + .get(_lg, f"Connecting to {proj} now, sir.")) elif action_type == "build": - response_text = "On it, sir." + response_text = {"fr": "Je m'en occupe, monsieur.", + "tr": "Hallediyorum, efendim."}.get(_lg, "On it, sir.") elif action_type == "research": - response_text = "Looking into that now, sir." + response_text = {"fr": "Je me renseigne, monsieur.", + "tr": "Araştırıyorum, efendim."}.get(_lg, "Looking into that now, sir.") else: - response_text = "Right away, sir." + response_text = {"fr": "Tout de suite, monsieur.", + "tr": "Hemen, efendim."}.get(_lg, "Right away, sir.") if embedded_action["action"] == "build": # Build in background — JARVIS stays conversational