diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..83813f99fd --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,10 @@ + +## Retro-Discovered Patterns + +- Happy CLI remote→local mode switch: do NOT pass `signal: opts.abort` (AbortSignal) when switching from remote back to local mode. Let the local process finish and fully clean up before starting the remote process — don't use opts.abort to force-kill it. + +**Why:** Aborting the local Claude Code process mid-cleanup leaves the terminal in a corrupted state: two cursor positions, garbled keystrokes that never recover until the entire happy+Claude Code combo is restarted. Root-caused in session 3cd3dc8a when a Mac Terminal (not iTerm2/Warp) reliably reproduced it. The fix is to sequence the transitions — let local complete cleanup then start remote. + +**How to apply:** In `claudeLocalLauncher.ts` and mode-switch logic, remove AbortSignal handoff between local→remote transitions. Wait for full process cleanup before spawning the successor. + + diff --git a/docs/brainstorms/2026-03-16-openai-realtime-migration-brainstorm.md b/docs/brainstorms/2026-03-16-openai-realtime-migration-brainstorm.md new file mode 100644 index 0000000000..330a928670 --- /dev/null +++ b/docs/brainstorms/2026-03-16-openai-realtime-migration-brainstorm.md @@ -0,0 +1,160 @@ +--- +date: 2026-03-16 +topic: openai-realtime-migration +status: active +origin: brainstorming session +--- + +# Migrate Voice Layer: ElevenLabs → OpenAI Realtime + +## What We're Building + +Swap the voice backend from ElevenLabs Conversational AI to OpenAI's Realtime API +(`gpt-realtime` GA model). The migration is surgical: the `VoiceSession` interface +is already provider-agnostic, so only the implementation layer changes. + +The driving use case is unchanged: hands-free multi-session Claude Code management +while driving — voice agent monitors N parallel sessions, routes messages, handles +permission requests, and stays silent unless it has something to say. + +## Why This Approach + +Three approaches were evaluated: + +**A — WebRTC (chosen):** `react-native-webrtc` is already installed in the project +as a config plugin. OpenAI recommends WebRTC for mobile clients. Audio quality is +best-in-class (Opus, echo cancellation, WebRTC congestion control). No manual +PCM16 encoding needed. thorwebdev demo provides a reference implementation. + +**B — Raw WebSocket:** Simpler transport but requires manual audio encoding (PCM16 +at 24kHz), manual echo cancellation, and TCP fragility on mobile networks. + +**C — Server-proxied:** Cleanest key management but requires running self-hosted +happy-server. Deferred — not needed for personal use. + +## Key Decisions + +- **Transport:** WebRTC via `react-native-webrtc` (already in project) +- **Model:** `gpt-realtime` GA (not `-preview`; those are deprecated Sept 2025) +- **Auth (dev):** `EXPO_PUBLIC_OPENAI_API_KEY` baked at Metro bundle time (same + pattern as current `EXPO_PUBLIC_ELEVENLABS_AGENT_ID_DEV`) +- **Auth (prod):** Ephemeral keys via `POST /v1/realtime/client_secrets` before + each session — API key stays server-side +- **VAD:** `semantic_vad` with `eagerness: low` — won't cut off mid-sentence; + no silence-filling per system prompt +- **Tools:** Same `realtimeClientTools` object — zero changes. OpenAI function + calling uses the same zod-validated async callback pattern. +- **Context injection:** `conversation.item.create { role: "system" }` replaces + `sendContextualUpdate()` — direct equivalent +- **Session names:** Same `{{initialConversationContext}}` pattern — inject on + connect via `session.update` instructions field +- **Scope:** Only 3 files change (RealtimeVoiceSession.tsx, .web.tsx, + RealtimeProvider.tsx). Everything else is untouched. + +## Architecture: What Changes vs What Stays + +### Files that change +| File | Change | +|---|---| +| `RealtimeVoiceSession.tsx` | Replace ElevenLabs `useConversation` hook with WebRTC client | +| `RealtimeVoiceSession.web.tsx` | Same for web — browser WebRTC APIs | +| `RealtimeProvider.tsx` | Remove `` wrapper | +| `package.json` | Remove `@elevenlabs/*` packages | +| env config | Add `EXPO_PUBLIC_OPENAI_API_KEY` | + +### Files that stay identical +- `types.ts` — VoiceSession interface (provider-agnostic) +- `RealtimeSession.ts` — session control singleton +- `realtimeClientTools.ts` — all 3 tools (messageClaudeCode, processPermissionRequest, switchSession) +- `hooks/voiceHooks.ts` — event routing +- `hooks/contextFormatters.ts` — message formatting +- `voiceConfig.ts` — feature flags + +## Data Flow + +``` +Tap mic + → requestMicrophonePermission() + → fetch ephemeral key: POST /v1/realtime/client_secrets (prod) + OR use EXPO_PUBLIC_OPENAI_API_KEY directly (dev) + → RTCPeerConnection to api.openai.com/v1/realtime/calls + → RTCDataChannel "oai-events" for all signaling + → Audio track: mic stream → WebRTC → OpenAI + OpenAI → WebRTC → speaker (auto-routed) + +On connect: + → session.update: { model, voice, instructions (system prompt + session roster), + tools: [messageClaudeCode, processPermissionRequest, switchSession], + turn_detection: { type: "semantic_vad", eagerness: "low" } } + +On tool call: + → response.function_call_arguments.done event received + → realtimeClientTools[name](params) called (unchanged) + → conversation.item.create { type: "function_call_output" } + → response.create to resume + +On sendContextualUpdate(text): + → conversation.item.create { role: "system", content: [{ type: "text", text }] } + → response.create (only if no response in-flight) +``` + +## Tool Schema (OpenAI format) + +Tools are registered in `session.update` as standard OpenAI function calling JSON. +No ElevenLabs-specific format needed: + +```json +{ + "type": "function", + "name": "messageClaudeCode", + "description": "Send a message to a Claude Code session...", + "parameters": { + "type": "object", + "properties": { + "message": { "type": "string" }, + "session": { "type": "string" } + }, + "required": ["message", "session"], + "additionalProperties": false + }, + "strict": true +} +``` + +## Pricing Impact + +At moderate usage (~120 min/month): +- ElevenLabs Creator: $11/month flat +- OpenAI `gpt-realtime` GA: ~$120 × $0.057/min ≈ $6.84/month + +At heavy use (500 min/month): +- ElevenLabs Pro: $99/month +- OpenAI: ~$28.50/month + +OpenAI is cheaper at scale. The old preview models were 3-4× more expensive; GA pricing +made this favorable. + +## Resolved Questions + +- **react-native-webrtc already in project?** Yes — `@config-plugins/react-native-webrtc: ^12.0.0` + and `@livekit/react-native-webrtc: ^137.0.0` both present. +- **Audio handling in RN?** WebRTC handles it natively via mic stream; no manual PCM16 encoding. +- **Official OpenAI SDK in RN?** Not supported. We write a minimal custom WebRTC client class + (~200 LOC) that conforms to `VoiceSession`. No SDK dependency needed. +- **Language codes?** OpenAI uses BCP-47 (e.g. "en-US") — simpler than ElevenLabs codes. + Replace `getElevenLabsCodeFromPreference()` with a trivial BCP-47 mapper. +- **System prompt / session roster?** Same approach — pass as `instructions` in `session.update` + on connect. `{{initialConversationContext}}` variable pattern replaced by actual string injection. + +## Open Questions + +_(none — all resolved above)_ + +## Next Steps + +> Run `/workflow:write-plan` to produce the TDD implementation plan. + +Estimated scope: ~2-3 days of implementation work +- Day 1: WebRTC client class + session.update + connect/disconnect +- Day 2: Tool calling event loop + context injection +- Day 3: Testing, ElevenLabs cleanup, env config docs diff --git a/docs/elevenlabs-agent-setup.md b/docs/elevenlabs-agent-setup.md new file mode 100644 index 0000000000..8bcba00bde --- /dev/null +++ b/docs/elevenlabs-agent-setup.md @@ -0,0 +1,292 @@ +# ElevenLabs Agent Setup for Happy Coder + +## Overview + +Happy Coder uses ElevenLabs Conversational AI agents for voice interaction. +The voice agent can send messages to Claude Code sessions and respond to +permission requests — it's not just TTS, it's an interactive AI voice layer. + +## Step 1: Create an ElevenLabs Account + +Sign up at https://elevenlabs.io — the free tier includes ~15 minutes of +Conversational AI per month, enough to validate the integration. + +## Step 2: Create a Conversational AI Agent + +1. Go to ElevenLabs dashboard → **Agents** (or Conversational AI) +2. Create a new agent +3. Note the **Agent ID** (you'll need this for the app build) + +## Step 3: Configure the Agent + +### System Prompt + +The agent acts as a voice interface to Claude Code sessions. Use this as the +system prompt: + +``` +You are Happy Voice, a proactive voice assistant that helps users manage +MULTIPLE Claude Code sessions from their phone while driving or away from +their keyboard. + +You act as an aggregating project manager across all active sessions. You will +receive context updates from multiple sessions simultaneously. + +ACTIVE SESSIONS (injected at voice start): +{{initialConversationContext}} + +YOUR RESPONSIBILITIES: +1. Proactively inform the user when any session finishes work, encounters an + error, or needs permission — don't wait to be asked. +2. Route messages to the correct session based on the user's intent. If they + say "on the trading bot, add error handling", match "trading bot" to the + session folder name and use the messageClaudeCode tool with the session + parameter. +3. When permission requests come in, tell the user which project needs it and + what it wants to do. Keep it brief: "Trading bot wants to run npm install. + Approve?" +4. When the user says "approve" or "deny" without specifying a session, apply + it to whichever session has a pending request. +5. If the user asks for a status update, summarize all active sessions briefly. + +VOICE STYLE: +- Keep it SHORT — 1-2 sentences per update. The user is driving. +- Use project folder names to identify sessions, not IDs. +- Summarize technical details — never read code, file paths, or JSON. +- Be proactive: when a session finishes or needs attention, speak up immediately. + +SILENCE BEHAVIOR (CRITICAL): +- Do NOT fill silence. The user is driving and thinking. +- NEVER ask "is there anything else I can help with?" or similar filler. +- NEVER prompt the user to speak when there is a pause. +- Only speak when YOU have something to report (session update, permission + request, error) or when the USER speaks to you first. +- Silence is normal. Wait quietly. The user will talk when they need you. + +TOOLS: +- messageClaudeCode: Send a message to a session. You MUST always specify the + "session" parameter with the folder name. If the user doesn't name a session, + ask which one before calling the tool. This also auto-switches the screen. +- processPermissionRequest: Approve or deny. You MUST always specify the + "session" parameter. When reporting a permission request, always name the + session so the user's response is unambiguous. +- switchSession: Switch the app screen to show a specific session. Use this + when the user wants to see a session's output, or when context makes it clear + which session should be visible. You MUST specify the "session" parameter. +``` + +### Client Tools + +The app registers two client-side tools. Configure these in your ElevenLabs +agent with matching names and schemas: + +#### Tool 1: messageClaudeCode + +Sends a text message to a Claude Code session. Supports multi-session routing +via the optional `session` parameter (matched against folder names). + +```json +{ + "type": "client", + "name": "messageClaudeCode", + "description": "Send a message to Claude Code. You MUST specify the 'session' parameter with the project folder name (e.g. 'trading-bot', 'family-journal'). Always ask the user to clarify which session if unclear.", + "expects_response": false, + "response_timeout_secs": 1, + "parameters": [ + { + "id": "message", + "type": "string", + "description": "The message to send to Claude Code", + "dynamic_variable": "", + "required": true, + "constant_value": "", + "value_type": "llm_prompt" + }, + { + "id": "session", + "type": "string", + "description": "Target session name (folder name like 'trading-bot'). Always required.", + "dynamic_variable": "", + "required": true, + "constant_value": "", + "value_type": "llm_prompt" + } + ], + "dynamic_variables": { + "dynamic_variable_placeholders": {} + }, + "assignments": [], + "disable_interruptions": false, + "force_pre_tool_speech": "auto", + "tool_call_sound": null, + "tool_call_sound_behavior": "auto", + "execution_mode": "immediate" +} +``` + +#### Tool 2: processPermissionRequest + +Approves or denies a pending permission request. Supports multi-session routing. + +```json +{ + "type": "client", + "name": "processPermissionRequest", + "description": "Approve or deny a permission request from Claude Code. You MUST specify the 'session' parameter with the project folder name. Always confirm which session with the user if unclear.", + "expects_response": false, + "response_timeout_secs": 1, + "parameters": [ + { + "id": "decision", + "type": "string", + "description": "Whether to allow or deny the permission request. Must be 'allow' or 'deny'.", + "dynamic_variable": "", + "required": true, + "constant_value": "", + "value_type": "llm_prompt" + }, + { + "id": "session", + "type": "string", + "description": "Target session name (folder name). Always required.", + "dynamic_variable": "", + "required": true, + "constant_value": "", + "value_type": "llm_prompt" + } + ], + "dynamic_variables": { + "dynamic_variable_placeholders": {} + }, + "assignments": [], + "disable_interruptions": false, + "force_pre_tool_speech": "auto", + "tool_call_sound": null, + "tool_call_sound_behavior": "auto", + "execution_mode": "immediate" +} +``` + +#### Tool 3: switchSession + +Switches the app screen to show a specific session. Also called automatically +when sending a message, but can be used standalone (e.g. "show me the trading bot"). + +```json +{ + "type": "client", + "name": "switchSession", + "description": "Switch the app screen to display a specific session. Use when the user asks to see a session, or when context makes it clear they want to view a different project. Always specify the session name.", + "expects_response": false, + "response_timeout_secs": 1, + "parameters": [ + { + "id": "session", + "type": "string", + "description": "Target session name (folder name like 'trading-bot'). Always required.", + "dynamic_variable": "", + "required": true, + "constant_value": "", + "value_type": "llm_prompt" + } + ], + "dynamic_variables": { + "dynamic_variable_placeholders": {} + }, + "assignments": [], + "disable_interruptions": false, + "force_pre_tool_speech": "auto", + "tool_call_sound": null, + "tool_call_sound_behavior": "auto", + "execution_mode": "immediate" +} +``` + +### Dynamic Variables + +The system prompt uses `{{initialConversationContext}}` — this is a dynamic +variable that the app fills with the full list of active sessions when voice +starts. In ElevenLabs dashboard: + +1. Go to your agent → **System Prompt** section +2. When you type `{{initialConversationContext}}` it should auto-register as + a dynamic variable +3. If it doesn't, go to **Dynamic Variables** and add one named + `initialConversationContext` with an empty default value + +The app also sends `sessionId` as a dynamic variable (the session the user +was viewing when they tapped the mic button). + +### Agent Settings + +| Setting | Recommended Value | +|---------|------------------| +| **Access** | Public (unauthenticated) — needed for direct agentId connection | +| **Voice** | Pick any ElevenLabs voice you like | +| **Language** | The app sends user's preferred language, but default to English | +| **LLM** | Use the default (ElevenLabs absorbs LLM costs for now) | +| **Max duration** | 10 minutes (or whatever your plan allows) | + +### Silence / End-of-Turn Settings + +In the ElevenLabs agent dashboard, look for these settings and adjust: + +| Setting | Recommended Value | +|---------|------------------| +| **Inactivity timeout** | Maximum allowed (or disable if possible) | +| **End call on silence** | Disabled (user is driving, long silences are normal) | + +The system prompt already instructs the agent not to fill silence, but these +platform-level settings reinforce that behavior. + +**Important:** The agent MUST have public/unauthenticated access enabled for the +direct-connect path (experiments=false). If you want token-based auth instead, +you'd need to self-host the Happy server with your `ELEVENLABS_API_KEY`. + +## Step 4: Build the App + +```bash +cd /Users/cr/Scripts/AI-Dev/happy + +# Install dependencies +yarn install + +# Build for iOS simulator +cd packages/happy-app +EXPO_PUBLIC_ELEVENLABS_AGENT_ID_DEV= yarn prebuild +EXPO_PUBLIC_ELEVENLABS_AGENT_ID_DEV= yarn ios:dev +``` + +## Step 5: Test Voice + +1. Open the app in the simulator +2. Authenticate with the CLI (`happy` command on your Mac) +3. Open a Claude Code session +4. Tap the microphone button in the session view +5. Grant microphone permission +6. Speak — the agent should respond and can relay messages to Claude Code + +## Context Updates the Agent Receives + +The app automatically sends contextual updates to the voice agent: + +| Event | What the agent sees | +|-------|-------------------| +| Session focus | Which session the user is looking at | +| New messages | Claude Code's responses, tool calls, user messages | +| Permission requests | Tool name, arguments, request ID | +| Session online/offline | Connection status changes | +| Ready event | "Claude Code done working" notification | + +These arrive as contextual updates (not user messages), so the agent can +proactively inform the user about what's happening. + +## Cost Estimate + +| Usage | Minutes/month | Cost (Creator plan, $11/mo) | +|-------|---------------|----------------------------| +| Light (quick checks) | ~30 min | Included in 250 min | +| Moderate (daily use) | ~120 min | Included in 250 min | +| Heavy (constant voice) | ~500 min | Need Pro plan ($99/mo) | + +Free tier: 15 min/month for testing. diff --git a/docs/plans/2026-03-16-migration-openai-realtime-plan.md b/docs/plans/2026-03-16-migration-openai-realtime-plan.md new file mode 100644 index 0000000000..621e16bb1d --- /dev/null +++ b/docs/plans/2026-03-16-migration-openai-realtime-plan.md @@ -0,0 +1,572 @@ +--- +title: Migrate Voice Layer from ElevenLabs to OpenAI Realtime API +type: migration +status: draft +created: 2026-03-16 +brainstorm: docs/brainstorms/2026-03-16-openai-realtime-migration-brainstorm.md +--- + +# Migrate Voice Layer: ElevenLabs → OpenAI Realtime (WebRTC) + +## Summary + +Replace the ElevenLabs Conversational AI SDK with a custom WebRTC client +connecting to OpenAI's Realtime API (`gpt-realtime` GA model). The existing +`VoiceSession` interface is the abstraction boundary — everything above it +(voiceHooks, contextFormatters, realtimeClientTools, UI) stays unchanged. + +**Why:** OpenAI Realtime GA is ~40% cheaper at moderate usage, removes the +ElevenLabs dependency, gives us direct control over VAD/silence/tools, and +the `semantic_vad` with `eagerness: low` is ideal for the driving use case. + +**Scope:** Voice transport layer only. No UI changes. No new features. + +## Architecture + +``` +┌─────────────────────────────────────────────────┐ +│ voiceHooks.ts / contextFormatters.ts │ UNCHANGED +│ realtimeClientTools.ts (3 tools) │ UNCHANGED (format adapter in client) +├─────────────────────────────────────────────────┤ +│ VoiceSession interface (types.ts) │ MINOR CHANGE (add clientSecret field) +├─────────────────────────────────────────────────┤ +│ RealtimeVoiceSession.tsx (native) │ REPLACED → OpenAIRealtimeClient +│ RealtimeVoiceSession.web.tsx (web) │ REPLACED → same client (browser WebRTC) +│ RealtimeProvider.tsx │ SIMPLIFIED (remove ElevenLabsProvider) +├─────────────────────────────────────────────────┤ +│ RealtimeSession.ts │ UPDATED (ephemeral key flow) +│ apiVoice.ts │ UPDATED (new endpoint for key) +└─────────────────────────────────────────────────┘ +``` + +## Key Decisions + +(see brainstorm: docs/brainstorms/2026-03-16-openai-realtime-migration-brainstorm.md) + +1. **WebRTC transport** — not WebSocket. WebRTC handles echo cancellation, + noise suppression, and codec negotiation automatically. Critical for driving. +2. **Custom client class** (~200 LOC) — no official RN SDK exists. We use + `@livekit/react-native-webrtc@137.x` which already supports `RTCDataChannel`. +3. **Ephemeral keys** — for production, proxy through Happy server. For dev, + direct API key is acceptable (already exposed in bundle anyway). +4. **`semantic_vad` with `eagerness: low`** — lets the user pause and think + without triggering premature turn-ends. Critical for driving UX. +5. **Keep `realtimeClientTools.ts` unchanged** — the OpenAI client class + translates tool definitions to OpenAI format and routes `function_call` + events back to the existing tool handlers. + +## Event Mapping: ElevenLabs → OpenAI + +| ElevenLabs Hook | OpenAI Equivalent | Implementation | +|-----------------|-------------------|----------------| +| `onConnect` | `datachannel.onopen` + `session.created` event | Fire status='connected' | +| `onDisconnect` | `pc.oniceconnectionstatechange → 'closed'` | Fire status='disconnected' | +| `onModeChange('speaking')` | `response.audio.delta` event | Set mode='speaking' | +| `onModeChange('listening')` | `response.audio.done` / `response.done` event | Set mode='idle' | +| `onError` | `error` event on data channel | Set status='disconnected' | +| `sendContextualUpdate(text)` | `conversation.item.create` (role=user, invisible) + NO `response.create` | Silent context injection | +| `sendUserMessage(text)` | `conversation.item.create` (role=user) + `response.create` | Triggers model speech | +| `clientTools[name](params)` | `response.function_call_arguments.done` → execute → `conversation.item.create` (function_call_output) + `response.create` | Same tool functions | + +### Critical: `sendContextualUpdate` vs `sendTextMessage` + +ElevenLabs distinguishes these natively. OpenAI does not. Our mapping: + +- **`sendContextualUpdate`** → `conversation.item.create` with role `user` but + do NOT follow with `response.create`. The model sees the context but doesn't + speak about it. Add prefix: `[CONTEXT UPDATE - do not respond to this]`. +- **`sendTextMessage`** → `conversation.item.create` with role `user` then + `response.create` to trigger a spoken response. + +## Tasks + +### Task 0: Spike — Verify DataChannel + OpenAI SDP Handshake + +**Goal:** Prove the WebRTC connection works end-to-end in React Native before +writing any production code. + +**Files:** None (throwaway test) + +**Steps:** +1. Create a minimal test script that: + - Fetches an ephemeral key from OpenAI (`POST /v1/realtime/client_secrets`) + - Creates `RTCPeerConnection` using `@livekit/react-native-webrtc` + - Creates data channel `oai-events` + - Creates SDP offer, POSTs to OpenAI, sets SDP answer + - Logs data channel `onopen` and first `session.created` event + - Sends a `session.update` with a simple instruction + - Adds mic track and verifies audio output +2. Run on physical iPhone to verify audio routing +3. If this fails, we stop and reassess + +**Why first:** If DataChannel or SDP handshake doesn't work with the LiveKit +fork, the entire plan is blocked. Find out in 30 minutes, not 2 days. + +--- + +### Task 1: Update `VoiceSessionConfig` Interface + +**File:** `sources/realtime/types.ts` + +**Change:** +```typescript +export interface VoiceSessionConfig { + sessionId: string; + initialContext?: string; + token?: string; // ElevenLabs token (keep for backward compat during migration) + agentId?: string; // ElevenLabs agent ID (keep for backward compat) + clientSecret?: string; // OpenAI ephemeral key + apiKey?: string; // OpenAI API key (dev only) + provider?: 'elevenlabs' | 'openai'; // Which backend to use +} +``` + +**Test:** `yarn typecheck` passes with no new errors. + +--- + +### Task 2: Create `OpenAIRealtimeClient` Class + +**New file:** `sources/realtime/openai/OpenAIRealtimeClient.ts` + +This is the core of the migration — a ~200 LOC class that: + +1. **Manages WebRTC lifecycle** — peer connection, data channel, audio tracks +2. **Handles OpenAI protocol** — session.update, conversation.item.create, response events +3. **Translates tools** — converts `realtimeClientTools` format to OpenAI function-calling schema +4. **Fires callbacks** — onConnect, onDisconnect, onModeChange, onError (matching ElevenLabs shape) + +**Public API:** +```typescript +interface OpenAIRealtimeCallbacks { + onConnect: () => void; + onDisconnect: () => void; + onModeChange: (mode: 'speaking' | 'idle') => void; + onError: (error: Error) => void; +} + +class OpenAIRealtimeClient { + constructor(callbacks: OpenAIRealtimeCallbacks); + + async connect(config: { + clientSecret?: string; + apiKey?: string; + model?: string; + instructions: string; + tools: OpenAIToolDef[]; + voice?: string; + vadConfig?: VADConfig; + }): Promise; + + disconnect(): void; + + // Maps to sendContextualUpdate (no response triggered) + injectContext(text: string): void; + + // Maps to sendTextMessage (triggers response) + sendMessage(text: string): void; +} +``` + +**Internal flow:** +1. `connect()`: + - Create `RTCPeerConnection` with `@livekit/react-native-webrtc` + - Create data channel `oai-events` + - Get mic stream via `navigator.mediaDevices.getUserMedia` or RN equivalent + - Add audio track to peer connection + - Create SDP offer + - POST offer to `https://api.openai.com/v1/realtime?model=gpt-realtime` + with `Authorization: Bearer ${clientSecret}` and `Content-Type: application/sdp` + - Set remote SDP answer + - Wait for `datachannel.onopen` + - Send `session.update` with instructions, tools, VAD config + - Fire `onConnect` + +2. Data channel event handler: + - `session.created` → log, no action needed + - `response.audio.delta` → set speaking mode (debounced) + - `response.audio.done` / `response.done` → set idle mode + - `response.function_call_arguments.done` → execute tool, return result + - `error` → fire onError + - `input_audio_buffer.speech_started` → (optional) set 'listening' state + - `session.ended` → fire onDisconnect + +3. ICE state handling: + - `disconnected` → attempt ICE restart once + - `failed` → fire onDisconnect, clean up + - `closed` → fire onDisconnect + +4. Cleanup (`disconnect()`): + - Close data channel + - Stop all media tracks + - Close peer connection + - Fire onDisconnect + +**Edge cases to handle:** +- Buffer events until data channel is open (queue `session.update` etc.) +- Guard against double-connect (AsyncLock) +- Handle `response.cancelled` (user interrupts model) — reset mode to idle +- Tool timeout: OpenAI expects response within 15s. If tool takes longer, + send an optimistic ack. +- Memory cleanup in disconnect: nullify all refs, remove all listeners + +**Test:** Unit test with mocked RTCPeerConnection verifying: +- SDP offer/answer exchange +- Data channel event routing +- Tool call → execute → result cycle +- Mode state transitions +- Cleanup on disconnect + +--- + +### Task 3: Create Tool Definition Translator + +**New file:** `sources/realtime/openai/toolTranslator.ts` + +Converts the existing `realtimeClientTools` format to OpenAI's function-calling schema. + +```typescript +export function translateToolsForOpenAI( + clientTools: Record Promise> +): OpenAIToolDef[] { + // Maps: + // messageClaudeCode → { type: "function", name: "messageClaudeCode", ... } + // processPermissionRequest → ... + // switchSession → ... + // Uses hardcoded schema definitions matching the Zod schemas in realtimeClientTools +} +``` + +**Why separate file:** Keeps `realtimeClientTools.ts` unchanged. The translator +knows the parameter schemas and maps them to OpenAI JSON Schema format. + +**Test:** Snapshot test that the output matches expected OpenAI tool schema. + +--- + +### Task 4: Build System Prompt for OpenAI + +**New file:** `sources/realtime/openai/systemPrompt.ts` + +Generates the OpenAI session instructions from the same content currently in +the ElevenLabs agent dashboard. This is now code-controlled instead of +dashboard-configured. + +```typescript +export function buildSystemPrompt(initialContext: string): string { + return `You are Happy Voice, a proactive voice assistant... + +ACTIVE SESSIONS: +${initialContext} + +YOUR RESPONSIBILITIES: +... + +SILENCE BEHAVIOR (CRITICAL): +... + +TOOLS: +...`; +} +``` + +**Advantage:** No more manual dashboard updates. System prompt lives in code, +versioned in git, deployed with the app. + +**Test:** Verify prompt includes all required sections; verify initialContext injection. + +--- + +### Task 5: Replace `RealtimeVoiceSession.tsx` (Native) + +**File:** `sources/realtime/RealtimeVoiceSession.tsx` + +Replace the ElevenLabs `useConversation` hook with `OpenAIRealtimeClient`. + +**Before:** React component using `useConversation` hook from `@elevenlabs/react-native` +**After:** React component that instantiates `OpenAIRealtimeClient` in a `useRef` + +```typescript +import { OpenAIRealtimeClient } from './openai/OpenAIRealtimeClient'; + +let clientInstance: OpenAIRealtimeClient | null = null; + +class OpenAIVoiceSessionImpl implements VoiceSession { + async startSession(config: VoiceSessionConfig): Promise { + if (!clientInstance) return; + + const tools = translateToolsForOpenAI(realtimeClientTools); + const instructions = buildSystemPrompt(config.initialContext || ''); + + await clientInstance.connect({ + clientSecret: config.clientSecret, + apiKey: config.apiKey, + instructions, + tools, + voice: 'alloy', // or user preference + vadConfig: { type: 'semantic_vad', eagerness: 'low' } + }); + } + + async endSession(): Promise { + clientInstance?.disconnect(); + } + + sendTextMessage(message: string): void { + clientInstance?.sendMessage(message); + } + + sendContextualUpdate(update: string): void { + clientInstance?.injectContext(update); + } +} + +export const RealtimeVoiceSession: React.FC = () => { + useEffect(() => { + clientInstance = new OpenAIRealtimeClient({ + onConnect: () => { + storage.getState().setRealtimeStatus('connected'); + storage.getState().setRealtimeMode('idle'); + // Send session roster (same as current code) + }, + onDisconnect: () => { + storage.getState().setRealtimeStatus('disconnected'); + storage.getState().setRealtimeMode('idle', true); + storage.getState().clearRealtimeModeDebounce(); + }, + onModeChange: (mode) => { + storage.getState().setRealtimeMode(mode === 'speaking' ? 'speaking' : 'idle'); + }, + onError: (error) => { + console.warn('OpenAI Realtime error:', error); + storage.getState().setRealtimeStatus('disconnected'); + storage.getState().setRealtimeMode('idle', true); + } + }); + + registerVoiceSession(new OpenAIVoiceSessionImpl()); + + return () => { clientInstance = null; }; + }, []); + + return null; +}; +``` + +**Test:** Integration test verifying VoiceSession interface contract is met. + +--- + +### Task 6: Replace `RealtimeVoiceSession.web.tsx` (Web) + +**File:** `sources/realtime/RealtimeVoiceSession.web.tsx` + +Same approach as Task 5 but using browser-native `RTCPeerConnection` instead +of `@livekit/react-native-webrtc`. The `OpenAIRealtimeClient` should accept +a WebRTC factory to support both environments: + +```typescript +// In OpenAIRealtimeClient constructor: +constructor(callbacks, options?: { RTCPeerConnection?: typeof RTCPeerConnection }) +``` + +Native passes the LiveKit import, web uses the browser global. + +**Test:** Same interface contract test as Task 5. + +--- + +### Task 7: Simplify `RealtimeProvider.tsx` + +**File:** `sources/realtime/RealtimeProvider.tsx` + +Remove `ElevenLabsProvider` wrapper. The component becomes: + +```typescript +export const RealtimeProvider = ({ children }: { children: React.ReactNode }) => { + return ( + <> + + {children} + + ); +}; +``` + +(This already matches `RealtimeProvider.web.tsx` — they can now be unified.) + +**Test:** `yarn typecheck` passes. + +--- + +### Task 8: Update `RealtimeSession.ts` — Ephemeral Key Flow + +**File:** `sources/realtime/RealtimeSession.ts` + +Replace the ElevenLabs token/agentId flow: + +```typescript +// OLD (ElevenLabs): +await voiceSession.startSession({ sessionId, initialContext, agentId }); + +// NEW (OpenAI): +const secret = await fetchEphemeralKey(); // or use API key in dev +await voiceSession.startSession({ + sessionId, + initialContext, + clientSecret: secret, + provider: 'openai' +}); +``` + +**For dev mode:** Use `config.openAiApiKey` directly (from `EXPO_PUBLIC_OPENAI_API_KEY`). +**For experiments/production:** Call Happy server endpoint to mint ephemeral key. + +The `experimentsEnabled` branch stays but calls a new server endpoint instead +of `fetchVoiceToken`. Paywall check remains unchanged. + +**Test:** Verify both dev (direct key) and production (server proxy) paths. + +--- + +### Task 9: Add OpenAI Environment Variables + +**Files:** +- `app.config.js` — add to `extra.app` +- `sources/sync/appConfig.ts` — add to `AppConfig` interface and loader +- `CLAUDE.local.md` — document new env vars + +```bash +EXPO_PUBLIC_OPENAI_API_KEY=sk-... # Dev only, direct connection +EXPO_PUBLIC_OPENAI_REALTIME_MODEL=gpt-realtime # Optional override +EXPO_PUBLIC_OPENAI_REALTIME_VOICE=alloy # Voice selection +``` + +**Test:** `yarn typecheck`; verify config loads correctly. + +--- + +### Task 10: Audio Output Routing + +**File:** `sources/utils/audioRouting.ts` (new) + +Ensure audio plays through speaker (not earpiece) for the driving use case. + +```typescript +import { Audio } from 'expo-audio'; + +export async function configureAudioForVoiceSession(): Promise { + await Audio.setAudioModeAsync({ + playsInSilentModeIOS: true, + allowsRecordingIOS: true, + staysActiveInBackground: true, + // Route to speaker, not earpiece + }); +} +``` + +Called before `startSession` in `RealtimeSession.ts`. + +**Note:** Also need `UIBackgroundModes: audio` in `app.config.js` for iOS +background audio (user switches to Maps while driving). + +**Test:** Manual test on physical device — audio plays from speaker. + +--- + +### Task 11: Session Duration Handling + +The OpenAI Realtime API has a 30-minute max session duration. For driving, +this needs handling. + +**In `OpenAIRealtimeClient`:** +- Track session start time +- At 25 minutes, fire a callback `onSessionExpiring` +- On `session.ended` event, fire `onDisconnect` with a reason +- In `RealtimeSession.ts`, auto-reconnect with new ephemeral key if + session expires (preserve no conversation state — just reconnect fresh) + +**Test:** Simulate session expiry event; verify reconnection. + +--- + +### Task 12: ICE Restart / Network Recovery + +**In `OpenAIRealtimeClient`:** +- Listen to `pc.oniceconnectionstatechange` +- On `disconnected`: wait 2s, attempt ICE restart +- On `failed` after restart: full disconnect + reconnect +- On network type change (WiFi→cellular): proactive ICE restart + +This is critical for the driving use case — tunnels, dead zones, cell handoffs. + +**Test:** Simulate ICE state transitions; verify restart behavior. + +--- + +### Task 13: Remove ElevenLabs Dependencies + +**After all tasks verified working:** + +1. Remove from `package.json`: + - `@elevenlabs/react` + - `@elevenlabs/react-native` +2. Remove language mapping: `sources/constants/Languages.ts` (ElevenLabs codes) +3. Remove `fetchVoiceToken` from `sources/sync/apiVoice.ts` +4. Run `yarn install && yarn typecheck` + +**Test:** Full build succeeds with no ElevenLabs references. + +--- + +## Task Dependency Graph + +``` +Task 0 (spike) ─── GATE ───┐ + │ +Task 1 (types) ─────────────┤ +Task 3 (tool translator) ───┤ +Task 4 (system prompt) ─────┤ +Task 9 (env vars) ──────────┤ +Task 10 (audio routing) ────┤ + │ + ├──→ Task 2 (client class) + │ │ + │ ├──→ Task 5 (native session) + │ ├──→ Task 6 (web session) + │ ├──→ Task 8 (session lifecycle) + │ │ + │ ├──→ Task 7 (simplify provider) + │ ├──→ Task 11 (session duration) + │ └──→ Task 12 (ICE recovery) + │ + └──→ Task 13 (remove ElevenLabs) ← LAST +``` + +**Parallelizable wave 1:** Tasks 1, 3, 4, 9, 10 (all independent) +**Parallelizable wave 2:** Tasks 5, 6 (after Task 2) +**Sequential:** Task 0 gates everything. Task 13 is last. + +## Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| LiveKit WebRTC fork has DataChannel bugs | Low | Blocker | Task 0 spike proves this early | +| OpenAI SDP handshake fails from RN | Low | Blocker | Task 0 spike | +| `sendContextualUpdate` mapping pollutes conversation | Medium | Degraded UX | Prefix with "[CONTEXT UPDATE]" instruction | +| 30-min session limit during long drives | Certain | Interruption | Task 11 auto-reconnect | +| Audio routes to earpiece instead of speaker | Medium | Unusable for driving | Task 10 explicit routing | +| Network drops in tunnels | Certain | Session drops | Task 12 ICE restart | + +## Success Criteria + +- [ ] Voice connects and audio flows bidirectionally on physical iPhone +- [ ] All 3 tools (messageClaudeCode, processPermissionRequest, switchSession) work +- [ ] Context updates reach the model without triggering speech +- [ ] Text messages trigger model speech +- [ ] Speaking/idle mode transitions drive UI animation +- [ ] Session survives a WiFi→cellular handoff +- [ ] Session auto-reconnects after 30-min expiry +- [ ] Audio plays through speaker, not earpiece +- [ ] `yarn typecheck` passes +- [ ] No ElevenLabs imports remain (after Task 13) diff --git a/docs/solutions/integration-issues/react-native-webrtc-getusermedia-silent-failure.md b/docs/solutions/integration-issues/react-native-webrtc-getusermedia-silent-failure.md new file mode 100644 index 0000000000..cc46cf5671 --- /dev/null +++ b/docs/solutions/integration-issues/react-native-webrtc-getusermedia-silent-failure.md @@ -0,0 +1,113 @@ +--- +module: Voice Layer (OpenAI Realtime API Integration) +date: 2026-03-16 +problem_type: integration_issue +symptoms: + - Speakerphone activates but no audio flows in either direction + - Microphone capture fails silently with no crash or visible error + - Model speech output not received despite connected status + - WebRTC connection appears fully established (SDP handshake, session.created event) +root_cause: OpenAIRealtimeClient used browser API navigator.mediaDevices.getUserMedia() which does not exist in React Native; requires mediaDevices imported from @livekit/react-native-webrtc +resolution_type: code_fix +severity: critical +tags: + - react-native-webrtc + - openai-realtime-api + - cross-platform-api + - dependency-injection + - voice-integration + - browser-api-incompatibility +--- + +# React Native WebRTC: `navigator.mediaDevices` Silently Fails — Complete Audio Silence + +## Problem + +When using `@livekit/react-native-webrtc` for the OpenAI Realtime API voice integration, calling `navigator?.mediaDevices?.getUserMedia()` inside a shared WebRTC client class silently fails in React Native. The WebRTC connection completes successfully (SDP handshake, data channel open, `session.created` event received) but no mic track is ever added to the peer connection and no remote audio plays — total silence in both directions. + +## Symptoms + +- Voice button tap activates the speakerphone (iOS audio session changes) — looks connected +- `[OpenAIRealtime] Connected` log appears, UI shows "connected" state +- No mic input detected by the model (model never responds) +- No model audio output despite `response.audio.delta` events potentially arriving +- Metro logs show: `[OpenAIRealtime] Mic not available: ...` (if not swallowed) or nothing at all + +## What Didn't Work + +**Assuming the WebRTC connection itself was broken:** SDP handshake logs showed success and `session.created` arrived on the data channel. The signaling plane was fine — the issue was in the media plane. + +**Using `navigator?.mediaDevices?.getUserMedia?.({ audio: true })`:** `navigator.mediaDevices` is `undefined` in React Native (no browser host). Optional chaining prevented a hard crash but silently swallowed the failure. Execution continued without a mic track, and the try/catch logged only a warning — the session appeared live while audio was completely broken. + +## Solution + +Inject `mediaDevices` as a constructor dependency on the WebRTC client class. The native entry point passes the RN-specific implementation; the web entry point falls back to the browser global. + +**Before (broken):** +```typescript +// OpenAIRealtimeClient.ts +constructor(callbacks, options?: { RTCPeerConnection?: any }) { + this.RTCPeerConnectionCtor = options?.RTCPeerConnection ?? globalThis.RTCPeerConnection; +} + +// Inside connect(): +this.localStream = await (navigator?.mediaDevices?.getUserMedia?.({ audio: true }) as any); +``` + +**After (fixed):** +```typescript +// OpenAIRealtimeClient.ts +constructor( + callbacks: OpenAIRealtimeCallbacks, + options?: { RTCPeerConnection?: any; mediaDevices?: any } +) { + this.RTCPeerConnectionCtor = options?.RTCPeerConnection ?? globalThis.RTCPeerConnection; + this.mediaDevicesImpl = options?.mediaDevices ?? navigator?.mediaDevices; +} + +// Inside connect(): +if (this.mediaDevicesImpl?.getUserMedia) { + this.localStream = await this.mediaDevicesImpl.getUserMedia({ audio: true }); + // ...add track to peer connection +} +``` + +**Native session — RealtimeVoiceSession.tsx:** +```typescript +import { RTCPeerConnection, mediaDevices as rnMediaDevices } from '@livekit/react-native-webrtc'; + +clientInstance = new OpenAIRealtimeClient(callbacks, { + RTCPeerConnection, + mediaDevices: rnMediaDevices, // ← RN-specific implementation +}); +``` + +**Web session — RealtimeVoiceSession.web.tsx:** +```typescript +// No override needed — falls back to navigator.mediaDevices (browser native) +clientInstance = new OpenAIRealtimeClient(callbacks); +``` + +## Why This Works + +React Native runs JavaScript on Hermes (or JSC) — not a browser. `navigator.mediaDevices` is a W3C Media Capture spec API that only exists because browser vendors implement it as part of their platform. In React Native there is no browser host, so the property is `undefined`. `@livekit/react-native-webrtc` ships its own `mediaDevices` export that bridges to the native iOS/Android media subsystem — nearly identical API surface to the browser version, which is exactly what makes the bug invisible until runtime. + +Dependency injection solves this structurally: the client class never reaches for a platform global. The caller (native or web session file) supplies the correct implementation at construction time. An incorrect implementation fails loudly at the wiring point; a missing one is immediately obvious. The bug becomes impossible to introduce silently. + +The real danger with optional chaining (`?.`) is that it turns "this global doesn't exist" into "silently do nothing" — which in audio code means silent operation, the hardest possible failure mode to debug. + +## Prevention + +1. **Never access `navigator.*` or `window.*` directly inside shared/cross-platform classes.** Any file that is imported by both a `.tsx` and `.web.tsx` must not reference browser globals. Add an ESLint `no-restricted-globals` rule for `navigator` and `window` in `sources/realtime/` and similar shared directories. + +2. **Treat swallowed catch blocks as defects when the fallback is broken.** A `catch` that logs a warning and continues without the mic is not defensive — it's a silent failure. If the resource is required for the feature to work, the error must propagate or result in an explicit `error` status. + +3. **Apply the platform injection pattern to all RN/Web split capabilities:** `mediaDevices`, `RTCPeerConnection`, `AudioContext`, `localStorage`, clipboard, biometrics. Accept them as constructor arguments. Wire them in the platform-specific entry files (`.tsx` vs `.web.tsx`). + +4. **Verify the media plane separately from the signaling plane.** SDP handshake success and `session.created` on the data channel only prove signaling works. A test or smoke check should assert at least one audio track was added to the peer connection before claiming "connected." + +5. **When porting Web API code to React Native, grep for:** `navigator.`, `window.`, `document.`, `globalThis.`, `AudioContext`, `localStorage` — treat every hit as a porting defect until confirmed otherwise. + +## Related Issues + +None identified. diff --git a/docs/solutions/patterns/critical-patterns.md b/docs/solutions/patterns/critical-patterns.md new file mode 100644 index 0000000000..53dd4a0da2 --- /dev/null +++ b/docs/solutions/patterns/critical-patterns.md @@ -0,0 +1,40 @@ +# Critical Patterns — Required Reading + +These patterns MUST be followed. All subagents check this file before +code generation. Violations of these patterns cause real bugs. + +--- + +## 1. React Native WebRTC: Never Use `navigator.mediaDevices` in Shared Classes + +### WRONG (silent audio failure — connection appears live but no audio) +```typescript +// Inside a shared WebRTC client class: +this.localStream = await (navigator?.mediaDevices?.getUserMedia?.({ audio: true }) as any); +``` + +### CORRECT +```typescript +// Accept mediaDevices as a constructor dependency: +constructor(callbacks, options?: { RTCPeerConnection?: any; mediaDevices?: any }) { + this.mediaDevicesImpl = options?.mediaDevices ?? navigator?.mediaDevices; +} + +// In connect(): +if (this.mediaDevicesImpl?.getUserMedia) { + this.localStream = await this.mediaDevicesImpl.getUserMedia({ audio: true }); +} + +// Native entry point (RealtimeVoiceSession.tsx): +import { RTCPeerConnection, mediaDevices as rnMediaDevices } from '@livekit/react-native-webrtc'; +new OpenAIRealtimeClient(callbacks, { RTCPeerConnection, mediaDevices: rnMediaDevices }); + +// Web entry point (RealtimeVoiceSession.web.tsx): +new OpenAIRealtimeClient(callbacks); // falls back to navigator.mediaDevices +``` + +**Why:** `navigator.mediaDevices` is a browser Web API — it is `undefined` in React Native. Optional chaining (`?.`) prevents a crash but silently returns nothing, so execution continues without a mic track. The WebRTC signaling plane (SDP handshake, data channel) completes successfully, making the bug invisible until you notice total audio silence. + +**Placement/Context:** Any class that handles WebRTC audio in a codebase that targets both React Native and web. Also applies to `RTCPeerConnection`, `AudioContext`, and other browser media globals. + +**Documented in:** `docs/solutions/integration-issues/react-native-webrtc-getusermedia-silent-failure.md` diff --git a/packages/happy-app/index.ts b/packages/happy-app/index.ts index cf9ee28c1b..e3a2affa22 100644 --- a/packages/happy-app/index.ts +++ b/packages/happy-app/index.ts @@ -1,2 +1,4 @@ +import { LogBox } from 'react-native'; +LogBox.ignoreAllLogs(true); import './sources/unistyles'; import 'expo-router/entry'; \ No newline at end of file diff --git a/packages/happy-app/sources/realtime/RealtimeProvider.tsx b/packages/happy-app/sources/realtime/RealtimeProvider.tsx index 685897aae2..c5d62d2c30 100644 --- a/packages/happy-app/sources/realtime/RealtimeProvider.tsx +++ b/packages/happy-app/sources/realtime/RealtimeProvider.tsx @@ -1,12 +1,11 @@ import React from 'react'; -import { ElevenLabsProvider } from "@elevenlabs/react-native"; import { RealtimeVoiceSession } from './RealtimeVoiceSession'; export const RealtimeProvider = ({ children }: { children: React.ReactNode }) => { return ( - + <> {children} - + ); -}; \ No newline at end of file +}; diff --git a/packages/happy-app/sources/realtime/RealtimeSession.ts b/packages/happy-app/sources/realtime/RealtimeSession.ts index 93ab973186..c6c0f2fb12 100644 --- a/packages/happy-app/sources/realtime/RealtimeSession.ts +++ b/packages/happy-app/sources/realtime/RealtimeSession.ts @@ -1,9 +1,6 @@ import type { VoiceSession } from './types'; -import { fetchVoiceToken } from '@/sync/apiVoice'; import { storage } from '@/sync/storage'; -import { sync } from '@/sync/sync'; import { Modal } from '@/modal'; -import { TokenStorage } from '@/auth/tokenStorage'; import { t } from '@/text'; import { config } from '@/config'; import { requestMicrophonePermission, showMicrophonePermissionDeniedAlert } from '@/utils/microphonePermissions'; @@ -26,66 +23,22 @@ export async function startRealtimeSession(sessionId: string, initialContext?: s return; } - const experimentsEnabled = storage.getState().settings.experiments; - const agentId = __DEV__ ? config.elevenLabsAgentIdDev : config.elevenLabsAgentIdProd; - - if (!agentId) { - console.error('Agent ID not configured'); + const apiKey = config.openAiApiKey; + if (!apiKey) { + console.error('OpenAI API key not configured'); return; } - - try { - // Simple path: No experiments = no auth needed - if (!experimentsEnabled) { - currentSessionId = sessionId; - voiceSessionStarted = true; - await voiceSession.startSession({ - sessionId, - initialContext, - agentId // Use agentId directly, no token - }); - return; - } - - // Experiments enabled = full auth flow - const credentials = await TokenStorage.getCredentials(); - if (!credentials) { - Modal.alert(t('common.error'), t('errors.authenticationFailed')); - return; - } - - const response = await fetchVoiceToken(credentials, sessionId); - console.log('[Voice] fetchVoiceToken response:', response); - - if (!response.allowed) { - console.log('[Voice] Not allowed, presenting paywall...'); - const result = await sync.presentPaywall(); - console.log('[Voice] Paywall result:', result); - if (result.purchased) { - await startRealtimeSession(sessionId, initialContext); - } - return; - } + try { currentSessionId = sessionId; voiceSessionStarted = true; - if (response.token) { - // Use token from backend - await voiceSession.startSession({ - sessionId, - initialContext, - token: response.token, - agentId: response.agentId - }); - } else { - // No token (e.g. server not deployed yet) - use agentId directly - await voiceSession.startSession({ - sessionId, - initialContext, - agentId - }); - } + // OpenAI Realtime: pass API key, the client handles ephemeral key exchange + await voiceSession.startSession({ + sessionId, + initialContext, + apiKey, + }); } catch (error) { console.error('Failed to start realtime session:', error); currentSessionId = null; @@ -98,7 +51,7 @@ export async function stopRealtimeSession() { if (!voiceSession) { return; } - + try { await voiceSession.endSession(); currentSessionId = null; @@ -125,4 +78,4 @@ export function getVoiceSession(): VoiceSession | null { export function getCurrentRealtimeSessionId(): string | null { return currentSessionId; -} \ No newline at end of file +} diff --git a/packages/happy-app/sources/realtime/RealtimeVoiceSession.tsx b/packages/happy-app/sources/realtime/RealtimeVoiceSession.tsx index da558e1ec3..2a59d1992a 100644 --- a/packages/happy-app/sources/realtime/RealtimeVoiceSession.tsx +++ b/packages/happy-app/sources/realtime/RealtimeVoiceSession.tsx @@ -1,160 +1,123 @@ import React, { useEffect, useRef } from 'react'; -import { useConversation } from '@elevenlabs/react-native'; +import { + RTCPeerConnection, + mediaDevices as rnMediaDevices, +} from '@livekit/react-native-webrtc'; import { registerVoiceSession } from './RealtimeSession'; import { storage } from '@/sync/storage'; -import { realtimeClientTools } from './realtimeClientTools'; -import { getElevenLabsCodeFromPreference } from '@/constants/Languages'; +import { OpenAIRealtimeClient } from './openai/OpenAIRealtimeClient'; +import { buildSystemPrompt } from './openai/systemPrompt'; +import { OPENAI_TOOL_DEFINITIONS } from './openai/toolTranslator'; +import { getSessionLabel } from './hooks/contextFormatters'; +import { config } from '@/config'; import type { VoiceSession, VoiceSessionConfig } from './types'; -// Static reference to the conversation hook instance -let conversationInstance: ReturnType | null = null; +// Global client reference accessible from the VoiceSession implementation +let clientInstance: OpenAIRealtimeClient | null = null; -// Global voice session implementation -class RealtimeVoiceSessionImpl implements VoiceSession { - - async startSession(config: VoiceSessionConfig): Promise { - if (!conversationInstance) { - console.warn('Realtime voice session not initialized'); +class OpenAIVoiceSessionImpl implements VoiceSession { + + async startSession(sessionConfig: VoiceSessionConfig): Promise { + if (!clientInstance) { + console.warn('[OpenAIVoiceSession] Client not initialized'); return; } try { storage.getState().setRealtimeStatus('connecting'); - - // Get user's preferred language for voice assistant - const userLanguagePreference = storage.getState().settings.voiceAssistantLanguage; - const elevenLabsLanguage = getElevenLabsCodeFromPreference(userLanguagePreference); - - if (!config.token && !config.agentId) { - throw new Error('Neither token nor agentId provided'); - } - - const sessionConfig: any = { - dynamicVariables: { - sessionId: config.sessionId, - initialConversationContext: config.initialContext || '' - }, - overrides: { - agent: { - language: elevenLabsLanguage - } - }, - ...(config.token ? { conversationToken: config.token } : { agentId: config.agentId }) - }; - - await conversationInstance.startSession(sessionConfig); + + const instructions = buildSystemPrompt(sessionConfig.initialContext || ''); + const voice = config.openAiRealtimeVoice || 'alloy'; + const model = config.openAiRealtimeModel || undefined; + + await clientInstance.connect({ + clientSecret: sessionConfig.clientSecret, + apiKey: sessionConfig.apiKey || config.openAiApiKey, + model, + instructions, + tools: OPENAI_TOOL_DEFINITIONS, + voice, + vadType: 'semantic_vad', + vadEagerness: 'low', + }); } catch (error) { - console.error('Failed to start realtime session:', error); + console.error('[OpenAIVoiceSession] Failed to start:', error); storage.getState().setRealtimeStatus('error'); } } async endSession(): Promise { - if (!conversationInstance) { - return; - } - - try { - await conversationInstance.endSession(); - storage.getState().setRealtimeStatus('disconnected'); - } catch (error) { - console.error('Failed to end realtime session:', error); - } + clientInstance?.disconnect(); + storage.getState().setRealtimeStatus('disconnected'); } sendTextMessage(message: string): void { - if (!conversationInstance) { - console.warn('Realtime voice session not initialized'); - return; - } - - try { - conversationInstance.sendUserMessage(message); - } catch (error) { - console.error('Failed to send text message:', error); - } + clientInstance?.sendMessage(message); } sendContextualUpdate(update: string): void { - if (!conversationInstance) { - console.warn('Realtime voice session not initialized'); - return; - } - - try { - conversationInstance.sendContextualUpdate(update); - } catch (error) { - console.error('Failed to send contextual update:', error); - } + clientInstance?.injectContext(update); } } export const RealtimeVoiceSession: React.FC = () => { - const conversation = useConversation({ - clientTools: realtimeClientTools, - onConnect: (data) => { - console.log('Realtime session connected:', data); - storage.getState().setRealtimeStatus('connected'); - storage.getState().setRealtimeMode('idle'); - }, - onDisconnect: () => { - console.log('Realtime session disconnected'); - storage.getState().setRealtimeStatus('disconnected'); - storage.getState().setRealtimeMode('idle', true); // immediate mode change - storage.getState().clearRealtimeModeDebounce(); - }, - onMessage: (data) => { - console.log('Realtime message:', data); - }, - onError: (error) => { - // Log but don't block app - voice features will be unavailable - // This prevents initialization errors from showing "Terminals error" on startup - console.warn('Realtime voice not available:', error); - // Don't set error status during initialization - just set disconnected - // This allows the app to continue working without voice features - storage.getState().setRealtimeStatus('disconnected'); - storage.getState().setRealtimeMode('idle', true); // immediate mode change - }, - onStatusChange: (data) => { - console.log('Realtime status change:', data); - }, - onModeChange: (data) => { - console.log('Realtime mode change:', data); - - // Only animate when speaking - const mode = data.mode as string; - const isSpeaking = mode === 'speaking'; - - // Use centralized debounce logic from storage - storage.getState().setRealtimeMode(isSpeaking ? 'speaking' : 'idle'); - }, - onDebug: (message) => { - console.debug('Realtime debug:', message); - } - }); - const hasRegistered = useRef(false); useEffect(() => { - // Store the conversation instance globally - conversationInstance = conversation; + // Create OpenAI Realtime client with RN WebRTC + clientInstance = new OpenAIRealtimeClient( + { + onConnect: () => { + console.log('[OpenAIVoiceSession] Connected'); + storage.getState().setRealtimeStatus('connected'); + storage.getState().setRealtimeMode('idle'); + + // Send session roster after connect + try { + const active = storage.getState().getActiveSessions(); + if (active.length > 0 && clientInstance) { + const roster = active.map(s => { + const label = getSessionLabel(s); + const summary = s.metadata?.summary?.text || 'no summary yet'; + const status = s.active ? 'online' : 'offline'; + return `- "${label}" (${status}): ${summary}`; + }).join('\n'); + clientInstance.injectContext( + `ACTIVE SESSIONS:\n${roster}\n\nUse these session names when calling tools.` + ); + } + } catch (error) { + console.warn('[OpenAIVoiceSession] Failed to send roster:', error); + } + }, + onDisconnect: (reason) => { + console.log('[OpenAIVoiceSession] Disconnected:', reason); + storage.getState().setRealtimeStatus('disconnected'); + storage.getState().setRealtimeMode('idle', true); + storage.getState().clearRealtimeModeDebounce(); + }, + onModeChange: (mode) => { + storage.getState().setRealtimeMode(mode === 'speaking' ? 'speaking' : 'idle'); + }, + onError: (error) => { + console.warn('[OpenAIVoiceSession] Error:', error); + storage.getState().setRealtimeStatus('disconnected'); + storage.getState().setRealtimeMode('idle', true); + }, + }, + { RTCPeerConnection, mediaDevices: rnMediaDevices } + ); - // Register the voice session once if (!hasRegistered.current) { - try { - registerVoiceSession(new RealtimeVoiceSessionImpl()); - hasRegistered.current = true; - } catch (error) { - console.error('Failed to register voice session:', error); - } + registerVoiceSession(new OpenAIVoiceSessionImpl()); + hasRegistered.current = true; } return () => { - // Clean up on unmount - conversationInstance = null; + clientInstance?.disconnect(); + clientInstance = null; }; - }, [conversation]); + }, []); - // This component doesn't render anything visible return null; -}; \ No newline at end of file +}; diff --git a/packages/happy-app/sources/realtime/RealtimeVoiceSession.web.tsx b/packages/happy-app/sources/realtime/RealtimeVoiceSession.web.tsx index 54edb46727..46204d6929 100644 --- a/packages/happy-app/sources/realtime/RealtimeVoiceSession.web.tsx +++ b/packages/happy-app/sources/realtime/RealtimeVoiceSession.web.tsx @@ -1,168 +1,127 @@ import React, { useEffect, useRef } from 'react'; -import { useConversation } from '@elevenlabs/react'; import { registerVoiceSession } from './RealtimeSession'; import { storage } from '@/sync/storage'; -import { realtimeClientTools } from './realtimeClientTools'; -import { getElevenLabsCodeFromPreference } from '@/constants/Languages'; +import { OpenAIRealtimeClient } from './openai/OpenAIRealtimeClient'; +import { buildSystemPrompt } from './openai/systemPrompt'; +import { OPENAI_TOOL_DEFINITIONS } from './openai/toolTranslator'; +import { getSessionLabel } from './hooks/contextFormatters'; +import { config } from '@/config'; import type { VoiceSession, VoiceSessionConfig } from './types'; -// Static reference to the conversation hook instance -let conversationInstance: ReturnType | null = null; +// Global client reference +let clientInstance: OpenAIRealtimeClient | null = null; -// Global voice session implementation -class RealtimeVoiceSessionImpl implements VoiceSession { +class OpenAIVoiceSessionImpl implements VoiceSession { - async startSession(config: VoiceSessionConfig): Promise { - console.log('[RealtimeVoiceSessionImpl] conversationInstance:', conversationInstance); - if (!conversationInstance) { - console.warn('Realtime voice session not initialized - conversationInstance is null'); + async startSession(sessionConfig: VoiceSessionConfig): Promise { + if (!clientInstance) { + console.warn('[OpenAIVoiceSession.web] Client not initialized'); return; } try { storage.getState().setRealtimeStatus('connecting'); - // Request microphone permission first + // Request mic permission on web try { await navigator.mediaDevices.getUserMedia({ audio: true }); } catch (error) { - console.error('Failed to get microphone permission:', error); + console.error('[OpenAIVoiceSession.web] Mic permission denied:', error); storage.getState().setRealtimeStatus('error'); return; } - // Get user's preferred language for voice assistant - const userLanguagePreference = storage.getState().settings.voiceAssistantLanguage; - const elevenLabsLanguage = getElevenLabsCodeFromPreference(userLanguagePreference); - - if (!config.token && !config.agentId) { - throw new Error('Neither token nor agentId provided'); - } - - const sessionConfig: any = { - connectionType: 'webrtc', - dynamicVariables: { - sessionId: config.sessionId, - initialConversationContext: config.initialContext || '' - }, - overrides: { - agent: { - language: elevenLabsLanguage - } - }, - ...(config.token ? { conversationToken: config.token } : { agentId: config.agentId }) - }; - - const conversationId = await conversationInstance.startSession(sessionConfig); - - console.log('Started conversation with ID:', conversationId); + const instructions = buildSystemPrompt(sessionConfig.initialContext || ''); + const voice = config.openAiRealtimeVoice || 'alloy'; + const model = config.openAiRealtimeModel || undefined; + + await clientInstance.connect({ + clientSecret: sessionConfig.clientSecret, + apiKey: sessionConfig.apiKey || config.openAiApiKey, + model, + instructions, + tools: OPENAI_TOOL_DEFINITIONS, + voice, + vadType: 'semantic_vad', + vadEagerness: 'low', + }); } catch (error) { - console.error('Failed to start realtime session:', error); + console.error('[OpenAIVoiceSession.web] Failed to start:', error); storage.getState().setRealtimeStatus('error'); } } async endSession(): Promise { - if (!conversationInstance) { - return; - } - - try { - await conversationInstance.endSession(); - storage.getState().setRealtimeStatus('disconnected'); - } catch (error) { - console.error('Failed to end realtime session:', error); - } + clientInstance?.disconnect(); + storage.getState().setRealtimeStatus('disconnected'); } sendTextMessage(message: string): void { - if (!conversationInstance) { - console.warn('Realtime voice session not initialized'); - return; - } - - conversationInstance.sendUserMessage(message); + clientInstance?.sendMessage(message); } sendContextualUpdate(update: string): void { - if (!conversationInstance) { - console.warn('Realtime voice session not initialized'); - return; - } - - conversationInstance.sendContextualUpdate(update); + clientInstance?.injectContext(update); } } export const RealtimeVoiceSession: React.FC = () => { - const conversation = useConversation({ - clientTools: realtimeClientTools, - onConnect: () => { - console.log('Realtime session connected'); - storage.getState().setRealtimeStatus('connected'); - storage.getState().setRealtimeMode('idle'); - }, - onDisconnect: () => { - console.log('Realtime session disconnected'); - storage.getState().setRealtimeStatus('disconnected'); - storage.getState().setRealtimeMode('idle', true); // immediate mode change - storage.getState().clearRealtimeModeDebounce(); - }, - onMessage: (data) => { - console.log('Realtime message:', data); - }, - onError: (error) => { - // Log but don't block app - voice features will be unavailable - // This prevents initialization errors from showing "Terminals error" on startup - console.warn('Realtime voice not available:', error); - // Don't set error status during initialization - just set disconnected - // This allows the app to continue working without voice features - storage.getState().setRealtimeStatus('disconnected'); - storage.getState().setRealtimeMode('idle', true); // immediate mode change - }, - onStatusChange: (data) => { - console.log('Realtime status change:', data); - }, - onModeChange: (data) => { - console.log('Realtime mode change:', data); - - // Only animate when speaking - const mode = data.mode as string; - const isSpeaking = mode === 'speaking'; - - // Use centralized debounce logic from storage - storage.getState().setRealtimeMode(isSpeaking ? 'speaking' : 'idle'); - }, - onDebug: (message) => { - console.debug('Realtime debug:', message); - } - }); - const hasRegistered = useRef(false); useEffect(() => { - // Store the conversation instance globally - console.log('[RealtimeVoiceSession] Setting conversationInstance:', conversation); - conversationInstance = conversation; + // Web uses browser-native RTCPeerConnection — no override needed + clientInstance = new OpenAIRealtimeClient( + { + onConnect: () => { + console.log('[OpenAIVoiceSession.web] Connected'); + storage.getState().setRealtimeStatus('connected'); + storage.getState().setRealtimeMode('idle'); + + // Send session roster after connect + try { + const active = storage.getState().getActiveSessions(); + if (active.length > 0 && clientInstance) { + const roster = active.map(s => { + const label = getSessionLabel(s); + const summary = s.metadata?.summary?.text || 'no summary yet'; + const status = s.active ? 'online' : 'offline'; + return `- "${label}" (${status}): ${summary}`; + }).join('\n'); + clientInstance.injectContext( + `ACTIVE SESSIONS:\n${roster}\n\nUse these session names when calling tools.` + ); + } + } catch (error) { + console.warn('[OpenAIVoiceSession.web] Failed to send roster:', error); + } + }, + onDisconnect: (reason) => { + console.log('[OpenAIVoiceSession.web] Disconnected:', reason); + storage.getState().setRealtimeStatus('disconnected'); + storage.getState().setRealtimeMode('idle', true); + storage.getState().clearRealtimeModeDebounce(); + }, + onModeChange: (mode) => { + storage.getState().setRealtimeMode(mode === 'speaking' ? 'speaking' : 'idle'); + }, + onError: (error) => { + console.warn('[OpenAIVoiceSession.web] Error:', error); + storage.getState().setRealtimeStatus('disconnected'); + storage.getState().setRealtimeMode('idle', true); + }, + } + ); - // Register the voice session once if (!hasRegistered.current) { - try { - console.log('[RealtimeVoiceSession] Registering voice session'); - registerVoiceSession(new RealtimeVoiceSessionImpl()); - hasRegistered.current = true; - console.log('[RealtimeVoiceSession] Voice session registered successfully'); - } catch (error) { - console.error('Failed to register voice session:', error); - } + registerVoiceSession(new OpenAIVoiceSessionImpl()); + hasRegistered.current = true; } return () => { - // Clean up on unmount - conversationInstance = null; + clientInstance?.disconnect(); + clientInstance = null; }; - }, [conversation]); + }, []); - // This component doesn't render anything visible return null; -}; \ No newline at end of file +}; diff --git a/packages/happy-app/sources/realtime/hooks/contextFormatters.ts b/packages/happy-app/sources/realtime/hooks/contextFormatters.ts index 817e49e68e..288b52d132 100644 --- a/packages/happy-app/sources/realtime/hooks/contextFormatters.ts +++ b/packages/happy-app/sources/realtime/hooks/contextFormatters.ts @@ -2,6 +2,8 @@ import { Session } from "@/sync/storageTypes"; import { Message } from "@/sync/typesMessage"; import { trimIdent } from "@/utils/trimIdent"; import { VOICE_CONFIG } from "../voiceConfig"; +import { getSessionName } from "@/utils/sessionUtils"; +import { storage } from "@/sync/storage"; interface SessionMetadata { summary?: { text?: string }; @@ -11,6 +13,18 @@ interface SessionMetadata { [key: string]: any; } +/** + * Get a short, voice-friendly label for a session. + * Prefers the folder name (short, stable) over the summary (long, changes). + */ +export function getSessionLabel(session: Session): string { + if (session.metadata?.path) { + const segments = session.metadata.path.split('/').filter(Boolean); + return segments.pop() || session.id.slice(0, 8); + } + return getSessionName(session); +} + /** * Format a permission request for natural language context @@ -21,9 +35,12 @@ export function formatPermissionRequest( toolName: string, toolArgs: any ): string { + const session = storage.getState().sessions[sessionId]; + const label = session ? getSessionLabel(session) : sessionId.slice(0, 8); return trimIdent(` - Claude Code is requesting permission to use ${toolName} (session ${sessionId}): + Claude Code in "${label}" is requesting permission to use ${toolName}: ${requestId} + ${label} ${toolName} ${JSON.stringify(toolArgs)} `); @@ -62,7 +79,9 @@ export function formatNewSingleMessage(sessionId: string, message: Message): str if (!formatted) { return null; } - return 'New message in session: ' + sessionId + '\n\n' + formatted; + const session = storage.getState().sessions[sessionId]; + const label = session ? getSessionLabel(session) : sessionId.slice(0, 8); + return `New message in "${label}":\n\n` + formatted; } export function formatNewMessages(sessionId: string, messages: Message[]): string | null { @@ -70,7 +89,9 @@ export function formatNewMessages(sessionId: string, messages: Message[]): strin if (formatted.length === 0) { return null; } - return 'New messages in session: ' + sessionId + '\n\n' + formatted.join('\n\n'); + const session = storage.getState().sessions[sessionId]; + const label = session ? getSessionLabel(session) : sessionId.slice(0, 8); + return `New messages in "${label}":\n\n` + formatted.join('\n\n'); } export function formatHistory(sessionId: string, messages: Message[]): string { @@ -86,42 +107,51 @@ export function formatHistory(sessionId: string, messages: Message[]): string { // export function formatSessionFull(session: Session, messages: Message[]): string { + const label = getSessionLabel(session); const sessionName = session.metadata?.summary?.text; const sessionPath = session.metadata?.path; const lines: string[] = []; - // Add session context - lines.push(`# Session ID: ${session.id}`); + // Add session context with voice-friendly label + lines.push(`# Session "${label}" (ID: ${session.id})`); lines.push(`# Project path: ${sessionPath}`); - lines.push(`# Session summary:\n${sessionName}`); - - // Add session metadata if available - if (session.metadata?.summary?.text) { - lines.push('## Session Summary'); - lines.push(session.metadata.summary.text); - lines.push(''); + if (sessionName) { + lines.push(`# Summary: ${sessionName}`); } // Add history - lines.push('## Our interaction history so far'); + lines.push('## Interaction history'); lines.push(''); lines.push(formatHistory(session.id, messages)); return lines.join('\n\n'); } +function labelFromMetadata(sessionId: string, metadata?: SessionMetadata): string { + if (metadata?.path) { + const segments = metadata.path.split('/').filter(Boolean); + return segments.pop() || sessionId.slice(0, 8); + } + if (metadata?.summary?.text) { + return metadata.summary.text.slice(0, 40); + } + return sessionId.slice(0, 8); +} + export function formatSessionOffline(sessionId: string, metadata?: SessionMetadata): string { - return `Session went offline: ${sessionId}`; + return `Session "${labelFromMetadata(sessionId, metadata)}" went offline.`; } export function formatSessionOnline(sessionId: string, metadata?: SessionMetadata): string { - return `Session came online: ${sessionId}`; + return `Session "${labelFromMetadata(sessionId, metadata)}" came online.`; } export function formatSessionFocus(sessionId: string, metadata?: SessionMetadata): string { - return `Session became focused: ${sessionId}`; + return `User is now looking at session "${labelFromMetadata(sessionId, metadata)}".`; } export function formatReadyEvent(sessionId: string): string { - return `Claude Code done working in session: ${sessionId}. The previous message(s) are the summary of the work done. Report this to the human immediately.`; + const session = storage.getState().sessions[sessionId]; + const label = session ? getSessionLabel(session) : sessionId.slice(0, 8); + return `Claude Code finished working in "${label}". The previous message(s) are the summary of the work done. Report this to the human immediately.`; } \ No newline at end of file diff --git a/packages/happy-app/sources/realtime/hooks/voiceHooks.ts b/packages/happy-app/sources/realtime/hooks/voiceHooks.ts index bb91d9b9da..5fc9f2ee35 100644 --- a/packages/happy-app/sources/realtime/hooks/voiceHooks.ts +++ b/packages/happy-app/sources/realtime/hooks/voiceHooks.ts @@ -121,7 +121,9 @@ export const voiceHooks = { }, /** - * Called when voice session starts + * Called when voice session starts. + * Builds initial context covering ALL active sessions so the voice agent + * can act as an aggregating project manager across sessions. */ onVoiceStarted(sessionId: string): string { if (VOICE_CONFIG.ENABLE_DEBUG_LOGGING) { @@ -129,13 +131,26 @@ export const voiceHooks = { } shownSessions.clear(); let prompt = ''; - prompt += 'THIS IS AN ACTIVE SESSION: \n\n' + formatSessionFull(storage.getState().sessions[sessionId], storage.getState().sessionMessages[sessionId]?.messages ?? []); - shownSessions.add(sessionId); - // prompt += 'Another active sessions: \n\n'; - // for (let s of storage.getState().getActiveSessions()) { - // if (s.id === sessionId) continue; - // prompt += formatSessionFull(s, storage.getState().sessionMessages[s.id]?.messages ?? []); - // } + + // Primary session the user launched voice from + const primarySession = storage.getState().sessions[sessionId]; + if (primarySession) { + prompt += 'PRIMARY SESSION (user is currently viewing this one):\n\n'; + prompt += formatSessionFull(primarySession, storage.getState().sessionMessages[sessionId]?.messages ?? []); + shownSessions.add(sessionId); + } + + // All other active sessions + const otherActive = storage.getState().getActiveSessions().filter(s => s.id !== sessionId); + if (otherActive.length > 0) { + prompt += '\n\nOTHER ACTIVE SESSIONS:\n\n'; + for (const s of otherActive) { + prompt += formatSessionFull(s, storage.getState().sessionMessages[s.id]?.messages ?? []); + prompt += '\n\n---\n\n'; + shownSessions.add(s.id); + } + } + return prompt; }, diff --git a/packages/happy-app/sources/realtime/openai/OpenAIRealtimeClient.ts b/packages/happy-app/sources/realtime/openai/OpenAIRealtimeClient.ts new file mode 100644 index 0000000000..a0ba520c8d --- /dev/null +++ b/packages/happy-app/sources/realtime/openai/OpenAIRealtimeClient.ts @@ -0,0 +1,448 @@ +/** + * OpenAI Realtime API client over WebRTC. + * + * Manages the peer connection, data channel, audio tracks, and + * translates OpenAI events into the same callback shape that + * RealtimeVoiceSession expects. + * + * Works in both React Native (via @livekit/react-native-webrtc) + * and browser (native WebRTC APIs) — caller passes the RTCPeerConnection + * constructor if needed. + */ + +import { OPENAI_TOOL_DEFINITIONS, type OpenAIToolDef } from './toolTranslator'; +import { realtimeClientTools } from '../realtimeClientTools'; + +// ──────────────────────────────────────────────────────────────── +// Types +// ──────────────────────────────────────────────────────────────── + +export interface OpenAIRealtimeCallbacks { + onConnect: () => void; + onDisconnect: (reason?: string) => void; + onModeChange: (mode: 'speaking' | 'idle') => void; + onError: (error: Error) => void; +} + +export interface OpenAIRealtimeConfig { + clientSecret?: string; + apiKey?: string; + model?: string; + instructions: string; + tools?: OpenAIToolDef[]; + voice?: string; + vadType?: 'semantic_vad' | 'server_vad'; + vadEagerness?: 'low' | 'medium' | 'high' | 'auto'; +} + +interface PendingEvent { + type: string; + [key: string]: any; +} + +// ──────────────────────────────────────────────────────────────── +// Constants +// ──────────────────────────────────────────────────────────────── + +const OPENAI_REALTIME_URL = 'https://api.openai.com/v1/realtime'; +const OPENAI_SESSIONS_URL = 'https://api.openai.com/v1/realtime/sessions'; +const DEFAULT_MODEL = 'gpt-realtime-1.5'; +const DEFAULT_VOICE = 'alloy'; +const SESSION_MAX_MS = 29 * 60 * 1000; // warn at 29 min (max is 30) + +// ──────────────────────────────────────────────────────────────── +// Client +// ──────────────────────────────────────────────────────────────── + +export class OpenAIRealtimeClient { + private callbacks: OpenAIRealtimeCallbacks; + private pc: RTCPeerConnection | null = null; + private dc: RTCDataChannel | null = null; + private localStream: MediaStream | null = null; + private connected = false; + private connecting = false; + private pendingEvents: PendingEvent[] = []; + private sessionTimer: ReturnType | null = null; + private isSpeaking = false; + private speakingTimeout: ReturnType | null = null; + + // Caller can override WebRTC constructors (for RN vs browser) + private RTCPeerConnectionCtor: any; + private mediaDevicesImpl: any; + + constructor( + callbacks: OpenAIRealtimeCallbacks, + options?: { RTCPeerConnection?: any; mediaDevices?: any } + ) { + this.callbacks = callbacks; + this.RTCPeerConnectionCtor = options?.RTCPeerConnection ?? globalThis.RTCPeerConnection; + this.mediaDevicesImpl = options?.mediaDevices ?? navigator?.mediaDevices; + } + + // ──────────────────────────────────────────────────────────── + // Public API + // ──────────────────────────────────────────────────────────── + + async connect(config: OpenAIRealtimeConfig): Promise { + if (this.connecting || this.connected) { + console.warn('[OpenAIRealtime] Already connected or connecting'); + return; + } + this.connecting = true; + + const model = config.model || DEFAULT_MODEL; + const voice = config.voice || DEFAULT_VOICE; + + try { + // 1. Get ephemeral key (or use API key directly for dev) + let bearerToken: string; + if (config.clientSecret) { + bearerToken = config.clientSecret; + } else if (config.apiKey) { + bearerToken = await this.fetchEphemeralKey(config.apiKey, model, voice); + } else { + throw new Error('No clientSecret or apiKey provided'); + } + + // 2. Create peer connection + this.pc = new this.RTCPeerConnectionCtor() as any; + + // ICE state monitoring + (this.pc as any).addEventListener('iceconnectionstatechange', () => { + const state = (this.pc as any)?.iceConnectionState; + console.log('[OpenAIRealtime] ICE state:', state); + if (state === 'failed' || state === 'closed') { + this.handleDisconnect('ICE ' + state); + } + }); + + // Remote audio track (model speaking) + (this.pc as any).addEventListener('track', (event: any) => { + console.log('[OpenAIRealtime] Remote track:', event.track?.kind); + }); + + // 3. Create data channel + this.dc = (this.pc as any).createDataChannel('oai-events'); + this.setupDataChannel(); + + // 4. Get mic audio + try { + if (this.mediaDevicesImpl?.getUserMedia) { + this.localStream = await this.mediaDevicesImpl.getUserMedia({ audio: true }); + if (this.localStream) { + const track = this.localStream.getTracks()[0]; + if (track) (this.pc as any).addTrack(track, this.localStream); + console.log('[OpenAIRealtime] Mic track added'); + } + } else { + console.warn('[OpenAIRealtime] No mediaDevices available'); + } + } catch (micErr) { + console.warn('[OpenAIRealtime] Mic not available:', micErr); + // Continue — data channel still works + } + + // 5. SDP offer/answer + const offer = await (this.pc as any).createOffer({ offerToReceiveAudio: true }); + await (this.pc as any).setLocalDescription(offer); + + const sdpResponse = await fetch(`${OPENAI_REALTIME_URL}?model=${model}`, { + method: 'POST', + body: offer.sdp, + headers: { + 'Authorization': `Bearer ${bearerToken}`, + 'Content-Type': 'application/sdp', + }, + }); + + if (!sdpResponse.ok) { + const errText = await sdpResponse.text(); + throw new Error(`SDP exchange failed: ${sdpResponse.status} ${errText}`); + } + + const answerSdp = await sdpResponse.text(); + await (this.pc as any).setRemoteDescription({ type: 'answer', sdp: answerSdp }); + + // 6. Wait for data channel open + await this.waitForDataChannelOpen(10000); + + // 7. Send session.update + this.sendEvent({ + type: 'session.update', + session: { + instructions: config.instructions, + tools: config.tools || OPENAI_TOOL_DEFINITIONS, + tool_choice: 'auto', + turn_detection: { + type: config.vadType || 'semantic_vad', + eagerness: config.vadEagerness || 'low', + }, + voice, + input_audio_noise_reduction: { type: 'near_field' }, + }, + }); + + this.connected = true; + this.connecting = false; + + // Session expiry timer + this.sessionTimer = setTimeout(() => { + console.warn('[OpenAIRealtime] Session approaching 30-min limit'); + this.callbacks.onDisconnect('session_expiring'); + }, SESSION_MAX_MS); + + this.callbacks.onConnect(); + + } catch (err: any) { + this.connecting = false; + this.cleanup(); + this.callbacks.onError(err instanceof Error ? err : new Error(String(err))); + } + } + + disconnect(): void { + this.handleDisconnect('user_requested'); + } + + /** + * Inject context without triggering a model response. + * Maps to ElevenLabs sendContextualUpdate. + */ + injectContext(text: string): void { + if (!this.connected || !this.dc) return; + this.sendEvent({ + type: 'conversation.item.create', + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: `[CONTEXT UPDATE - do not respond to this] ${text}` }], + }, + }); + // No response.create — model sees the context but doesn't speak + } + + /** + * Send a user-facing message that triggers a spoken model response. + * Maps to ElevenLabs sendUserMessage. + */ + sendMessage(text: string): void { + if (!this.connected || !this.dc) return; + this.sendEvent({ + type: 'conversation.item.create', + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text }], + }, + }); + this.sendEvent({ type: 'response.create' }); + } + + // ──────────────────────────────────────────────────────────── + // Private + // ──────────────────────────────────────────────────────────── + + private async fetchEphemeralKey(apiKey: string, model: string, voice: string): Promise { + console.log('[OpenAIRealtime] Fetching ephemeral key...'); + const res = await fetch(OPENAI_SESSIONS_URL, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ model, voice }), + }); + if (!res.ok) { + const text = await res.text(); + throw new Error(`Ephemeral key fetch failed: ${res.status} ${text}`); + } + const data = await res.json(); + return data.client_secret.value; + } + + private setupDataChannel(): void { + if (!this.dc) return; + const dc = this.dc as any; + + dc.addEventListener('message', (e: any) => { + try { + const data = JSON.parse(e.data); + this.handleServerEvent(data); + } catch { + console.warn('[OpenAIRealtime] Unparseable event:', e.data?.slice?.(0, 100)); + } + }); + + dc.addEventListener('close', () => { + console.log('[OpenAIRealtime] DataChannel closed'); + this.handleDisconnect('datachannel_closed'); + }); + + dc.addEventListener('error', (e: any) => { + console.error('[OpenAIRealtime] DataChannel error:', e); + this.callbacks.onError(new Error('DataChannel error')); + }); + } + + private handleServerEvent(event: any): void { + switch (event.type) { + case 'session.created': + console.log('[OpenAIRealtime] Session created:', event.session?.id); + break; + + case 'response.audio.delta': + this.setSpeaking(true); + break; + + case 'response.audio.done': + case 'response.done': + this.setSpeaking(false); + break; + + case 'response.cancelled': + this.setSpeaking(false); + break; + + case 'input_audio_buffer.speech_started': + // User started talking — model should stop + this.setSpeaking(false); + break; + + case 'response.function_call_arguments.done': + this.handleToolCall(event); + break; + + case 'error': + console.error('[OpenAIRealtime] Server error:', event.error); + this.callbacks.onError(new Error(event.error?.message || 'OpenAI server error')); + break; + + case 'session.ended': + console.log('[OpenAIRealtime] Session ended by server'); + this.handleDisconnect('session_ended'); + break; + } + } + + private async handleToolCall(event: any): Promise { + const { name, arguments: argsJson, call_id } = event; + console.log(`[OpenAIRealtime] Tool call: ${name}`, argsJson); + + const tool = (realtimeClientTools as any)[name]; + if (!tool) { + console.error(`[OpenAIRealtime] Unknown tool: ${name}`); + this.sendToolResult(call_id, `error: unknown tool "${name}"`); + return; + } + + try { + const args = JSON.parse(argsJson); + const result = await tool(args); + this.sendToolResult(call_id, typeof result === 'string' ? result : JSON.stringify(result)); + } catch (err: any) { + console.error(`[OpenAIRealtime] Tool "${name}" failed:`, err); + this.sendToolResult(call_id, `error: ${err.message}`); + } + } + + private sendToolResult(callId: string, output: string): void { + this.sendEvent({ + type: 'conversation.item.create', + item: { + type: 'function_call_output', + call_id: callId, + output, + }, + }); + // Trigger the model to continue speaking with the result + this.sendEvent({ type: 'response.create' }); + } + + private setSpeaking(speaking: boolean): void { + if (this.speakingTimeout) { + clearTimeout(this.speakingTimeout); + this.speakingTimeout = null; + } + + if (speaking && !this.isSpeaking) { + this.isSpeaking = true; + this.callbacks.onModeChange('speaking'); + } else if (!speaking && this.isSpeaking) { + // Debounce idle transition to avoid flicker + this.speakingTimeout = setTimeout(() => { + this.isSpeaking = false; + this.callbacks.onModeChange('idle'); + }, 300); + } + } + + private sendEvent(event: PendingEvent): void { + if (!this.dc || (this.dc as any).readyState !== 'open') { + this.pendingEvents.push(event); + return; + } + try { + (this.dc as any).send(JSON.stringify(event)); + } catch (err) { + console.error('[OpenAIRealtime] Failed to send event:', err); + } + } + + private flushPendingEvents(): void { + while (this.pendingEvents.length > 0) { + const event = this.pendingEvents.shift()!; + this.sendEvent(event); + } + } + + private waitForDataChannelOpen(timeoutMs: number): Promise { + return new Promise((resolve, reject) => { + if (!this.dc) return reject(new Error('No data channel')); + if ((this.dc as any).readyState === 'open') { + this.flushPendingEvents(); + return resolve(); + } + const timeout = setTimeout(() => reject(new Error('DataChannel open timeout')), timeoutMs); + (this.dc as any).addEventListener('open', () => { + clearTimeout(timeout); + this.flushPendingEvents(); + resolve(); + }); + }); + } + + private handleDisconnect(reason: string): void { + if (!this.connected && !this.connecting) return; + console.log('[OpenAIRealtime] Disconnecting:', reason); + this.cleanup(); + this.callbacks.onDisconnect(reason); + } + + private cleanup(): void { + this.connected = false; + this.connecting = false; + this.isSpeaking = false; + this.pendingEvents = []; + + if (this.speakingTimeout) { + clearTimeout(this.speakingTimeout); + this.speakingTimeout = null; + } + if (this.sessionTimer) { + clearTimeout(this.sessionTimer); + this.sessionTimer = null; + } + if (this.dc) { + try { (this.dc as any).close(); } catch {} + this.dc = null; + } + if (this.localStream) { + try { this.localStream.getTracks().forEach(t => t.stop()); } catch {} + this.localStream = null; + } + if (this.pc) { + try { (this.pc as any).close(); } catch {} + this.pc = null; + } + } +} diff --git a/packages/happy-app/sources/realtime/openai/systemPrompt.ts b/packages/happy-app/sources/realtime/openai/systemPrompt.ts new file mode 100644 index 0000000000..83a3c74904 --- /dev/null +++ b/packages/happy-app/sources/realtime/openai/systemPrompt.ts @@ -0,0 +1,57 @@ +/** + * Builds the OpenAI Realtime session instructions. + * This replaces the ElevenLabs dashboard system prompt — now versioned in code. + */ + +export function buildSystemPrompt(initialContext: string): string { + return `You are Happy Voice, a proactive voice assistant that helps users manage \ +MULTIPLE Claude Code sessions from their phone while driving or away from \ +their keyboard. + +You act as an aggregating project manager across all active sessions. You will \ +receive context updates from multiple sessions simultaneously. + +ACTIVE SESSIONS: +${initialContext || 'No sessions reported yet. Sessions will appear as context updates.'} + +YOUR RESPONSIBILITIES: +1. Proactively inform the user when any session finishes work, encounters an \ +error, or needs permission — don't wait to be asked. +2. Route messages to the correct session based on the user's intent. If they \ +say "on the trading bot, add error handling", match "trading bot" to the \ +session folder name and use the messageClaudeCode tool with the session parameter. +3. When permission requests come in, tell the user which project needs it and \ +what it wants to do. Keep it brief: "Trading bot wants to run npm install. Approve?" +4. When the user says "approve" or "deny" without specifying a session, apply \ +it to whichever session has a pending request. +5. If the user asks for a status update, summarize all active sessions briefly. + +VOICE STYLE: +- Use project folder names to identify sessions, not IDs. +- Be proactive: when a session finishes or needs attention, speak up immediately. +- When Claude sends an explanation, analysis, or detailed response — relay it fully and clearly. Do NOT reduce it to a one-liner. The user needs the full context to make decisions. +- For status updates and short notifications (e.g. "session finished", "permission request"), keep it brief. +- Never read raw code blocks, file paths, or JSON verbatim. Describe what the code does instead. +- Adjust length to the content: short for events, full for explanations. + +SILENCE BEHAVIOR (CRITICAL): +- Do NOT fill silence. The user is driving and thinking. +- NEVER ask "is there anything else I can help with?" or similar filler. +- NEVER prompt the user to speak when there is a pause. +- Only speak when YOU have something to report (session update, permission \ +request, error) or when the USER speaks to you first. +- Silence is normal. Wait quietly. The user will talk when they need you. + +CONTEXT UPDATES: +- You will receive context updates prefixed with [CONTEXT UPDATE]. These are \ +informational — do NOT respond to them verbally unless they require user attention \ +(like a permission request or an error). +- Only speak about context updates when they are actionable for the user. + +TOOLS: +- messageClaudeCode: Send a message to a session. You MUST always specify the \ +"session" parameter with the folder name. +- processPermissionRequest: Approve or deny. You MUST always specify the \ +"session" parameter. +- switchSession: Switch the app screen to show a specific session.`; +} diff --git a/packages/happy-app/sources/realtime/openai/toolTranslator.ts b/packages/happy-app/sources/realtime/openai/toolTranslator.ts new file mode 100644 index 0000000000..651c4072c3 --- /dev/null +++ b/packages/happy-app/sources/realtime/openai/toolTranslator.ts @@ -0,0 +1,61 @@ +/** + * Translates the realtimeClientTools definitions into OpenAI Realtime + * function-calling format. The tool *handlers* stay in realtimeClientTools.ts + * unchanged — this only produces the schema the model sees. + */ + +export interface OpenAIToolDef { + type: 'function'; + name: string; + description: string; + parameters: { + type: 'object'; + properties: Record; + required: string[]; + }; +} + +/** + * Static tool schemas matching the Zod schemas in realtimeClientTools.ts. + * If a tool is added/changed there, update here too. + */ +export const OPENAI_TOOL_DEFINITIONS: OpenAIToolDef[] = [ + { + type: 'function', + name: 'messageClaudeCode', + description: "Send a message to Claude Code. You MUST specify the 'session' parameter with the project folder name (e.g. 'trading-bot', 'family-journal'). Always ask the user to clarify which session if unclear.", + parameters: { + type: 'object', + properties: { + message: { type: 'string', description: 'The message to send to Claude Code' }, + session: { type: 'string', description: "Target session name (folder name like 'trading-bot'). Always required." }, + }, + required: ['message', 'session'], + }, + }, + { + type: 'function', + name: 'processPermissionRequest', + description: "Approve or deny a permission request from Claude Code. You MUST specify the 'session' parameter with the project folder name. Always confirm which session with the user if unclear.", + parameters: { + type: 'object', + properties: { + decision: { type: 'string', description: "Whether to allow or deny the permission request. Must be 'allow' or 'deny'." }, + session: { type: 'string', description: 'Target session name (folder name). Always required.' }, + }, + required: ['decision', 'session'], + }, + }, + { + type: 'function', + name: 'switchSession', + description: 'Switch the app screen to display a specific session. Use when the user asks to see a session, or when context makes it clear they want to view a different project. Always specify the session name.', + parameters: { + type: 'object', + properties: { + session: { type: 'string', description: "Target session name (folder name like 'trading-bot'). Always required." }, + }, + required: ['session'], + }, + }, +]; diff --git a/packages/happy-app/sources/realtime/realtimeClientTools.ts b/packages/happy-app/sources/realtime/realtimeClientTools.ts index 091e29cb19..6c011a9ff2 100644 --- a/packages/happy-app/sources/realtime/realtimeClientTools.ts +++ b/packages/happy-app/sources/realtime/realtimeClientTools.ts @@ -4,76 +4,155 @@ import { sessionAllow, sessionDeny } from '@/sync/ops'; import { storage } from '@/sync/storage'; import { trackPermissionResponse } from '@/track'; import { getCurrentRealtimeSessionId } from './RealtimeSession'; +import { getSessionLabel } from './hooks/contextFormatters'; +import { router } from 'expo-router'; + +/** + * Resolve a session ID from a user-provided session name/hint. + * Matches against folder name, summary text, or raw session ID. + * Falls back to the currently focused session if no hint given. + */ +function resolveSessionId(sessionHint?: string): string | null { + // No hint — use focused session + if (!sessionHint || sessionHint.trim() === '') { + return getCurrentRealtimeSessionId(); + } + + const hint = sessionHint.toLowerCase().trim(); + const sessions = storage.getState().getActiveSessions(); + + // Try exact folder name match first + for (const s of sessions) { + if (s.metadata?.path) { + const folder = s.metadata.path.split('/').filter(Boolean).pop()?.toLowerCase(); + if (folder === hint) return s.id; + } + } + + // Try partial folder name match + for (const s of sessions) { + if (s.metadata?.path) { + const folder = s.metadata.path.split('/').filter(Boolean).pop()?.toLowerCase() || ''; + if (folder.includes(hint) || hint.includes(folder)) return s.id; + } + } + + // Try summary text match + for (const s of sessions) { + const summary = s.metadata?.summary?.text?.toLowerCase() || ''; + if (summary.includes(hint)) return s.id; + } + + // Try session ID prefix + for (const s of sessions) { + if (s.id.toLowerCase().startsWith(hint)) return s.id; + } + + return null; +} + +/** + * Navigate to a session screen using expo-router's imperative API. + * Safe to call from outside React components. + */ +function navigateToSessionImperative(sessionId: string) { + try { + router.navigate(`/session/${sessionId}`, { + dangerouslySingular(name, params) { + return 'session'; + }, + }); + } catch (error) { + console.error('Failed to navigate to session:', error); + } +} /** * Static client tools for the realtime voice interface. - * These tools allow the voice assistant to interact with Claude Code. + * These tools allow the voice assistant to interact with Claude Code + * across multiple sessions. */ export const realtimeClientTools = { /** - * Send a message to Claude Code + * Send a message to Claude Code. + * Supports multi-session routing via optional session parameter. */ messageClaudeCode: async (parameters: unknown) => { - // Parse and validate the message parameter using Zod const messageSchema = z.object({ - message: z.string().min(1, 'Message cannot be empty') + message: z.string().min(1, 'Message cannot be empty'), + session: z.string().min(1, 'Session name is required') }); - const parsedMessage = messageSchema.safeParse(parameters); + const parsed = messageSchema.safeParse(parameters); - if (!parsedMessage.success) { - console.error('❌ Invalid message parameter:', parsedMessage.error); - return "error (invalid message parameter)"; + if (!parsed.success) { + console.error('❌ Invalid parameters:', parsed.error); + const active = storage.getState().getActiveSessions(); + const names = active.map(s => `"${getSessionLabel(s)}"`).join(', '); + return `error (both message and session are required). Available sessions: ${names}`; } - const message = parsedMessage.data.message; - const sessionId = getCurrentRealtimeSessionId(); - + const { message, session: sessionHint } = parsed.data; + const sessionId = resolveSessionId(sessionHint); + if (!sessionId) { - console.error('❌ No active session'); - return "error (no active session)"; + // List available sessions to help the agent + const active = storage.getState().getActiveSessions(); + const names = active.map(s => `"${getSessionLabel(s)}"`).join(', '); + return `error (could not find session matching "${sessionHint}"). Available sessions: ${names}`; } - - console.log('🔍 messageClaudeCode called with:', message); - console.log('📤 Sending message to session:', sessionId); + + const sessionObj = storage.getState().sessions[sessionId]; + const label = sessionObj ? getSessionLabel(sessionObj) : sessionId.slice(0, 8); + + console.log(`📤 Sending message to "${label}" (${sessionId}):`, message); sync.sendMessage(sessionId, message); - return "sent [DO NOT say anything else, simply say 'sent']"; + return `sent to "${label}" [DO NOT say anything else, simply say 'sent to ${label}']`; }, /** - * Process a permission request from Claude Code + * Process a permission request from Claude Code. + * Supports multi-session routing via optional session parameter. */ processPermissionRequest: async (parameters: unknown) => { const messageSchema = z.object({ - decision: z.enum(['allow', 'deny']) + decision: z.enum(['allow', 'deny']), + session: z.string().min(1, 'Session name is required') }); - const parsedMessage = messageSchema.safeParse(parameters); + const parsed = messageSchema.safeParse(parameters); - if (!parsedMessage.success) { - console.error('❌ Invalid decision parameter:', parsedMessage.error); - return "error (invalid decision parameter, expected 'allow' or 'deny')"; + if (!parsed.success) { + console.error('❌ Invalid parameters:', parsed.error); + // List sessions with pending requests to help the agent + const active = storage.getState().getActiveSessions(); + const withRequests = active.filter(s => { + const reqs = s.agentState?.requests; + return reqs && Object.keys(reqs).length > 0; + }); + if (withRequests.length > 0) { + const names = withRequests.map(s => `"${getSessionLabel(s)}"`).join(', '); + return `error (both decision and session are required). Sessions with pending requests: ${names}`; + } + return "error (both decision and session are required, expected 'allow'/'deny' and a session name)"; } - const decision = parsedMessage.data.decision; - const sessionId = getCurrentRealtimeSessionId(); - + const { decision, session: sessionHint } = parsed.data; + const sessionId = resolveSessionId(sessionHint); + if (!sessionId) { - console.error('❌ No active session'); - return "error (no active session)"; + return "error (no active session found)"; } - - console.log('🔍 processPermissionRequest called with:', decision); - - // Get the current session to check for permission requests + const session = storage.getState().sessions[sessionId]; const requests = session?.agentState?.requests; - + if (!requests || Object.keys(requests).length === 0) { - console.error('❌ No active permission request'); - return "error (no active permission request)"; + const label = session ? getSessionLabel(session) : sessionId.slice(0, 8); + return `error (no pending permission request in "${label}")`; } - + const requestId = Object.keys(requests)[0]; - + const label = session ? getSessionLabel(session) : sessionId.slice(0, 8); + try { if (decision === 'allow') { await sessionAllow(sessionId, requestId); @@ -82,10 +161,41 @@ export const realtimeClientTools = { await sessionDeny(sessionId, requestId); trackPermissionResponse(false); } - return "done [DO NOT say anything else, simply say 'done']"; + return `done [DO NOT say anything else, simply say '${decision === 'allow' ? 'approved' : 'denied'} for ${label}']`; } catch (error) { console.error('❌ Failed to process permission:', error); - return `error (failed to ${decision} permission)`; + return `error (failed to ${decision} permission for "${label}")`; + } + }, + + /** + * Switch the app screen to show a specific session. + */ + switchSession: async (parameters: unknown) => { + const schema = z.object({ + session: z.string().min(1, 'Session name is required') + }); + const parsed = schema.safeParse(parameters); + + if (!parsed.success) { + const active = storage.getState().getActiveSessions(); + const names = active.map(s => `"${getSessionLabel(s)}"`).join(', '); + return `error (session name is required). Available sessions: ${names}`; } + + const sessionId = resolveSessionId(parsed.data.session); + + if (!sessionId) { + const active = storage.getState().getActiveSessions(); + const names = active.map(s => `"${getSessionLabel(s)}"`).join(', '); + return `error (could not find session "${parsed.data.session}"). Available sessions: ${names}`; + } + + const sessionObj = storage.getState().sessions[sessionId]; + const label = sessionObj ? getSessionLabel(sessionObj) : sessionId.slice(0, 8); + + navigateToSessionImperative(sessionId); + + return `switched to "${label}" [DO NOT say anything else, simply say 'switched to ${label}']`; } -}; \ No newline at end of file +}; diff --git a/packages/happy-app/sources/realtime/types.ts b/packages/happy-app/sources/realtime/types.ts index 2059ffbd64..4d503ab313 100644 --- a/packages/happy-app/sources/realtime/types.ts +++ b/packages/happy-app/sources/realtime/types.ts @@ -3,6 +3,8 @@ export interface VoiceSessionConfig { initialContext?: string; token?: string; agentId?: string; + clientSecret?: string; + apiKey?: string; } export interface VoiceSession { @@ -13,4 +15,4 @@ export interface VoiceSession { } export type ConversationStatus = 'disconnected' | 'connecting' | 'connected'; -export type ConversationMode = 'speaking' | 'listening'; \ No newline at end of file +export type ConversationMode = 'speaking' | 'listening'; diff --git a/packages/happy-app/sources/sync/appConfig.ts b/packages/happy-app/sources/sync/appConfig.ts index 3c16daacbe..c4138dd431 100644 --- a/packages/happy-app/sources/sync/appConfig.ts +++ b/packages/happy-app/sources/sync/appConfig.ts @@ -8,6 +8,9 @@ export interface AppConfig { revenueCatStripeKey?: string; elevenLabsAgentIdDev?: string; elevenLabsAgentIdProd?: string; + openAiApiKey?: string; + openAiRealtimeModel?: string; + openAiRealtimeVoice?: string; serverUrl?: string; } @@ -96,6 +99,18 @@ export function loadAppConfig(): AppConfig { console.log('[loadAppConfig] Override serverUrl from EXPO_PUBLIC_SERVER_URL'); config.serverUrl = process.env.EXPO_PUBLIC_SERVER_URL; } + if (process.env.EXPO_PUBLIC_OPENAI_API_KEY && config.openAiApiKey !== process.env.EXPO_PUBLIC_OPENAI_API_KEY) { + console.log('[loadAppConfig] Override openAiApiKey from EXPO_PUBLIC_OPENAI_API_KEY'); + config.openAiApiKey = process.env.EXPO_PUBLIC_OPENAI_API_KEY; + } + if (process.env.EXPO_PUBLIC_OPENAI_REALTIME_MODEL && config.openAiRealtimeModel !== process.env.EXPO_PUBLIC_OPENAI_REALTIME_MODEL) { + console.log('[loadAppConfig] Override openAiRealtimeModel from EXPO_PUBLIC_OPENAI_REALTIME_MODEL'); + config.openAiRealtimeModel = process.env.EXPO_PUBLIC_OPENAI_REALTIME_MODEL; + } + if (process.env.EXPO_PUBLIC_OPENAI_REALTIME_VOICE && config.openAiRealtimeVoice !== process.env.EXPO_PUBLIC_OPENAI_REALTIME_VOICE) { + console.log('[loadAppConfig] Override openAiRealtimeVoice from EXPO_PUBLIC_OPENAI_REALTIME_VOICE'); + config.openAiRealtimeVoice = process.env.EXPO_PUBLIC_OPENAI_REALTIME_VOICE; + } return config as AppConfig; } \ No newline at end of file