diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index b2781d26c..17d2956b8 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -9,7 +9,7 @@ { "name": "omni", "description": "Full Omni platform control — multichannel messaging, automations, events, batch ops via the omni CLI", - "version": "2.260410.1", + "version": "2.260618.3", "author": { "name": "Automagik" }, diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 000000000..13216bb44 --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"66ec30dc-2df2-4983-9917-28cf1f5a2136","pid":1548845,"acquiredAt":1776627038192} \ No newline at end of file diff --git a/.env.example b/.env.example index db4f3d37a..0ec8a353c 100644 --- a/.env.example +++ b/.env.example @@ -39,6 +39,15 @@ API_HOST=0.0.0.0 # Set to true to have PM2 manage the API server (local dev) API_MANAGED=true +# ----------------------------------------------------------------------------- +# A2A Protocol +# ----------------------------------------------------------------------------- +# Set true to expose A2A agent-card discovery and JSON-RPC endpoints. +# A2A_ENABLED=false + +# Max time SendMessage waits for a terminal task before returning current state. +# A2A_SEND_WAIT_MS=30000 + # ----------------------------------------------------------------------------- # Sentry (Error Tracking & Performance Monitoring) # DSN is hardcoded to the Omni project — override to redirect, set empty to disable @@ -47,8 +56,39 @@ API_MANAGED=true # SENTRY_TRACES_SAMPLE_RATE=0.1 # SENTRY_ENVIRONMENT=development +# ----------------------------------------------------------------------------- +# OpenTelemetry (optional) +# Point at your OTLP collector (e.g. SigNoz, Honeycomb, Jaeger). Set these in +# .env so PM2 picks them up via `set -a && . ./.env` instead of inheriting +# conflicting OTEL_* from the launching shell. +# ----------------------------------------------------------------------------- +# OTEL_EXPORTER_OTLP_ENDPOINT=http://:4318 +# OTEL_SERVICE_NAME=omni-api +# OTEL_RESOURCE_ATTRIBUTES=deployment.environment=development,service.namespace=omni,owner= + # ----------------------------------------------------------------------------- # Secrets (add as needed) # ----------------------------------------------------------------------------- # OPENAI_API_KEY= # ANTHROPIC_API_KEY= + +# ----------------------------------------------------------------------------- +# WhatsApp outbound timing +# ----------------------------------------------------------------------------- +# Humanized delay before outgoing WhatsApp actions. Set ENABLED=false to disable. +# WHATSAPP_HUMAN_DELAY_ENABLED=true +# WHATSAPP_HUMAN_DELAY_MIN_MS=1500 +# WHATSAPP_HUMAN_DELAY_MAX_MS=3500 + +# Typing simulation before text/caption sends. Set ENABLED=false to disable. +# Delay is min(BASE_MS + text.length * PER_CHAR_MS, MAX_MS). +# WHATSAPP_TYPING_SIMULATION_ENABLED=true +# WHATSAPP_TYPING_DELAY_BASE_MS=800 +# WHATSAPP_TYPING_DELAY_PER_CHAR_MS=30 +# WHATSAPP_TYPING_DELAY_MAX_MS=4000 +# WHATSAPP_TYPING_DEFAULT_MS=3000 + +# Exponential backoff used after WhatsApp/Baileys rate-limit errors. +# WHATSAPP_RATE_LIMIT_INITIAL_BACKOFF_MS=1000 +# WHATSAPP_RATE_LIMIT_MAX_BACKOFF_MS=30000 +# WHATSAPP_RATE_LIMIT_JITTER_FACTOR=0.2 diff --git a/.genie/.gitignore b/.genie/.gitignore deleted file mode 100644 index 1503fc1bd..000000000 --- a/.genie/.gitignore +++ /dev/null @@ -1 +0,0 @@ -workers.json diff --git a/.genie/DREAM.md b/.genie/DREAM.md deleted file mode 100644 index 2926083e7..000000000 --- a/.genie/DREAM.md +++ /dev/null @@ -1,14 +0,0 @@ -# Dream Session — 2026-03-29 (Omni) - -## Wishes (6) - -| merge_order | slug | branch | wish_path | depends_on | GH Issue | -|-------------|------|--------|-----------|------------|----------| -| 1 | sdk-compliance-test-suite | feat/sdk-compliance-test-suite | .genie/wishes/sdk-compliance-test-suite/WISH.md | — | #82 | -| 1 | remove-baileys-logger-from-core | feat/remove-baileys-logger-from-core | .genie/wishes/remove-baileys-logger-from-core/WISH.md | — | #90 | -| 1 | remove-channel-leaks-from-core | feat/remove-channel-leaks-from-core | .genie/wishes/remove-channel-leaks-from-core/WISH.md | — | #88 | -| 1 | standardize-sendtyping | feat/standardize-sendtyping | .genie/wishes/standardize-sendtyping/WISH.md | — | #86 | -| 1 | channel-error-migration | feat/channel-error-migration | .genie/wishes/channel-error-migration/WISH.md | — | #81 | -| 1 | route-config-overrides | feat/route-config-overrides | .genie/wishes/route-config-overrides/WISH.md | — | — | - -All 6 wishes are independent — Layer 1, full parallel execution. diff --git a/.genie/agents.json b/.genie/agents.json deleted file mode 100644 index c948287e0..000000000 --- a/.genie/agents.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "entries": { - "omni-qa": { - "name": "omni-qa", - "dir": "/home/genie/agents/namastexlabs/omni/omni-qa-engineer", - "promptMode": "append", - "registeredAt": "2026-03-20T12:19:42.948Z" - } - }, - "lastUpdated": "2026-03-20T12:19:42.959Z" -} \ No newline at end of file diff --git a/.genie/brainstorm.md b/.genie/brainstorm.md deleted file mode 100644 index e3204e5cd..000000000 --- a/.genie/brainstorm.md +++ /dev/null @@ -1,21 +0,0 @@ -# Brainstorm Jar - -## Raw -- **session-observatory** — Agent session observability layer (#292). Scope unclear — needs decomposition. Issue closed as stale. - -## Ready -- **gupshup-channel-rewrite** — Full rewrite of channel-gupshup: Meta-format inbound, Custom Integration outbound, 10 msg types, 7 outbound types. DESIGN.md ready. - -## Simmering - -## Ready -- **gupshup-webhook-native-format** — Rewrite webhook handler: Meta/WA format → Gupshup native format. 7 types mapped, status events ignored. DESIGN.md ready. - -## Poured -- **cli-360-agents-update-events-get-verbose-logs** — CLI gaps from #360: `agents update`, `events get`, `logs --verbose/--json`. DESIGN.md ready (WRS 100). One wish, 3 groups (G1 agents-update → G2 events-get → G3 logs-verbose). LogEntry schema gets explicit `data?: Record`. -- **gupshup-handoff-message** — Gupshup HANDOFF msg_type: POST /messages/send/handoff, agentPaused chain, follow-up disarm, extra_info field. DESIGN.md ready. -- **fix-omni-bugs-243-244** — API key chat scoping (#244) + event-driven media pipeline (#243). DESIGN.md ready. Council reviewed: APPROVE. -- **route-config-overrides** — Per-user/per-agent debounce/ack/split overrides on routes (#242). DESIGN.md ready. Kills the multi-Omni-installation hack. -- **omni-docs-cleanup** — Complete CLI reference (20 missing command groups, ~80 subcommands) + routing skill rewrite + multi-instance guide (#252, #240). DESIGN.md ready. -- **omni-agentic-cli** — Turn-based execution mode (third provider mode) + 9 multimodal verb commands + PG-backed context + provider-agnostic media (Gemini/ElevenLabs/Groq) + instance scoping + persons CLI (#259). 13 groups, 4 waves. SHIPPED: PR #349 merged 2026-04-05 -- **fix-person-deduplication** — Fix identity resolution pipeline: use resolvedSenderPhone for LID linking, cross-instance matching, sync-worker guard, data migration, orphan cleanup. 5 groups, 3 waves. SHIPPED: PR #348 merged 2026-04-05 diff --git a/.genie/brainstorms/_archived/fix-omni-bugs-243-244/DESIGN.md b/.genie/brainstorms/_archived/fix-omni-bugs-243-244/DESIGN.md deleted file mode 100644 index 4018b3075..000000000 --- a/.genie/brainstorms/_archived/fix-omni-bugs-243-244/DESIGN.md +++ /dev/null @@ -1,280 +0,0 @@ -# Design: Fix API Key Chat Scoping + Event-Driven Media Pipeline - -| Field | Value | -|-------|-------| -| **Slug** | `fix-omni-bugs-243-244` | -| **Date** | 2026-03-24 | -| **WRS** | 100/100 | - -## Problem -Two production bugs: (1) scoped API keys leak cross-instance chat data, and (2) the media processing pipeline uses DB polling with a hardcoded timeout instead of the NATS event system — dropping media results on batch messages. - -## Scope -### IN -- #244: Apply instance scoping to `GET /chats` list endpoint -- #243: Replace DB-polling `waitForMediaProcessing()` with event-driven await on `media.processed` -- Ensure media-processor publishes events on BOTH success AND failure paths -- Tests for both fixes - -### OUT -- No new DB migration (no `mediaWaitTimeoutMs` column — the timeout is deleted, not configured) -- Route-level config overrides (separate wish) -- Changes to media-processing service itself (Gemini/Whisper) - -## Fix 1: Chat List Instance Scoping (#244) - -### Root Cause -`chats.ts:159` — `GET /chats` calls `services.chats.list(query)` without passing API key instance restrictions. Every other list endpoint applies scoping. - -### Fix -1. Add `instanceIds?: string[]` to `ListChatsOptions` interface in `services/chats.ts` -2. Add `if (instanceIds?.length) conditions.push(inArray(chats.instanceId, instanceIds))` in `buildListConditions()` -3. In `routes/v2/chats.ts` GET `/`, extract `apiKey` from context and pass `instanceIds`: -```typescript -const apiKey = c.get('apiKey'); -const queryWithAccess = apiKey?.instanceIds - ? { ...query, instanceIds: apiKey.instanceIds } - : query; -const result = await services.chats.list(queryWithAccess); -``` - -### Files -- `packages/api/src/services/chats.ts` — add `instanceIds` to interface + condition -- `packages/api/src/routes/v2/chats.ts` — pass instanceIds from apiKey context - -## Fix 2: Event-Driven Media Pipeline (#243) - -### Root Cause — Architectural -The dispatcher and media-processor are two independent NATS subscribers of `message.received`. They coordinate through **DB column polling** instead of using the event system: - -``` -CURRENT (broken): - message.received ──┬── media-processor → process → write DB column → publish media.processed (NOBODY LISTENS) - └── agent-dispatcher → debounce → poll DB column 120× → timeout → "[unavailable]" -``` - -This is fundamentally wrong. We have NATS JetStream with durable consumers, dead letter queues, at-least-once delivery, and auto-retry — and we're polling a database 120 times instead. - -### Fix — Event-Driven Completion - -``` -NEW (correct): - message.received ──┬── media-processor → process → write DB → publish media.processed ─┐ - │ (OR on failure: media.failed) ─┐│ - └── agent-dispatcher → debounce ──────────────────────────────────────┤ - │ - media.processed ────── agent-dispatcher (durable subscriber) ← resolves pending await ──┘ - media.failed ────── agent-dispatcher (durable subscriber) ← resolves as error ───────┘ -``` - -### Implementation - -#### A. Media Processor — publish on ALL paths - -Currently the media-processor only publishes `media.processed` on success (line 367). On failure (line 338), it writes a DB error marker but publishes nothing. Fix: - -```typescript -// On failure path (line 338-353), ADD event publication: -if (!result.success) { - // ... existing error marker write ... - - // NEW: publish failure event so dispatcher doesn't wait forever - await ctx.eventBus.publish('media.processed', { - eventId: eventId ?? media.messageId, - mediaId: media.messageId, - processingType: result.processingType ?? inferProcessingType(content.type), - content: '', // empty = failed - error: result.errorMessage ?? 'unknown', - }); - return; -} -``` - -> Decision: Use `media.processed` with empty content + error field, not a new `media.failed` event type. Keeps the contract simple — one event type, dispatcher checks for content. - -Also: the outer try/catch (line 437-449) currently swallows errors silently. Add a `media.processed` publish there too for truly unexpected crashes. - -#### B. Agent Dispatcher — subscribe to `media.processed`, delete polling - -1. **New: Media completion registry** — in-memory Map that tracks pending and completed media: - -```typescript -// Pending media awaits, keyed by DB message UUID -const mediaCompletions = new Map void; - reject: (error: Error) => void; -}>(); - -// Cache of already-completed results (event arrived before dispatcher needed it) -const mediaResultCache = new Map(); -``` - -2. **New: Subscribe to `media.processed`** with durable consumer: - -```typescript -await eventBus.subscribe( - 'media.processed', - async (event) => { - const payload = event.payload as MediaProcessedPayload; - const { mediaId, content, error } = payload; - - // If dispatcher is already waiting → resolve the promise - const pending = mediaCompletions.get(mediaId); - if (pending) { - pending.resolve({ content, error }); - mediaCompletions.delete(mediaId); - return; - } - - // If dispatcher hasn't asked yet → cache the result (TTL 5min) - mediaResultCache.set(mediaId, { content, error }); - setTimeout(() => mediaResultCache.delete(mediaId), 300_000); - }, - { - durable: 'agent-dispatcher-media', - queue: 'agent-dispatcher-media', - startFrom: 'new', - }, -); -``` - -3. **Replace `waitForMediaProcessing()`** — delete the 30-line polling loop, replace with: - -```typescript -async function awaitMediaProcessing( - services: Services, - instanceId: string, - chatId: string, - externalId: string, - contentType: string, -): Promise<{ content: string | null; localPath: string | null }> { - const column = getProcessedColumn(contentType); - if (!column) return MEDIA_WAIT_NULL; - - const chat = await services.chats.findByExternalIdSmart(instanceId, chatId); - if (!chat) return MEDIA_WAIT_NULL; - - const msg = await services.messages.getByExternalId(chat.id, externalId); - if (!msg) return MEDIA_WAIT_NULL; - - // 1. Check if result already in DB (processing finished before debounce fired) - const existing = checkProcessedColumn(msg, column); - if (existing !== 'pending') { - return existing === 'error' ? MEDIA_WAIT_NULL : existing; - } - - // 2. Check event cache (event arrived before we asked) - const cached = mediaResultCache.get(msg.id); - if (cached) { - mediaResultCache.delete(msg.id); - if (!cached.content || cached.error) return MEDIA_WAIT_NULL; - const localPath = msg.mediaLocalPath ? resolve(join(MEDIA_BASE_PATH, msg.mediaLocalPath)) : null; - return { content: cached.content, localPath }; - } - - // 3. Await the event (NATS guarantees delivery) - const result = await new Promise<{ content: string; error?: string }>((resolve, reject) => { - mediaCompletions.set(msg.id, { resolve, reject }); - }); - - if (!result.content || result.error) return MEDIA_WAIT_NULL; - - // Re-read message for localPath (media storage may have updated it) - const updated = await services.messages.getByExternalId(chat.id, externalId); - const localPath = updated?.mediaLocalPath ? resolve(join(MEDIA_BASE_PATH, updated.mediaLocalPath)) : null; - return { content: result.content, localPath }; -} -``` - -**Zero polling. Zero timeouts. NATS guarantees delivery.** - -### Delivery Guarantees - -| Scenario | What Happens | -|----------|-------------| -| Processing succeeds | `media.processed` published → dispatcher resolves promise | -| Processing fails (API error) | `media.processed` with empty content published → dispatcher gets null | -| Processor crashes mid-processing | NATS redelivers `message.received` (maxRetries: 2) → processor retries | -| All retries exhausted | Dead letter queue → auto-retry 1h/6h/24h → manual intervention | -| Dispatcher restarts | Durable consumer replays unacked `media.processed` events | -| Event arrives before dispatcher needs it | Cached in `mediaResultCache` (5min TTL) | -| Event arrives after dispatcher asks | Resolves pending promise in `mediaCompletions` | -| NATS itself goes down | JetStream persistence on disk → events replayed on reconnect | - -**Nothing is lost.** Every media message produces exactly one `media.processed` event (success or failure). The dispatcher has a durable consumer that will receive it — if not now, then on retry, or on restart, or from the dead letter queue. - -### What Gets Deleted -- `waitForMediaProcessing()` — 30 lines of DB polling loop -- `sleep(500)` import usage in media wait -- The hardcoded `60_000` deadline -- 120 DB queries per media message - -### What Gets Added -- `media.processed` event publication on failure path (~5 lines) -- `media.processed` durable subscription in dispatcher (~20 lines) -- `awaitMediaProcessing()` — event-driven replacement (~25 lines) -- `mediaCompletions` Map + `mediaResultCache` Map (~10 lines) - -**Net delta: approximately zero lines.** We're replacing polling with events. - -### MediaProcessedPayload Extension - -The existing `MediaProcessedPayload` needs an optional `error` field: - -```typescript -export interface MediaProcessedPayload { - eventId: string; - mediaId: string; - processingType: 'transcription' | 'description' | 'extraction'; - content: string; - model?: string; - provider?: string; - tokensUsed?: number; - error?: string; // NEW: set when processing failed -} -``` - -## Decisions -| Decision | Rationale | -|----------|-----------| -| Event-driven, not configurable timeout | Timeouts are duct tape. NATS gives us guaranteed delivery. Use it. | -| No new DB migration | The `mediaWaitTimeoutMs` column is unnecessary — there's no timeout to configure. | -| Single `media.processed` event for success+failure | Simpler than two event types. Empty content + error field = failure. | -| Cache + Promise Map pattern | Handles both race conditions: event-before-ask and ask-before-event. Standard pattern. | -| Durable consumer for media events | Survives dispatcher restarts. JetStream replays missed events. | -| Query-level filtering for chats (#244) | More efficient than post-fetch. Matches messages.ts pattern. | - -## Risks & Assumptions -| Risk | Severity | Mitigation | -|------|----------|------------| -| Promise leak if event never arrives (catastrophic NATS failure) | Low | Add periodic cleanup: promises older than 10min get rejected with error. This is a circuit breaker, not a timeout. Log as `media_promise_leaked` for alerting. | -| mediaResultCache grows unbounded | Low | 5min TTL with `setTimeout` cleanup. Also periodic sweep. | -| Existing `MediaProcessedPayload` type change | Low | Adding optional `error` field is backward compatible. | - -## Execution Groups - -### Group 1: Chat scoping fix (#244) -**Files:** -- `packages/api/src/services/chats.ts` — add `instanceIds` to ListChatsOptions + buildListConditions -- `packages/api/src/routes/v2/chats.ts` — extract apiKey, pass instanceIds -- Test: verify scoped key only sees its instance's chats - -### Group 2: Event-driven media pipeline (#243) -**Files:** -- `packages/core/src/events/types.ts` — add `error?` to MediaProcessedPayload -- `packages/api/src/plugins/media-processor.ts` — publish event on failure path + catch block -- `packages/api/src/plugins/agent-dispatcher.ts` — subscribe to `media.processed`, replace polling with event-await - -### Group 3: Tests -- Unit test: `awaitMediaProcessing()` resolves on event, returns null on error event -- Unit test: cache hit when event arrives before ask -- Integration test: scoped API key chat filtering - -## Success Criteria -- [ ] `GET /chats` with scoped API key only returns chats from allowed instances -- [ ] `waitForMediaProcessing()` polling loop is deleted — zero DB polling for media -- [ ] Dispatcher subscribes to `media.processed` with durable consumer -- [ ] Media processor publishes `media.processed` on BOTH success and failure -- [ ] 8 photos sent in batch → all 8 described, zero `[media processing unavailable]` -- [ ] Existing tests pass -- [ ] Close GitHub issues #243 and #244 diff --git a/.genie/brainstorms/_archived/fix-omni-bugs-243-244/DRAFT.md b/.genie/brainstorms/_archived/fix-omni-bugs-243-244/DRAFT.md deleted file mode 100644 index 3ceaed57c..000000000 --- a/.genie/brainstorms/_archived/fix-omni-bugs-243-244/DRAFT.md +++ /dev/null @@ -1,64 +0,0 @@ -# Fix Omni Bugs: API Key Scoping + Media Timeout - -## Problem Statement - -Two bugs that affect production multi-agent deployments: - -### Bug 1: Scoped API key returns cross-instance chats (#244) -**Severity: HIGH — security gap** - -API keys created with `instanceIds` restriction correctly filter `instances list` but the `chats list` endpoint returns chats from ALL instances. This breaks multi-agent isolation where each agent (Sofia, ClaudIA) should only see its own chats. - -**Root cause:** `packages/api/src/routes/v2/chats.ts` line 159-163 — the `GET /chats` handler calls `services.chats.list(query)` without applying `filterByInstanceAccess()`. The helper exists in `auth.ts:169` but is simply not called. - -**Evidence:** -```typescript -// chats.ts:159 — NO instance filtering -chatsRoutes.get('/', zValidator('query', listQuerySchema), async (c) => { - const query = c.req.valid('query'); - const services = c.get('services'); - const result = await services.chats.list(query); // ← returns ALL chats - return c.json({ items: result.items, ... }); -}); -``` - -Compare with `messages.ts:461` which correctly passes `instanceIds`: -```typescript -const queryWithAccess = apiKey?.instanceIds ? { ...query, instanceIds: apiKey.instanceIds } : query; -``` - -### Bug 2: Media processing timeout hardcoded to 60s (#243) -**Severity: MEDIUM — breaks batch media** - -`waitForMediaProcessing()` in `agent-dispatcher.ts:572` has `const deadline = Date.now() + 60_000`. When users send 8+ photos, later messages time out because the media processor queues them sequentially (~7s each via Gemini Vision). - -The instance already has `agentWaitForMedia` (boolean) and `ackTimeoutMs` (integer) in the DB schema, but there's no `mediaWaitTimeoutMs` field. The 60s is hardcoded. - -## Proposed Solution - -### Fix 1: Apply instance scoping to chats list -- In `chats.ts` GET `/`, get `apiKey` from context -- Either filter results with `filterByInstanceAccess()` or pass `instanceIds` to the query (like messages.ts does) -- Also check: any other list endpoints missing this filter? (persons, journeys, etc.) - -### Fix 2: Make media timeout configurable -- Add `mediaWaitTimeoutMs` column to instances table (default 60000) -- Pass the timeout from instance settings into `waitForMediaProcessing()` -- Schema migration + update CLI `instances update` to expose the flag - -## Key Files -- `packages/api/src/routes/v2/chats.ts` — missing filterByInstanceAccess -- `packages/api/src/middleware/auth.ts` — has filterByInstanceAccess helper -- `packages/api/src/plugins/agent-dispatcher.ts:572` — hardcoded 60_000 -- `packages/db/src/schema.ts` — needs mediaWaitTimeoutMs column -- `packages/api/src/routes/v2/instances.ts` — expose new field in update - -## Questions for Brainstorm -1. Should we filter chats at query level (pass instanceIds to SQL) or post-fetch (filterByInstanceAccess)? Query level is more efficient. -2. For #243, should the timeout scale automatically with queue depth, or just be a flat configurable value? -3. Are there other list endpoints missing instance scoping? Should we audit all of them? -4. What's a good default for mediaWaitTimeoutMs? 60s works for 1-3 images but fails for 8+. 120s? 180s? - -## GitHub Issues -- https://github.com/automagik-dev/omni/issues/244 -- https://github.com/automagik-dev/omni/issues/243 diff --git a/.genie/brainstorms/_archived/generate-image-native/DRAFT.md b/.genie/brainstorms/_archived/generate-image-native/DRAFT.md deleted file mode 100644 index ea0f95afe..000000000 --- a/.genie/brainstorms/_archived/generate-image-native/DRAFT.md +++ /dev/null @@ -1,445 +0,0 @@ -# Brainstorm: Omni Agentic CLI — Multimodal Verbs + Conversation Context - -| Field | Value | -|-------|-------| -| **Slug** | `omni-agentic-cli` | -| **Date** | 2026-04-05 | -| **Issue** | #259 | -| **WRS** | 100/100 | - -## Problem -Omni's CLI requires explicit instance IDs, JIDs, and flags for every interaction — the opposite of how humans use messaging apps. Multimodal capabilities (image gen, TTS, STT, vision, video gen) live in scattered Python scripts that shell out to `omni send`. The platform needs native verb commands with IM-like conversation context, backed by multi-provider media services where every capability is provider-agnostic with configurable defaults and per-call overrides. - -## Scope -### IN -1. **Conversation context** — `omni open/use/where/close`, backed by PG (per API key), env var override for agents -2. **8 agentic verb commands** — `say`, `send`, `speak`, `react`, `imagine`, `film`, `listen`, `see` -3. **`--reply` modifier** — universal quote-reply on any verb (no separate reply command) -4. **Provider-agnostic media** — TTS, STT, image gen, video gen, vision all go through provider interfaces with configurable defaults + `--provider` override -5. **Gemini providers** — image gen (Nano Banana 2), TTS, STT, video gen (Veo 3.1), vision -6. **Existing provider integration** — ElevenLabs TTS + Groq Whisper STT as first-class providers alongside Gemini -7. **Instance scoping enforcement** — `api_keys.instanceIds` exists in schema, enforce server-side -8. **Auto-provisioned agent keys** — assigning agent to instance auto-creates scoped API key -9. **Persons CLI** — `omni persons merge/link/update` commands (API exists, CLI doesn't) -10. **Smarter auto-linking** — when @lid resolves to known phone, link identity to existing person instead of creating new one -11. **Full coexistence** — `omni send --text --instance --to` unchanged, verb commands are a layer on top - -### OUT -- Replacing ElevenLabs or Groq — they remain as providers -- Music generation (Lyria) -- Live/streaming modes -- Multi-turn image editing (Gemini 3 thought signatures) -- Non-Gemini image/video gen providers (DALL-E, etc.) -- Person deduplication UI/bulk tools (separate wish) -- Breaking changes to existing CLI commands - -## Models -| Capability | Model | Notes | -|-----------|-------|-------| -| Image gen (default) | `gemini-3.1-flash-image-preview` | Nano Banana 2, via generateContent | -| Image gen (pro) | `gemini-3-pro-image-preview` | Nano Banana Pro, thinking-enhanced | -| Image gen (fast) | `gemini-2.5-flash-image` | Nano Banana, speed-optimized | -| TTS (Gemini) | `gemini-2.5-flash-preview-tts` | 30 voices, multi-speaker | -| STT (Gemini) | `gemini-3-flash-preview` | Audio understanding, timestamps | -| Video gen | `veo-3.1-generate-preview` | Native audio, 720p-4K | -| Vision | `gemini-3.1-flash-lite-preview` | Default multimodal model | - ---- - -## Architecture - -### Provider Model — Every Verb is Provider-Agnostic - -``` -CLI verb (say, speak, listen, imagine, etc.) - → resolves context (flags → env → PG → config) - → calls API endpoint - → provider router (reads default from config, accepts --provider override) - → GeminiProvider / ElevenLabsProvider / GroqProvider / etc. -``` - -```bash -# Defaults set via config -omni config set tts.provider gemini # default TTS -omni config set stt.provider groq # default STT -omni config set imagegen.provider gemini # default image gen -omni config set videogen.provider gemini # default video gen -omni config set vision.provider gemini # default vision - -# Verbs use default, --provider overrides -omni speak "hello" # → default TTS provider -omni speak "hello" --provider elevenlabs # → ElevenLabs -omni speak "hello" --provider gemini # → Gemini - -omni listen audio.ogg # → default STT provider -omni listen audio.ogg --provider gemini # → Gemini -omni listen audio.ogg --provider groq # → Groq Whisper -``` - -Provider interfaces: -```typescript -interface ITtsProvider { - generate(text: string, options: TtsOptions): Promise; - listVoices(): Promise; -} - -interface ISttProvider { - transcribe(audio: Buffer, options: SttOptions): Promise; -} - -interface IImageGenProvider { - generate(prompt: string, options: ImageGenOptions): Promise; -} - -interface IVideoGenProvider { - generate(prompt: string, options: VideoGenOptions): Promise; -} - -interface IVisionProvider { - describe(media: Buffer, options: VisionOptions): Promise; -} -``` - ---- - -### Context Layer — PG-backed, Permission-Scoped - -**No files.** Context stored in PG on the API key. Env vars override for agents. - -#### Commands - -| Command | Who uses it | What it does | -|---------|------------|--------------| -| `omni use ` | Admins (multi-instance keys) | Set active instance | -| `omni open ` | Everyone | Open chat within accessible instances | -| `omni where` | Everyone | Show current context | -| `omni close` | Everyone | Clear context | - -#### `omni open` — resolution scoped by API key permissions - -```bash -omni open felipe # search chats on YOUR instances only -omni open nmstx # fuzzy match chat name -omni open 5512982298888 # phone number -omni open febc95ba # chat ID prefix -``` - -Resolution flow: -``` -1. Get accessible instances from api_keys.instanceIds (null = all) -2. If admin with multiple instances, filter by active instance (from "omni use") -3. Search chats + persons ONLY on accessible instances -4. One match → open -5. Multiple matches → pick most active, show alternatives -6. Zero matches → "contact not found on your instances" -``` - -**Scoped agents never see disambiguation.** They have one instance. `omni open felipe` finds Felipe on that instance or errors. No `@sofia`, no `@telegram` — their key only sees one world. - -**Admins use `omni use` to switch between instances:** -```bash -omni use sofia # set active instance -omni open felipe # → Felipe on Sofia -omni use telegram # switch -omni open felipe # → Felipe on Telegram -``` - -#### Context storage — PG on api_keys - -```sql -ALTER TABLE api_keys ADD COLUMN active_instance_id UUID REFERENCES instances(id); -- "omni use" -ALTER TABLE api_keys ADD COLUMN context_instance_id UUID REFERENCES instances(id); -- "omni open" resolved instance -ALTER TABLE api_keys ADD COLUMN context_chat_id TEXT; -ALTER TABLE api_keys ADD COLUMN context_message_id TEXT; -ALTER TABLE api_keys ADD COLUMN context_updated_at TIMESTAMP; -``` - -#### Resolution chain (first match wins): -``` -1. --to / --instance flags ← explicit, always wins -2. OMNI_INSTANCE / OMNI_CHAT env ← per-process (agent dispatcher) -3. PG context (per API key) ← persistent ("omni open" state) -4. omni config defaults ← fallback -5. error ← nothing -``` - ---- - -### Instance Scoping + Auto-Provisioned Agent Keys - -**`api_keys.instanceIds UUID[]` already exists.** Enforce it server-side: middleware validates `instanceId ∈ key.instanceIds` on every request. - -**Auto-provisioning:** When an agent is assigned to an instance, auto-create/update a scoped key: - -```bash -omni instances update sofia --agent my-agent -# → Auto-creates API key "agent:my-agent" scoped to [sofia] -# → Key stored on agent record -``` - -Agent assigned to second instance → key updated: `instanceIds = [sofia, claudia]`. -Agent removed from instance → key updated to remove that instance. - -Dispatcher uses the agent's auto-provisioned key: -```typescript -// agent-dispatcher.ts -const agentKey = agent.apiKey; // auto-provisioned, scoped -process.env.OMNI_API_KEY = agentKey; -process.env.OMNI_INSTANCE = instance.id; -process.env.OMNI_CHAT = payload.chatId; -process.env.OMNI_MESSAGE = payload.messageId; -``` - ---- - -### Verb Commands - -#### Communication (send to open chat) - -```bash -# say — text message -omni say "oi, tudo bem?" -omni say "concordo" --reply # quote-reply to trigger/last msg -omni say "concordo" --reply abc123 # quote-reply to specific msg - -# send — deliver file/media (auto-detects type from extension) -omni send foto.jpg # image -omni send recording.ogg # audio -omni send video.mp4 # video -omni send contract.pdf # document -omni send foto.jpg --caption "olha isso" # with caption -omni send foto.jpg --reply # file as quote-reply - -# speak — voice note via TTS (provider-agnostic) -omni speak "bom dia" # default provider -omni speak "escuta isso" --voice Kore # specific voice -omni speak "hello" --provider elevenlabs # override provider -omni speak "calma" --style "slow and calm" # Gemini style prompt - -# react — emoji reaction -omni react 👍 # react to last/trigger message -omni react ❤️ --msg abc123 # react to specific message -``` - -#### Generative (create content → send to chat) - -```bash -# imagine — generate image (provider-agnostic) -omni imagine "a cat wearing sunglasses" -omni imagine "pricing table" --aspect-ratio 16:9 --size 2048 -omni imagine "logo" --model nano-banana-pro -omni imagine "cat" --output cat.png # save locally, don't send -omni imagine "cat" --reply abc123 # generate + quote-reply -omni imagine "cats" --count 3 # 3 variations - -# film — generate video (provider-agnostic) -omni film "sunset over sao paulo" --duration 8 --resolution 1080p -omni film "product demo" --reference product.jpg -omni film "continue" --extend clip.mp4 -omni film "sunset" --output sunset.mp4 -``` - -Behavior: -- With context → generates + sends to chat -- With `--output` → saves locally, does NOT send -- With `--reply` → generates + sends as quote-reply -- No context + no output → saves to temp file, prints path - -#### Understanding (process content → stdout by default) - -```bash -# listen — transcribe audio (provider-agnostic) -omni listen voice.ogg # → stdout (agent uses internally) -omni listen voice.ogg --reply # transcribe + quote-reply with text -omni listen voice.ogg --provider gemini # force Gemini -omni listen voice.ogg --provider groq # force Groq -omni listen voice.ogg --timestamps # word-level timestamps (Gemini) -omni listen voice.ogg --format srt # SRT subtitles - -# see — describe image/video (provider-agnostic) -omni see photo.jpg # → stdout -omni see photo.jpg --reply # describe + quote-reply -omni see screenshot.png "what app is this?" # guided prompt -omni see video.mp4 # video understanding -``` - ---- - -### `--reply` — Universal Quote Modifier - -One mechanism. No separate `reply` verb. Composes with all 8 verbs. - -```bash -omni say "text" --reply # text quote-reply -omni send file.pdf --reply # file quote-reply -omni speak "text" --reply abc123 # voice quote-reply to specific msg -omni imagine "cat" --reply # generate + quote-reply -omni listen audio.ogg --reply # transcribe + quote-reply with result -``` - -Resolution for which message to quote: -1. `--reply ` — explicit message ID (always wins) -2. `OMNI_MESSAGE` env var — set by dispatcher (trigger message) -3. Last received message in open chat -4. Error if none available - ---- - -### Persons CLI — Expose Existing APIs - -```bash -omni persons merge [--reason "duplicate"] -omni persons link -omni persons unlink --reason "wrong link" -omni persons update --phone "+5512982298888" --email "felipe@x.com" -``` - -### Smarter Auto-Linking - -In `PersonService.findOrCreateIdentity()`: when a platform identity arrives with a platformUserId that maps to a known phone (via `chat_id_mappings` @lid → @s.whatsapp.net), check if a person with that phone already exists and link to them instead of creating a new person. - ---- - -### Shared Flags - -```bash -# Universal (all verbs) ---instance # override instance (bypass context) ---to # override chat (bypass context) ---reply [msg-id] # send as quote-reply ---msg # specify message (for react) ---provider # provider override ---model # model override ---output # save to file (generative verbs) - -# Per-verb ---caption # send: caption for media ---voice # speak: voice selection ---style # speak: style prompt (Gemini) ---aspect-ratio # imagine/film: 1:1, 16:9, 9:16, 4:3, 3:4 ---size # imagine: 512, 1024, 2048, 4096 ---count # imagine: number of images (1-4) ---duration # film: 5-8s ---resolution # film: 720p, 1080p, 4K ---reference # film: reference image ---extend