From 455ca32c66b28f7fbb2fde4d18091bb62a48f670 Mon Sep 17 00:00:00 2001 From: Genie Automagik Date: Tue, 2 Jun 2026 20:27:34 +0000 Subject: [PATCH] fix(api): repair voice notes and Veo requests --- .../api/src/providers/gemini/videogen.test.ts | 8 ++-- packages/api/src/providers/gemini/videogen.ts | 4 +- .../v2/__tests__/messages-send-media.test.ts | 40 +++++++++++++++++++ packages/api/src/routes/v2/messages.ts | 23 ++++++++--- 4 files changed, 64 insertions(+), 11 deletions(-) diff --git a/packages/api/src/providers/gemini/videogen.test.ts b/packages/api/src/providers/gemini/videogen.test.ts index 416de6cf..a6287d06 100644 --- a/packages/api/src/providers/gemini/videogen.test.ts +++ b/packages/api/src/providers/gemini/videogen.test.ts @@ -73,7 +73,7 @@ describe('GeminiVideoGenProvider', () => { expect(request.prompt).toContain('1. hero frame'); }); - it('preserves generateAudio for text-to-video requests', async () => { + it('omits generateAudio for text-to-video requests because current Veo 3.1 API rejects it', async () => { generateVideoCalls.length = 0; const provider = new GeminiVideoGenProvider({ getSecret: async () => 'test-gemini-key', @@ -84,10 +84,10 @@ describe('GeminiVideoGenProvider', () => { expect(generateVideoCalls).toHaveLength(1); const request = generateVideoCalls[0] as { config?: Record }; - expect(request.config?.generateAudio).toBe(true); + expect(request.config?.generateAudio).toBeUndefined(); }); - it('can disable generated audio for text-to-video requests', async () => { + it('treats --no-audio as a compatibility no-op while generateAudio is unsupported', async () => { generateVideoCalls.length = 0; const provider = new GeminiVideoGenProvider({ getSecret: async () => 'test-gemini-key', @@ -98,6 +98,6 @@ describe('GeminiVideoGenProvider', () => { expect(generateVideoCalls).toHaveLength(1); const request = generateVideoCalls[0] as { config?: Record }; - expect(request.config?.generateAudio).toBe(false); + expect(request.config?.generateAudio).toBeUndefined(); }); }); diff --git a/packages/api/src/providers/gemini/videogen.ts b/packages/api/src/providers/gemini/videogen.ts index 7c41d27e..c527e1d3 100644 --- a/packages/api/src/providers/gemini/videogen.ts +++ b/packages/api/src/providers/gemini/videogen.ts @@ -70,7 +70,9 @@ export class GeminiVideoGenProvider implements IVideoGenProvider { ...(options?.durationSec !== undefined ? { durationSeconds: options.durationSec } : {}), ...(options?.seed !== undefined ? { seed: options.seed } : {}), ...(options?.resolution !== undefined ? { resolution: options.resolution } : {}), - ...(!options?.imageBase64 ? { generateAudio: options?.audio !== false } : {}), + // The current Gemini Veo 3.1 API rejects generateAudio in this SDK path. + // Keep --no-audio as a no-op compatibility flag until the provider exposes + // a supported audio toggle again. }, }; if (options?.imageBase64) { diff --git a/packages/api/src/routes/v2/__tests__/messages-send-media.test.ts b/packages/api/src/routes/v2/__tests__/messages-send-media.test.ts index 2677100a..81c3cb1d 100644 --- a/packages/api/src/routes/v2/__tests__/messages-send-media.test.ts +++ b/packages/api/src/routes/v2/__tests__/messages-send-media.test.ts @@ -67,4 +67,44 @@ describe('POST /messages/send/media', () => { }, }); }); + + test('forwards WhatsApp voice-note audio as audioBuffer instead of base64', async () => { + const sendMessage = mock(async (_instanceId: string, _message: unknown) => ({ + success: true, + messageId: 'VOICE-MSG-ID', + timestamp: 123, + })); + const app = mountMessagesRoutes(sendMessage); + const audio = Buffer.from('ogg-opus-bytes'); + + const res = await app.request('/messages/send/media', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + instanceId: '11111111-1111-4111-8111-111111111111', + to: '5511999999999@s.whatsapp.net', + type: 'audio', + base64: audio.toString('base64'), + filename: 'voice.ogg', + voiceNote: true, + }), + }); + + expect(res.status).toBe(201); + expect(sendMessage).toHaveBeenCalledTimes(1); + const message = sendMessage.mock.calls[0]?.[1] as { metadata?: Record }; + expect(message).toMatchObject({ + content: { + type: 'audio', + filename: 'voice.ogg', + mimeType: 'audio/ogg; codecs=opus', + }, + metadata: { + ptt: true, + }, + }); + expect(message.metadata?.base64).toBeUndefined(); + expect(Buffer.isBuffer(message.metadata?.audioBuffer)).toBe(true); + expect((message.metadata?.audioBuffer as Buffer).equals(audio)).toBe(true); + }); }); diff --git a/packages/api/src/routes/v2/messages.ts b/packages/api/src/routes/v2/messages.ts index d35fd4f9..60c1e6e2 100644 --- a/packages/api/src/routes/v2/messages.ts +++ b/packages/api/src/routes/v2/messages.ts @@ -533,6 +533,21 @@ const sendMediaSchema = z.object({ threadId: z.string().optional().describe('Thread/topic ID (e.g. Telegram forum topic)'), }); +function normalizeSendMediaMimeType(data: z.infer): string { + const inferred = data.mimeType ?? inferMediaMimeType(data.type, data.filename); + if (data.type === 'audio' && data.voiceNote === true && inferred === 'audio/ogg') { + return 'audio/ogg; codecs=opus'; + } + return inferred; +} + +function buildSendMediaMetadata(data: z.infer): Record { + if (data.type === 'audio' && data.voiceNote === true && data.base64) { + return { audioBuffer: Buffer.from(data.base64, 'base64'), ptt: true }; + } + return { base64: data.base64, ptt: data.voiceNote }; +} + // Send reaction schema const sendReactionSchema = z.object({ instanceId: z.string().uuid().describe('Instance ID'), @@ -1147,7 +1162,7 @@ messagesRoutes.post('/send/media', zValidator('json', sendMediaSchema), async (c // Resolve recipient (handles person ID to platform ID resolution) const resolvedTo = await resolveRecipient(data.to, instance.channel, services); - const mediaMimeType = data.mimeType ?? inferMediaMimeType(data.type, data.filename); + const mediaMimeType = normalizeSendMediaMimeType(data); // Build outgoing message const outgoingMessage: OutgoingMessage = { @@ -1160,11 +1175,7 @@ messagesRoutes.post('/send/media', zValidator('json', sendMediaSchema), async (c filename: data.filename, mimeType: mediaMimeType, } as OutgoingContent, - metadata: { - base64: data.base64, - // WhatsApp uses 'ptt' (push-to-talk) flag for voice notes - ptt: data.voiceNote, - }, + metadata: buildSendMediaMetadata(data), }; // T8: API processed the send request