Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions packages/api/src/providers/gemini/videogen.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ describe('GeminiVideoGenProvider', () => {
expect(request.prompt).toContain('1. hero frame');
});

it('preserves generateAudio for text-to-video requests', async () => {
it('omits generateAudio for text-to-video requests because current Veo 3.1 API rejects it', async () => {
generateVideoCalls.length = 0;
const provider = new GeminiVideoGenProvider({
getSecret: async () => 'test-gemini-key',
Expand All @@ -84,10 +84,10 @@ describe('GeminiVideoGenProvider', () => {

expect(generateVideoCalls).toHaveLength(1);
const request = generateVideoCalls[0] as { config?: Record<string, unknown> };
expect(request.config?.generateAudio).toBe(true);
expect(request.config?.generateAudio).toBeUndefined();
});

it('can disable generated audio for text-to-video requests', async () => {
it('treats --no-audio as a compatibility no-op while generateAudio is unsupported', async () => {
generateVideoCalls.length = 0;
const provider = new GeminiVideoGenProvider({
getSecret: async () => 'test-gemini-key',
Expand All @@ -98,6 +98,6 @@ describe('GeminiVideoGenProvider', () => {

expect(generateVideoCalls).toHaveLength(1);
const request = generateVideoCalls[0] as { config?: Record<string, unknown> };
expect(request.config?.generateAudio).toBe(false);
expect(request.config?.generateAudio).toBeUndefined();
});
});
4 changes: 3 additions & 1 deletion packages/api/src/providers/gemini/videogen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ export class GeminiVideoGenProvider implements IVideoGenProvider {
...(options?.durationSec !== undefined ? { durationSeconds: options.durationSec } : {}),
...(options?.seed !== undefined ? { seed: options.seed } : {}),
...(options?.resolution !== undefined ? { resolution: options.resolution } : {}),
...(!options?.imageBase64 ? { generateAudio: options?.audio !== false } : {}),
// The current Gemini Veo 3.1 API rejects generateAudio in this SDK path.
// Keep --no-audio as a no-op compatibility flag until the provider exposes
// a supported audio toggle again.
},
};
if (options?.imageBase64) {
Expand Down
40 changes: 40 additions & 0 deletions packages/api/src/routes/v2/__tests__/messages-send-media.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,44 @@ describe('POST /messages/send/media', () => {
},
});
});

test('forwards WhatsApp voice-note audio as audioBuffer instead of base64', async () => {
const sendMessage = mock(async (_instanceId: string, _message: unknown) => ({
success: true,
messageId: 'VOICE-MSG-ID',
timestamp: 123,
}));
const app = mountMessagesRoutes(sendMessage);
const audio = Buffer.from('ogg-opus-bytes');

const res = await app.request('/messages/send/media', {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify({
instanceId: '11111111-1111-4111-8111-111111111111',
to: '5511999999999@s.whatsapp.net',
type: 'audio',
base64: audio.toString('base64'),
filename: 'voice.ogg',
voiceNote: true,
}),
});

expect(res.status).toBe(201);
expect(sendMessage).toHaveBeenCalledTimes(1);
const message = sendMessage.mock.calls[0]?.[1] as { metadata?: Record<string, unknown> };
expect(message).toMatchObject({
content: {
type: 'audio',
filename: 'voice.ogg',
mimeType: 'audio/ogg; codecs=opus',
},
metadata: {
ptt: true,
},
});
expect(message.metadata?.base64).toBeUndefined();
expect(Buffer.isBuffer(message.metadata?.audioBuffer)).toBe(true);
expect((message.metadata?.audioBuffer as Buffer).equals(audio)).toBe(true);
});
});
23 changes: 17 additions & 6 deletions packages/api/src/routes/v2/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,21 @@ const sendMediaSchema = z.object({
threadId: z.string().optional().describe('Thread/topic ID (e.g. Telegram forum topic)'),
});

function normalizeSendMediaMimeType(data: z.infer<typeof sendMediaSchema>): string {
const inferred = data.mimeType ?? inferMediaMimeType(data.type, data.filename);
if (data.type === 'audio' && data.voiceNote === true && inferred === 'audio/ogg') {
return 'audio/ogg; codecs=opus';
}
return inferred;
}

function buildSendMediaMetadata(data: z.infer<typeof sendMediaSchema>): Record<string, unknown> {
if (data.type === 'audio' && data.voiceNote === true && data.base64) {
return { audioBuffer: Buffer.from(data.base64, 'base64'), ptt: true };
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve base64 for voice-note conversion

When callers send a base64 voice note that is not already OGG/Opus (for example omni send --media clip.mp3 --voice, which posts base64), this replaces metadata.base64 with audioBuffer. The WhatsApp preprocessing path still only looks at message.content.mediaUrl or message.metadata?.base64 before calling convertBufferForVoiceNote (packages/channel-whatsapp/src/plugin.ts around processAudioForVoiceNote), so it now returns early and skips conversion, sending the original MP3/WAV buffer as ptt instead of a WhatsApp-compatible OGG/Opus voice note. Either keep the base64 through preprocessing or update the WhatsApp converter to consume audioBuffer.

Useful? React with 👍 / 👎.

}
return { base64: data.base64, ptt: data.voiceNote };
}

// Send reaction schema
const sendReactionSchema = z.object({
instanceId: z.string().uuid().describe('Instance ID'),
Expand Down Expand Up @@ -1147,7 +1162,7 @@ messagesRoutes.post('/send/media', zValidator('json', sendMediaSchema), async (c
// Resolve recipient (handles person ID to platform ID resolution)
const resolvedTo = await resolveRecipient(data.to, instance.channel, services);

const mediaMimeType = data.mimeType ?? inferMediaMimeType(data.type, data.filename);
const mediaMimeType = normalizeSendMediaMimeType(data);

// Build outgoing message
const outgoingMessage: OutgoingMessage = {
Expand All @@ -1160,11 +1175,7 @@ messagesRoutes.post('/send/media', zValidator('json', sendMediaSchema), async (c
filename: data.filename,
mimeType: mediaMimeType,
} as OutgoingContent,
metadata: {
base64: data.base64,
// WhatsApp uses 'ptt' (push-to-talk) flag for voice notes
ptt: data.voiceNote,
},
metadata: buildSendMediaMetadata(data),
};

// T8: API processed the send request
Expand Down
Loading