Skip to content

Commit d5980b4

Browse files
feat(llm): add video and audio media support to Gemini protocol (#31889)
Co-authored-by: Aiden Cline <63023139+rekram1-node@users.noreply.github.com>
1 parent 41d1279 commit d5980b4

2 files changed

Lines changed: 8 additions & 5 deletions

File tree

packages/llm/src/protocols/gemini.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import { GeminiToolSchema } from "./utils/gemini-tool-schema"
2121
import { Lifecycle } from "./utils/lifecycle"
2222

2323
const ADAPTER = "gemini"
24-
const IMAGE_MIMES = new Set<string>(ProviderShared.IMAGE_MIMES)
24+
const MEDIA_MIMES = new Set<string>(ProviderShared.MEDIA_MIMES)
2525
export const DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
2626

2727
// =============================================================================
@@ -182,7 +182,7 @@ const lowerToolConfig = (toolChoice: NonNullable<LLMRequest["toolChoice"]>) =>
182182

183183
const lowerUserPart = Effect.fn("Gemini.lowerUserPart")(function* (part: TextPart | MediaPart) {
184184
if (part.type === "text") return { text: part.text }
185-
const media = yield* ProviderShared.validateMedia("Gemini", part, IMAGE_MIMES)
185+
const media = yield* ProviderShared.validateMedia("Gemini", part, MEDIA_MIMES)
186186
return { inlineData: { mimeType: media.mime, data: media.base64 } }
187187
})
188188

@@ -275,7 +275,7 @@ const lowerMessages = Effect.fn("Gemini.lowerMessages")(function* (request: LLMR
275275
})
276276
for (const item of content) {
277277
if (item.type === "text") continue
278-
const media = yield* ProviderShared.validateToolFile("Gemini", item, IMAGE_MIMES)
278+
const media = yield* ProviderShared.validateToolFile("Gemini", item, MEDIA_MIMES)
279279
parts.push({ inlineData: { mimeType: media.mime, data: media.base64 } })
280280
}
281281
}

packages/llm/src/protocols/shared.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,11 @@ export const parseToolInput = (route: string, name: string, raw: string) =>
188188
parseJson(route, raw || "{}", `Invalid JSON input for ${route} tool call ${name}`)
189189

190190
export const IMAGE_MIMES = ["image/png", "image/jpeg", "image/gif", "image/webp"] as const
191-
export const MAX_MEDIA_ENCODED_BYTES = 8 * 1024 * 1024
192-
export const MAX_MEDIA_DECODED_BYTES = 6 * 1024 * 1024
191+
export const VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"] as const
192+
export const AUDIO_MIMES = ["audio/wav", "audio/mp3", "audio/aiff", "audio/aac", "audio/ogg", "audio/flac"] as const
193+
export const MEDIA_MIMES = [...IMAGE_MIMES, ...VIDEO_MIMES, ...AUDIO_MIMES] as const
194+
export const MAX_MEDIA_ENCODED_BYTES = 28 * 1024 * 1024
195+
export const MAX_MEDIA_DECODED_BYTES = 20 * 1024 * 1024
193196

194197
const base64Pattern = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/
195198

0 commit comments

Comments
 (0)