-
Notifications
You must be signed in to change notification settings - Fork 0
feat: Gemini agentic video analysis with Google Search grounding #47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -150,38 +150,79 @@ export async function POST(request: Request) { | |||||
| try { | ||||||
| const { transcript, videoTitle, videoUrl } = await request.json(); | ||||||
|
|
||||||
| if (!transcript || typeof transcript !== 'string') { | ||||||
| // Accept either transcript text OR videoUrl for direct Gemini analysis | ||||||
| if ((!transcript || typeof transcript !== 'string') && !videoUrl) { | ||||||
| return NextResponse.json( | ||||||
| { error: 'transcript (string) is required' }, | ||||||
| { error: 'transcript (string) or videoUrl is required' }, | ||||||
| { status: 400 } | ||||||
| ); | ||||||
| } | ||||||
|
|
||||||
| const trimmed = transcript.slice(0, 8000); | ||||||
| let parsed; | ||||||
| let provider = 'openai'; | ||||||
|
|
||||||
| // Try OpenAI first, fall back to Gemini on quota/auth errors | ||||||
| if (process.env.OPENAI_API_KEY) { | ||||||
| try { | ||||||
| parsed = await extractWithOpenAI(trimmed, videoTitle, videoUrl); | ||||||
| } catch (err) { | ||||||
| const msg = err instanceof Error ? err.message : ''; | ||||||
| if ((msg.includes('429') || msg.includes('quota') || msg.includes('rate')) && process.env.GEMINI_API_KEY) { | ||||||
| console.warn('OpenAI quota hit, falling back to Gemini'); | ||||||
| parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); | ||||||
| provider = 'gemini'; | ||||||
| } else { | ||||||
| throw err; | ||||||
| // If we have transcript text, use the existing extraction logic | ||||||
| if (transcript && typeof transcript === 'string' && transcript.length > 50) { | ||||||
| const trimmed = transcript.slice(0, 8000); | ||||||
|
|
||||||
| if (process.env.OPENAI_API_KEY) { | ||||||
| try { | ||||||
| parsed = await extractWithOpenAI(trimmed, videoTitle, videoUrl); | ||||||
| } catch (err) { | ||||||
| const msg = err instanceof Error ? err.message : ''; | ||||||
| if ((msg.includes('429') || msg.includes('quota') || msg.includes('rate')) && process.env.GEMINI_API_KEY) { | ||||||
| console.warn('OpenAI quota hit, falling back to Gemini'); | ||||||
| parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); | ||||||
| provider = 'gemini'; | ||||||
| } else { | ||||||
| throw err; | ||||||
| } | ||||||
| } | ||||||
| } else if (process.env.GEMINI_API_KEY) { | ||||||
| parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); | ||||||
| provider = 'gemini'; | ||||||
| } | ||||||
| } else if (process.env.GEMINI_API_KEY) { | ||||||
| parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); | ||||||
| provider = 'gemini'; | ||||||
| } else { | ||||||
| } | ||||||
|
|
||||||
| // If no transcript but have videoUrl + Gemini, do direct video analysis via Google Search | ||||||
| if (!parsed && videoUrl && process.env.GEMINI_API_KEY) { | ||||||
| try { | ||||||
| const ai = getGemini(); | ||||||
| const response = await ai.models.generateContent({ | ||||||
| model: 'gemini-2.5-flash', | ||||||
| contents: `${SYSTEM_PROMPT}\n\nAnalyze this YouTube video and extract structured data. | ||||||
| Use your Google Search tool to find the video's transcript, description, and chapter content. | ||||||
|
|
||||||
| Video URL: ${videoUrl} | ||||||
| ${videoTitle ? `Video Title: ${videoTitle}` : ''} | ||||||
|
|
||||||
| Extract events, actions, summary, and topics from the actual video content found via search. | ||||||
| Respond with ONLY valid JSON matching this structure: | ||||||
| { | ||||||
| "events": [{"type": "action|topic|insight|tool|resource", "title": "...", "description": "...", "timestamp": "02:15" or null, "priority": "high|medium|low"}], | ||||||
| "actions": [{"title": "...", "description": "...", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": number or null}], | ||||||
| "summary": "2-3 sentence summary", | ||||||
| "topics": ["topic1", "topic2"] | ||||||
| }`, | ||||||
|
Comment on lines
+193
to
+206
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
| config: { | ||||||
| temperature: 0.3, | ||||||
| responseMimeType: 'application/json', | ||||||
| responseSchema: geminiResponseSchema, | ||||||
|
Comment on lines
+209
to
+210
|
||||||
| responseMimeType: 'application/json', | |
| responseSchema: geminiResponseSchema, |
Copilot
AI
Feb 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error message says "Set GEMINI_API_KEY", but /api/extract-events still supports OPENAI_API_KEY as a working provider for the transcript-based path. A user who only has OPENAI_API_KEY configured and provides a videoUrl without a transcript will see this misleading error. The message should mention both API keys.
| error: 'No AI API key configured or all extraction attempts failed. Set GEMINI_API_KEY.', | |
| error: 'No AI API key configured or all extraction attempts failed. Set GEMINI_API_KEY and/or OPENAI_API_KEY.', |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,6 +1,7 @@ | ||||||
| import OpenAI from 'openai'; | ||||||
| import { GoogleGenAI } from '@google/genai'; | ||||||
| import { NextResponse } from 'next/server'; | ||||||
| import { fetchYouTubeMetadata, formatMetadataAsContext } from '@/lib/youtube-metadata'; | ||||||
|
||||||
|
|
||||||
| let _openai: OpenAI | null = null; | ||||||
| function getOpenAI() { | ||||||
|
|
@@ -93,59 +94,42 @@ export async function POST(request: Request) { | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Strategy 2: OpenAI Responses API with web_search | ||||||
| if (url && !audioUrl && process.env.OPENAI_API_KEY) { | ||||||
| // Fetch YouTube metadata (description, chapters, title) — used by strategies below | ||||||
| let metadata: Awaited<ReturnType<typeof fetchYouTubeMetadata>> = null; | ||||||
| if (url) { | ||||||
| try { | ||||||
| const response = await getOpenAI().responses.create({ | ||||||
| model: 'gpt-4o-mini', | ||||||
| instructions: `You are a video content transcription assistant. | ||||||
| Given a YouTube URL, use web search to find the video's transcript or detailed content. | ||||||
| Return the full transcript text if available, or a detailed content summary. | ||||||
| Be thorough — capture all key points, quotes, and technical details.`, | ||||||
| tools: [{ type: 'web_search' as const }], | ||||||
| input: `Find and return the full transcript or detailed content of this video: ${url}`, | ||||||
| }); | ||||||
|
|
||||||
| const text = response.output_text || ''; | ||||||
|
|
||||||
| if (text.length > 100) { | ||||||
| return NextResponse.json({ | ||||||
| success: true, | ||||||
| transcript: text, | ||||||
| source: 'openai-web-search', | ||||||
| wordCount: text.split(/\s+/).length, | ||||||
| }); | ||||||
| } | ||||||
| } catch (e) { | ||||||
| console.warn('OpenAI web_search transcript failed:', e); | ||||||
| metadata = await fetchYouTubeMetadata(url); | ||||||
| } catch { | ||||||
| console.log('YouTube metadata fetch failed, continuing without'); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Strategy 3: Gemini with direct YouTube URL processing + Google Search grounding | ||||||
| // Strategy 2: Gemini with Google Search grounding (PRIMARY for YouTube) | ||||||
| // Uses Google Search to find actual transcript content, descriptions, and chapters | ||||||
| if (url && !audioUrl && process.env.GEMINI_API_KEY) { | ||||||
| try { | ||||||
| const ai = getGemini(); | ||||||
| const metadataContext = metadata ? formatMetadataAsContext(metadata) : ''; | ||||||
|
|
||||||
| const result = await ai.models.generateContent({ | ||||||
| model: 'gemini-2.0-flash', | ||||||
| contents: [ | ||||||
| { | ||||||
| role: 'user', | ||||||
| parts: [ | ||||||
| { | ||||||
| fileData: { | ||||||
| mimeType: 'video/*', | ||||||
| fileUri: url, | ||||||
| }, | ||||||
| }, | ||||||
| { | ||||||
| text: 'Provide a complete, detailed transcript of this video. ' + | ||||||
| 'Include all spoken content verbatim. ' + | ||||||
| 'Include timestamps where possible in [MM:SS] format. ' + | ||||||
| 'Be thorough and comprehensive — capture every key point, quote, and technical detail.', | ||||||
| }, | ||||||
| ], | ||||||
| }, | ||||||
| ], | ||||||
| model: 'gemini-2.5-flash', | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The model name
Suggested change
|
||||||
| contents: `You are a video transcription assistant with access to Google Search. | ||||||
|
|
||||||
| For the following YouTube video, use your googleSearch tool to find the ACTUAL transcript, | ||||||
| description, and chapter content. The video creator often provides detailed descriptions | ||||||
| with chapter breakdowns — USE that metadata as high-quality structured content. | ||||||
|
|
||||||
| ${metadataContext ? `KNOWN VIDEO METADATA:\n${metadataContext}\n` : ''} | ||||||
| Video URL: ${url} | ||||||
|
|
||||||
| INSTRUCTIONS: | ||||||
| 1. Search for the video's transcript using Google Search. | ||||||
| 2. If a spoken transcript is available, return it verbatim. | ||||||
| 3. If not, reconstruct detailed content from the description, chapters, comments, | ||||||
| and related articles found via search. | ||||||
| 4. Be thorough — capture ALL key points, technical details, quotes, and actionable insights. | ||||||
| 5. Include timestamps in [MM:SS] format where possible. | ||||||
| 6. Do NOT return generic advice like "click Show Transcript" — return actual content.`, | ||||||
|
Comment on lines
+116
to
+132
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
| config: { | ||||||
| temperature: 0.2, | ||||||
| tools: [{ googleSearch: {} }], | ||||||
|
|
@@ -157,40 +141,56 @@ Be thorough — capture all key points, quotes, and technical details.`, | |||||
| return NextResponse.json({ | ||||||
| success: true, | ||||||
| transcript: text, | ||||||
| source: 'gemini-video', | ||||||
| source: 'gemini-search', | ||||||
| wordCount: text.split(/\s+/).length, | ||||||
| metadata: metadata ? { | ||||||
| title: metadata.title, | ||||||
| channel: metadata.channel, | ||||||
| chapters: metadata.chapters, | ||||||
| } : undefined, | ||||||
| }); | ||||||
| } | ||||||
| } catch (e) { | ||||||
| console.warn('Gemini video URL processing failed, trying text fallback:', e); | ||||||
|
|
||||||
| // Fallback: text-based Gemini with Google Search grounding | ||||||
| try { | ||||||
| const ai = getGemini(); | ||||||
| const result = await ai.models.generateContent({ | ||||||
| model: 'gemini-2.0-flash', | ||||||
| contents: `You are a video content transcription assistant. ` + | ||||||
| `For the following YouTube video URL, provide a detailed transcript or content summary. ` + | ||||||
| `Include all key points, technical details, quotes, and actionable insights. ` + | ||||||
| `Be thorough and comprehensive.\n\nVideo URL: ${url}`, | ||||||
| config: { | ||||||
| temperature: 0.2, | ||||||
| tools: [{ googleSearch: {} }], | ||||||
| }, | ||||||
| }); | ||||||
| const text = result.text ?? ''; | ||||||
| console.warn('Gemini Google Search transcript failed:', e); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| if (text.length > 100) { | ||||||
| return NextResponse.json({ | ||||||
| success: true, | ||||||
| transcript: text, | ||||||
| source: 'gemini', | ||||||
| wordCount: text.split(/\s+/).length, | ||||||
| }); | ||||||
| } | ||||||
| } catch (e2) { | ||||||
| console.warn('Gemini text fallback also failed:', e2); | ||||||
| // Strategy 3: OpenAI Responses API with web_search (fallback) | ||||||
| if (url && !audioUrl && process.env.OPENAI_API_KEY) { | ||||||
| try { | ||||||
| const metadataContext = metadata ? formatMetadataAsContext(metadata) : ''; | ||||||
|
|
||||||
| const response = await getOpenAI().responses.create({ | ||||||
| model: 'gpt-4o-mini', | ||||||
| instructions: `You are a video content transcription assistant. | ||||||
| Given a YouTube URL, use web search to find the video's ACTUAL transcript or detailed content. | ||||||
| Return the full transcript text if available. If not, provide a comprehensive content summary | ||||||
| based on the video's description, chapters, and any available reviews or summaries. | ||||||
| Do NOT return instructions on how to find a transcript — return the actual content. | ||||||
| Be thorough — capture all key points, quotes, technical details, and chapter breakdowns.`, | ||||||
| tools: [{ type: 'web_search' as const }], | ||||||
| input: `Find and return the full transcript or detailed content of this video: ${url} | ||||||
| ${metadataContext ? `\nKNOWN METADATA:\n${metadataContext}` : ''}`, | ||||||
|
Comment on lines
+172
to
+173
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
| }); | ||||||
|
|
||||||
| const text = response.output_text || ''; | ||||||
|
|
||||||
| // Reject results that are just instructions rather than actual content | ||||||
| const isGarbage = text.toLowerCase().includes('click show transcript') || | ||||||
| text.toLowerCase().includes('click on the three dots') || | ||||||
| text.toLowerCase().includes('steps to find') || | ||||||
| (text.length < 300 && text.includes('transcript')); | ||||||
|
Comment on lines
+179
to
+182
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic to detect 'garbage' responses is a great addition for robustness. However, the current implementation is a bit difficult to read and maintain as a single long boolean expression. Refactoring this to use an array of substrings would make it cleaner and easier to update in the future. const garbageSubstrings = [
'click show transcript',
'click on the three dots',
'steps to find',
];
const lowerCaseText = text.toLowerCase();
const isGarbage = garbageSubstrings.some(s => lowerCaseText.includes(s)) ||
(text.length < 300 && lowerCaseText.includes('transcript')); |
||||||
|
|
||||||
| if (text.length > 100 && !isGarbage) { | ||||||
| return NextResponse.json({ | ||||||
| success: true, | ||||||
| transcript: text, | ||||||
| source: 'openai-web-search', | ||||||
| wordCount: text.split(/\s+/).length, | ||||||
| }); | ||||||
| } | ||||||
| } catch (e) { | ||||||
| console.warn('OpenAI web_search transcript failed:', e); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,5 +1,6 @@ | ||||||
| import { NextResponse } from 'next/server'; | ||||||
| import { publishEvent, EventTypes } from '@/lib/cloudevents'; | ||||||
| import { analyzeVideoWithGemini } from '@/lib/gemini-video-analyzer'; | ||||||
|
|
||||||
| // Backend URL with validation - skip if not a valid URL | ||||||
| const rawBackendUrl = process.env.BACKEND_URL || ''; | ||||||
|
|
@@ -112,18 +113,59 @@ export async function POST(request: Request) { | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| // ── Strategy 2: Frontend-only pipeline ── | ||||||
| // Works on Vercel without the Python backend by chaining the serverless | ||||||
| // /api/transcribe and /api/extract-events routes directly. | ||||||
| // ── Strategy 2: Gemini Agentic Analysis (primary frontend strategy) ── | ||||||
| // Uses Google Search grounding to retrieve transcripts, descriptions, | ||||||
| // and chapter data directly — no separate transcribe/extract steps needed. | ||||||
| if (process.env.GEMINI_API_KEY) { | ||||||
| try { | ||||||
| await publishEvent(EventTypes.TRANSCRIPT_STARTED, { url, strategy: 'gemini-agentic' }, url); | ||||||
| const startTime = Date.now(); | ||||||
| const analysis = await analyzeVideoWithGemini(url, process.env.GEMINI_API_KEY); | ||||||
|
Comment on lines
+119
to
+123
|
||||||
| const elapsed = Date.now() - startTime; | ||||||
|
|
||||||
| await publishEvent(EventTypes.PIPELINE_COMPLETED, { | ||||||
| strategy: 'gemini-agentic', | ||||||
| success: true, | ||||||
| transcriptSegments: analysis.transcript?.length || 0, | ||||||
| events: analysis.events?.length || 0, | ||||||
| }, url); | ||||||
|
|
||||||
| // Use trusted backend origin instead of deriving from potentially user-controlled request data | ||||||
| const origin = BACKEND_URL; | ||||||
| return NextResponse.json({ | ||||||
| id: `vid_${Date.now().toString(36)}`, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using
Suggested change
|
||||||
| status: 'complete', | ||||||
| processing_time_ms: elapsed, | ||||||
| result: { | ||||||
| success: true, | ||||||
| insights: { | ||||||
| summary: analysis.summary, | ||||||
| actions: analysis.actions?.map((a) => a.title) || [], | ||||||
| topics: analysis.topics || [], | ||||||
| sentiment: 'Neutral', | ||||||
| }, | ||||||
| transcript_segments: analysis.transcript?.length || 0, | ||||||
| transcript_source: 'gemini-agentic', | ||||||
| agents_used: ['gemini-agentic-engine'], | ||||||
| errors: [], | ||||||
| raw_response: { | ||||||
| title: analysis.title, | ||||||
| transcript: analysis.transcript, | ||||||
| events: analysis.events, | ||||||
| actions: analysis.actions, | ||||||
| architectureCode: analysis.architectureCode, | ||||||
| ingestScript: analysis.ingestScript, | ||||||
| }, | ||||||
| }, | ||||||
| }); | ||||||
| } catch (e) { | ||||||
| console.warn('Gemini agentic analysis failed, falling back to transcribe chain:', e); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Step 1: Get transcript | ||||||
| // ── Strategy 3: Frontend-only transcribe → extract chain (fallback) ── | ||||||
| let transcript = ''; | ||||||
| let transcriptSource = 'none'; | ||||||
| try { | ||||||
| await publishEvent(EventTypes.TRANSCRIPT_STARTED, { url, strategy: 'frontend' }, url); | ||||||
| await publishEvent(EventTypes.TRANSCRIPT_STARTED, { url, strategy: 'frontend-chain' }, url); | ||||||
| const baseUrl = getBaseUrl(request); | ||||||
| const transcribeRes = await fetch(`${baseUrl}/api/transcribe`, { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||||||
| method: 'POST', | ||||||
|
|
@@ -140,7 +182,6 @@ export async function POST(request: Request) { | |||||
| console.error('Transcript extraction failed:', e); | ||||||
| } | ||||||
|
|
||||||
| // Step 2: Extract events + insights from transcript | ||||||
| let extraction: { events?: Array<{ type: string; title: string; description?: string; timestamp?: string; priority?: string }>; actions?: Array<{ title: string }>; summary?: string; topics?: string[] } = {}; | ||||||
| if (transcript) { | ||||||
| try { | ||||||
|
|
@@ -165,7 +206,7 @@ export async function POST(request: Request) { | |||||
|
|
||||||
| await publishEvent( | ||||||
| hasResults ? EventTypes.PIPELINE_COMPLETED : EventTypes.PIPELINE_FAILED, | ||||||
| { strategy: 'frontend', success: hasResults, transcriptSource }, | ||||||
| { strategy: 'frontend-chain', success: hasResults, transcriptSource }, | ||||||
| url, | ||||||
| ); | ||||||
|
|
||||||
|
|
@@ -176,15 +217,15 @@ export async function POST(request: Request) { | |||||
| result: { | ||||||
| success: hasResults, | ||||||
| insights: { | ||||||
| summary: extraction.summary || (hasResults ? 'Transcript extracted successfully' : 'Could not extract transcript — configure OPENAI_API_KEY or GEMINI_API_KEY'), | ||||||
| summary: extraction.summary || (hasResults ? 'Transcript extracted successfully' : 'Could not extract transcript — configure GEMINI_API_KEY'), | ||||||
|
||||||
| actions: extraction.actions?.map((a) => a.title) || [], | ||||||
| topics: extraction.topics || [], | ||||||
| sentiment: 'Neutral', | ||||||
| }, | ||||||
| transcript_segments: 0, | ||||||
| transcript_source: transcriptSource, | ||||||
| agents_used: ['frontend-pipeline'], | ||||||
| errors: hasResults ? [] : ['Backend unavailable and transcript extraction failed'], | ||||||
| errors: hasResults ? [] : ['All strategies failed — ensure GEMINI_API_KEY is set'], | ||||||
|
||||||
| raw_response: { | ||||||
| transcript: { text: transcript }, | ||||||
| extraction, | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The model name
gemini-2.5-flashappears to be incorrect and will likely cause the API call to fail. The current flash model is namedgemini-1.5-flash-latest. It's advisable to use a constant for model names to ensure consistency and avoid such errors, as this typo is present in multiple files.