diff --git a/apps/web/src/app/api/extract-events/route.ts b/apps/web/src/app/api/extract-events/route.ts index c85cfd288..007ae3965 100644 --- a/apps/web/src/app/api/extract-events/route.ts +++ b/apps/web/src/app/api/extract-events/route.ts @@ -150,38 +150,79 @@ export async function POST(request: Request) { try { const { transcript, videoTitle, videoUrl } = await request.json(); - if (!transcript || typeof transcript !== 'string') { + // Accept either transcript text OR videoUrl for direct Gemini analysis + if ((!transcript || typeof transcript !== 'string') && !videoUrl) { return NextResponse.json( - { error: 'transcript (string) is required' }, + { error: 'transcript (string) or videoUrl is required' }, { status: 400 } ); } - const trimmed = transcript.slice(0, 8000); let parsed; let provider = 'openai'; - // Try OpenAI first, fall back to Gemini on quota/auth errors - if (process.env.OPENAI_API_KEY) { - try { - parsed = await extractWithOpenAI(trimmed, videoTitle, videoUrl); - } catch (err) { - const msg = err instanceof Error ? err.message : ''; - if ((msg.includes('429') || msg.includes('quota') || msg.includes('rate')) && process.env.GEMINI_API_KEY) { - console.warn('OpenAI quota hit, falling back to Gemini'); - parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); - provider = 'gemini'; - } else { - throw err; + // If we have transcript text, use the existing extraction logic + if (transcript && typeof transcript === 'string' && transcript.length > 50) { + const trimmed = transcript.slice(0, 8000); + + if (process.env.OPENAI_API_KEY) { + try { + parsed = await extractWithOpenAI(trimmed, videoTitle, videoUrl); + } catch (err) { + const msg = err instanceof Error ? err.message : ''; + if ((msg.includes('429') || msg.includes('quota') || msg.includes('rate')) && process.env.GEMINI_API_KEY) { + console.warn('OpenAI quota hit, falling back to Gemini'); + parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); + provider = 'gemini'; + } else { + throw err; + } } + } else if (process.env.GEMINI_API_KEY) { + parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); + provider = 'gemini'; } - } else if (process.env.GEMINI_API_KEY) { - parsed = await extractWithGemini(trimmed, videoTitle, videoUrl); - provider = 'gemini'; - } else { + } + + // If no transcript but have videoUrl + Gemini, do direct video analysis via Google Search + if (!parsed && videoUrl && process.env.GEMINI_API_KEY) { + try { + const ai = getGemini(); + const response = await ai.models.generateContent({ + model: 'gemini-2.5-flash', + contents: `${SYSTEM_PROMPT}\n\nAnalyze this YouTube video and extract structured data. +Use your Google Search tool to find the video's transcript, description, and chapter content. + +Video URL: ${videoUrl} +${videoTitle ? `Video Title: ${videoTitle}` : ''} + +Extract events, actions, summary, and topics from the actual video content found via search. +Respond with ONLY valid JSON matching this structure: +{ + "events": [{"type": "action|topic|insight|tool|resource", "title": "...", "description": "...", "timestamp": "02:15" or null, "priority": "high|medium|low"}], + "actions": [{"title": "...", "description": "...", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": number or null}], + "summary": "2-3 sentence summary", + "topics": ["topic1", "topic2"] +}`, + config: { + temperature: 0.3, + responseMimeType: 'application/json', + responseSchema: geminiResponseSchema, + tools: [{ googleSearch: {} }], + }, + }); + const text = response.text ?? ''; + parsed = JSON.parse(text); + provider = 'gemini-search'; + } catch (e) { + console.warn('Gemini direct video extraction failed:', e); + } + } + + if (!parsed) { return NextResponse.json({ success: false, - error: 'No AI API key configured. Set OPENAI_API_KEY or GEMINI_API_KEY.', + error: 'No AI API key configured or all extraction attempts failed. Set GEMINI_API_KEY.', data: { events: [], actions: [], summary: '', topics: [] }, }); } diff --git a/apps/web/src/app/api/transcribe/route.ts b/apps/web/src/app/api/transcribe/route.ts index ea23fbd90..ea8200a1d 100644 --- a/apps/web/src/app/api/transcribe/route.ts +++ b/apps/web/src/app/api/transcribe/route.ts @@ -1,6 +1,7 @@ import OpenAI from 'openai'; import { GoogleGenAI } from '@google/genai'; import { NextResponse } from 'next/server'; +import { fetchYouTubeMetadata, formatMetadataAsContext } from '@/lib/youtube-metadata'; let _openai: OpenAI | null = null; function getOpenAI() { @@ -93,59 +94,42 @@ export async function POST(request: Request) { } } - // Strategy 2: OpenAI Responses API with web_search - if (url && !audioUrl && process.env.OPENAI_API_KEY) { + // Fetch YouTube metadata (description, chapters, title) — used by strategies below + let metadata: Awaited> = null; + if (url) { try { - const response = await getOpenAI().responses.create({ - model: 'gpt-4o-mini', - instructions: `You are a video content transcription assistant. -Given a YouTube URL, use web search to find the video's transcript or detailed content. -Return the full transcript text if available, or a detailed content summary. -Be thorough — capture all key points, quotes, and technical details.`, - tools: [{ type: 'web_search' as const }], - input: `Find and return the full transcript or detailed content of this video: ${url}`, - }); - - const text = response.output_text || ''; - - if (text.length > 100) { - return NextResponse.json({ - success: true, - transcript: text, - source: 'openai-web-search', - wordCount: text.split(/\s+/).length, - }); - } - } catch (e) { - console.warn('OpenAI web_search transcript failed:', e); + metadata = await fetchYouTubeMetadata(url); + } catch { + console.log('YouTube metadata fetch failed, continuing without'); } } - // Strategy 3: Gemini with direct YouTube URL processing + Google Search grounding + // Strategy 2: Gemini with Google Search grounding (PRIMARY for YouTube) + // Uses Google Search to find actual transcript content, descriptions, and chapters if (url && !audioUrl && process.env.GEMINI_API_KEY) { try { const ai = getGemini(); + const metadataContext = metadata ? formatMetadataAsContext(metadata) : ''; + const result = await ai.models.generateContent({ - model: 'gemini-2.0-flash', - contents: [ - { - role: 'user', - parts: [ - { - fileData: { - mimeType: 'video/*', - fileUri: url, - }, - }, - { - text: 'Provide a complete, detailed transcript of this video. ' + - 'Include all spoken content verbatim. ' + - 'Include timestamps where possible in [MM:SS] format. ' + - 'Be thorough and comprehensive — capture every key point, quote, and technical detail.', - }, - ], - }, - ], + model: 'gemini-2.5-flash', + contents: `You are a video transcription assistant with access to Google Search. + +For the following YouTube video, use your googleSearch tool to find the ACTUAL transcript, +description, and chapter content. The video creator often provides detailed descriptions +with chapter breakdowns — USE that metadata as high-quality structured content. + +${metadataContext ? `KNOWN VIDEO METADATA:\n${metadataContext}\n` : ''} +Video URL: ${url} + +INSTRUCTIONS: +1. Search for the video's transcript using Google Search. +2. If a spoken transcript is available, return it verbatim. +3. If not, reconstruct detailed content from the description, chapters, comments, + and related articles found via search. +4. Be thorough — capture ALL key points, technical details, quotes, and actionable insights. +5. Include timestamps in [MM:SS] format where possible. +6. Do NOT return generic advice like "click Show Transcript" — return actual content.`, config: { temperature: 0.2, tools: [{ googleSearch: {} }], @@ -157,40 +141,56 @@ Be thorough — capture all key points, quotes, and technical details.`, return NextResponse.json({ success: true, transcript: text, - source: 'gemini-video', + source: 'gemini-search', wordCount: text.split(/\s+/).length, + metadata: metadata ? { + title: metadata.title, + channel: metadata.channel, + chapters: metadata.chapters, + } : undefined, }); } } catch (e) { - console.warn('Gemini video URL processing failed, trying text fallback:', e); - - // Fallback: text-based Gemini with Google Search grounding - try { - const ai = getGemini(); - const result = await ai.models.generateContent({ - model: 'gemini-2.0-flash', - contents: `You are a video content transcription assistant. ` + - `For the following YouTube video URL, provide a detailed transcript or content summary. ` + - `Include all key points, technical details, quotes, and actionable insights. ` + - `Be thorough and comprehensive.\n\nVideo URL: ${url}`, - config: { - temperature: 0.2, - tools: [{ googleSearch: {} }], - }, - }); - const text = result.text ?? ''; + console.warn('Gemini Google Search transcript failed:', e); + } + } - if (text.length > 100) { - return NextResponse.json({ - success: true, - transcript: text, - source: 'gemini', - wordCount: text.split(/\s+/).length, - }); - } - } catch (e2) { - console.warn('Gemini text fallback also failed:', e2); + // Strategy 3: OpenAI Responses API with web_search (fallback) + if (url && !audioUrl && process.env.OPENAI_API_KEY) { + try { + const metadataContext = metadata ? formatMetadataAsContext(metadata) : ''; + + const response = await getOpenAI().responses.create({ + model: 'gpt-4o-mini', + instructions: `You are a video content transcription assistant. +Given a YouTube URL, use web search to find the video's ACTUAL transcript or detailed content. +Return the full transcript text if available. If not, provide a comprehensive content summary +based on the video's description, chapters, and any available reviews or summaries. +Do NOT return instructions on how to find a transcript — return the actual content. +Be thorough — capture all key points, quotes, technical details, and chapter breakdowns.`, + tools: [{ type: 'web_search' as const }], + input: `Find and return the full transcript or detailed content of this video: ${url} +${metadataContext ? `\nKNOWN METADATA:\n${metadataContext}` : ''}`, + }); + + const text = response.output_text || ''; + + // Reject results that are just instructions rather than actual content + const isGarbage = text.toLowerCase().includes('click show transcript') || + text.toLowerCase().includes('click on the three dots') || + text.toLowerCase().includes('steps to find') || + (text.length < 300 && text.includes('transcript')); + + if (text.length > 100 && !isGarbage) { + return NextResponse.json({ + success: true, + transcript: text, + source: 'openai-web-search', + wordCount: text.split(/\s+/).length, + }); } + } catch (e) { + console.warn('OpenAI web_search transcript failed:', e); } } diff --git a/apps/web/src/app/api/video/route.ts b/apps/web/src/app/api/video/route.ts index 42cf8ad62..b8bc5ed88 100644 --- a/apps/web/src/app/api/video/route.ts +++ b/apps/web/src/app/api/video/route.ts @@ -1,5 +1,6 @@ import { NextResponse } from 'next/server'; import { publishEvent, EventTypes } from '@/lib/cloudevents'; +import { analyzeVideoWithGemini } from '@/lib/gemini-video-analyzer'; // Backend URL with validation - skip if not a valid URL const rawBackendUrl = process.env.BACKEND_URL || ''; @@ -112,18 +113,59 @@ export async function POST(request: Request) { } } - // ── Strategy 2: Frontend-only pipeline ── - // Works on Vercel without the Python backend by chaining the serverless - // /api/transcribe and /api/extract-events routes directly. + // ── Strategy 2: Gemini Agentic Analysis (primary frontend strategy) ── + // Uses Google Search grounding to retrieve transcripts, descriptions, + // and chapter data directly — no separate transcribe/extract steps needed. + if (process.env.GEMINI_API_KEY) { + try { + await publishEvent(EventTypes.TRANSCRIPT_STARTED, { url, strategy: 'gemini-agentic' }, url); + const startTime = Date.now(); + const analysis = await analyzeVideoWithGemini(url, process.env.GEMINI_API_KEY); + const elapsed = Date.now() - startTime; + + await publishEvent(EventTypes.PIPELINE_COMPLETED, { + strategy: 'gemini-agentic', + success: true, + transcriptSegments: analysis.transcript?.length || 0, + events: analysis.events?.length || 0, + }, url); - // Use trusted backend origin instead of deriving from potentially user-controlled request data - const origin = BACKEND_URL; + return NextResponse.json({ + id: `vid_${Date.now().toString(36)}`, + status: 'complete', + processing_time_ms: elapsed, + result: { + success: true, + insights: { + summary: analysis.summary, + actions: analysis.actions?.map((a) => a.title) || [], + topics: analysis.topics || [], + sentiment: 'Neutral', + }, + transcript_segments: analysis.transcript?.length || 0, + transcript_source: 'gemini-agentic', + agents_used: ['gemini-agentic-engine'], + errors: [], + raw_response: { + title: analysis.title, + transcript: analysis.transcript, + events: analysis.events, + actions: analysis.actions, + architectureCode: analysis.architectureCode, + ingestScript: analysis.ingestScript, + }, + }, + }); + } catch (e) { + console.warn('Gemini agentic analysis failed, falling back to transcribe chain:', e); + } + } - // Step 1: Get transcript + // ── Strategy 3: Frontend-only transcribe → extract chain (fallback) ── let transcript = ''; let transcriptSource = 'none'; try { - await publishEvent(EventTypes.TRANSCRIPT_STARTED, { url, strategy: 'frontend' }, url); + await publishEvent(EventTypes.TRANSCRIPT_STARTED, { url, strategy: 'frontend-chain' }, url); const baseUrl = getBaseUrl(request); const transcribeRes = await fetch(`${baseUrl}/api/transcribe`, { method: 'POST', @@ -140,7 +182,6 @@ export async function POST(request: Request) { console.error('Transcript extraction failed:', e); } - // Step 2: Extract events + insights from transcript let extraction: { events?: Array<{ type: string; title: string; description?: string; timestamp?: string; priority?: string }>; actions?: Array<{ title: string }>; summary?: string; topics?: string[] } = {}; if (transcript) { try { @@ -165,7 +206,7 @@ export async function POST(request: Request) { await publishEvent( hasResults ? EventTypes.PIPELINE_COMPLETED : EventTypes.PIPELINE_FAILED, - { strategy: 'frontend', success: hasResults, transcriptSource }, + { strategy: 'frontend-chain', success: hasResults, transcriptSource }, url, ); @@ -176,7 +217,7 @@ export async function POST(request: Request) { result: { success: hasResults, insights: { - summary: extraction.summary || (hasResults ? 'Transcript extracted successfully' : 'Could not extract transcript — configure OPENAI_API_KEY or GEMINI_API_KEY'), + summary: extraction.summary || (hasResults ? 'Transcript extracted successfully' : 'Could not extract transcript — configure GEMINI_API_KEY'), actions: extraction.actions?.map((a) => a.title) || [], topics: extraction.topics || [], sentiment: 'Neutral', @@ -184,7 +225,7 @@ export async function POST(request: Request) { transcript_segments: 0, transcript_source: transcriptSource, agents_used: ['frontend-pipeline'], - errors: hasResults ? [] : ['Backend unavailable and transcript extraction failed'], + errors: hasResults ? [] : ['All strategies failed — ensure GEMINI_API_KEY is set'], raw_response: { transcript: { text: transcript }, extraction, diff --git a/apps/web/src/lib/gemini-video-analyzer.ts b/apps/web/src/lib/gemini-video-analyzer.ts new file mode 100644 index 000000000..4a3715ce2 --- /dev/null +++ b/apps/web/src/lib/gemini-video-analyzer.ts @@ -0,0 +1,173 @@ +/** + * Agentic Video Intelligence Engine — Gemini + Google Search grounding. + * + * Uses the googleSearch tool as the PRIMARY mechanism to retrieve real-time + * transcripts, descriptions, chapters, and metadata from YouTube videos. + * Based on the UVAI PK=998 implementation pattern. + */ + +import { GoogleGenAI, Type } from '@google/genai'; + +export interface VideoAnalysisResult { + title: string; + summary: string; + transcript: { start: number; duration: number; text: string }[]; + events: { + timestamp: number; + label: string; + description: string; + codeMapping: string; + cloudService: string; + }[]; + actions: { + title: string; + description: string; + category: string; + estimatedMinutes: number | null; + }[]; + topics: string[]; + architectureCode: string; + ingestScript: string; +} + +/** + * Gemini response schema using the @google/genai Type system. + * Matches the UVAI structured output requirements. + */ +const responseSchema = { + type: Type.OBJECT, + properties: { + title: { type: Type.STRING }, + summary: { type: Type.STRING }, + transcript: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + start: { type: Type.NUMBER, description: 'Seconds from video start' }, + duration: { type: Type.NUMBER }, + text: { type: Type.STRING }, + }, + required: ['start', 'duration', 'text'] as const, + }, + }, + events: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + timestamp: { type: Type.NUMBER }, + label: { type: Type.STRING }, + description: { type: Type.STRING }, + codeMapping: { + type: Type.STRING, + description: 'One-line code implementation of the action', + }, + cloudService: { type: Type.STRING }, + }, + required: ['timestamp', 'label', 'description', 'codeMapping', 'cloudService'] as const, + }, + }, + actions: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + title: { type: Type.STRING }, + description: { type: Type.STRING }, + category: { + type: Type.STRING, + enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'], + }, + estimatedMinutes: { type: Type.NUMBER, nullable: true }, + }, + required: ['title', 'description', 'category'] as const, + }, + }, + topics: { type: Type.ARRAY, items: { type: Type.STRING } }, + architectureCode: { type: Type.STRING }, + ingestScript: { type: Type.STRING }, + }, + required: [ + 'title', + 'summary', + 'transcript', + 'events', + 'actions', + 'topics', + 'architectureCode', + 'ingestScript', + ] as const, +}; + +/** + * Build the agentic system instruction for the Gemini model. + * Implements the Think → Act → Observe → Map loop from PK=998. + */ +function buildSystemInstruction(videoUrl: string): string { + return ` +You are the Agentic Video Intelligence Engine. + +MISSION: +1. WATCH the video at ${videoUrl} by searching for its transcript, technical documentation, + channel description, and chapter markers using your googleSearch tool. +2. THINK: Analyze the sequence of technical events described in the transcript and description. + Pay special attention to chapter markers — they indicate the video creator's own breakdown + of the content structure. +3. ACT: Reconstruct the timeline and generate actionable tasks that mirror the video content. +4. OBSERVE & MAP: Extract specific "Action Events" from the video and provide a direct + code mapping for each. + +DATA STRUCTURE REQUIREMENTS: +- title: Accurate video title from search results. +- summary: A high-level technical executive summary (2-3 sentences). +- transcript: An array of {start, duration, text} reconstructed from grounding. + Use chapter timestamps and description content if a full transcript is unavailable. + Each entry should cover a meaningful segment (30-120 seconds). +- events: 3-8 key technical milestones with timestamp, label, description, and codeMapping. +- actions: 3-8 concrete tasks a developer/learner should DO after watching. +- topics: Key topics and technologies covered. +- architectureCode: A markdown-formatted architecture overview if technical content is discussed, + or empty string if not applicable. +- ingestScript: A Python script that processes or replicates the video's key workflow, + or empty string if not applicable. + +IMPORTANT RULES: +- Use your googleSearch tool to find the ACTUAL content. Search for the video URL, + the video title, and related terms. +- The video creator often provides detailed descriptions with chapter breakdowns. + USE that metadata — it is high-quality structured content. +- If a spoken transcript is not available, reconstruct content from the description, + chapters, comments, and related articles found via search. +- NO MOCK DATA. Only use what is found via search grounding. +- Be thorough — capture every key point, technical detail, and actionable insight. +`; +} + +/** + * Executes a deep agentic analysis of a YouTube video using Gemini + Google Search. + * This is a single API call that handles both transcription AND extraction. + */ +export async function analyzeVideoWithGemini( + videoUrl: string, + apiKey: string, +): Promise { + const ai = new GoogleGenAI({ apiKey }); + + const systemInstruction = buildSystemInstruction(videoUrl); + + const response = await ai.models.generateContent({ + model: 'gemini-2.5-flash', + contents: `Perform Agentic Grounding for Video: ${videoUrl}`, + config: { + systemInstruction, + responseMimeType: 'application/json', + responseSchema, + tools: [{ googleSearch: {} }], + temperature: 0.3, + }, + }); + + const resultText = response.text || '{}'; + return JSON.parse(resultText) as VideoAnalysisResult; +} diff --git a/apps/web/src/lib/youtube-metadata.ts b/apps/web/src/lib/youtube-metadata.ts new file mode 100644 index 000000000..fdfc95b39 --- /dev/null +++ b/apps/web/src/lib/youtube-metadata.ts @@ -0,0 +1,125 @@ +/** + * YouTube metadata fetcher — extracts title, description, chapters, + * and channel info from YouTube videos without requiring an API key. + * + * Scrapes the YouTube page for `og:` meta tags and the embedded + * `shortDescription` JSON field, then parses chapter timestamps + * from the description text. + */ + +export interface YouTubeMetadata { + videoId: string; + title: string; + channel: string; + description: string; + chapters: { time: string; title: string }[]; +} + +/** + * Extract YouTube video ID from various URL formats. + */ +export function extractVideoId(url: string): string | null { + const patterns = [ + /(?:youtube\.com\/watch\?v=)([a-zA-Z0-9_-]{11})/, + /(?:youtu\.be\/)([a-zA-Z0-9_-]{11})/, + /(?:youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})/, + /(?:youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/, + ]; + for (const p of patterns) { + const m = url.match(p); + if (m) return m[1]; + } + return null; +} + +/** + * Parse chapter timestamps from a YouTube description. + * Chapters appear as lines like "0:00 Introduction" or "1:23:45 Deep Dive". + */ +function parseChapters(description: string): { time: string; title: string }[] { + const lines = description.split('\n'); + const chapters: { time: string; title: string }[] = []; + const chapterPattern = /^(\d{1,2}:\d{2}(?::\d{2})?)\s+(.+)$/; + + for (const line of lines) { + const m = line.trim().match(chapterPattern); + if (m) { + chapters.push({ time: m[1], title: m[2].trim() }); + } + } + return chapters; +} + +/** + * Fetch YouTube video metadata by scraping the watch page. + * No API key required. + */ +export async function fetchYouTubeMetadata(url: string): Promise { + const videoId = extractVideoId(url); + if (!videoId) return null; + + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 10_000); + + const response = await fetch(`https://www.youtube.com/watch?v=${videoId}`, { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; EventRelay/2.0)', + 'Accept-Language': 'en-US,en;q=0.9', + }, + signal: controller.signal, + }).finally(() => clearTimeout(timeout)); + + if (!response.ok) return null; + + const html = await response.text(); + + // Extract title from og:title + const titleMatch = html.match(/ 0) { + context += `\nCHAPTERS:\n`; + for (const ch of meta.chapters) { + context += ` ${ch.time} — ${ch.title}\n`; + } + } + + return context; +}