diff --git a/apps/web/src/app/api/extract-events/route.ts b/apps/web/src/app/api/extract-events/route.ts index 7f414e60..a3d2cd92 100644 --- a/apps/web/src/app/api/extract-events/route.ts +++ b/apps/web/src/app/api/extract-events/route.ts @@ -1,5 +1,5 @@ import OpenAI from 'openai'; - +import { Type } from '@google/genai'; import { NextResponse } from 'next/server'; import { getGeminiClient, hasGeminiKey } from '@/lib/gemini-client'; @@ -49,6 +49,43 @@ const extractionSchema = { additionalProperties: false, }; +// Gemini responseSchema using @google/genai Type system +const geminiResponseSchema = { + type: Type.OBJECT, + properties: { + events: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + type: { type: Type.STRING, enum: ['action', 'topic', 'insight', 'tool', 'resource'] }, + title: { type: Type.STRING }, + description: { type: Type.STRING }, + timestamp: { type: Type.STRING, nullable: true }, + priority: { type: Type.STRING, enum: ['high', 'medium', 'low'] }, + }, + required: ['type', 'title', 'description', 'priority'], + }, + }, + actions: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + title: { type: Type.STRING }, + description: { type: Type.STRING }, + category: { type: Type.STRING, enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'] }, + estimatedMinutes: { type: Type.NUMBER, nullable: true }, + }, + required: ['title', 'description', 'category'], + }, + }, + summary: { type: Type.STRING }, + topics: { type: Type.ARRAY, items: { type: Type.STRING } }, + }, + required: ['events', 'actions', 'summary', 'topics'], +}; + const SYSTEM_PROMPT = `You are an expert content analyst. Extract structured data from video transcripts. Be specific and practical — no vague or generic items. For events: classify type (action/topic/insight/tool/resource) and priority (high/medium/low). @@ -91,16 +128,17 @@ async function extractWithOpenAI(trimmed: string, videoTitle?: string, videoUrl? async function extractWithGemini(trimmed: string, videoTitle?: string, videoUrl?: string) { const ai = getGeminiClient(); const response = await ai.models.generateContent({ - model: 'gemini-2.0-flash', + model: 'gemini-3-pro-preview', contents: `${SYSTEM_PROMPT}\n\n${buildUserPrompt(trimmed, videoTitle, videoUrl)}`, config: { temperature: 0.3, + responseMimeType: 'application/json', + responseSchema: geminiResponseSchema, tools: [{ googleSearch: {} }], }, }); - const text = (response.text ?? '').trim(); - const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, ''); - return JSON.parse(cleaned); + const text = response.text ?? ''; + return JSON.parse(text); } export async function POST(request: Request) { @@ -146,29 +184,23 @@ export async function POST(request: Request) { try { const ai = getGeminiClient(); const response = await ai.models.generateContent({ - model: 'gemini-2.5-flash', + model: 'gemini-3-pro-preview', contents: `${SYSTEM_PROMPT}\n\nAnalyze this YouTube video and extract structured data. Use your Google Search tool to find the video's transcript, description, and chapter content. Video URL: ${videoUrl} ${videoTitle ? `Video Title: ${videoTitle}` : ''} -Extract events, actions, summary, and topics from the actual video content found via search. -Respond with ONLY valid JSON matching this structure: -{ - "events": [{"type": "action|topic|insight|tool|resource", "title": "...", "description": "...", "timestamp": "02:15" or null, "priority": "high|medium|low"}], - "actions": [{"title": "...", "description": "...", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": number or null}], - "summary": "2-3 sentence summary", - "topics": ["topic1", "topic2"] -}`, +Extract events, actions, summary, and topics from the actual video content found via search.`, config: { temperature: 0.3, + responseMimeType: 'application/json', + responseSchema: geminiResponseSchema, tools: [{ googleSearch: {} }], }, }); - const text = (response.text ?? '').trim(); - const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, ''); - parsed = JSON.parse(cleaned); + const text = response.text ?? ''; + parsed = JSON.parse(text); provider = 'gemini-search'; } catch (e) { console.warn('Gemini direct video extraction failed:', e); diff --git a/apps/web/src/app/api/transcribe/route.ts b/apps/web/src/app/api/transcribe/route.ts index adc20733..73149708 100644 --- a/apps/web/src/app/api/transcribe/route.ts +++ b/apps/web/src/app/api/transcribe/route.ts @@ -106,7 +106,7 @@ export async function POST(request: Request) { const metadataContext = metadata ? formatMetadataAsContext(metadata) : ''; const result = await ai.models.generateContent({ - model: 'gemini-2.5-flash', + model: 'gemini-3-pro-preview', contents: `You are a video transcription assistant with access to Google Search. For the following YouTube video, use your googleSearch tool to find the ACTUAL transcript, diff --git a/apps/web/src/lib/gemini-video-analyzer.ts b/apps/web/src/lib/gemini-video-analyzer.ts index c9404dfb..63ffa392 100644 --- a/apps/web/src/lib/gemini-video-analyzer.ts +++ b/apps/web/src/lib/gemini-video-analyzer.ts @@ -5,10 +5,11 @@ * transcripts, descriptions, chapters, and metadata from YouTube videos. * Based on the UVAI PK=998 implementation pattern. * - * NOTE: Vertex AI does NOT support responseSchema (controlled generation) - * combined with googleSearch tool. JSON structure is enforced via prompt. + * Uses gemini-3-pro-preview which supports responseSchema + googleSearch + * together (older models like gemini-2.5-flash do not). */ +import { Type } from '@google/genai'; import { getGeminiClient } from './gemini-client'; export interface VideoAnalysisResult { @@ -31,56 +32,140 @@ export interface VideoAnalysisResult { topics: string[]; architectureCode: string; ingestScript: string; + e22Snippets: { + title: string; + description: string; + code: string; + language: string; + }[]; } +/** + * Gemini response schema using the @google/genai Type system. + * Matches the UVAI PK=998 structured output requirements exactly. + */ +const responseSchema = { + type: Type.OBJECT, + properties: { + title: { type: Type.STRING }, + summary: { type: Type.STRING }, + transcript: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + start: { type: Type.NUMBER, description: 'Seconds from video start' }, + duration: { type: Type.NUMBER }, + text: { type: Type.STRING }, + }, + required: ['start', 'duration', 'text'] as const, + }, + }, + events: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + timestamp: { type: Type.NUMBER }, + label: { type: Type.STRING }, + description: { type: Type.STRING }, + codeMapping: { + type: Type.STRING, + description: 'One-line code implementation of the action', + }, + cloudService: { type: Type.STRING }, + }, + required: ['timestamp', 'label', 'description', 'codeMapping', 'cloudService'] as const, + }, + }, + actions: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + title: { type: Type.STRING }, + description: { type: Type.STRING }, + category: { + type: Type.STRING, + enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'], + }, + estimatedMinutes: { type: Type.NUMBER, nullable: true }, + }, + required: ['title', 'description', 'category'] as const, + }, + }, + topics: { type: Type.ARRAY, items: { type: Type.STRING } }, + architectureCode: { type: Type.STRING }, + ingestScript: { type: Type.STRING }, + e22Snippets: { + type: Type.ARRAY, + items: { + type: Type.OBJECT, + properties: { + title: { type: Type.STRING }, + description: { type: Type.STRING }, + code: { type: Type.STRING }, + language: { type: Type.STRING }, + }, + required: ['title', 'description', 'code', 'language'] as const, + }, + }, + }, + required: [ + 'title', + 'summary', + 'transcript', + 'events', + 'actions', + 'topics', + 'architectureCode', + 'ingestScript', + 'e22Snippets', + ] as const, +}; + /** * Build the agentic system instruction for the Gemini model. * Implements the Think → Act → Observe → Map loop from PK=998. */ function buildSystemInstruction(videoUrl: string): string { + const videoId = videoUrl.match(/[?&]v=([^&]+)/)?.[1] || videoUrl; return `You are the Agentic Video Intelligence Engine. MISSION: -1. WATCH the video at ${videoUrl} by searching for its transcript, technical documentation, - channel description, and chapter markers using your googleSearch tool. -2. THINK: Analyze the sequence of technical events described in the transcript and description. - Pay special attention to chapter markers — they indicate the video creator's own breakdown - of the content structure. -3. ACT: Reconstruct the timeline and generate actionable tasks that mirror the video content. +1. WATCH the video (Video ID: ${videoId}) by searching for its transcript, technical documentation, + and chapter markers using your googleSearch tool. +2. THINK: Analyze the sequence of technical events described in the transcript. +3. ACT: Reconstruct the timeline and generate Python 'ingest.py' logic that mimics + the data patterns discussed in the video. 4. OBSERVE & MAP: Extract specific "Action Events" from the video and provide a direct - code mapping for each. + "E22 Mapping" (code logic) for each. + +DATA STRUCTURE REQUIREMENTS: +- title: Accurate video title from search results. +- summary: A high-level technical executive summary. +- transcript: An array of {start, duration, text} reconstructed from grounding. + Use chapter timestamps and description content if a full transcript is unavailable. + Each entry should cover a meaningful segment (30-120 seconds). +- events: 3-5 key technical milestones with timestamp, label, description, and codeMapping. +- actions: 3-8 concrete tasks a developer/learner should DO after watching. +- topics: Key topics and technologies covered. +- architectureCode: A Markdown-formatted cloud architecture blueprint. +- ingestScript: A robust, modular Python script using Playwright for high-density ingestion. +- e22Snippets: 3-5 production-ready code snippets for E22 cloud solutions. -IMPORTANT RULES: -- Use your googleSearch tool to find the ACTUAL content. Search for the video URL, - the video title, and related terms. +STRICT RULE: NO MOCK DATA. Only use what is found via search grounding. +- Use your googleSearch tool to find the ACTUAL content. - The video creator often provides detailed descriptions with chapter breakdowns. USE that metadata — it is high-quality structured content. - If a spoken transcript is not available, reconstruct content from the description, chapters, comments, and related articles found via search. -- NO MOCK DATA. Only use what is found via search grounding. -- Be thorough — capture every key point, technical detail, and actionable insight. - -You MUST respond with ONLY valid JSON (no markdown fences, no extra text) matching this exact structure: -{ - "title": "Accurate video title", - "summary": "2-3 sentence technical executive summary", - "transcript": [ - {"start": 0, "duration": 60, "text": "segment text covering 30-120 seconds each"} - ], - "events": [ - {"timestamp": 0, "label": "Event Name", "description": "What happened", "codeMapping": "one-line code", "cloudService": "relevant service"} - ], - "actions": [ - {"title": "Task title", "description": "What to do", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": 15} - ], - "topics": ["topic1", "topic2"], - "architectureCode": "markdown architecture overview or empty string", - "ingestScript": "Python script or empty string" -}`; +- Be thorough — capture every key point, technical detail, and actionable insight.`; } /** * Executes a deep agentic analysis of a YouTube video using Gemini + Google Search. + * Uses gemini-3-pro-preview with responseSchema + googleSearch (PK=998 pattern). * This is a single API call that handles both transcription AND extraction. */ export async function analyzeVideoWithGemini( @@ -91,17 +176,16 @@ export async function analyzeVideoWithGemini( const systemInstruction = buildSystemInstruction(videoUrl); const response = await ai.models.generateContent({ - model: 'gemini-2.5-flash', + model: 'gemini-3-pro-preview', contents: `Perform Agentic Grounding for Video: ${videoUrl}`, config: { systemInstruction, + responseMimeType: 'application/json', + responseSchema, tools: [{ googleSearch: {} }], - temperature: 0.3, }, }); - const resultText = (response.text || '').trim(); - // Strip markdown code fences if present - const cleaned = resultText.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, ''); - return JSON.parse(cleaned) as VideoAnalysisResult; + const resultText = response.text || '{}'; + return JSON.parse(resultText) as VideoAnalysisResult; }