diff --git a/apps/web/src/app/api/extract-events/route.ts b/apps/web/src/app/api/extract-events/route.ts index 83862a2c..7f414e60 100644 --- a/apps/web/src/app/api/extract-events/route.ts +++ b/apps/web/src/app/api/extract-events/route.ts @@ -1,5 +1,5 @@ import OpenAI from 'openai'; -import { Type } from '@google/genai'; + import { NextResponse } from 'next/server'; import { getGeminiClient, hasGeminiKey } from '@/lib/gemini-client'; @@ -49,43 +49,6 @@ const extractionSchema = { additionalProperties: false, }; -// Gemini responseSchema using @google/genai Type system -const geminiResponseSchema = { - type: Type.OBJECT, - properties: { - events: { - type: Type.ARRAY, - items: { - type: Type.OBJECT, - properties: { - type: { type: Type.STRING, enum: ['action', 'topic', 'insight', 'tool', 'resource'] }, - title: { type: Type.STRING }, - description: { type: Type.STRING }, - timestamp: { type: Type.STRING, nullable: true }, - priority: { type: Type.STRING, enum: ['high', 'medium', 'low'] }, - }, - required: ['type', 'title', 'description', 'priority'], - }, - }, - actions: { - type: Type.ARRAY, - items: { - type: Type.OBJECT, - properties: { - title: { type: Type.STRING }, - description: { type: Type.STRING }, - category: { type: Type.STRING, enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'] }, - estimatedMinutes: { type: Type.NUMBER, nullable: true }, - }, - required: ['title', 'description', 'category'], - }, - }, - summary: { type: Type.STRING }, - topics: { type: Type.ARRAY, items: { type: Type.STRING } }, - }, - required: ['events', 'actions', 'summary', 'topics'], -}; - const SYSTEM_PROMPT = `You are an expert content analyst. Extract structured data from video transcripts. Be specific and practical — no vague or generic items. For events: classify type (action/topic/insight/tool/resource) and priority (high/medium/low). @@ -132,13 +95,12 @@ async function extractWithGemini(trimmed: string, videoTitle?: string, videoUrl? contents: `${SYSTEM_PROMPT}\n\n${buildUserPrompt(trimmed, videoTitle, videoUrl)}`, config: { temperature: 0.3, - responseMimeType: 'application/json', - responseSchema: geminiResponseSchema, tools: [{ googleSearch: {} }], }, }); - const text = response.text ?? ''; - return JSON.parse(text); + const text = (response.text ?? '').trim(); + const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, ''); + return JSON.parse(cleaned); } export async function POST(request: Request) { @@ -201,13 +163,12 @@ Respond with ONLY valid JSON matching this structure: }`, config: { temperature: 0.3, - responseMimeType: 'application/json', - responseSchema: geminiResponseSchema, tools: [{ googleSearch: {} }], }, }); - const text = response.text ?? ''; - parsed = JSON.parse(text); + const text = (response.text ?? '').trim(); + const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, ''); + parsed = JSON.parse(cleaned); provider = 'gemini-search'; } catch (e) { console.warn('Gemini direct video extraction failed:', e); diff --git a/apps/web/src/lib/gemini-client.ts b/apps/web/src/lib/gemini-client.ts index 42d61776..108a14ae 100644 --- a/apps/web/src/lib/gemini-client.ts +++ b/apps/web/src/lib/gemini-client.ts @@ -3,26 +3,22 @@ * * Supports two authentication modes: * 1. Gemini API: uses GEMINI_API_KEY or GOOGLE_API_KEY - * 2. Vertex AI: uses Vertex_AI_API_KEY with project/location - * (Express Mode — API key instead of service account) + * 2. Vertex AI Express Mode: uses Vertex_AI_API_KEY + * (apiKey + vertexai: true — no project/location needed) * - * Env vars for Vertex AI: - * - Vertex_AI_API_KEY: the Vertex AI API key - * - GOOGLE_CLOUD_PROJECT: GCP project ID (default: uvai-730bb) - * - GOOGLE_CLOUD_LOCATION: GCP location (default: us-central1) + * See: https://docs.cloud.google.com/vertex-ai/generative-ai/docs/start/express-mode/vertex-ai-express-mode-api-reference */ import { GoogleGenAI } from '@google/genai'; /** * Resolve the best available Google/Gemini API key. - * Returns the first non-empty key found, or empty string. */ export function resolveGeminiApiKey(): string { return ( + process.env.Vertex_AI_API_KEY || process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || - process.env.Vertex_AI_API_KEY || '' ); } @@ -35,14 +31,10 @@ export function hasGeminiKey(): boolean { } /** - * Determine if we should use Vertex AI mode. - * True when the only available key is Vertex_AI_API_KEY, - * or when GOOGLE_CLOUD_PROJECT is explicitly set. + * Determine if we should use Vertex AI Express Mode. */ function shouldUseVertexAI(): boolean { - // If standard Gemini keys are set, use Gemini API - if (process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY) return false; - // If Vertex AI key is available, use Vertex AI Express Mode + if (process.env.GOOGLE_GENAI_USE_VERTEXAI === 'true') return true; if (process.env.Vertex_AI_API_KEY) return true; return false; } @@ -52,7 +44,7 @@ let _lastKey = ''; let _lastMode = ''; /** - * Get a shared GoogleGenAI instance, creating one if needed. + * Get a shared GoogleGenAI instance. * Automatically selects Gemini API or Vertex AI Express Mode. */ export function getGeminiClient(): GoogleGenAI { @@ -61,11 +53,10 @@ export function getGeminiClient(): GoogleGenAI { if (!_gemini || _lastKey !== key || _lastMode !== mode) { if (mode === 'vertex') { + // Vertex AI Express Mode: apiKey + vertexai only _gemini = new GoogleGenAI({ vertexai: true, apiKey: key, - project: process.env.GOOGLE_CLOUD_PROJECT || 'uvai-730bb', - location: process.env.GOOGLE_CLOUD_LOCATION || 'us-central1', }); } else { _gemini = new GoogleGenAI({ apiKey: key }); diff --git a/apps/web/src/lib/gemini-video-analyzer.ts b/apps/web/src/lib/gemini-video-analyzer.ts index 53ce227f..c9404dfb 100644 --- a/apps/web/src/lib/gemini-video-analyzer.ts +++ b/apps/web/src/lib/gemini-video-analyzer.ts @@ -4,9 +4,11 @@ * Uses the googleSearch tool as the PRIMARY mechanism to retrieve real-time * transcripts, descriptions, chapters, and metadata from YouTube videos. * Based on the UVAI PK=998 implementation pattern. + * + * NOTE: Vertex AI does NOT support responseSchema (controlled generation) + * combined with googleSearch tool. JSON structure is enforced via prompt. */ -import { Type } from '@google/genai'; import { getGeminiClient } from './gemini-client'; export interface VideoAnalysisResult { @@ -31,83 +33,12 @@ export interface VideoAnalysisResult { ingestScript: string; } -/** - * Gemini response schema using the @google/genai Type system. - * Matches the UVAI structured output requirements. - */ -const responseSchema = { - type: Type.OBJECT, - properties: { - title: { type: Type.STRING }, - summary: { type: Type.STRING }, - transcript: { - type: Type.ARRAY, - items: { - type: Type.OBJECT, - properties: { - start: { type: Type.NUMBER, description: 'Seconds from video start' }, - duration: { type: Type.NUMBER }, - text: { type: Type.STRING }, - }, - required: ['start', 'duration', 'text'] as const, - }, - }, - events: { - type: Type.ARRAY, - items: { - type: Type.OBJECT, - properties: { - timestamp: { type: Type.NUMBER }, - label: { type: Type.STRING }, - description: { type: Type.STRING }, - codeMapping: { - type: Type.STRING, - description: 'One-line code implementation of the action', - }, - cloudService: { type: Type.STRING }, - }, - required: ['timestamp', 'label', 'description', 'codeMapping', 'cloudService'] as const, - }, - }, - actions: { - type: Type.ARRAY, - items: { - type: Type.OBJECT, - properties: { - title: { type: Type.STRING }, - description: { type: Type.STRING }, - category: { - type: Type.STRING, - enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'], - }, - estimatedMinutes: { type: Type.NUMBER, nullable: true }, - }, - required: ['title', 'description', 'category'] as const, - }, - }, - topics: { type: Type.ARRAY, items: { type: Type.STRING } }, - architectureCode: { type: Type.STRING }, - ingestScript: { type: Type.STRING }, - }, - required: [ - 'title', - 'summary', - 'transcript', - 'events', - 'actions', - 'topics', - 'architectureCode', - 'ingestScript', - ] as const, -}; - /** * Build the agentic system instruction for the Gemini model. * Implements the Think → Act → Observe → Map loop from PK=998. */ function buildSystemInstruction(videoUrl: string): string { - return ` -You are the Agentic Video Intelligence Engine. + return `You are the Agentic Video Intelligence Engine. MISSION: 1. WATCH the video at ${videoUrl} by searching for its transcript, technical documentation, @@ -119,20 +50,6 @@ MISSION: 4. OBSERVE & MAP: Extract specific "Action Events" from the video and provide a direct code mapping for each. -DATA STRUCTURE REQUIREMENTS: -- title: Accurate video title from search results. -- summary: A high-level technical executive summary (2-3 sentences). -- transcript: An array of {start, duration, text} reconstructed from grounding. - Use chapter timestamps and description content if a full transcript is unavailable. - Each entry should cover a meaningful segment (30-120 seconds). -- events: 3-8 key technical milestones with timestamp, label, description, and codeMapping. -- actions: 3-8 concrete tasks a developer/learner should DO after watching. -- topics: Key topics and technologies covered. -- architectureCode: A markdown-formatted architecture overview if technical content is discussed, - or empty string if not applicable. -- ingestScript: A Python script that processes or replicates the video's key workflow, - or empty string if not applicable. - IMPORTANT RULES: - Use your googleSearch tool to find the ACTUAL content. Search for the video URL, the video title, and related terms. @@ -142,7 +59,24 @@ IMPORTANT RULES: chapters, comments, and related articles found via search. - NO MOCK DATA. Only use what is found via search grounding. - Be thorough — capture every key point, technical detail, and actionable insight. -`; + +You MUST respond with ONLY valid JSON (no markdown fences, no extra text) matching this exact structure: +{ + "title": "Accurate video title", + "summary": "2-3 sentence technical executive summary", + "transcript": [ + {"start": 0, "duration": 60, "text": "segment text covering 30-120 seconds each"} + ], + "events": [ + {"timestamp": 0, "label": "Event Name", "description": "What happened", "codeMapping": "one-line code", "cloudService": "relevant service"} + ], + "actions": [ + {"title": "Task title", "description": "What to do", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": 15} + ], + "topics": ["topic1", "topic2"], + "architectureCode": "markdown architecture overview or empty string", + "ingestScript": "Python script or empty string" +}`; } /** @@ -161,13 +95,13 @@ export async function analyzeVideoWithGemini( contents: `Perform Agentic Grounding for Video: ${videoUrl}`, config: { systemInstruction, - responseMimeType: 'application/json', - responseSchema, tools: [{ googleSearch: {} }], temperature: 0.3, }, }); - const resultText = response.text || '{}'; - return JSON.parse(resultText) as VideoAnalysisResult; + const resultText = (response.text || '').trim(); + // Strip markdown code fences if present + const cleaned = resultText.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, ''); + return JSON.parse(cleaned) as VideoAnalysisResult; }