fix: restore full PK=998 pattern — responseSchema + googleSearch + gemini-3-pro-preview (#49)

groupthinking · Copilot · web-flow · commit f38b34c09a61 · 2026-02-28T16:37:18.000-06:00
The previous fix (PR #48) was a shortcut — it removed responseSchema when the real issue was using gemini-2.5-flash which doesn't support responseSchema + googleSearch together on Vertex AI. gemini-3-pro-preview DOES support the combination. This commit restores the exact PK=998 pattern: - gemini-video-analyzer.ts: Restored responseSchema with Type system, responseMimeType, e22Snippets field, model → gemini-3-pro-preview - extract-events/route.ts: Restored geminiResponseSchema, Type import, responseMimeType, model → gemini-3-pro-preview - transcribe/route.ts: model → gemini-3-pro-preview Tested with Vertex AI Express Mode key on two YouTube videos. Both return structured JSON with events, transcript, actions, codeMapping, cloudService, e22Snippets, architectureCode, ingestScript. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/apps/web/src/app/api/extract-events/route.ts b/apps/web/src/app/api/extract-events/route.ts
@@ -1,5 +1,5 @@
 import OpenAI from 'openai';
-
+import { Type } from '@google/genai';
 import { NextResponse } from 'next/server';
 import { getGeminiClient, hasGeminiKey } from '@/lib/gemini-client';
 
@@ -49,6 +49,43 @@ const extractionSchema = {
   additionalProperties: false,
 };
 
+// Gemini responseSchema using @google/genai Type system
+const geminiResponseSchema = {
+  type: Type.OBJECT,
+  properties: {
+    events: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          type: { type: Type.STRING, enum: ['action', 'topic', 'insight', 'tool', 'resource'] },
+          title: { type: Type.STRING },
+          description: { type: Type.STRING },
+          timestamp: { type: Type.STRING, nullable: true },
+          priority: { type: Type.STRING, enum: ['high', 'medium', 'low'] },
+        },
+        required: ['type', 'title', 'description', 'priority'],
+      },
+    },
+    actions: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          title: { type: Type.STRING },
+          description: { type: Type.STRING },
+          category: { type: Type.STRING, enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'] },
+          estimatedMinutes: { type: Type.NUMBER, nullable: true },
+        },
+        required: ['title', 'description', 'category'],
+      },
+    },
+    summary: { type: Type.STRING },
+    topics: { type: Type.ARRAY, items: { type: Type.STRING } },
+  },
+  required: ['events', 'actions', 'summary', 'topics'],
+};
+
 const SYSTEM_PROMPT = `You are an expert content analyst. Extract structured data from video transcripts.
 Be specific and practical — no vague or generic items.
 For events: classify type (action/topic/insight/tool/resource) and priority (high/medium/low).
@@ -91,16 +128,17 @@ async function extractWithOpenAI(trimmed: string, videoTitle?: string, videoUrl?
 async function extractWithGemini(trimmed: string, videoTitle?: string, videoUrl?: string) {
   const ai = getGeminiClient();
   const response = await ai.models.generateContent({
-    model: 'gemini-2.0-flash',
+    model: 'gemini-3-pro-preview',
     contents: `${SYSTEM_PROMPT}\n\n${buildUserPrompt(trimmed, videoTitle, videoUrl)}`,
     config: {
       temperature: 0.3,
+      responseMimeType: 'application/json',
+      responseSchema: geminiResponseSchema,
       tools: [{ googleSearch: {} }],
     },
   });
-  const text = (response.text ?? '').trim();
-  const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, '');
-  return JSON.parse(cleaned);
+  const text = response.text ?? '';
+  return JSON.parse(text);
 }
 
 export async function POST(request: Request) {
@@ -146,29 +184,23 @@ export async function POST(request: Request) {
       try {
         const ai = getGeminiClient();
         const response = await ai.models.generateContent({
-          model: 'gemini-2.5-flash',
+          model: 'gemini-3-pro-preview',
           contents: `${SYSTEM_PROMPT}\n\nAnalyze this YouTube video and extract structured data.
 Use your Google Search tool to find the video's transcript, description, and chapter content.
 
 Video URL: ${videoUrl}
 ${videoTitle ? `Video Title: ${videoTitle}` : ''}
 
-Extract events, actions, summary, and topics from the actual video content found via search.
-Respond with ONLY valid JSON matching this structure:
-{
-  "events": [{"type": "action|topic|insight|tool|resource", "title": "...", "description": "...", "timestamp": "02:15" or null, "priority": "high|medium|low"}],
-  "actions": [{"title": "...", "description": "...", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": number or null}],
-  "summary": "2-3 sentence summary",
-  "topics": ["topic1", "topic2"]
-}`,
+Extract events, actions, summary, and topics from the actual video content found via search.`,
           config: {
             temperature: 0.3,
+            responseMimeType: 'application/json',
+            responseSchema: geminiResponseSchema,
             tools: [{ googleSearch: {} }],
           },
         });
-        const text = (response.text ?? '').trim();
-        const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, '');
-        parsed = JSON.parse(cleaned);
+        const text = response.text ?? '';
+        parsed = JSON.parse(text);
         provider = 'gemini-search';
       } catch (e) {
         console.warn('Gemini direct video extraction failed:', e);
diff --git a/apps/web/src/app/api/transcribe/route.ts b/apps/web/src/app/api/transcribe/route.ts
@@ -106,7 +106,7 @@ export async function POST(request: Request) {
         const metadataContext = metadata ? formatMetadataAsContext(metadata) : '';
 
         const result = await ai.models.generateContent({
-          model: 'gemini-2.5-flash',
+          model: 'gemini-3-pro-preview',
           contents: `You are a video transcription assistant with access to Google Search.
 
 For the following YouTube video, use your googleSearch tool to find the ACTUAL transcript,
diff --git a/apps/web/src/lib/gemini-video-analyzer.ts b/apps/web/src/lib/gemini-video-analyzer.ts
@@ -5,10 +5,11 @@
  * transcripts, descriptions, chapters, and metadata from YouTube videos.
  * Based on the UVAI PK=998 implementation pattern.
  *
- * NOTE: Vertex AI does NOT support responseSchema (controlled generation)
- * combined with googleSearch tool. JSON structure is enforced via prompt.
+ * Uses gemini-3-pro-preview which supports responseSchema + googleSearch
+ * together (older models like gemini-2.5-flash do not).
  */
 
+import { Type } from '@google/genai';
 import { getGeminiClient } from './gemini-client';
 
 export interface VideoAnalysisResult {
@@ -31,56 +32,140 @@ export interface VideoAnalysisResult {
   topics: string[];
   architectureCode: string;
   ingestScript: string;
+  e22Snippets: {
+    title: string;
+    description: string;
+    code: string;
+    language: string;
+  }[];
 }
 
+/**
+ * Gemini response schema using the @google/genai Type system.
+ * Matches the UVAI PK=998 structured output requirements exactly.
+ */
+const responseSchema = {
+  type: Type.OBJECT,
+  properties: {
+    title: { type: Type.STRING },
+    summary: { type: Type.STRING },
+    transcript: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          start: { type: Type.NUMBER, description: 'Seconds from video start' },
+          duration: { type: Type.NUMBER },
+          text: { type: Type.STRING },
+        },
+        required: ['start', 'duration', 'text'] as const,
+      },
+    },
+    events: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          timestamp: { type: Type.NUMBER },
+          label: { type: Type.STRING },
+          description: { type: Type.STRING },
+          codeMapping: {
+            type: Type.STRING,
+            description: 'One-line code implementation of the action',
+          },
+          cloudService: { type: Type.STRING },
+        },
+        required: ['timestamp', 'label', 'description', 'codeMapping', 'cloudService'] as const,
+      },
+    },
+    actions: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          title: { type: Type.STRING },
+          description: { type: Type.STRING },
+          category: {
+            type: Type.STRING,
+            enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'],
+          },
+          estimatedMinutes: { type: Type.NUMBER, nullable: true },
+        },
+        required: ['title', 'description', 'category'] as const,
+      },
+    },
+    topics: { type: Type.ARRAY, items: { type: Type.STRING } },
+    architectureCode: { type: Type.STRING },
+    ingestScript: { type: Type.STRING },
+    e22Snippets: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          title: { type: Type.STRING },
+          description: { type: Type.STRING },
+          code: { type: Type.STRING },
+          language: { type: Type.STRING },
+        },
+        required: ['title', 'description', 'code', 'language'] as const,
+      },
+    },
+  },
+  required: [
+    'title',
+    'summary',
+    'transcript',
+    'events',
+    'actions',
+    'topics',
+    'architectureCode',
+    'ingestScript',
+    'e22Snippets',
+  ] as const,
+};
+
 /**
  * Build the agentic system instruction for the Gemini model.
  * Implements the Think → Act → Observe → Map loop from PK=998.
  */
 function buildSystemInstruction(videoUrl: string): string {
+  const videoId = videoUrl.match(/[?&]v=([^&]+)/)?.[1] || videoUrl;
   return `You are the Agentic Video Intelligence Engine.
 
 MISSION:
-1. WATCH the video at ${videoUrl} by searching for its transcript, technical documentation,
-   channel description, and chapter markers using your googleSearch tool.
-2. THINK: Analyze the sequence of technical events described in the transcript and description.
-   Pay special attention to chapter markers — they indicate the video creator's own breakdown
-   of the content structure.
-3. ACT: Reconstruct the timeline and generate actionable tasks that mirror the video content.
+1. WATCH the video (Video ID: ${videoId}) by searching for its transcript, technical documentation,
+   and chapter markers using your googleSearch tool.
+2. THINK: Analyze the sequence of technical events described in the transcript.
+3. ACT: Reconstruct the timeline and generate Python 'ingest.py' logic that mimics
+   the data patterns discussed in the video.
 4. OBSERVE & MAP: Extract specific "Action Events" from the video and provide a direct
-   code mapping for each.
+   "E22 Mapping" (code logic) for each.
+
+DATA STRUCTURE REQUIREMENTS:
+- title: Accurate video title from search results.
+- summary: A high-level technical executive summary.
+- transcript: An array of {start, duration, text} reconstructed from grounding.
+  Use chapter timestamps and description content if a full transcript is unavailable.
+  Each entry should cover a meaningful segment (30-120 seconds).
+- events: 3-5 key technical milestones with timestamp, label, description, and codeMapping.
+- actions: 3-8 concrete tasks a developer/learner should DO after watching.
+- topics: Key topics and technologies covered.
+- architectureCode: A Markdown-formatted cloud architecture blueprint.
+- ingestScript: A robust, modular Python script using Playwright for high-density ingestion.
+- e22Snippets: 3-5 production-ready code snippets for E22 cloud solutions.
 
-IMPORTANT RULES:
-- Use your googleSearch tool to find the ACTUAL content. Search for the video URL,
-  the video title, and related terms.
+STRICT RULE: NO MOCK DATA. Only use what is found via search grounding.
+- Use your googleSearch tool to find the ACTUAL content.
 - The video creator often provides detailed descriptions with chapter breakdowns.
   USE that metadata — it is high-quality structured content.
 - If a spoken transcript is not available, reconstruct content from the description,
   chapters, comments, and related articles found via search.
-- NO MOCK DATA. Only use what is found via search grounding.
-- Be thorough — capture every key point, technical detail, and actionable insight.
-
-You MUST respond with ONLY valid JSON (no markdown fences, no extra text) matching this exact structure:
-{
-  "title": "Accurate video title",
-  "summary": "2-3 sentence technical executive summary",
-  "transcript": [
-    {"start": 0, "duration": 60, "text": "segment text covering 30-120 seconds each"}
-  ],
-  "events": [
-    {"timestamp": 0, "label": "Event Name", "description": "What happened", "codeMapping": "one-line code", "cloudService": "relevant service"}
-  ],
-  "actions": [
-    {"title": "Task title", "description": "What to do", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": 15}
-  ],
-  "topics": ["topic1", "topic2"],
-  "architectureCode": "markdown architecture overview or empty string",
-  "ingestScript": "Python script or empty string"
-}`;
+- Be thorough — capture every key point, technical detail, and actionable insight.`;
 }
 
 /**
  * Executes a deep agentic analysis of a YouTube video using Gemini + Google Search.
+ * Uses gemini-3-pro-preview with responseSchema + googleSearch (PK=998 pattern).
  * This is a single API call that handles both transcription AND extraction.
  */
 export async function analyzeVideoWithGemini(
@@ -91,17 +176,16 @@ export async function analyzeVideoWithGemini(
   const systemInstruction = buildSystemInstruction(videoUrl);
 
   const response = await ai.models.generateContent({
-    model: 'gemini-2.5-flash',
+    model: 'gemini-3-pro-preview',
     contents: `Perform Agentic Grounding for Video: ${videoUrl}`,
     config: {
       systemInstruction,
+      responseMimeType: 'application/json',
+      responseSchema,
       tools: [{ googleSearch: {} }],
-      temperature: 0.3,
     },
   });
 
-  const resultText = (response.text || '').trim();
-  // Strip markdown code fences if present
-  const cleaned = resultText.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, '');
-  return JSON.parse(cleaned) as VideoAnalysisResult;
+  const resultText = response.text || '{}';
+  return JSON.parse(resultText) as VideoAnalysisResult;
 }