Skip to content

Commit f38b34c

Browse files
fix: restore full PK=998 pattern — responseSchema + googleSearch + gemini-3-pro-preview (#49)
The previous fix (PR #48) was a shortcut — it removed responseSchema when the real issue was using gemini-2.5-flash which doesn't support responseSchema + googleSearch together on Vertex AI. gemini-3-pro-preview DOES support the combination. This commit restores the exact PK=998 pattern: - gemini-video-analyzer.ts: Restored responseSchema with Type system, responseMimeType, e22Snippets field, model → gemini-3-pro-preview - extract-events/route.ts: Restored geminiResponseSchema, Type import, responseMimeType, model → gemini-3-pro-preview - transcribe/route.ts: model → gemini-3-pro-preview Tested with Vertex AI Express Mode key on two YouTube videos. Both return structured JSON with events, transcript, actions, codeMapping, cloudService, e22Snippets, architectureCode, ingestScript. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 924afa2 commit f38b34c

3 files changed

Lines changed: 172 additions & 56 deletions

File tree

apps/web/src/app/api/extract-events/route.ts

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import OpenAI from 'openai';
2-
2+
import { Type } from '@google/genai';
33
import { NextResponse } from 'next/server';
44
import { getGeminiClient, hasGeminiKey } from '@/lib/gemini-client';
55

@@ -49,6 +49,43 @@ const extractionSchema = {
4949
additionalProperties: false,
5050
};
5151

52+
// Gemini responseSchema using @google/genai Type system
53+
const geminiResponseSchema = {
54+
type: Type.OBJECT,
55+
properties: {
56+
events: {
57+
type: Type.ARRAY,
58+
items: {
59+
type: Type.OBJECT,
60+
properties: {
61+
type: { type: Type.STRING, enum: ['action', 'topic', 'insight', 'tool', 'resource'] },
62+
title: { type: Type.STRING },
63+
description: { type: Type.STRING },
64+
timestamp: { type: Type.STRING, nullable: true },
65+
priority: { type: Type.STRING, enum: ['high', 'medium', 'low'] },
66+
},
67+
required: ['type', 'title', 'description', 'priority'],
68+
},
69+
},
70+
actions: {
71+
type: Type.ARRAY,
72+
items: {
73+
type: Type.OBJECT,
74+
properties: {
75+
title: { type: Type.STRING },
76+
description: { type: Type.STRING },
77+
category: { type: Type.STRING, enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'] },
78+
estimatedMinutes: { type: Type.NUMBER, nullable: true },
79+
},
80+
required: ['title', 'description', 'category'],
81+
},
82+
},
83+
summary: { type: Type.STRING },
84+
topics: { type: Type.ARRAY, items: { type: Type.STRING } },
85+
},
86+
required: ['events', 'actions', 'summary', 'topics'],
87+
};
88+
5289
const SYSTEM_PROMPT = `You are an expert content analyst. Extract structured data from video transcripts.
5390
Be specific and practical — no vague or generic items.
5491
For events: classify type (action/topic/insight/tool/resource) and priority (high/medium/low).
@@ -91,16 +128,17 @@ async function extractWithOpenAI(trimmed: string, videoTitle?: string, videoUrl?
91128
async function extractWithGemini(trimmed: string, videoTitle?: string, videoUrl?: string) {
92129
const ai = getGeminiClient();
93130
const response = await ai.models.generateContent({
94-
model: 'gemini-2.0-flash',
131+
model: 'gemini-3-pro-preview',
95132
contents: `${SYSTEM_PROMPT}\n\n${buildUserPrompt(trimmed, videoTitle, videoUrl)}`,
96133
config: {
97134
temperature: 0.3,
135+
responseMimeType: 'application/json',
136+
responseSchema: geminiResponseSchema,
98137
tools: [{ googleSearch: {} }],
99138
},
100139
});
101-
const text = (response.text ?? '').trim();
102-
const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, '');
103-
return JSON.parse(cleaned);
140+
const text = response.text ?? '';
141+
return JSON.parse(text);
104142
}
105143

106144
export async function POST(request: Request) {
@@ -146,29 +184,23 @@ export async function POST(request: Request) {
146184
try {
147185
const ai = getGeminiClient();
148186
const response = await ai.models.generateContent({
149-
model: 'gemini-2.5-flash',
187+
model: 'gemini-3-pro-preview',
150188
contents: `${SYSTEM_PROMPT}\n\nAnalyze this YouTube video and extract structured data.
151189
Use your Google Search tool to find the video's transcript, description, and chapter content.
152190
153191
Video URL: ${videoUrl}
154192
${videoTitle ? `Video Title: ${videoTitle}` : ''}
155193
156-
Extract events, actions, summary, and topics from the actual video content found via search.
157-
Respond with ONLY valid JSON matching this structure:
158-
{
159-
"events": [{"type": "action|topic|insight|tool|resource", "title": "...", "description": "...", "timestamp": "02:15" or null, "priority": "high|medium|low"}],
160-
"actions": [{"title": "...", "description": "...", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": number or null}],
161-
"summary": "2-3 sentence summary",
162-
"topics": ["topic1", "topic2"]
163-
}`,
194+
Extract events, actions, summary, and topics from the actual video content found via search.`,
164195
config: {
165196
temperature: 0.3,
197+
responseMimeType: 'application/json',
198+
responseSchema: geminiResponseSchema,
166199
tools: [{ googleSearch: {} }],
167200
},
168201
});
169-
const text = (response.text ?? '').trim();
170-
const cleaned = text.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, '');
171-
parsed = JSON.parse(cleaned);
202+
const text = response.text ?? '';
203+
parsed = JSON.parse(text);
172204
provider = 'gemini-search';
173205
} catch (e) {
174206
console.warn('Gemini direct video extraction failed:', e);

apps/web/src/app/api/transcribe/route.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ export async function POST(request: Request) {
106106
const metadataContext = metadata ? formatMetadataAsContext(metadata) : '';
107107

108108
const result = await ai.models.generateContent({
109-
model: 'gemini-2.5-flash',
109+
model: 'gemini-3-pro-preview',
110110
contents: `You are a video transcription assistant with access to Google Search.
111111
112112
For the following YouTube video, use your googleSearch tool to find the ACTUAL transcript,

apps/web/src/lib/gemini-video-analyzer.ts

Lines changed: 122 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
* transcripts, descriptions, chapters, and metadata from YouTube videos.
66
* Based on the UVAI PK=998 implementation pattern.
77
*
8-
* NOTE: Vertex AI does NOT support responseSchema (controlled generation)
9-
* combined with googleSearch tool. JSON structure is enforced via prompt.
8+
* Uses gemini-3-pro-preview which supports responseSchema + googleSearch
9+
* together (older models like gemini-2.5-flash do not).
1010
*/
1111

12+
import { Type } from '@google/genai';
1213
import { getGeminiClient } from './gemini-client';
1314

1415
export interface VideoAnalysisResult {
@@ -31,56 +32,140 @@ export interface VideoAnalysisResult {
3132
topics: string[];
3233
architectureCode: string;
3334
ingestScript: string;
35+
e22Snippets: {
36+
title: string;
37+
description: string;
38+
code: string;
39+
language: string;
40+
}[];
3441
}
3542

43+
/**
44+
* Gemini response schema using the @google/genai Type system.
45+
* Matches the UVAI PK=998 structured output requirements exactly.
46+
*/
47+
const responseSchema = {
48+
type: Type.OBJECT,
49+
properties: {
50+
title: { type: Type.STRING },
51+
summary: { type: Type.STRING },
52+
transcript: {
53+
type: Type.ARRAY,
54+
items: {
55+
type: Type.OBJECT,
56+
properties: {
57+
start: { type: Type.NUMBER, description: 'Seconds from video start' },
58+
duration: { type: Type.NUMBER },
59+
text: { type: Type.STRING },
60+
},
61+
required: ['start', 'duration', 'text'] as const,
62+
},
63+
},
64+
events: {
65+
type: Type.ARRAY,
66+
items: {
67+
type: Type.OBJECT,
68+
properties: {
69+
timestamp: { type: Type.NUMBER },
70+
label: { type: Type.STRING },
71+
description: { type: Type.STRING },
72+
codeMapping: {
73+
type: Type.STRING,
74+
description: 'One-line code implementation of the action',
75+
},
76+
cloudService: { type: Type.STRING },
77+
},
78+
required: ['timestamp', 'label', 'description', 'codeMapping', 'cloudService'] as const,
79+
},
80+
},
81+
actions: {
82+
type: Type.ARRAY,
83+
items: {
84+
type: Type.OBJECT,
85+
properties: {
86+
title: { type: Type.STRING },
87+
description: { type: Type.STRING },
88+
category: {
89+
type: Type.STRING,
90+
enum: ['setup', 'build', 'deploy', 'learn', 'research', 'configure'],
91+
},
92+
estimatedMinutes: { type: Type.NUMBER, nullable: true },
93+
},
94+
required: ['title', 'description', 'category'] as const,
95+
},
96+
},
97+
topics: { type: Type.ARRAY, items: { type: Type.STRING } },
98+
architectureCode: { type: Type.STRING },
99+
ingestScript: { type: Type.STRING },
100+
e22Snippets: {
101+
type: Type.ARRAY,
102+
items: {
103+
type: Type.OBJECT,
104+
properties: {
105+
title: { type: Type.STRING },
106+
description: { type: Type.STRING },
107+
code: { type: Type.STRING },
108+
language: { type: Type.STRING },
109+
},
110+
required: ['title', 'description', 'code', 'language'] as const,
111+
},
112+
},
113+
},
114+
required: [
115+
'title',
116+
'summary',
117+
'transcript',
118+
'events',
119+
'actions',
120+
'topics',
121+
'architectureCode',
122+
'ingestScript',
123+
'e22Snippets',
124+
] as const,
125+
};
126+
36127
/**
37128
* Build the agentic system instruction for the Gemini model.
38129
* Implements the Think → Act → Observe → Map loop from PK=998.
39130
*/
40131
function buildSystemInstruction(videoUrl: string): string {
132+
const videoId = videoUrl.match(/[?&]v=([^&]+)/)?.[1] || videoUrl;
41133
return `You are the Agentic Video Intelligence Engine.
42134
43135
MISSION:
44-
1. WATCH the video at ${videoUrl} by searching for its transcript, technical documentation,
45-
channel description, and chapter markers using your googleSearch tool.
46-
2. THINK: Analyze the sequence of technical events described in the transcript and description.
47-
Pay special attention to chapter markers — they indicate the video creator's own breakdown
48-
of the content structure.
49-
3. ACT: Reconstruct the timeline and generate actionable tasks that mirror the video content.
136+
1. WATCH the video (Video ID: ${videoId}) by searching for its transcript, technical documentation,
137+
and chapter markers using your googleSearch tool.
138+
2. THINK: Analyze the sequence of technical events described in the transcript.
139+
3. ACT: Reconstruct the timeline and generate Python 'ingest.py' logic that mimics
140+
the data patterns discussed in the video.
50141
4. OBSERVE & MAP: Extract specific "Action Events" from the video and provide a direct
51-
code mapping for each.
142+
"E22 Mapping" (code logic) for each.
143+
144+
DATA STRUCTURE REQUIREMENTS:
145+
- title: Accurate video title from search results.
146+
- summary: A high-level technical executive summary.
147+
- transcript: An array of {start, duration, text} reconstructed from grounding.
148+
Use chapter timestamps and description content if a full transcript is unavailable.
149+
Each entry should cover a meaningful segment (30-120 seconds).
150+
- events: 3-5 key technical milestones with timestamp, label, description, and codeMapping.
151+
- actions: 3-8 concrete tasks a developer/learner should DO after watching.
152+
- topics: Key topics and technologies covered.
153+
- architectureCode: A Markdown-formatted cloud architecture blueprint.
154+
- ingestScript: A robust, modular Python script using Playwright for high-density ingestion.
155+
- e22Snippets: 3-5 production-ready code snippets for E22 cloud solutions.
52156
53-
IMPORTANT RULES:
54-
- Use your googleSearch tool to find the ACTUAL content. Search for the video URL,
55-
the video title, and related terms.
157+
STRICT RULE: NO MOCK DATA. Only use what is found via search grounding.
158+
- Use your googleSearch tool to find the ACTUAL content.
56159
- The video creator often provides detailed descriptions with chapter breakdowns.
57160
USE that metadata — it is high-quality structured content.
58161
- If a spoken transcript is not available, reconstruct content from the description,
59162
chapters, comments, and related articles found via search.
60-
- NO MOCK DATA. Only use what is found via search grounding.
61-
- Be thorough — capture every key point, technical detail, and actionable insight.
62-
63-
You MUST respond with ONLY valid JSON (no markdown fences, no extra text) matching this exact structure:
64-
{
65-
"title": "Accurate video title",
66-
"summary": "2-3 sentence technical executive summary",
67-
"transcript": [
68-
{"start": 0, "duration": 60, "text": "segment text covering 30-120 seconds each"}
69-
],
70-
"events": [
71-
{"timestamp": 0, "label": "Event Name", "description": "What happened", "codeMapping": "one-line code", "cloudService": "relevant service"}
72-
],
73-
"actions": [
74-
{"title": "Task title", "description": "What to do", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": 15}
75-
],
76-
"topics": ["topic1", "topic2"],
77-
"architectureCode": "markdown architecture overview or empty string",
78-
"ingestScript": "Python script or empty string"
79-
}`;
163+
- Be thorough — capture every key point, technical detail, and actionable insight.`;
80164
}
81165

82166
/**
83167
* Executes a deep agentic analysis of a YouTube video using Gemini + Google Search.
168+
* Uses gemini-3-pro-preview with responseSchema + googleSearch (PK=998 pattern).
84169
* This is a single API call that handles both transcription AND extraction.
85170
*/
86171
export async function analyzeVideoWithGemini(
@@ -91,17 +176,16 @@ export async function analyzeVideoWithGemini(
91176
const systemInstruction = buildSystemInstruction(videoUrl);
92177

93178
const response = await ai.models.generateContent({
94-
model: 'gemini-2.5-flash',
179+
model: 'gemini-3-pro-preview',
95180
contents: `Perform Agentic Grounding for Video: ${videoUrl}`,
96181
config: {
97182
systemInstruction,
183+
responseMimeType: 'application/json',
184+
responseSchema,
98185
tools: [{ googleSearch: {} }],
99-
temperature: 0.3,
100186
},
101187
});
102188

103-
const resultText = (response.text || '').trim();
104-
// Strip markdown code fences if present
105-
const cleaned = resultText.replace(/^```(?:json)?\s*\n?/i, '').replace(/\n?```\s*$/i, '');
106-
return JSON.parse(cleaned) as VideoAnalysisResult;
189+
const resultText = response.text || '{}';
190+
return JSON.parse(resultText) as VideoAnalysisResult;
107191
}

0 commit comments

Comments
 (0)