55 * transcripts, descriptions, chapters, and metadata from YouTube videos.
66 * Based on the UVAI PK=998 implementation pattern.
77 *
8- * NOTE: Vertex AI does NOT support responseSchema (controlled generation)
9- * combined with googleSearch tool. JSON structure is enforced via prompt .
8+ * Uses gemini-3-pro-preview which supports responseSchema + googleSearch
9+ * together (older models like gemini-2.5-flash do not) .
1010 */
1111
12+ import { Type } from '@google/genai' ;
1213import { getGeminiClient } from './gemini-client' ;
1314
1415export interface VideoAnalysisResult {
@@ -31,56 +32,140 @@ export interface VideoAnalysisResult {
3132 topics : string [ ] ;
3233 architectureCode : string ;
3334 ingestScript : string ;
35+ e22Snippets : {
36+ title : string ;
37+ description : string ;
38+ code : string ;
39+ language : string ;
40+ } [ ] ;
3441}
3542
43+ /**
44+ * Gemini response schema using the @google/genai Type system.
45+ * Matches the UVAI PK=998 structured output requirements exactly.
46+ */
47+ const responseSchema = {
48+ type : Type . OBJECT ,
49+ properties : {
50+ title : { type : Type . STRING } ,
51+ summary : { type : Type . STRING } ,
52+ transcript : {
53+ type : Type . ARRAY ,
54+ items : {
55+ type : Type . OBJECT ,
56+ properties : {
57+ start : { type : Type . NUMBER , description : 'Seconds from video start' } ,
58+ duration : { type : Type . NUMBER } ,
59+ text : { type : Type . STRING } ,
60+ } ,
61+ required : [ 'start' , 'duration' , 'text' ] as const ,
62+ } ,
63+ } ,
64+ events : {
65+ type : Type . ARRAY ,
66+ items : {
67+ type : Type . OBJECT ,
68+ properties : {
69+ timestamp : { type : Type . NUMBER } ,
70+ label : { type : Type . STRING } ,
71+ description : { type : Type . STRING } ,
72+ codeMapping : {
73+ type : Type . STRING ,
74+ description : 'One-line code implementation of the action' ,
75+ } ,
76+ cloudService : { type : Type . STRING } ,
77+ } ,
78+ required : [ 'timestamp' , 'label' , 'description' , 'codeMapping' , 'cloudService' ] as const ,
79+ } ,
80+ } ,
81+ actions : {
82+ type : Type . ARRAY ,
83+ items : {
84+ type : Type . OBJECT ,
85+ properties : {
86+ title : { type : Type . STRING } ,
87+ description : { type : Type . STRING } ,
88+ category : {
89+ type : Type . STRING ,
90+ enum : [ 'setup' , 'build' , 'deploy' , 'learn' , 'research' , 'configure' ] ,
91+ } ,
92+ estimatedMinutes : { type : Type . NUMBER , nullable : true } ,
93+ } ,
94+ required : [ 'title' , 'description' , 'category' ] as const ,
95+ } ,
96+ } ,
97+ topics : { type : Type . ARRAY , items : { type : Type . STRING } } ,
98+ architectureCode : { type : Type . STRING } ,
99+ ingestScript : { type : Type . STRING } ,
100+ e22Snippets : {
101+ type : Type . ARRAY ,
102+ items : {
103+ type : Type . OBJECT ,
104+ properties : {
105+ title : { type : Type . STRING } ,
106+ description : { type : Type . STRING } ,
107+ code : { type : Type . STRING } ,
108+ language : { type : Type . STRING } ,
109+ } ,
110+ required : [ 'title' , 'description' , 'code' , 'language' ] as const ,
111+ } ,
112+ } ,
113+ } ,
114+ required : [
115+ 'title' ,
116+ 'summary' ,
117+ 'transcript' ,
118+ 'events' ,
119+ 'actions' ,
120+ 'topics' ,
121+ 'architectureCode' ,
122+ 'ingestScript' ,
123+ 'e22Snippets' ,
124+ ] as const ,
125+ } ;
126+
36127/**
37128 * Build the agentic system instruction for the Gemini model.
38129 * Implements the Think → Act → Observe → Map loop from PK=998.
39130 */
40131function buildSystemInstruction ( videoUrl : string ) : string {
132+ const videoId = videoUrl . match ( / [ ? & ] v = ( [ ^ & ] + ) / ) ?. [ 1 ] || videoUrl ;
41133 return `You are the Agentic Video Intelligence Engine.
42134
43135MISSION:
44- 1. WATCH the video at ${ videoUrl } by searching for its transcript, technical documentation,
45- channel description, and chapter markers using your googleSearch tool.
46- 2. THINK: Analyze the sequence of technical events described in the transcript and description.
47- Pay special attention to chapter markers — they indicate the video creator's own breakdown
48- of the content structure.
49- 3. ACT: Reconstruct the timeline and generate actionable tasks that mirror the video content.
136+ 1. WATCH the video (Video ID: ${ videoId } ) by searching for its transcript, technical documentation,
137+ and chapter markers using your googleSearch tool.
138+ 2. THINK: Analyze the sequence of technical events described in the transcript.
139+ 3. ACT: Reconstruct the timeline and generate Python 'ingest.py' logic that mimics
140+ the data patterns discussed in the video.
501414. OBSERVE & MAP: Extract specific "Action Events" from the video and provide a direct
51- code mapping for each.
142+ "E22 Mapping" (code logic) for each.
143+
144+ DATA STRUCTURE REQUIREMENTS:
145+ - title: Accurate video title from search results.
146+ - summary: A high-level technical executive summary.
147+ - transcript: An array of {start, duration, text} reconstructed from grounding.
148+ Use chapter timestamps and description content if a full transcript is unavailable.
149+ Each entry should cover a meaningful segment (30-120 seconds).
150+ - events: 3-5 key technical milestones with timestamp, label, description, and codeMapping.
151+ - actions: 3-8 concrete tasks a developer/learner should DO after watching.
152+ - topics: Key topics and technologies covered.
153+ - architectureCode: A Markdown-formatted cloud architecture blueprint.
154+ - ingestScript: A robust, modular Python script using Playwright for high-density ingestion.
155+ - e22Snippets: 3-5 production-ready code snippets for E22 cloud solutions.
52156
53- IMPORTANT RULES:
54- - Use your googleSearch tool to find the ACTUAL content. Search for the video URL,
55- the video title, and related terms.
157+ STRICT RULE: NO MOCK DATA. Only use what is found via search grounding.
158+ - Use your googleSearch tool to find the ACTUAL content.
56159- The video creator often provides detailed descriptions with chapter breakdowns.
57160 USE that metadata — it is high-quality structured content.
58161- If a spoken transcript is not available, reconstruct content from the description,
59162 chapters, comments, and related articles found via search.
60- - NO MOCK DATA. Only use what is found via search grounding.
61- - Be thorough — capture every key point, technical detail, and actionable insight.
62-
63- You MUST respond with ONLY valid JSON (no markdown fences, no extra text) matching this exact structure:
64- {
65- "title": "Accurate video title",
66- "summary": "2-3 sentence technical executive summary",
67- "transcript": [
68- {"start": 0, "duration": 60, "text": "segment text covering 30-120 seconds each"}
69- ],
70- "events": [
71- {"timestamp": 0, "label": "Event Name", "description": "What happened", "codeMapping": "one-line code", "cloudService": "relevant service"}
72- ],
73- "actions": [
74- {"title": "Task title", "description": "What to do", "category": "setup|build|deploy|learn|research|configure", "estimatedMinutes": 15}
75- ],
76- "topics": ["topic1", "topic2"],
77- "architectureCode": "markdown architecture overview or empty string",
78- "ingestScript": "Python script or empty string"
79- }` ;
163+ - Be thorough — capture every key point, technical detail, and actionable insight.` ;
80164}
81165
82166/**
83167 * Executes a deep agentic analysis of a YouTube video using Gemini + Google Search.
168+ * Uses gemini-3-pro-preview with responseSchema + googleSearch (PK=998 pattern).
84169 * This is a single API call that handles both transcription AND extraction.
85170 */
86171export async function analyzeVideoWithGemini (
@@ -91,17 +176,16 @@ export async function analyzeVideoWithGemini(
91176 const systemInstruction = buildSystemInstruction ( videoUrl ) ;
92177
93178 const response = await ai . models . generateContent ( {
94- model : 'gemini-2.5-flash ' ,
179+ model : 'gemini-3-pro-preview ' ,
95180 contents : `Perform Agentic Grounding for Video: ${ videoUrl } ` ,
96181 config : {
97182 systemInstruction,
183+ responseMimeType : 'application/json' ,
184+ responseSchema,
98185 tools : [ { googleSearch : { } } ] ,
99- temperature : 0.3 ,
100186 } ,
101187 } ) ;
102188
103- const resultText = ( response . text || '' ) . trim ( ) ;
104- // Strip markdown code fences if present
105- const cleaned = resultText . replace ( / ^ ` ` ` (?: j s o n ) ? \s * \n ? / i, '' ) . replace ( / \n ? ` ` ` \s * $ / i, '' ) ;
106- return JSON . parse ( cleaned ) as VideoAnalysisResult ;
189+ const resultText = response . text || '{}' ;
190+ return JSON . parse ( resultText ) as VideoAnalysisResult ;
107191}
0 commit comments