-
Notifications
You must be signed in to change notification settings - Fork 0
Add frontend-only video analysis pipeline with multi-strategy transcription #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
db6541f
c095d73
410ce1a
2288b6c
ff6d5e4
8792227
86a4208
0b66974
f925e30
ebf0327
d95c7f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,29 @@ | ||
| import OpenAI from 'openai'; | ||
| import { GoogleGenerativeAI } from '@google/generative-ai'; | ||
| import { NextResponse } from 'next/server'; | ||
|
|
||
| let _client: OpenAI | null = null; | ||
| function getClient() { | ||
| if (!_client) _client = new OpenAI(); | ||
| return _client; | ||
| let _openai: OpenAI | null = null; | ||
| function getOpenAI() { | ||
| if (!_openai) _openai = new OpenAI(); | ||
| return _openai; | ||
| } | ||
|
|
||
| let _gemini: GoogleGenerativeAI | null = null; | ||
| function getGemini() { | ||
| if (!_gemini) _gemini = new GoogleGenerativeAI(process.env.GEMINI_API_KEY || ''); | ||
| return _gemini; | ||
| } | ||
|
|
||
| const BACKEND_URL = process.env.BACKEND_URL || 'http://localhost:8000'; | ||
|
|
||
| /** | ||
| * OpenAI STT fallback — used when YouTube's auto-caption API fails or | ||
| * returns low-quality transcripts. Uses gpt-4o-mini-transcribe for | ||
| * cost-effective, high-quality transcription. | ||
| * | ||
| * POST /api/transcribe | ||
| * { url: string } — YouTube URL (tries YouTube API first, falls back to STT) | ||
| * { audioUrl: string } — Direct audio URL (goes straight to STT) | ||
| * | ||
| * Returns { success, transcript, source: 'youtube' | 'openai-stt' } | ||
| * Multi-strategy transcript extraction: | ||
| * 1. YouTube captions via backend (fast + free) | ||
| * 2. OpenAI Responses API with web_search (finds transcripts online) | ||
| * 3. Gemini fallback (if OpenAI unavailable) | ||
| * 4. Direct audio STT via OpenAI Whisper | ||
| */ | ||
| export async function POST(request: Request) { | ||
| try { | ||
|
|
@@ -30,18 +36,24 @@ export async function POST(request: Request) { | |
| ); | ||
| } | ||
|
|
||
| // Strategy 1: Try YouTube transcript API via backend first (fast + free) | ||
| // Strategy 1: Try YouTube transcript API via backend (fast + free) | ||
| if (url && !audioUrl) { | ||
| try { | ||
| const controller = new AbortController(); | ||
| const timeout = setTimeout(() => controller.abort(), 8_000); | ||
vercel[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| const ytResponse = await fetch(`${BACKEND_URL}/api/v1/transcript-action`, { | ||
| method: 'POST', | ||
| headers: { 'Content-Type': 'application/json' }, | ||
| body: JSON.stringify({ video_url: url, language }), | ||
| }); | ||
| signal: controller.signal, | ||
| }).finally(() => clearTimeout(timeout)); | ||
|
|
||
|
Comment on lines
+42
to
51
|
||
| if (ytResponse.ok) { | ||
| const result = await ytResponse.json(); | ||
| const segments = result.transcript || []; | ||
|
|
||
| // Handle transcript as segments array | ||
| const segments = Array.isArray(result.transcript) ? result.transcript : []; | ||
| if (segments.length > 0) { | ||
| const fullText = segments | ||
| .map((s: { text?: string }) => s.text || '') | ||
|
|
@@ -58,42 +70,85 @@ export async function POST(request: Request) { | |
| }); | ||
| } | ||
| } | ||
|
|
||
| // Handle transcript as { text: string } | ||
| const transcriptText = | ||
| typeof result.transcript === 'string' | ||
| ? result.transcript | ||
| : result.transcript?.text; | ||
| if (typeof transcriptText === 'string' && transcriptText.length > 50) { | ||
| return NextResponse.json({ | ||
| success: true, | ||
| transcript: transcriptText, | ||
| source: 'youtube', | ||
| wordCount: transcriptText.split(/\s+/).length, | ||
| }); | ||
| } | ||
| } | ||
| } catch { | ||
| // YouTube API failed — fall through to OpenAI STT | ||
| console.log('YouTube transcript unavailable, falling back to OpenAI STT'); | ||
| console.log('YouTube transcript unavailable, falling back to AI providers'); | ||
| } | ||
| } | ||
|
|
||
| // Strategy 2: OpenAI Speech-to-Text via Responses API | ||
| // For YouTube URLs without direct audio, use the Responses API with | ||
| // web_search to find and analyze the content | ||
| if (url && !audioUrl) { | ||
| // Use Responses API to transcribe/summarize the video content | ||
| const response = await getClient().responses.create({ | ||
| model: 'gpt-4o-mini', | ||
| instructions: `You are a video content transcription assistant. | ||
| // Strategy 2: OpenAI Responses API with web_search | ||
| if (url && !audioUrl && process.env.OPENAI_API_KEY) { | ||
| try { | ||
| const response = await getOpenAI().responses.create({ | ||
| model: 'gpt-4o-mini', | ||
| instructions: `You are a video content transcription assistant. | ||
| Given a YouTube URL, use web search to find the video's transcript or detailed content. | ||
| Return the full transcript text if available, or a detailed content summary. | ||
| Be thorough — capture all key points, quotes, and technical details.`, | ||
| tools: [{ type: 'web_search' as const }], | ||
| input: `Find and return the full transcript or detailed content of this video: ${url}`, | ||
| }); | ||
| tools: [{ type: 'web_search' as const }], | ||
| input: `Find and return the full transcript or detailed content of this video: ${url}`, | ||
| }); | ||
|
|
||
| const text = response.output_text || ''; | ||
|
|
||
| const text = response.output_text || ''; | ||
| if (text.length > 100) { | ||
| return NextResponse.json({ | ||
| success: true, | ||
| transcript: text, | ||
| source: 'openai-web-search', | ||
| wordCount: text.split(/\s+/).length, | ||
| }); | ||
| } | ||
| } catch (e) { | ||
| console.warn('OpenAI web_search transcript failed:', e); | ||
| } | ||
| } | ||
|
|
||
| if (text.length > 100) { | ||
| return NextResponse.json({ | ||
| success: true, | ||
| transcript: text, | ||
| source: 'openai-web-search', | ||
| wordCount: text.split(/\s+/).length, | ||
| // Strategy 3: Gemini fallback (when OpenAI unavailable) | ||
| if (url && !audioUrl && process.env.GEMINI_API_KEY) { | ||
| try { | ||
| const model = getGemini().getGenerativeModel({ | ||
| model: 'gemini-2.0-flash', | ||
| generationConfig: { temperature: 0.2 }, | ||
| }); | ||
|
|
||
| const result = await model.generateContent( | ||
| `You are a video content transcription assistant. ` + | ||
| `For the following YouTube video URL, provide a detailed transcript or content summary. ` + | ||
| `Include all key points, technical details, quotes, and actionable insights. ` + | ||
| `Be thorough and comprehensive.\n\nVideo URL: ${url}` | ||
| ); | ||
| const text = result.response.text(); | ||
|
|
||
| if (text.length > 100) { | ||
| return NextResponse.json({ | ||
| success: true, | ||
| transcript: text, | ||
| source: 'gemini', | ||
| wordCount: text.split(/\s+/).length, | ||
| }); | ||
| } | ||
| } catch (e) { | ||
| console.warn('Gemini transcript fallback failed:', e); | ||
| } | ||
| } | ||
|
|
||
| // Strategy 3: Direct audio file transcription via OpenAI Whisper/STT | ||
| if (audioUrl) { | ||
| // Strategy 4: Direct audio file transcription via OpenAI Whisper | ||
| if (audioUrl && process.env.OPENAI_API_KEY) { | ||
| const audioResponse = await fetch(audioUrl); | ||
| if (!audioResponse.ok) { | ||
| return NextResponse.json( | ||
|
|
@@ -105,7 +160,7 @@ Be thorough — capture all key points, quotes, and technical details.`, | |
| const audioBlob = await audioResponse.blob(); | ||
| const audioFile = new File([audioBlob], 'audio.mp3', { type: 'audio/mpeg' }); | ||
|
|
||
| const transcription = await getClient().audio.transcriptions.create({ | ||
| const transcription = await getOpenAI().audio.transcriptions.create({ | ||
| model: 'gpt-4o-mini-transcribe', | ||
| file: audioFile, | ||
| language, | ||
|
|
@@ -119,9 +174,13 @@ Be thorough — capture all key points, quotes, and technical details.`, | |
| }); | ||
| } | ||
|
|
||
| // No strategy succeeded | ||
| const hasKeys = !!(process.env.OPENAI_API_KEY || process.env.GEMINI_API_KEY); | ||
| return NextResponse.json({ | ||
| success: false, | ||
| error: 'Could not transcribe video — YouTube API and OpenAI STT both failed', | ||
| error: hasKeys | ||
| ? 'Could not transcribe video — all strategies failed' | ||
| : 'No AI API key configured. Set OPENAI_API_KEY or GEMINI_API_KEY in Vercel environment variables.', | ||
| transcript: '', | ||
| }); | ||
| } catch (error) { | ||
|
|
@@ -131,7 +190,7 @@ Be thorough — capture all key points, quotes, and technical details.`, | |
| return NextResponse.json({ | ||
| success: false, | ||
| error: message.includes('API key') | ||
| ? 'OpenAI API key not configured. Set OPENAI_API_KEY in your environment.' | ||
| ? 'AI API key not configured. Set OPENAI_API_KEY or GEMINI_API_KEY.' | ||
| : message, | ||
| transcript: '', | ||
| }); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Gemini client initialization with empty string fallback silently creates invalid client when API key is missing