Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 98 additions & 39 deletions apps/web/src/app/api/transcribe/route.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import OpenAI from 'openai';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { NextResponse } from 'next/server';

let _client: OpenAI | null = null;
function getClient() {
if (!_client) _client = new OpenAI();
return _client;
let _openai: OpenAI | null = null;
function getOpenAI() {
if (!_openai) _openai = new OpenAI();
return _openai;
}

let _gemini: GoogleGenerativeAI | null = null;
function getGemini() {
Copy link
Contributor

@vercel vercel bot Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gemini client initialization with empty string fallback silently creates invalid client when API key is missing

Fix on Vercel

if (!_gemini) _gemini = new GoogleGenerativeAI(process.env.GEMINI_API_KEY || '');
return _gemini;
}

const BACKEND_URL = process.env.BACKEND_URL || 'http://localhost:8000';

/**
* OpenAI STT fallback — used when YouTube's auto-caption API fails or
* returns low-quality transcripts. Uses gpt-4o-mini-transcribe for
* cost-effective, high-quality transcription.
*
* POST /api/transcribe
* { url: string } — YouTube URL (tries YouTube API first, falls back to STT)
* { audioUrl: string } — Direct audio URL (goes straight to STT)
*
* Returns { success, transcript, source: 'youtube' | 'openai-stt' }
* Multi-strategy transcript extraction:
* 1. YouTube captions via backend (fast + free)
* 2. OpenAI Responses API with web_search (finds transcripts online)
* 3. Gemini fallback (if OpenAI unavailable)
* 4. Direct audio STT via OpenAI Whisper
*/
export async function POST(request: Request) {
try {
Expand All @@ -30,18 +36,24 @@ export async function POST(request: Request) {
);
}

// Strategy 1: Try YouTube transcript API via backend first (fast + free)
// Strategy 1: Try YouTube transcript API via backend (fast + free)
if (url && !audioUrl) {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 8_000);

const ytResponse = await fetch(`${BACKEND_URL}/api/v1/transcript-action`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ video_url: url, language }),
});
signal: controller.signal,
}).finally(() => clearTimeout(timeout));

Comment on lines +42 to 51
Copy link

Copilot AI Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The backend transcript fetch uses an AbortController timeout, but clearTimeout(timeout) only runs after a successful fetch. If the request aborts or throws, the timeout isn’t cleared. Wrap the fetch in try { ... } finally { clearTimeout(timeout) } to avoid leaving timers pending in the error path.

Copilot uses AI. Check for mistakes.
if (ytResponse.ok) {
const result = await ytResponse.json();
const segments = result.transcript || [];

// Handle transcript as segments array
const segments = Array.isArray(result.transcript) ? result.transcript : [];
if (segments.length > 0) {
const fullText = segments
.map((s: { text?: string }) => s.text || '')
Expand All @@ -58,42 +70,85 @@ export async function POST(request: Request) {
});
}
}

// Handle transcript as { text: string }
const transcriptText =
typeof result.transcript === 'string'
? result.transcript
: result.transcript?.text;
if (typeof transcriptText === 'string' && transcriptText.length > 50) {
return NextResponse.json({
success: true,
transcript: transcriptText,
source: 'youtube',
wordCount: transcriptText.split(/\s+/).length,
});
}
}
} catch {
// YouTube API failed — fall through to OpenAI STT
console.log('YouTube transcript unavailable, falling back to OpenAI STT');
console.log('YouTube transcript unavailable, falling back to AI providers');
}
}

// Strategy 2: OpenAI Speech-to-Text via Responses API
// For YouTube URLs without direct audio, use the Responses API with
// web_search to find and analyze the content
if (url && !audioUrl) {
// Use Responses API to transcribe/summarize the video content
const response = await getClient().responses.create({
model: 'gpt-4o-mini',
instructions: `You are a video content transcription assistant.
// Strategy 2: OpenAI Responses API with web_search
if (url && !audioUrl && process.env.OPENAI_API_KEY) {
try {
const response = await getOpenAI().responses.create({
model: 'gpt-4o-mini',
instructions: `You are a video content transcription assistant.
Given a YouTube URL, use web search to find the video's transcript or detailed content.
Return the full transcript text if available, or a detailed content summary.
Be thorough — capture all key points, quotes, and technical details.`,
tools: [{ type: 'web_search' as const }],
input: `Find and return the full transcript or detailed content of this video: ${url}`,
});
tools: [{ type: 'web_search' as const }],
input: `Find and return the full transcript or detailed content of this video: ${url}`,
});

const text = response.output_text || '';

const text = response.output_text || '';
if (text.length > 100) {
return NextResponse.json({
success: true,
transcript: text,
source: 'openai-web-search',
wordCount: text.split(/\s+/).length,
});
}
} catch (e) {
console.warn('OpenAI web_search transcript failed:', e);
}
}

if (text.length > 100) {
return NextResponse.json({
success: true,
transcript: text,
source: 'openai-web-search',
wordCount: text.split(/\s+/).length,
// Strategy 3: Gemini fallback (when OpenAI unavailable)
if (url && !audioUrl && process.env.GEMINI_API_KEY) {
try {
const model = getGemini().getGenerativeModel({
model: 'gemini-2.0-flash',
generationConfig: { temperature: 0.2 },
});

const result = await model.generateContent(
`You are a video content transcription assistant. ` +
`For the following YouTube video URL, provide a detailed transcript or content summary. ` +
`Include all key points, technical details, quotes, and actionable insights. ` +
`Be thorough and comprehensive.\n\nVideo URL: ${url}`
);
const text = result.response.text();

if (text.length > 100) {
return NextResponse.json({
success: true,
transcript: text,
source: 'gemini',
wordCount: text.split(/\s+/).length,
});
}
} catch (e) {
console.warn('Gemini transcript fallback failed:', e);
}
}

// Strategy 3: Direct audio file transcription via OpenAI Whisper/STT
if (audioUrl) {
// Strategy 4: Direct audio file transcription via OpenAI Whisper
if (audioUrl && process.env.OPENAI_API_KEY) {
const audioResponse = await fetch(audioUrl);
if (!audioResponse.ok) {
return NextResponse.json(
Expand All @@ -105,7 +160,7 @@ Be thorough — capture all key points, quotes, and technical details.`,
const audioBlob = await audioResponse.blob();
const audioFile = new File([audioBlob], 'audio.mp3', { type: 'audio/mpeg' });

const transcription = await getClient().audio.transcriptions.create({
const transcription = await getOpenAI().audio.transcriptions.create({
model: 'gpt-4o-mini-transcribe',
file: audioFile,
language,
Expand All @@ -119,9 +174,13 @@ Be thorough — capture all key points, quotes, and technical details.`,
});
}

// No strategy succeeded
const hasKeys = !!(process.env.OPENAI_API_KEY || process.env.GEMINI_API_KEY);
return NextResponse.json({
success: false,
error: 'Could not transcribe video — YouTube API and OpenAI STT both failed',
error: hasKeys
? 'Could not transcribe video — all strategies failed'
: 'No AI API key configured. Set OPENAI_API_KEY or GEMINI_API_KEY in Vercel environment variables.',
transcript: '',
});
} catch (error) {
Expand All @@ -131,7 +190,7 @@ Be thorough — capture all key points, quotes, and technical details.`,
return NextResponse.json({
success: false,
error: message.includes('API key')
? 'OpenAI API key not configured. Set OPENAI_API_KEY in your environment.'
? 'AI API key not configured. Set OPENAI_API_KEY or GEMINI_API_KEY.'
: message,
transcript: '',
});
Expand Down
Loading
Loading