diff --git a/app.py b/app.py index 9e96a46..39b5034 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,4 @@ -import os, json, subprocess, tempfile, threading, time, io, shutil, struct +import os, json, subprocess, tempfile, threading, time, io, shutil, struct, concurrent.futures from pathlib import Path from flask import Flask, render_template, request, jsonify, send_file, Response, abort @@ -99,6 +99,26 @@ def get_clip_speaker(clip, transcript): best = seg.get("speaker", "S1") return best +def resolve_source_for_clip(clip, state): + """Given a clip with start/end times, find the correct source file and real timestamps.""" + source_files = state.get("source_files", []) + if not source_files: + # Legacy single-file project + sf = state.get("source_file", "") + return sf, clip["start"], clip["end"] + + # Find which source file this clip belongs to by checking timestamp range + for i, sf in enumerate(source_files): + offset = sf["offset"] + end_time = offset + sf["duration"] + if clip["start"] >= offset and clip["start"] < end_time: + real_start = clip["start"] - offset + real_end = clip["end"] - offset + return sf["path"], real_start, real_end + # Fallback to last file + sf = source_files[-1] + return sf["path"], clip["start"] - sf["offset"], clip["end"] - sf["offset"] + # Load .env file if present (so OPENAI_API_KEY persists across sessions) try: from dotenv import load_dotenv @@ -219,6 +239,21 @@ def load_state(): if os.path.exists(sf): with open(sf) as f: state = json.load(f) + # Backward compat: populate source_files from legacy source_file + if state.get("source_file") and not state.get("source_files"): + path = state["source_file"] + if os.path.exists(path): + try: + dur_result = subprocess.run( + ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "csv=p=0", path], + capture_output=True, text=True) + duration = float(dur_result.stdout.strip()) + except Exception: + duration = 0 + state["source_files"] = [{ + "filename": state.get("filename", os.path.basename(path)), + "path": path, "offset": 0, "duration": duration + }] # Auto-detect phase for legacy projects that predate the phase system if "phase" not in state: status = state.get("status", "") @@ -231,7 +266,7 @@ def load_state(): return state return {"transcript": [], "words": [], "clips": [], "text_clips": [], "narration_transcript": [], "narration_words": [], "narr_text_clips": [], - "narration": [], "assembly": [], "source_file": None, "phase": 1} + "narration": [], "assembly": [], "source_file": None, "source_files": [], "phase": 1} def save_state(state): with open(state_file(), "w") as f: @@ -276,6 +311,79 @@ def get_state(): # ─── STEP 1: TRANSCRIBE ─────────────────────────────────── +def transcribe_single_file(filepath, whisper_lang, diarize, file_index, total_files): + """Transcribe a single audio file and return (segments, words, duration). + + Updates the global ``progress`` dict with per-file status messages. + Raises on failure so the caller can handle partial errors. + """ + prefix = f"file {file_index + 1}/{total_files}: " if total_files > 1 else "" + + # ── Get audio duration via ffprobe ── + duration = 0 + try: + dur_result = subprocess.run( + ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "csv=p=0", filepath], + capture_output=True, text=True) + duration = float(dur_result.stdout.strip()) + except Exception: + pass + + # ── Compress if file > 25 MB ── + upload_path = filepath + if os.path.getsize(filepath) > 25 * 1024 * 1024: + progress.update(message=f"{prefix}compressing audio…") + compressed = filepath.rsplit(".", 1)[0] + "_compressed.mp3" + target_bits = 24 * 1024 * 1024 * 8 + bitrate_kbps = max(8, min(64, int(target_bits / (duration or 1) / 1000))) + subprocess.run([ + "ffmpeg", "-y", "-i", filepath, + "-ac", "1", "-ar", "16000", "-b:a", f"{bitrate_kbps}k", + compressed + ], capture_output=True, check=True) + upload_path = compressed + + # ── Send to Whisper API ── + progress.update(message=f"{prefix}sending to Whisper…") + whisper_kwargs = { + "model": "whisper-1", + "response_format": "verbose_json", + "timestamp_granularities": ["word", "segment"], + } + if whisper_lang: + whisper_kwargs["language"] = whisper_lang + + with open(upload_path, "rb") as audio_file: + whisper_kwargs["file"] = audio_file + result = client.audio.transcriptions.create(**whisper_kwargs) + + # ── Extract words ── + words = [] + if hasattr(result, 'words') and result.words: + for w in result.words: + words.append({"word": w.word.strip(), "start": w.start, "end": w.end}) + + # ── Extract and merge segments ── + raw = [{"start": seg.start, "end": seg.end, "text": seg.text.strip()} for seg in result.segments] + passages = merge_segments(raw) + segments = [] + for i, p in enumerate(passages): + segments.append({ + "id": i, + "start": p["start"], + "end": p["end"], + "text": p["text"], + "speaker": "S1", + }) + + # ── Diarization (optional) ── + if diarize and os.environ.get("HUGGINGFACE_TOKEN"): + progress.update(message=f"{prefix}detecting speakers…") + segments = assign_speakers(segments, filepath) + + return segments, words, duration + + @app.route("/transcribe", methods=["POST"]) def transcribe(): if not client: @@ -283,104 +391,143 @@ def transcribe(): if "file" not in request.files: return jsonify({"error": "No file uploaded"}), 400 - f = request.files["file"] - filename = f.filename - filepath = os.path.join(pdir("uploads"), filename) - f.save(filepath) + files = request.files.getlist("file") whisper_lang = request.form.get("language", "he") diarize = request.form.get("diarize") == "1" if whisper_lang == "auto": whisper_lang = None + # Save all uploaded files to the uploads directory + saved_files = [] # list of (filename, filepath) + for f in files: + filepath = os.path.join(pdir("uploads"), f.filename) + f.save(filepath) + saved_files.append((f.filename, filepath)) + def do_transcribe(): state = load_state() - state["source_file"] = filepath state["status"] = "transcribing" + state["source_file"] = saved_files[0][1] # backward compat save_state(state) - try: - # Get audio duration for time estimates - try: - dur_result = subprocess.run( - ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "csv=p=0", filepath], - capture_output=True, text=True) - progress["audio_duration"] = float(dur_result.stdout.strip()) - except Exception: - progress["audio_duration"] = 0 - - upload_path = filepath - if os.path.getsize(filepath) > 25 * 1024 * 1024: - progress.update(phase="transcribe", current=0, total=3, message="compressing audio…") - compressed = filepath.rsplit(".", 1)[0] + "_compressed.mp3" - # Calculate bitrate to keep output under 24 MB regardless of duration - duration = progress["audio_duration"] or 1 - target_bits = 24 * 1024 * 1024 * 8 - bitrate_kbps = max(8, min(64, int(target_bits / duration / 1000))) - subprocess.run([ - "ffmpeg", "-y", "-i", filepath, - "-ac", "1", "-ar", "16000", "-b:a", f"{bitrate_kbps}k", - compressed - ], capture_output=True, check=True) - upload_path = compressed - progress.update(current=1, message="sending to Whisper…") - else: - progress.update(phase="transcribe", current=0, total=2, message="sending to Whisper…") + total_files = len(saved_files) + progress.update(phase="transcribe", current=0, total=total_files + 1, + message="starting transcription…") - whisper_kwargs = { - "model": "whisper-1", - "response_format": "verbose_json", - "timestamp_granularities": ["word", "segment"], - } - if whisper_lang: - whisper_kwargs["language"] = whisper_lang + # Transcribe files in parallel + results = [None] * total_files # indexed by position + errors = [None] * total_files + warnings = [] - with open(upload_path, "rb") as audio_file: - whisper_kwargs["file"] = audio_file - result = client.audio.transcriptions.create(**whisper_kwargs) + def _transcribe_one(idx): + filename, filepath = saved_files[idx] + try: + return idx, transcribe_single_file(filepath, whisper_lang, diarize, idx, total_files) + except Exception as e: + return idx, e - # Store word-level timestamps - words = [] - if hasattr(result, 'words') and result.words: - for w in result.words: - words.append({"word": w.word.strip(), "start": w.start, "end": w.end}) + max_workers = min(total_files, 4) + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_transcribe_one, i) for i in range(total_files)] + for future in concurrent.futures.as_completed(futures): + idx, result = future.result() + if isinstance(result, Exception): + errors[idx] = result + warnings.append(f"{saved_files[idx][0]}: {friendly_error(result)}") + else: + results[idx] = result # (segments, words, duration) + # Update progress count + done_count = sum(1 for r in results if r is not None) + sum(1 for e in errors if e is not None) + progress.update(current=done_count, message=f"transcribed {done_count}/{total_files} files…") + + # Check if ALL files failed + if all(r is None for r in results): + err_msg = "; ".join(warnings) if warnings else "all files failed" + raise Exception(err_msg) + + # ── Merge results with cumulative time offsets ── + progress.update(message="merging transcripts…") + merged_segments = [] + merged_words = [] + source_files_info = [] + cumulative_offset = 0.0 + global_seg_id = 0 + speaker_offset = 0 + + for idx in range(total_files): + filename, filepath = saved_files[idx] + if results[idx] is None: + # This file failed — skip it but record in source_files + source_files_info.append({ + "filename": filename, + "path": filepath, + "offset": cumulative_offset, + "duration": 0, + "error": friendly_error(errors[idx]) if errors[idx] else "unknown error", + }) + continue - # Store segment-level (merged into passages) for paragraph grouping - raw = [{"start": seg.start, "end": seg.end, "text": seg.text.strip()} for seg in result.segments] - passages = merge_segments(raw) + segments, words, duration = results[idx] - segments = [] - for i, p in enumerate(passages): - segments.append({ - "id": i, - "start": p["start"], - "end": p["end"], - "text": p["text"], - "speaker": "S1", + source_files_info.append({ + "filename": filename, + "path": filepath, + "offset": cumulative_offset, + "duration": duration, }) - progress.update(current=progress["total"] - 1, message="processing segments…") - - # Diarization (optional — only if user opted in and HUGGINGFACE_TOKEN is set) - if diarize and os.environ.get("HUGGINGFACE_TOKEN"): - progress.update(message="detecting speakers…") - segments = assign_speakers(segments, filepath) - - # Build speaker_names map from unique speakers in transcript + # Build speaker ID mapping: offset speakers so each file gets unique IDs + file_speakers = set() + for seg in segments: + file_speakers.add(seg.get("speaker", "S1")) + # Sort to get deterministic mapping + file_speakers = sorted(file_speakers, key=lambda s: int(s[1:]) if s[1:].isdigit() else 0) + speaker_map = {} + for spk in file_speakers: + old_num = int(spk[1:]) if spk[1:].isdigit() else 1 + speaker_map[spk] = f"S{old_num + speaker_offset}" + + # Shift timestamps, remap speakers, and add source_index + for seg in segments: + seg["id"] = global_seg_id + seg["start"] += cumulative_offset + seg["end"] += cumulative_offset + seg["source_index"] = idx + seg["speaker"] = speaker_map.get(seg.get("speaker", "S1"), seg.get("speaker", "S1")) + merged_segments.append(seg) + global_seg_id += 1 + + for w in words: + w["start"] += cumulative_offset + w["end"] += cumulative_offset + w["source_index"] = idx + merged_words.append(w) + + speaker_offset += len(file_speakers) + cumulative_offset += duration + + # Build speaker_names from merged segments seen = [] - for seg in segments: + for seg in merged_segments: spk = seg.get("speaker", "S1") if spk not in seen: seen.append(spk) speaker_names = {spk: spk for spk in seen} - state["transcript"] = segments - state["words"] = words + # Save state + state["transcript"] = merged_segments + state["words"] = merged_words state["text_clips"] = [] state["clips"] = [] state["status"] = "transcribed" - state["filename"] = filename + state["source_file"] = saved_files[0][1] # backward compat + state["source_files"] = source_files_info + state["filename"] = ", ".join(fn for fn, _ in saved_files) state["transcription_language"] = whisper_lang or "auto" state["speaker_names"] = speaker_names + if warnings: + state["transcription_warnings"] = warnings save_state(state) progress.update(current=progress["total"], message="done", phase=None) @@ -515,9 +662,15 @@ def remove_clip(): @app.route("/cut_clips", methods=["POST"]) def cut_clips(): state = load_state() + source_files = state.get("source_files", []) source = state.get("source_file") - if not source or not os.path.exists(source): + # Validate we have at least one source file available + if source_files: + has_valid = any(os.path.exists(sf["path"]) for sf in source_files) + if not has_valid: + return jsonify({"error": "Source audio file not found"}), 400 + elif not source or not os.path.exists(source): return jsonify({"error": "Source audio file not found"}), 400 text_clips = state.get("text_clips", []) @@ -530,9 +683,12 @@ def cut_clips(): def do_cut(): try: st = load_state() - source_path = st.get("source_file", "") - if not source_path or not os.path.exists(source_path): - st["status"] = f"error: source file not found — {source_path}" + + # Validate source availability + sf_list = st.get("source_files", []) + legacy_source = st.get("source_file", "") + if not sf_list and (not legacy_source or not os.path.exists(legacy_source)): + st["status"] = f"error: source file not found — {legacy_source}" save_state(st) progress.update(phase=None, message="") return @@ -543,12 +699,15 @@ def do_cut(): cut_files = [] for i, clip in enumerate(clips): progress.update(current=i, message=f"cutting {clip['id']}… ({i+1}/{len(clips)})") + source_path, real_start, real_end = resolve_source_for_clip(clip, st) + if not source_path or not os.path.exists(source_path): + continue out_path = os.path.join(pdir("clips"), f"{clip['id']}.wav") - duration = clip["end"] - clip["start"] + duration = real_end - real_start cmd = [ "ffmpeg", "-y", "-i", source_path, - "-ss", str(clip["start"]), + "-ss", str(real_start), "-t", str(duration), "-c:a", "pcm_s16le", "-ar", "44100", "-ac", "1", out_path @@ -1147,6 +1306,65 @@ def waveform(): return jsonify({"points": points, "duration": round(duration, 2)}) +@app.route("/waveform_multi") +def waveform_multi(): + """Combined waveform from all source files for multi-file projects.""" + n_points = int(request.args.get("points", 1000)) + + source_files = state.get("source_files", []) + if not source_files: + # Legacy single-file fallback + sf = state.get("source_file", "") + if not sf: + return jsonify({"error": "No source files"}), 404 + source_files = [{"path": sf}] + + all_samples = [] + total_duration = 0.0 + + for sf in source_files: + filepath = sf.get("path", "") + if not filepath: + continue + filepath = safe_project_path(filepath) + if not os.path.exists(filepath): + continue + + cmd = [ + "ffmpeg", "-i", filepath, + "-ac", "1", "-filter:a", "aresample=100", + "-map_metadata", "-1", + "-f", "s16le", "-acodec", "pcm_s16le", "pipe:1" + ] + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0 or not result.stdout: + continue + + raw = result.stdout + n_samples = len(raw) // 2 + samples = struct.unpack(f"<{n_samples}h", raw) + all_samples.extend(samples) + total_duration += n_samples / 100.0 + + if not all_samples: + return jsonify({"error": "No audio data"}), 500 + + n_total = len(all_samples) + chunk = max(1, n_total // n_points) + rms_list = [] + for i in range(0, n_total, chunk): + seg = all_samples[i:i + chunk] + rms = (sum(s * s for s in seg) / len(seg)) ** 0.5 + rms_list.append(rms) + + max_rms = max(rms_list) if rms_list else 1.0 + if max_rms == 0: + max_rms = 1.0 + points = [round(r / max_rms, 4) for r in rms_list] + + return jsonify({"points": points, "duration": round(total_duration, 2)}) + + @app.route("/audio_snippet") def audio_snippet(): """Extract a small audio snippet on the fly as MP3 — instant playback, no buffering.""" @@ -1326,6 +1544,8 @@ def fix(path): return path state["source_file"] = fix(state.get("source_file")) + for sf in state.get("source_files", []): + sf["path"] = fix(sf.get("path", "")) state["narration_source"] = fix(state.get("narration_source")) for c in state.get("clips", []): c["path"] = fix(c.get("path", "")) diff --git a/docs/plans/2026-04-11-multi-file-upload-design.md b/docs/plans/2026-04-11-multi-file-upload-design.md new file mode 100644 index 0000000..46fd7a2 --- /dev/null +++ b/docs/plans/2026-04-11-multi-file-upload-design.md @@ -0,0 +1,68 @@ +# Multi-File Upload & Parallel Transcription + +## Problem + +Users often have multiple interview recordings (different speakers) for the same project. Currently they must merge files externally before uploading, which is slow and adds a manual step. Transcribing one large merged file is also slower than transcribing smaller files in parallel. + +## Design + +### Approach: Multi-file single endpoint with parallel transcription + +The upload zone accepts multiple files. All files are sent to `/transcribe` in one request. The backend spawns a thread per file, transcribes in parallel via Whisper, then merges results into one unified transcript with artificial time offsets so segments don't overlap. + +### State changes + +`source_file` (string) becomes `source_files` (array): + +```json +{ + "source_files": [ + { "filename": "interview_a.wav", "path": "projects/x/uploads/interview_a.wav", "offset": 0, "duration": 300.5 }, + { "filename": "interview_b.wav", "path": "projects/x/uploads/interview_b.wav", "offset": 300.5, "duration": 245.8 } + ] +} +``` + +Backward compatible: old projects with `source_file` (string) still load fine. + +### Transcript merging + +- Each file is transcribed independently (compression + Whisper call per file) +- After all complete, transcripts are merged in upload order +- File B's timestamps are shifted by the sum of all preceding files' durations +- Each segment and word gets a `source_index` field pointing to its entry in `source_files` + +### Clip cutting + +When cutting a clip, the system: +1. Looks up `source_index` on the segment/word +2. Subtracts the file's `offset` to get the real timestamp within that file +3. Runs ffmpeg against the correct source file + +### Progress UX + +- Progress bar shows overall status: "Transcribing file 2 of 3..." +- Individual file compression/upload steps tracked +- Errors on one file don't block others; failed files are reported at the end + +### UI changes + +- Upload zone `` gets `multiple` attribute +- File info bar shows count: "3 files selected" instead of single filename +- Transcript view unchanged — segments appear with speaker colors/labels as before +- A subtle divider or label between file boundaries (optional, low priority) + +### Backend changes + +- `/transcribe` accepts multiple files in `request.files.getlist("file")` +- Each file gets its own thread for compress + Whisper +- Results collected and merged after all threads complete +- `source_files` array written to state + +### What stays the same + +- Speaker diarization (runs per-file, results merged) +- Clip selection UI (word clicking) +- Assembly timeline +- All export routes +- Narration workflow diff --git a/docs/plans/2026-04-11-multi-file-upload.md b/docs/plans/2026-04-11-multi-file-upload.md new file mode 100644 index 0000000..ca1fb48 --- /dev/null +++ b/docs/plans/2026-04-11-multi-file-upload.md @@ -0,0 +1,463 @@ +# Multi-File Upload Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Let users upload multiple interview files at once, transcribe them in parallel, and work with one unified transcript. + +**Architecture:** The `/transcribe` endpoint accepts multiple files via `getlist("file")`. A thread per file handles compression + Whisper. After all threads complete, transcripts merge with time offsets into one unified state. Clip cutting resolves which source file to use via `source_index`. + +**Tech Stack:** Python/Flask, threading, ffmpeg, OpenAI Whisper API, vanilla JS + +--- + +### Task 1: Backend — `source_files` state + backward compat + +**Files:** +- Modify: `app.py:232-234` (initial state shape) +- Modify: `app.py:516-564` (cut_clips — source file lookup) + +**Step 1: Update initial state shape** + +In `load_state()` return dict (line 232), add `source_files` and keep `source_file` for backward compat: + +```python +return {"transcript": [], "words": [], "clips": [], "text_clips": [], + "narration_transcript": [], "narration_words": [], "narr_text_clips": [], + "narration": [], "assembly": [], "source_file": None, "source_files": [], "phase": 1} +``` + +**Step 2: Add helper to resolve source file for a clip** + +Add this function after `get_clip_speaker()` (after line 108): + +```python +def resolve_source_for_clip(clip, state): + """Given a clip with start/end times, find the correct source file and real timestamps.""" + source_files = state.get("source_files", []) + if not source_files: + # Legacy single-file project + sf = state.get("source_file", "") + return sf, clip["start"], clip["end"] + + # Find which source file this clip belongs to by checking source_index on words/segments + # or by timestamp range + for i, sf in enumerate(source_files): + offset = sf["offset"] + end_time = offset + sf["duration"] + if clip["start"] >= offset and clip["start"] < end_time: + real_start = clip["start"] - offset + real_end = clip["end"] - offset + return sf["path"], real_start, real_end + # Fallback to last file + sf = source_files[-1] + return sf["path"], clip["start"] - sf["offset"], clip["end"] - sf["offset"] +``` + +**Step 3: Update `cut_clips` to use resolver** + +In `do_cut()` (line 530), replace the single `source_path` lookup with per-clip resolution: + +```python +def do_cut(): + try: + st = load_state() + # Validate we have source files + source_files = st.get("source_files", []) + source_file = st.get("source_file", "") + if not source_files and not source_file: + st["status"] = "error: no source file found" + save_state(st) + progress.update(phase=None, message="") + return + # For legacy single-file: check it exists + if not source_files and source_file and not os.path.exists(source_file): + st["status"] = f"error: source file not found — {source_file}" + save_state(st) + progress.update(phase=None, message="") + return + + clips = st.get("text_clips", []) + progress.update(phase="cut", current=0, total=len(clips), message=f"cutting 0/{len(clips)} clips…") + + cut_files = [] + for i, clip in enumerate(clips): + progress.update(current=i, message=f"cutting {clip['id']}… ({i+1}/{len(clips)})") + source_path, real_start, real_end = resolve_source_for_clip(clip, st) + if not os.path.exists(source_path): + continue + out_path = os.path.join(pdir("clips"), f"{clip['id']}.wav") + duration = real_end - real_start + cmd = [ + "ffmpeg", "-y", + "-i", source_path, + "-ss", str(real_start), + "-t", str(duration), + "-c:a", "pcm_s16le", "-ar", "44100", "-ac", "1", + out_path + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + cut_files.append({ + "id": clip["id"], + "path": out_path, + "start": clip["start"], + "end": clip["end"], + "duration": round(duration, 2) + }) +``` + +**Step 4: Commit** + +```bash +git add app.py +git commit -m "feat: add source_files state and clip source resolver" +``` + +--- + +### Task 2: Backend — multi-file `/transcribe` endpoint + +**Files:** +- Modify: `app.py:279-393` (transcribe route + do_transcribe) + +**Step 1: Extract single-file transcription into a reusable function** + +Add this function before the `/transcribe` route: + +```python +def transcribe_single_file(filepath, whisper_lang, diarize, file_index, total_files): + """Transcribe a single file. Returns (segments, words, duration) or raises.""" + # Get duration + try: + dur_result = subprocess.run( + ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "csv=p=0", filepath], + capture_output=True, text=True) + duration = float(dur_result.stdout.strip()) + except Exception: + duration = 0 + + # Compress if needed + upload_path = filepath + if os.path.getsize(filepath) > 25 * 1024 * 1024: + progress.update(phase="transcribe", current=file_index, + total=total_files, message=f"compressing file {file_index+1}/{total_files}…") + compressed = filepath.rsplit(".", 1)[0] + "_compressed.mp3" + target_bits = 24 * 1024 * 1024 * 8 + bitrate_kbps = max(8, min(64, int(target_bits / (duration or 1) / 1000))) + subprocess.run([ + "ffmpeg", "-y", "-i", filepath, + "-ac", "1", "-ar", "16000", "-b:a", f"{bitrate_kbps}k", + compressed + ], capture_output=True, check=True) + upload_path = compressed + + progress.update(message=f"sending file {file_index+1}/{total_files} to Whisper…") + + whisper_kwargs = { + "model": "whisper-1", + "response_format": "verbose_json", + "timestamp_granularities": ["word", "segment"], + } + if whisper_lang: + whisper_kwargs["language"] = whisper_lang + + with open(upload_path, "rb") as audio_file: + whisper_kwargs["file"] = audio_file + result = client.audio.transcriptions.create(**whisper_kwargs) + + words = [] + if hasattr(result, 'words') and result.words: + for w in result.words: + words.append({"word": w.word.strip(), "start": w.start, "end": w.end}) + + raw = [{"start": seg.start, "end": seg.end, "text": seg.text.strip()} for seg in result.segments] + passages = merge_segments(raw) + segments = [] + for i, p in enumerate(passages): + segments.append({ + "id": i, "start": p["start"], "end": p["end"], + "text": p["text"], "speaker": "S1", + }) + + if diarize and os.environ.get("HUGGINGFACE_TOKEN"): + segments = assign_speakers(segments, filepath) + + return segments, words, duration +``` + +**Step 2: Rewrite `/transcribe` to handle multiple files** + +```python +@app.route("/transcribe", methods=["POST"]) +def transcribe(): + if not client: + return jsonify({"error": "OPENAI_API_KEY not set"}), 500 + + files = request.files.getlist("file") + if not files: + return jsonify({"error": "No file uploaded"}), 400 + + # Save all uploaded files + file_infos = [] + for f in files: + filepath = os.path.join(pdir("uploads"), f.filename) + f.save(filepath) + file_infos.append({"filename": f.filename, "path": filepath}) + + whisper_lang = request.form.get("language", "he") + diarize = request.form.get("diarize") == "1" + if whisper_lang == "auto": + whisper_lang = None + + def do_transcribe(): + state = load_state() + state["status"] = "transcribing" + # Set source_file to first file for backward compat + state["source_file"] = file_infos[0]["path"] + save_state(state) + + total_files = len(file_infos) + progress.update(phase="transcribe", current=0, total=total_files, + message=f"transcribing 0/{total_files} files…") + + # Transcribe files in parallel + from concurrent.futures import ThreadPoolExecutor, as_completed + results = [None] * total_files + errors = [] + + def transcribe_file(idx): + info = file_infos[idx] + return idx, transcribe_single_file(info["path"], whisper_lang, diarize, idx, total_files) + + with ThreadPoolExecutor(max_workers=min(total_files, 4)) as executor: + futures = {executor.submit(transcribe_file, i): i for i in range(total_files)} + for future in as_completed(futures): + try: + idx, (segments, words, duration) = future.result() + results[idx] = {"segments": segments, "words": words, "duration": duration} + progress.update(current=sum(1 for r in results if r is not None), + message=f"transcribed {sum(1 for r in results if r is not None)}/{total_files} files…") + except Exception as e: + idx = futures[future] + errors.append(f"{file_infos[idx]['filename']}: {friendly_error(e)}") + + if all(r is None for r in results): + state["status"] = f"error: all files failed — {'; '.join(errors)}" + save_state(state) + progress.update(phase=None, current=0, total=0, message="") + return + + # Merge results with time offsets + merged_segments = [] + merged_words = [] + source_files = [] + offset = 0.0 + seg_id = 0 + + for idx, r in enumerate(results): + if r is None: + continue + info = file_infos[idx] + source_files.append({ + "filename": info["filename"], + "path": info["path"], + "offset": offset, + "duration": r["duration"] + }) + for seg in r["segments"]: + merged_segments.append({ + "id": seg_id, + "start": seg["start"] + offset, + "end": seg["end"] + offset, + "text": seg["text"], + "speaker": seg.get("speaker", "S1"), + "source_index": len(source_files) - 1 + }) + seg_id += 1 + for w in r["words"]: + merged_words.append({ + "word": w["word"], + "start": w["start"] + offset, + "end": w["end"] + offset, + "source_index": len(source_files) - 1 + }) + offset += r["duration"] + + progress.update(message="processing segments…") + + # Build speaker_names + seen = [] + for seg in merged_segments: + spk = seg.get("speaker", "S1") + if spk not in seen: + seen.append(spk) + speaker_names = {spk: spk for spk in seen} + + state["transcript"] = merged_segments + state["words"] = merged_words + state["text_clips"] = [] + state["clips"] = [] + state["status"] = "transcribed" + state["filename"] = ", ".join(info["filename"] for info in file_infos) + state["source_files"] = source_files + state["source_file"] = file_infos[0]["path"] # backward compat + state["transcription_language"] = whisper_lang or "auto" + state["speaker_names"] = speaker_names + if errors: + state["transcription_warnings"] = errors + save_state(state) + progress.update(phase="transcribe", current=total_files, total=total_files, message="done") + progress["audio_duration"] = offset + + threading.Thread(target=do_transcribe).start() + return jsonify({"message": "Transcription started"}) +``` + +**Step 3: Commit** + +```bash +git add app.py +git commit -m "feat: multi-file parallel transcription endpoint" +``` + +--- + +### Task 3: Frontend — multi-file upload UI + +**Files:** +- Modify: `templates/index.html:456` (file input) +- Modify: `templates/index.html:773-794` (uploadAndTranscribe function) + +**Step 1: Add `multiple` attribute to file input** + +Line 456, change: +```html + +``` +to: +```html + +``` + +**Step 2: Update `uploadAndTranscribe` to send multiple files** + +```javascript +async function uploadAndTranscribe(event) { + const files = Array.from(event.target.files); + if (!files.length) return; + document.getElementById('upload-zone').style.display = 'none'; + document.getElementById('file-info').textContent = + files.length === 1 ? files[0].name : `${files.length} files selected`; + setStatus('working', t('transcribing')); + const whisperLang = document.getElementById('whisper-lang').value; + const diarize = document.getElementById('diarize-toggle').checked; + const formData = new FormData(); + for (const file of files) { + formData.append('file', file); + } + formData.append('language', whisperLang); + if (diarize) formData.append('diarize', '1'); + const res = await fetch('/transcribe', { method: 'POST', body: formData }); + if (!res.ok) { + const data = await res.json().catch(() => ({ error: t('server_error') })); + setStatus('error', data.error || t('transcription_failed')); + document.getElementById('upload-zone').style.display = ''; + return; + } + startProgressPolling(); + startPolling(); +} +``` + +**Step 3: Commit** + +```bash +git add templates/index.html +git commit -m "feat: multi-file upload UI" +``` + +--- + +### Task 4: Update remaining `source_file` references for compat + +**Files:** +- Modify: `app.py` — all routes that read `source_file` + +**Step 1: Audit and update all `source_file` references** + +Search for all `source_file` usages and ensure they fall back correctly. Key places: + +- `load_demo` route: should set both `source_file` and `source_files` +- `save_project` / `load_project`: `source_files` paths need fixing like `source_file` +- `/state` route: already returns full state, no change needed +- `reset` route: no change needed (returns fresh state which now includes `source_files`) + +Update `load_state()` to auto-populate `source_files` from legacy `source_file`: + +```python +def load_state(): + sf = state_file() + if os.path.exists(sf): + with open(sf) as f: + state = json.load(f) + # Backward compat: populate source_files from legacy source_file + if state.get("source_file") and not state.get("source_files"): + path = state["source_file"] + if os.path.exists(path): + try: + dur_result = subprocess.run( + ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "csv=p=0", path], + capture_output=True, text=True) + duration = float(dur_result.stdout.strip()) + except Exception: + duration = 0 + state["source_files"] = [{ + "filename": state.get("filename", os.path.basename(path)), + "path": path, "offset": 0, "duration": duration + }] + # ... existing phase detection ... +``` + +**Step 2: Update path-fixing in `load_project`** + +Where `source_file` path is fixed (line ~1328), also fix `source_files` paths: + +```python +state["source_file"] = fix(state.get("source_file")) +for sf in state.get("source_files", []): + sf["path"] = fix(sf.get("path", "")) +``` + +**Step 3: Commit** + +```bash +git add app.py +git commit -m "feat: backward compat for source_files in all routes" +``` + +--- + +### Task 5: Manual integration test + +**Step 1: Restart the server** + +```bash +kill -9 $(lsof -ti:5555) 2>/dev/null +source venv/bin/activate && python app.py +``` + +**Step 2: Test single file upload (backward compat)** + +Upload one file, verify transcription works as before. + +**Step 3: Test multi-file upload** + +Upload 2+ files, verify: +- Progress shows file count +- Transcript merges correctly with all segments +- Clips can be marked across file boundaries +- Clip cutting produces correct audio from correct source file + +**Step 4: Test loading old project** + +Load an existing saved project, verify `source_files` auto-populates from `source_file`. diff --git a/templates/index.html b/templates/index.html index 84ae027..420699f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -90,6 +90,22 @@ .pane-label { font-family: var(--sans); font-size: 14px; font-weight: 600; color: var(--text2); margin-bottom: 12px; display: flex; align-items: center; gap: 8px; text-wrap: balance; } .pane-label .label-count { color: var(--text2); } + /* FILE DIVIDER */ + .file-divider { + display: flex; + align-items: center; + gap: 12px; + margin: 16px 0; + color: var(--text2); + font-size: 11px; + font-family: var(--mono); + } + .file-divider::before, .file-divider::after { + content: ''; + flex: 1; + border-top: 1px solid var(--border); + } + /* WORD-LEVEL TRANSCRIPT */ .paragraph { margin-bottom: 16px; line-height: 1.85; position: relative; font-size: 15px; } .para-ts { font-family: var(--mono); font-size: 11px; color: var(--text3); margin-left: 8px; cursor: pointer; user-select: none; font-variant-numeric: tabular-nums; } @@ -453,7 +469,7 @@

- +