-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranscribe.py
More file actions
198 lines (162 loc) · 6.42 KB
/
Copy pathtranscribe.py
File metadata and controls
198 lines (162 loc) · 6.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
"""
AudioScribe: Transcribe .m4a audio files to .md and .txt
Usage:
python transcribe.py recording.m4a # OpenAI Whisper API
python transcribe.py recording.m4a --local # Local Whisper (offline)
python transcribe.py recording.m4a --local --model medium
python transcribe.py recording.m4a -o ./output/
"""
import argparse
import os
import sys
from pathlib import Path
from datetime import datetime
# Load .env file from the same directory as this script
_env_path = Path(__file__).parent / ".env"
if _env_path.exists():
for _line in _env_path.read_text().splitlines():
_line = _line.strip()
if _line and not _line.startswith("#") and "=" in _line:
_k, _v = _line.split("=", 1)
os.environ.setdefault(_k.strip(), _v.strip())
def split_audio(audio_path: Path, chunk_minutes: int = 10) -> list[Path]:
"""Split audio into chunks using ffmpeg. Returns list of chunk paths."""
import shutil
import tempfile
if not shutil.which("ffmpeg"):
print("Error: ffmpeg not found. Install with: brew install ffmpeg")
sys.exit(1)
tmp_dir = Path(tempfile.mkdtemp(prefix="audioscribe_"))
chunk_seconds = chunk_minutes * 60
chunk_pattern = str(tmp_dir / "chunk_%03d.m4a")
cmd = (
f'ffmpeg -i "{audio_path}" -f segment -segment_time {chunk_seconds} '
f'-c copy -vn "{chunk_pattern}" -y -loglevel error'
)
ret = os.system(cmd)
if ret != 0:
print("Error: ffmpeg failed to split the audio file.")
sys.exit(1)
chunks = sorted(tmp_dir.glob("chunk_*.m4a"))
return chunks
def transcribe_with_openai(audio_path: Path) -> str:
"""Transcribe using OpenAI Whisper API (requires OPENAI_API_KEY)."""
try:
from openai import OpenAI
except ImportError:
print("Error: openai package not installed. Run: pip install openai")
sys.exit(1)
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("Error: OPENAI_API_KEY environment variable not set.")
print("Set it with: export OPENAI_API_KEY=sk-...")
sys.exit(1)
client = OpenAI(api_key=api_key)
max_bytes = 25 * 1024 * 1024
file_size = audio_path.stat().st_size
if file_size <= max_bytes:
print(f"Transcribing '{audio_path.name}' via OpenAI Whisper API...")
with open(audio_path, "rb") as f:
response = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="text",
)
return response.strip()
# File too large — split into chunks
print(f"File is {file_size / 1024 / 1024:.1f}MB — splitting into chunks for API...")
chunks = split_audio(audio_path)
print(f" Split into {len(chunks)} chunks. Transcribing each...")
parts = []
for i, chunk in enumerate(chunks, 1):
print(f" Chunk {i}/{len(chunks)}: {chunk.name}")
with open(chunk, "rb") as f:
response = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="text",
)
parts.append(response.strip())
chunk.unlink() # clean up
chunks[0].parent.rmdir() if chunks else None
return " ".join(parts)
def transcribe_with_local_whisper(audio_path: Path, model_size: str) -> str:
"""Transcribe using a local Whisper model (no internet or API key needed)."""
try:
import whisper
except ImportError:
print("Error: openai-whisper package not installed.")
print("Run: pip install openai-whisper")
print("Note: ffmpeg must also be installed (brew install ffmpeg on Mac).")
sys.exit(1)
print(f"Loading local Whisper model '{model_size}' (downloads on first use)...")
model = whisper.load_model(model_size)
print(f"Transcribing '{audio_path.name}'... (this may take a while)")
result = model.transcribe(str(audio_path), verbose=False)
return result["text"].strip()
def write_outputs(text: str, audio_path: Path, output_dir: Path):
"""Write transcription to .txt and .md files."""
base = audio_path.stem
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Plain text
txt_path = output_dir / f"{base}.txt"
txt_path.write_text(text + "\n", encoding="utf-8")
print(f" Saved TXT: {txt_path}")
# Markdown with metadata header
md_lines = [
f"# Transcription: {base}",
"",
f"**Source:** `{audio_path.name}` ",
f"**Transcribed:** {timestamp}",
"",
"---",
"",
text,
"",
]
md_path = output_dir / f"{base}.md"
md_path.write_text("\n".join(md_lines), encoding="utf-8")
print(f" Saved MD: {md_path}")
def main():
parser = argparse.ArgumentParser(
description="Transcribe .m4a audio files to .txt and .md",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument("audio_file", help="Path to the .m4a audio file")
parser.add_argument(
"--local",
action="store_true",
help="Use a local Whisper model instead of the OpenAI API (works offline)",
)
parser.add_argument(
"--model",
default="base",
choices=["tiny", "base", "small", "medium", "large"],
help="Local Whisper model size (default: base). Larger = more accurate but slower.",
)
parser.add_argument(
"--output-dir", "-o",
default=None,
help="Directory to write output files (default: same directory as input)",
)
args = parser.parse_args()
audio_path = Path(args.audio_file).expanduser().resolve()
if not audio_path.exists():
print(f"Error: File not found: {audio_path}")
sys.exit(1)
if audio_path.suffix.lower() not in (".m4a", ".mp3", ".mp4", ".wav", ".ogg", ".flac"):
print(f"Warning: Unexpected file extension '{audio_path.suffix}' — proceeding anyway.")
output_dir = Path(args.output_dir).expanduser().resolve() if args.output_dir else audio_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
if args.local:
text = transcribe_with_local_whisper(audio_path, args.model)
else:
text = transcribe_with_openai(audio_path)
if not text:
print("Warning: Transcription returned empty text.")
write_outputs(text, audio_path, output_dir)
print("Done.")
if __name__ == "__main__":
main()