AudioScribe/transcribe.py at main · pitboss19/AudioScribe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
"""
AudioScribe: Transcribe .m4a audio files to .md and .txt

Usage:
    python transcribe.py recording.m4a                  # OpenAI Whisper API
    python transcribe.py recording.m4a --local          # Local Whisper (offline)
    python transcribe.py recording.m4a --local --model medium
    python transcribe.py recording.m4a -o ./output/
"""

import argparse
import os
import sys
from pathlib import Path
from datetime import datetime

# Load .env file from the same directory as this script
_env_path = Path(__file__).parent / ".env"
if _env_path.exists():
    for _line in _env_path.read_text().splitlines():
        _line = _line.strip()
        if _line and not _line.startswith("#") and "=" in _line:
            _k, _v = _line.split("=", 1)
            os.environ.setdefault(_k.strip(), _v.strip())


def split_audio(audio_path: Path, chunk_minutes: int = 10) -> list[Path]:
    """Split audio into chunks using ffmpeg. Returns list of chunk paths."""
    import shutil
    import tempfile

    if not shutil.which("ffmpeg"):
        print("Error: ffmpeg not found. Install with: brew install ffmpeg")
        sys.exit(1)

    tmp_dir = Path(tempfile.mkdtemp(prefix="audioscribe_"))
    chunk_seconds = chunk_minutes * 60
    chunk_pattern = str(tmp_dir / "chunk_%03d.m4a")

    cmd = (
        f'ffmpeg -i "{audio_path}" -f segment -segment_time {chunk_seconds} '
        f'-c copy -vn "{chunk_pattern}" -y -loglevel error'
    )
    ret = os.system(cmd)
    if ret != 0:
        print("Error: ffmpeg failed to split the audio file.")
        sys.exit(1)

    chunks = sorted(tmp_dir.glob("chunk_*.m4a"))
    return chunks


def transcribe_with_openai(audio_path: Path) -> str:
    """Transcribe using OpenAI Whisper API (requires OPENAI_API_KEY)."""
    try:
        from openai import OpenAI
    except ImportError:
        print("Error: openai package not installed. Run: pip install openai")
        sys.exit(1)

    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("Error: OPENAI_API_KEY environment variable not set.")
        print("Set it with: export OPENAI_API_KEY=sk-...")
        sys.exit(1)

    client = OpenAI(api_key=api_key)

    max_bytes = 25 * 1024 * 1024
    file_size = audio_path.stat().st_size

    if file_size <= max_bytes:
        print(f"Transcribing '{audio_path.name}' via OpenAI Whisper API...")
        with open(audio_path, "rb") as f:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=f,
                response_format="text",
            )
        return response.strip()

    # File too large — split into chunks
    print(f"File is {file_size / 1024 / 1024:.1f}MB — splitting into chunks for API...")
    chunks = split_audio(audio_path)
    print(f"  Split into {len(chunks)} chunks. Transcribing each...")

    parts = []
    for i, chunk in enumerate(chunks, 1):
        print(f"  Chunk {i}/{len(chunks)}: {chunk.name}")
        with open(chunk, "rb") as f:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=f,
                response_format="text",
            )
        parts.append(response.strip())
        chunk.unlink()  # clean up

    chunks[0].parent.rmdir() if chunks else None
    return " ".join(parts)


def transcribe_with_local_whisper(audio_path: Path, model_size: str) -> str:
    """Transcribe using a local Whisper model (no internet or API key needed)."""
    try:
        import whisper
    except ImportError:
        print("Error: openai-whisper package not installed.")
        print("Run: pip install openai-whisper")
        print("Note: ffmpeg must also be installed (brew install ffmpeg on Mac).")
        sys.exit(1)

    print(f"Loading local Whisper model '{model_size}' (downloads on first use)...")
    model = whisper.load_model(model_size)

    print(f"Transcribing '{audio_path.name}'... (this may take a while)")
    result = model.transcribe(str(audio_path), verbose=False)
    return result["text"].strip()


def write_outputs(text: str, audio_path: Path, output_dir: Path):
    """Write transcription to .txt and .md files."""
    base = audio_path.stem
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Plain text
    txt_path = output_dir / f"{base}.txt"
    txt_path.write_text(text + "\n", encoding="utf-8")
    print(f"  Saved TXT: {txt_path}")

    # Markdown with metadata header
    md_lines = [
        f"# Transcription: {base}",
        "",
        f"**Source:** `{audio_path.name}`  ",
        f"**Transcribed:** {timestamp}",
        "",
        "---",
        "",
        text,
        "",
    ]
    md_path = output_dir / f"{base}.md"
    md_path.write_text("\n".join(md_lines), encoding="utf-8")
    print(f"  Saved MD:  {md_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Transcribe .m4a audio files to .txt and .md",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument("audio_file", help="Path to the .m4a audio file")
    parser.add_argument(
        "--local",
        action="store_true",
        help="Use a local Whisper model instead of the OpenAI API (works offline)",
    )
    parser.add_argument(
        "--model",
        default="base",
        choices=["tiny", "base", "small", "medium", "large"],
        help="Local Whisper model size (default: base). Larger = more accurate but slower.",
    )
    parser.add_argument(
        "--output-dir", "-o",
        default=None,
        help="Directory to write output files (default: same directory as input)",
    )

    args = parser.parse_args()

    audio_path = Path(args.audio_file).expanduser().resolve()
    if not audio_path.exists():
        print(f"Error: File not found: {audio_path}")
        sys.exit(1)
    if audio_path.suffix.lower() not in (".m4a", ".mp3", ".mp4", ".wav", ".ogg", ".flac"):
        print(f"Warning: Unexpected file extension '{audio_path.suffix}' — proceeding anyway.")

    output_dir = Path(args.output_dir).expanduser().resolve() if args.output_dir else audio_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)

    if args.local:
        text = transcribe_with_local_whisper(audio_path, args.model)
    else:
        text = transcribe_with_openai(audio_path)

    if not text:
        print("Warning: Transcription returned empty text.")

    write_outputs(text, audio_path, output_dir)
    print("Done.")


if __name__ == "__main__":
    main()