Merge pull request #94 from JoshRosen/fix-wav-header-in-streaming-res…

…ponses
remsky · Jan 26, 2025 · 0de22ad · 0de22ad
2 parents 55ce88b + b8d5920
commit 0de22ad
Showing 1 changed file with 25 additions and 8 deletions.
diff --git a/api/src/services/audio.py b/api/src/services/audio.py
@@ -1,6 +1,7 @@
 """Audio conversion service"""
 
 from io import BytesIO
+import struct
 
 import numpy as np
 import scipy.io.wavfile as wavfile
@@ -107,14 +108,30 @@ def convert_audio(
                 # Raw 16-bit PCM samples, no header
                 buffer.write(normalized_audio.tobytes())
             elif output_format == "wav":
-                # WAV format with headers
-                sf.write(
-                    buffer,
-                    normalized_audio,
-                    sample_rate,
-                    format="WAV",
-                    subtype="PCM_16",
-                )
+                # Write the WAV header ourselves so that we can specify a "fake" data size.
+                # This is necessary for streaming responses to work properly: if we simply
+                # concatenated individual WAV files then the initial chunk's header length
+                # would be shorter than the full file length and subsequent chunks' RIFF
+                # headers would appear in the middle of the audio data.
+                if is_first_chunk:
+                    # Modified from Python stdlib's wave.py module:
+                    buffer.write(b'RIFF')
+                    buffer.write(struct.pack('<L4s4sLHHLLHH4s',
+                        0xFFFFFFFF,  # total size (set to max)
+                        b'WAVE',
+                        b'fmt ',
+                        16,
+                        1,  # PCM format
+                        1,  # channels
+                        sample_rate,
+                        sample_rate * 2,  # byte rate
+                        2,  # block align
+                        16,  # bits per sample
+                        b'data'
+                    ))
+                    buffer.write(struct.pack('<L', 0xFFFFFFFF))  # data size (set to max)
+                # write raw PCM data
+                buffer.write(normalized_audio.tobytes())
             elif output_format == "mp3":
                 # MP3 format with proper framing
                 settings = format_settings.get("mp3", {}) if format_settings else {}