facebeak/audio.py at main · EdgeCaser/facebeak · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import librosa
import numpy as np
import logging
import soundfile as sf
import warnings
import subprocess
from pathlib import Path
import tempfile

logger = logging.getLogger(__name__)


def extract_audio_features(audio_path, sr=16000, n_fft=512, hop_length=256, n_mels=128):
    """
    Extract mel spectrogram and chroma features from audio file.

    Args:
        audio_path: Path to audio file
        sr: Sample rate
        n_fft: FFT window size
        hop_length: Number of samples between successive frames
        n_mels: Number of mel bands

    Returns:
        tuple: (mel_spectrogram, chroma_features) as numpy arrays
    """
    try:
        # Load audio file
        try:
            y, sr_orig = sf.read(audio_path)
            if len(y.shape) > 1:  # Convert to mono if stereo
                y = y.mean(axis=1)
        except Exception as e:
            logger.warning("PySoundFile failed. Trying audioread instead.")
            y, sr_orig = librosa.load(audio_path, sr=sr, mono=True)

        # Resample if necessary
        if sr_orig != sr:
            y = librosa.resample(y=y, orig_sr=sr_orig, target_sr=sr)

        # Ensure audio length is sufficient for n_fft
        if len(y) < n_fft:
            warnings.warn(f"Audio length {len(y)} is shorter than n_fft {n_fft}. Padding with zeros.")
            y = np.pad(y, (0, n_fft - len(y)))

        # Compute mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            fmin=20,
            fmax=sr/2
        )

        # Convert to log scale (dB)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # Normalize to [0, 1]
        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)

        # Compute chroma features
        chroma = librosa.feature.chroma_stft(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length
        )

        # Normalize chroma to [0, 1]
        chroma_norm = (chroma - chroma.min()) / (chroma.max() - chroma.min() + 1e-8)

        # Convert to float32
        mel_spec_norm = mel_spec_norm.astype(np.float32)
        chroma_norm = chroma_norm.astype(np.float32)

        logger.info(f"Audio file {audio_path} processed. Computed mel spectrogram (shape {mel_spec_norm.shape}) and chroma (shape {chroma_norm.shape}).")

        return mel_spec_norm, chroma_norm

    except FileNotFoundError:
        logger.error(f"Audio file not found: {audio_path}.")
        raise
    except Exception as e:
        logger.error(f"Error processing audio file {audio_path}: {str(e)}")
        raise

def extract_and_save_crow_audio(video_path, frame_time_seconds, fps, crow_id, frame_num, audio_dir, duration=2.0):
    """
    Extract audio segment from video when a crow is detected.

    Args:
        video_path: Path to the video file
        frame_time_seconds: Time in video when crow was detected (seconds)
        fps: Frames per second of the video
        crow_id: ID of the detected crow
        frame_num: Frame number where crow was detected
        audio_dir: Directory to save audio files
        duration: Duration of audio segment to extract (seconds)

    Returns:
        str: Path to saved audio file, or None if extraction failed
    """
    try:
        if not video_path or not os.path.exists(video_path):
            logger.warning(f"Video file not found: {video_path}")
            return None

        # Create audio directory for this crow
        crow_audio_dir = Path(audio_dir) / crow_id
        crow_audio_dir.mkdir(parents=True, exist_ok=True)

        # Calculate start time (center the detection)
        start_time = max(0, frame_time_seconds - duration/2)

        # Generate output filename
        audio_filename = f"frame_{frame_num:06d}_{start_time:.2f}s.wav"
        output_path = crow_audio_dir / audio_filename

        # Use ffmpeg to extract audio segment
        try:
            cmd = [
                'ffmpeg',
                '-i', str(video_path),
                '-ss', str(start_time),
                '-t', str(duration),
                '-acodec', 'pcm_s16le',
                '-ar', '16000',
                '-ac', '1',  # Mono
                '-y',  # Overwrite output file
                str(output_path)
            ]

            # Run ffmpeg with suppressed output
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)

            if result.returncode == 0 and output_path.exists():
                logger.debug(f"Extracted audio segment: {output_path}")
                return str(output_path)
            else:
                logger.warning(f"ffmpeg failed to extract audio: {result.stderr}")
                return None

        except subprocess.TimeoutExpired:
            logger.warning(f"Audio extraction timed out for {video_path}")
            return None
        except FileNotFoundError:
            logger.warning("ffmpeg not found. Audio extraction disabled.")
            return None
        except Exception as e:
            logger.warning(f"Error running ffmpeg: {e}")
            return None

    except Exception as e:
        logger.error(f"Error extracting audio from {video_path}: {e}")
        return None