diff --git a/manim_voiceover/modify_audio.py b/manim_voiceover/modify_audio.py index e0912c6..be57925 100644 --- a/manim_voiceover/modify_audio.py +++ b/manim_voiceover/modify_audio.py @@ -2,6 +2,7 @@ import sox import uuid from mutagen.mp3 import MP3 +from mutagen.wave import WAVE def adjust_speed(input_path: str, output_path: str, tempo: float) -> None: @@ -19,6 +20,7 @@ def adjust_speed(input_path: str, output_path: str, tempo: float) -> None: def get_duration(path: str) -> float: - audio = MP3(path) - return audio.info.length + if str(path).endswith(".wav"): + return WAVE(path).info.length + return MP3(path).info.length # return sox.file_info.duration(path) diff --git a/manim_voiceover/services/gemini.py b/manim_voiceover/services/gemini.py new file mode 100644 index 0000000..3bd5046 --- /dev/null +++ b/manim_voiceover/services/gemini.py @@ -0,0 +1,120 @@ +import os +import sys +import wave +from pathlib import Path +from dotenv import find_dotenv, load_dotenv +from manim import logger + +from manim_voiceover.helper import ( + create_dotenv_file, + remove_bookmarks, +) +from manim_voiceover.services.base import SpeechService + +try: + from google import genai + from google.genai import types +except ImportError: + logger.error( + "Missing packages. " + 'Run `pip install google-genai` to use GeminiTTSService.' + ) + +load_dotenv(find_dotenv(usecwd=True)) + + +def create_dotenv_gemini(): + logger.info("You need a Gemini API key from https://makersuite.google.com/app/apikey") + if not create_dotenv_file(["GOOGLE_API_KEY"]): + raise ValueError( + "The environment variable GOOGLE_API_KEY is not set. " + "Please add it to your .env file." + ) + logger.info("The .env file has been created. Please restart Manim.") + sys.exit() + + +class GeminiTTSService(SpeechService): + """ + Gemini-based TTS service using Google's Gemini 2.5 SDK. + """ + + def __init__( + self, + model: str = "gemini-2.5-flash-preview-tts", + voice_name: str = "Kore", + **kwargs + ): + self.model = model + self.voice_name = voice_name + + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + create_dotenv_gemini() + + self.client = genai.Client(api_key=api_key) + + super().__init__(transcription_model=None, **kwargs) + + + def generate_from_text( + self, + text: str, + cache_dir: str = None, + path: str = None, + **kwargs + ) -> dict: + if cache_dir is None: + cache_dir = self.cache_dir + + clean_text = remove_bookmarks(text) + + input_data = { + "input_text": clean_text, + "service": "gemini", + "config": { + "model": self.model, + "voice": self.voice_name, + }, + } + + cached = self.get_cached_result(input_data, cache_dir) + if cached: + return cached + + try: + response = self.client.models.generate_content( + model=self.model, + contents=clean_text, + config=types.GenerateContentConfig( + response_modalities=["AUDIO"], + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig( + voice_name=self.voice_name, + ) + ) + ), + ), + ) + except Exception as e: + logger.error(f"Gemini TTS generation failed: {e}") + raise + + audio_data = response.candidates[0].content.parts[0].inline_data.data + + audio_path = path or self.get_audio_basename(input_data) + ".wav" + full_path = Path(cache_dir) / audio_path + + # Save audio + with wave.open(str(full_path), "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(24000) + wf.writeframes(audio_data) + + return { + "input_text": text, + "input_data": input_data, + "original_audio": audio_path, + }