Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
96507c6
messy, started adding speechbrain implementation
RobertAgee Jun 13, 2025
aa3c0b6
Create generated.mp3
RobertAgee Jun 13, 2025
7ccf4ea
upload readme audio files
RobertAgee Jun 13, 2025
a4912e8
Update and rename generated.mp3 to .gitkeep
RobertAgee Jun 13, 2025
ad8c9bb
add speechbrain to fitness_scorer for greater accuracy
RobertAgee Jun 14, 2025
2fbc497
GPU optimization in process
RobertAgee Jun 14, 2025
f2eede1
GPU optimization, tensor cleanup, memory use tracking
RobertAgee Jun 15, 2025
52dc9b6
Further GPU optimization, memory use tracking
RobertAgee Jun 15, 2025
81f1ea2
Full GPU optimization, memory and timing logs, config settings
RobertAgee Jun 16, 2025
4675774
messy, started adding speechbrain implementation
RobertAgee Jun 13, 2025
63910d3
add speechbrain to fitness_scorer for greater accuracy
RobertAgee Jun 14, 2025
c7e6b6d
GPU optimization in process
RobertAgee Jun 14, 2025
d9fcc8e
GPU optimization, tensor cleanup, memory use tracking
RobertAgee Jun 15, 2025
67f5f98
Further GPU optimization, memory use tracking
RobertAgee Jun 15, 2025
4e67736
Full GPU optimization, memory and timing logs, config settings
RobertAgee Jun 16, 2025
60aee79
Merge branch 'RobViren:main' into similarity-checker
RobertAgee Jun 16, 2025
b834cf1
Merge pull request #1 from RobertAgee/similarity-checker
RobertAgee Jun 16, 2025
4535210
small lints
RobertAgee Jun 17, 2025
5de2b40
Merge remote-tracking branch 'origin/main'
RobertAgee Jun 17, 2025
e607c0e
Add packages to toml, update config settings
RobertAgee Jun 17, 2025
66b2688
correct spelling in pyproject.toml
RobertAgee Jun 17, 2025
cbc75f1
correct spelling in pyproject.toml
RobertAgee Jun 17, 2025
f3cbf03
small bug fix for interpolate_start
RobertAgee Jun 17, 2025
30c1884
improve pt saving by moving to cpu first
RobertAgee Jun 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/readme/.gitkeep
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file added example/readme/baseline.wav
Binary file not shown.
Binary file added example/readme/generated.wav
Binary file not shown.
Binary file added example/readme/original-clip.wav
Binary file not shown.
45 changes: 36 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,35 @@
import argparse
import os
import traceback
from pathlib import Path

import numpy as np
import soundfile as sf
import torch

from utilities.audio_processor import Transcriber, convert_to_wav_mono_24k
from utilities.kvoicewalk import KVoiceWalk
from utilities.kvw_informer import KVW_Informer
from utilities.pytorch_sanitizer import load_multiple_voices
from utilities.speech_generator import SpeechGenerator


def main():
# import config settings
kvw_informer = KVW_Informer()
kvw_settings = kvw_informer.settings
log_view = kvw_settings["preprocessing_logs"]
use_cached = kvw_settings["use_cached"]
cap_memory = kvw_settings["cap_memory"]
cap_memory_frac = kvw_settings["cap_memory_frac"]
# After initial download, recommend use cached copies of models == faster load times
if use_cached:
os.environ['HF_HUB_OFFLINE'] = '1' # Force offline mode
os.environ['TRANSFORMERS_OFFLINE'] = '1'
# True: Limits excess memory overhead reservation, benchmarked at ~0.70GB throughout operation, no spikes
# Cap_memory_frac = 0.2, can be set 0-1, but recommend no lower than 0.15
if cap_memory: torch.cuda.set_per_process_memory_fraction(cap_memory_frac)

parser = argparse.ArgumentParser(description="A random walk Kokoro voice cloner.")

# Common required arguments
Expand Down Expand Up @@ -78,6 +96,7 @@ def main():

# Handle target_audio input - convert to mono wav 24K automatically
if args.target_audio:
if log_view is True: kvw_informer.log_gpu_memory("Preprocessing target audio file", log_view)
try:
target_audio_path = Path(args.target_audio)
if target_audio_path.is_file():
Expand All @@ -89,6 +108,7 @@ def main():

# Transcribe (Start Mode)
if args.transcribe_start:
if log_view is True: kvw_informer.log_gpu_memory("Transcribing target audio file", log_view)
try:
target_path = Path(args.target_audio)

Expand Down Expand Up @@ -165,7 +185,8 @@ def main():
if not args.target_text:
parser.error("--target_text is required when using --test_voice")

speech_generator = SpeechGenerator()
speech_generator = SpeechGenerator(kvw_informer=kvw_informer, target_text=args.target_text,
other_text=args.other_text)
audio = speech_generator.generate_audio(args.target_text, args.test_voice)
sf.write(args.output_name, audio, 24000)
else:
Expand All @@ -175,15 +196,21 @@ def main():
if not args.target_text:
parser.error("--target_text is required for random walk mode")

if log_view is True: kvw_informer.log_gpu_memory("Initializing KVoicewalk", log_view)
ktb = KVoiceWalk(args.target_audio,
args.target_text,
args.other_text,
args.voice_folder,
args.interpolate_start,
args.population_limit,
args.target_text,
args.other_text,
args.voice_folder,
args.interpolate_start,
args.population_limit,
args.starting_voice,
args.output_name)
ktb.random_walk(args.step_limit)

args.output_name, kvw_informer)
try:
ktb.random_walk(args.step_limit)
except Exception as e:
print("FULL TRACEBACK:")
traceback.print_exc()
print(f"\nERROR: {e}")
print(f"ERROR TYPE: {type(e)}")
if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "kvoicewalk"
version = "0.1.0"
description = "Add your description here"
description = "A randomwalk through Kokoro latent space"
readme = "README.md"
requires-python = ">=3.10,<3.13"
dependencies = [
Expand All @@ -12,4 +12,7 @@ dependencies = [
"soundfile>=0.13.1",
"torch>=2.7.0",
"tqdm>=4.67.1",
"faster-whisper>=1.1.1",
"speechbrain>=1.0.3",
"torchaudio>=2.7.0",
]
26 changes: 12 additions & 14 deletions utilities/audio_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@


def convert_to_wav_mono_24k(audio_path: Path) -> Path:
print(f"Converting {audio_path.name} to Mono Wav 24K...")
try:
with sf.SoundFile(audio_path, 'r') as f:
if f.format != 'WAV' or f.samplerate != 24000 or f.channels != 1:
print(f"Converting {audio_path.name} to Mono Wav 24K...")
# Create output filename with proper audio format
converted_audio_file = Path(CONVERTED_DIR / str(audio_path.stem + ".wav"))

Expand All @@ -23,7 +23,7 @@ def convert_to_wav_mono_24k(audio_path: Path) -> Path:
# Convert to mono if needed
if f.channels > 1:
converted_audio_data = np.mean(audio_data, axis=1)
print("Cenverted to Mono...")
# print("Cenverted to Mono...")
else:
converted_audio_data = audio_data

Expand All @@ -34,14 +34,14 @@ def convert_to_wav_mono_24k(audio_path: Path) -> Path:
orig_sr=f.samplerate,
target_sr=24000
)
print("Resampled to 24K...")
# print("Resampled to 24K...")

# Save converted audio
sf.write(converted_audio_file, converted_audio_data, samplerate=24000, format='WAV')
print(f"{audio_path.name} successfully converted to Mono WAV 24K format: {converted_audio_file}")
print(f"{audio_path.name} converted to Mono WAV 24K format: {converted_audio_file}")
return converted_audio_file
else:
print(f"{audio_path.name} matches Mono WAV 24K format")
# print(f"{audio_path.name} matches Mono WAV 24K format")
return audio_path

except Exception as e:
Expand All @@ -51,7 +51,7 @@ def convert_to_wav_mono_24k(audio_path: Path) -> Path:
class Transcriber:
def __init__(self):
model_size = "large-v3"
print('Starting Transcriber...')
# print('Starting Transcriber...')
# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")

Expand All @@ -66,26 +66,24 @@ def transcribe(self, audio_path: Path):
start_time = datetime.datetime.now()

try:
print(f'Loading {audio_file.name}...')
# print(f'Loading {audio_file.name}...')
segments, info = self.model.transcribe(str(audio_file), beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
print(f'Transcribing {audio_file.name}...')

transcription = ''
transcription = ""
for segment in segments:
transcription += segment.text
# print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) # Optional timestamps if parsing longer audio clips
transcription += " " + segment.text.strip()

transcription_output = Path(TEXTS_DIR / str(f"{audio_file.stem}.txt"))
with open(str(transcription_output), "w") as file:
file.write(f"{transcription}")
file.write(f"{transcription[1:]}")

end_time = datetime.datetime.now()
print(f"Transcription completed in {(end_time - start_time).total_seconds()} seconds")
print(f"Transcription available at ./texts/{audio_file.name[:-4]}.txt")
print(f"{audio_file.name} Transcription:\n{transcription}")
return transcription
print(f"{audio_file.name} Transcription:\n{transcription[1:]}")
return transcription[1:]

except Exception as e:
print(f"Transcription failed for {audio_file.name} - Error: {e}")
Expand Down
Loading