diff --git a/DESCRIPTION b/DESCRIPTION index 1e3d157..34bfb9b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chatterbox Title: Text-to-Speech Using Chatterbox TTS Engine -Version: 0.1.0.6 +Version: 0.1.0.7 Authors@R: c(person("Troy", "Hernandez", role = c("aut", "cre"), email = "troy@cornball.ai", diff --git a/NAMESPACE b/NAMESPACE index d8e04e8..cdfa788 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,6 +41,7 @@ export(text_to_tokens) export(tts_chunked) export(tts_to_file) export(turbo_models_available) +export(voice_convert) export(write_audio) S3method(print,chatterbox) diff --git a/NEWS.md b/NEWS.md index 01376ef..40356bd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# chatterbox 0.1.0.7 (development) + +- New `voice_convert()`: speech-to-speech voice conversion (port of + Python ChatterboxVC); re-renders source speech in a target voice, + preserving the source timing. + # chatterbox 0.1.0.6 (development) - `generate(skip_vocoder = TRUE)` returns the mel spectrogram instead of diff --git a/R/tts.R b/R/tts.R index 470ac8b..ab0d61d 100644 --- a/R/tts.R +++ b/R/tts.R @@ -615,7 +615,7 @@ generate <- function(model, text, voice, exaggeration = 0.5, tts_to_file <- function(model, text, voice, output_path, ...) { if (isTRUE(list(...)$skip_vocoder)) { stop("skip_vocoder makes no sense here: there is no audio to ", - "write. Use generate() to get the mel.") + "write. Use generate() to get the mel.") } result <- generate(model, text, voice, ...) write_audio(result$audio, result$sample_rate, output_path) diff --git a/R/vc.R b/R/vc.R new file mode 100644 index 0000000..8fb5df4 --- /dev/null +++ b/R/vc.R @@ -0,0 +1,86 @@ +# Voice conversion (port of Python chatterbox's vc.py). +# Speech-to-speech: re-synthesize source speech in the target voice. +# Skips T3 entirely - the source audio is tokenized by the S3 tokenizer +# and S3Gen regenerates it conditioned on the target speaker. + +#' Convert speech to a target voice +#' +#' Re-synthesizes \code{audio} so the same words and prosody come out in +#' the target voice (Python chatterbox's \code{ChatterboxVC}). No text +#' or T3 generation is involved: the source speech is tokenized directly +#' (25 tokens/s) and S3Gen renders the tokens with the target speaker's +#' conditioning, so the result follows the source's timing. +#' +#' @param model Loaded chatterbox model (standard, not turbo) +#' @param audio Source speech (file path, numeric vector, or torch +#' tensor) +#' @param voice Target voice: a voice_embedding from +#' \code{\link{create_voice_embedding}} (or +#' \code{\link{load_voice_embedding}}), or a path to reference audio +#' @param sample_rate Sample rate of \code{audio} (if not a file) +#' @return List with \code{audio} (numeric vector), \code{sample_rate} +#' (24000), and \code{audio_sec}, like \code{\link{generate}} +#' @export +voice_convert <- function(model, audio, voice, sample_rate = NULL) { + if (!is_loaded(model)) { + stop("Model not loaded. Call load_chatterbox() first.") + } + if (isTRUE(model$turbo)) { + stop("Voice conversion uses the standard S3Gen decoder; ", + "load a standard (non-turbo) model.") + } + + # Target voice conditioning (only ref_dict is used; Python VC's + # embed_ref caps the reference at 10 s, as create_voice_embedding + # already does) + if (is.character(voice)) { + voice <- create_voice_embedding(model, voice) + } + if (!inherits(voice, "voice_embedding")) { + stop("voice must be a voice_embedding object or path to ", + "reference audio") + } + + # Source speech at 16 kHz for the S3 tokenizer + if (is.character(audio)) { + audio_data <- read_audio(audio) + samples <- audio_data$samples + sample_rate <- audio_data$sr + } else if (is.numeric(audio)) { + if (is.null(sample_rate)) { + stop("sample_rate must be provided for numeric audio input") + } + samples <- audio + } else if (inherits(audio, "torch_tensor")) { + if (is.null(sample_rate)) { + stop("sample_rate must be provided for tensor audio input") + } + samples <- as.numeric(audio$cpu()) + } else { + stop("audio must be a file path, numeric vector, or torch tensor") + } + if (sample_rate != S3_SR) { + samples <- resample_audio(samples, sample_rate, S3_SR) + } + + device <- model$device + audio_16k <- torch::torch_tensor(samples, + dtype = torch::torch_float32())$unsqueeze(1)$to(device = device) + + torch::with_no_grad({ + # Full-length tokenization: VC keeps the source's timing + tok <- model$s3gen$tokenizer$forward(audio_16k) + result <- model$s3gen$inference( + speech_tokens = tok$tokens$to(device = device), + ref_dict = voice$ref_dict, + finalize = TRUE + ) + }) + + audio_samples <- as.numeric(result[[1]]$squeeze()$cpu()) + list( + audio = audio_samples, + sample_rate = S3GEN_SR, + audio_sec = length(audio_samples) / S3GEN_SR + ) +} diff --git a/inst/tinytest/test_vc.R b/inst/tinytest/test_vc.R new file mode 100644 index 0000000..6d8e5f9 --- /dev/null +++ b/inst/tinytest/test_vc.R @@ -0,0 +1,23 @@ +# voice_convert error paths (no weights needed) + +if (requireNamespace("torch", quietly = TRUE) && torch::torch_is_installed()) { + unloaded <- chatterbox::chatterbox("cpu") + expect_error(chatterbox::voice_convert(unloaded, "x.wav", "y.wav"), + "not loaded") + + fake_turbo <- structure(list(loaded = TRUE, turbo = TRUE), + class = "chatterbox") + expect_error(chatterbox::voice_convert(fake_turbo, "x.wav", "y.wav"), + "non-turbo") + + fake <- structure(list(loaded = TRUE, turbo = FALSE), + class = "chatterbox") + fake_voice <- structure(list(ref_dict = list()), + class = "voice_embedding") + expect_error(chatterbox::voice_convert(fake, TRUE, fake_voice), + "file path, numeric") + expect_error(chatterbox::voice_convert(fake, c(0, 0.1), fake_voice), + "sample_rate") + expect_error(chatterbox::voice_convert(fake, c(0, 0.1), list()), + "voice_embedding") +} diff --git a/man/voice_convert.Rd b/man/voice_convert.Rd new file mode 100644 index 0000000..7f9d3aa --- /dev/null +++ b/man/voice_convert.Rd @@ -0,0 +1,30 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{voice_convert} +\alias{voice_convert} +\title{Convert speech to a target voice} +\usage{ +voice_convert(model, audio, voice, sample_rate = NULL) +} +\arguments{ +\item{model}{Loaded chatterbox model (standard, not turbo)} + +\item{audio}{Source speech (file path, numeric vector, or torch +tensor)} + +\item{voice}{Target voice: a voice_embedding from +\code{\link{create_voice_embedding}} (or +\code{\link{load_voice_embedding}}), or a path to reference audio} + +\item{sample_rate}{Sample rate of \code{audio} (if not a file)} +} +\value{ +List with \code{audio} (numeric vector), \code{sample_rate} + (24000), and \code{audio_sec}, like \code{\link{generate}} +} +\description{ +Re-synthesizes \code{audio} so the same words and prosody come out in +the target voice (Python chatterbox's \code{ChatterboxVC}). No text +or T3 generation is involved: the source speech is tokenized directly +(25 tokens/s) and S3Gen renders the tokens with the target speaker's +conditioning, so the result follows the source's timing. +} diff --git a/scripts/vc_reference.py b/scripts/vc_reference.py new file mode 100644 index 0000000..cead29a --- /dev/null +++ b/scripts/vc_reference.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +"""Run Python ChatterboxVC for R voice_convert comparison.""" + +import glob +import torch +import soundfile as sf +from chatterbox.vc import ChatterboxVC + +snap = glob.glob("/root/.cache/huggingface/hub/models--ResembleAI--chatterbox/snapshots/*")[0] +vc = ChatterboxVC.from_local(snap, "cuda") +wav = vc.generate("/pkg/inst/audio/jfk.wav", target_voice_path="/pkg/scripts/reference.wav") +wav = wav.squeeze(0).numpy() +print(f"py vc: {len(wav)/24000:.2f}s, std={wav.std():.4f}") +sf.write("/outputs/vc_py_jfk_to_reference.wav", wav, 24000)