From 7e6a369d87ebf7c82fd29f7371cde3e6cb58e4a0 Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 13:42:08 -0500 Subject: [PATCH 1/3] Voice conversion: voice_convert() ports Python's ChatterboxVC Source speech -> S3 tokenizer (full length, keeps source timing) -> S3Gen with the target voice's ref_dict. Validated against the 0.1.7 container on the same source/target: durations match to 0.01 s (7.56 s), amplitude in family (std 0.048 R vs 0.051 Python; CFM noise draws differ by construction). --- NAMESPACE | 1 + R/vc.R | 86 +++++++++++++++++++++++++++++++++++++++++ inst/tinytest/test_vc.R | 23 +++++++++++ man/voice_convert.Rd | 30 ++++++++++++++ scripts/vc_reference.py | 14 +++++++ 5 files changed, 154 insertions(+) create mode 100644 R/vc.R create mode 100644 inst/tinytest/test_vc.R create mode 100644 man/voice_convert.Rd create mode 100644 scripts/vc_reference.py diff --git a/NAMESPACE b/NAMESPACE index d8e04e8..cdfa788 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,6 +41,7 @@ export(text_to_tokens) export(tts_chunked) export(tts_to_file) export(turbo_models_available) +export(voice_convert) export(write_audio) S3method(print,chatterbox) diff --git a/R/vc.R b/R/vc.R new file mode 100644 index 0000000..71b346c --- /dev/null +++ b/R/vc.R @@ -0,0 +1,86 @@ +# Voice conversion (port of Python chatterbox's vc.py). +# Speech-to-speech: re-synthesize source speech in the target voice. +# Skips T3 entirely - the source audio is tokenized by the S3 tokenizer +# and S3Gen regenerates it conditioned on the target speaker. + +#' Convert speech to a target voice +#' +#' Re-synthesizes \code{audio} so the same words and prosody come out in +#' the target voice (Python chatterbox's \code{ChatterboxVC}). No text +#' or T3 generation is involved: the source speech is tokenized directly +#' (25 tokens/s) and S3Gen renders the tokens with the target speaker's +#' conditioning, so the result follows the source's timing. +#' +#' @param model Loaded chatterbox model (standard, not turbo) +#' @param audio Source speech (file path, numeric vector, or torch +#' tensor) +#' @param voice Target voice: a voice_embedding from +#' \code{\link{create_voice_embedding}} (or +#' \code{\link{load_voice_embedding}}), or a path to reference audio +#' @param sample_rate Sample rate of \code{audio} (if not a file) +#' @return List with \code{audio} (numeric vector), \code{sample_rate} +#' (24000), and \code{audio_sec}, like \code{\link{generate}} +#' @export +voice_convert <- function (model, audio, voice, sample_rate = NULL) { + if (!is_loaded(model)) { + stop("Model not loaded. Call load_chatterbox() first.") + } + if (isTRUE(model$turbo)) { + stop("Voice conversion uses the standard S3Gen decoder; ", + "load a standard (non-turbo) model.") + } + + # Target voice conditioning (only ref_dict is used; Python VC's + # embed_ref caps the reference at 10 s, as create_voice_embedding + # already does) + if (is.character(voice)) { + voice <- create_voice_embedding(model, voice) + } + if (!inherits(voice, "voice_embedding")) { + stop("voice must be a voice_embedding object or path to ", + "reference audio") + } + + # Source speech at 16 kHz for the S3 tokenizer + if (is.character(audio)) { + audio_data <- read_audio(audio) + samples <- audio_data$samples + sample_rate <- audio_data$sr + } else if (is.numeric(audio)) { + if (is.null(sample_rate)) { + stop("sample_rate must be provided for numeric audio input") + } + samples <- audio + } else if (inherits(audio, "torch_tensor")) { + if (is.null(sample_rate)) { + stop("sample_rate must be provided for tensor audio input") + } + samples <- as.numeric(audio$cpu()) + } else { + stop("audio must be a file path, numeric vector, or torch tensor") + } + if (sample_rate != S3_SR) { + samples <- resample_audio(samples, sample_rate, S3_SR) + } + + device <- model$device + audio_16k <- torch::torch_tensor(samples, + dtype = torch::torch_float32())$unsqueeze(1)$to(device = device) + + torch::with_no_grad({ + # Full-length tokenization: VC keeps the source's timing + tok <- model$s3gen$tokenizer$forward(audio_16k) + result <- model$s3gen$inference( + speech_tokens = tok$tokens$to(device = device), + ref_dict = voice$ref_dict, + finalize = TRUE + ) + }) + + audio_samples <- as.numeric(result[[1]]$squeeze()$cpu()) + list( + audio = audio_samples, + sample_rate = S3GEN_SR, + audio_sec = length(audio_samples) / S3GEN_SR + ) +} diff --git a/inst/tinytest/test_vc.R b/inst/tinytest/test_vc.R new file mode 100644 index 0000000..6d8e5f9 --- /dev/null +++ b/inst/tinytest/test_vc.R @@ -0,0 +1,23 @@ +# voice_convert error paths (no weights needed) + +if (requireNamespace("torch", quietly = TRUE) && torch::torch_is_installed()) { + unloaded <- chatterbox::chatterbox("cpu") + expect_error(chatterbox::voice_convert(unloaded, "x.wav", "y.wav"), + "not loaded") + + fake_turbo <- structure(list(loaded = TRUE, turbo = TRUE), + class = "chatterbox") + expect_error(chatterbox::voice_convert(fake_turbo, "x.wav", "y.wav"), + "non-turbo") + + fake <- structure(list(loaded = TRUE, turbo = FALSE), + class = "chatterbox") + fake_voice <- structure(list(ref_dict = list()), + class = "voice_embedding") + expect_error(chatterbox::voice_convert(fake, TRUE, fake_voice), + "file path, numeric") + expect_error(chatterbox::voice_convert(fake, c(0, 0.1), fake_voice), + "sample_rate") + expect_error(chatterbox::voice_convert(fake, c(0, 0.1), list()), + "voice_embedding") +} diff --git a/man/voice_convert.Rd b/man/voice_convert.Rd new file mode 100644 index 0000000..7f9d3aa --- /dev/null +++ b/man/voice_convert.Rd @@ -0,0 +1,30 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{voice_convert} +\alias{voice_convert} +\title{Convert speech to a target voice} +\usage{ +voice_convert(model, audio, voice, sample_rate = NULL) +} +\arguments{ +\item{model}{Loaded chatterbox model (standard, not turbo)} + +\item{audio}{Source speech (file path, numeric vector, or torch +tensor)} + +\item{voice}{Target voice: a voice_embedding from +\code{\link{create_voice_embedding}} (or +\code{\link{load_voice_embedding}}), or a path to reference audio} + +\item{sample_rate}{Sample rate of \code{audio} (if not a file)} +} +\value{ +List with \code{audio} (numeric vector), \code{sample_rate} + (24000), and \code{audio_sec}, like \code{\link{generate}} +} +\description{ +Re-synthesizes \code{audio} so the same words and prosody come out in +the target voice (Python chatterbox's \code{ChatterboxVC}). No text +or T3 generation is involved: the source speech is tokenized directly +(25 tokens/s) and S3Gen renders the tokens with the target speaker's +conditioning, so the result follows the source's timing. +} diff --git a/scripts/vc_reference.py b/scripts/vc_reference.py new file mode 100644 index 0000000..cead29a --- /dev/null +++ b/scripts/vc_reference.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +"""Run Python ChatterboxVC for R voice_convert comparison.""" + +import glob +import torch +import soundfile as sf +from chatterbox.vc import ChatterboxVC + +snap = glob.glob("/root/.cache/huggingface/hub/models--ResembleAI--chatterbox/snapshots/*")[0] +vc = ChatterboxVC.from_local(snap, "cuda") +wav = vc.generate("/pkg/inst/audio/jfk.wav", target_voice_path="/pkg/scripts/reference.wav") +wav = wav.squeeze(0).numpy() +print(f"py vc: {len(wav)/24000:.2f}s, std={wav.std():.4f}") +sf.write("/outputs/vc_py_jfk_to_reference.wav", wav, 24000) From 12f7725d9111283456b4d76cd542118f397f3efb Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 13:42:31 -0500 Subject: [PATCH 2/3] rformat + document --- R/tts.R | 2 +- R/vc.R | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/tts.R b/R/tts.R index 470ac8b..ab0d61d 100644 --- a/R/tts.R +++ b/R/tts.R @@ -615,7 +615,7 @@ generate <- function(model, text, voice, exaggeration = 0.5, tts_to_file <- function(model, text, voice, output_path, ...) { if (isTRUE(list(...)$skip_vocoder)) { stop("skip_vocoder makes no sense here: there is no audio to ", - "write. Use generate() to get the mel.") + "write. Use generate() to get the mel.") } result <- generate(model, text, voice, ...) write_audio(result$audio, result$sample_rate, output_path) diff --git a/R/vc.R b/R/vc.R index 71b346c..8fb5df4 100644 --- a/R/vc.R +++ b/R/vc.R @@ -21,13 +21,13 @@ #' @return List with \code{audio} (numeric vector), \code{sample_rate} #' (24000), and \code{audio_sec}, like \code{\link{generate}} #' @export -voice_convert <- function (model, audio, voice, sample_rate = NULL) { +voice_convert <- function(model, audio, voice, sample_rate = NULL) { if (!is_loaded(model)) { stop("Model not loaded. Call load_chatterbox() first.") } if (isTRUE(model$turbo)) { stop("Voice conversion uses the standard S3Gen decoder; ", - "load a standard (non-turbo) model.") + "load a standard (non-turbo) model.") } # Target voice conditioning (only ref_dict is used; Python VC's @@ -38,7 +38,7 @@ voice_convert <- function (model, audio, voice, sample_rate = NULL) { } if (!inherits(voice, "voice_embedding")) { stop("voice must be a voice_embedding object or path to ", - "reference audio") + "reference audio") } # Source speech at 16 kHz for the S3 tokenizer @@ -65,22 +65,22 @@ voice_convert <- function (model, audio, voice, sample_rate = NULL) { device <- model$device audio_16k <- torch::torch_tensor(samples, - dtype = torch::torch_float32())$unsqueeze(1)$to(device = device) + dtype = torch::torch_float32())$unsqueeze(1)$to(device = device) torch::with_no_grad({ # Full-length tokenization: VC keeps the source's timing tok <- model$s3gen$tokenizer$forward(audio_16k) result <- model$s3gen$inference( - speech_tokens = tok$tokens$to(device = device), - ref_dict = voice$ref_dict, - finalize = TRUE + speech_tokens = tok$tokens$to(device = device), + ref_dict = voice$ref_dict, + finalize = TRUE ) }) audio_samples <- as.numeric(result[[1]]$squeeze()$cpu()) list( - audio = audio_samples, - sample_rate = S3GEN_SR, - audio_sec = length(audio_samples) / S3GEN_SR + audio = audio_samples, + sample_rate = S3GEN_SR, + audio_sec = length(audio_samples) / S3GEN_SR ) } From 2d6f1af894c1fecc7297ab0a41e4f2de0f9502d9 Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 13:42:31 -0500 Subject: [PATCH 3/3] Bump version to 0.1.0.7 --- DESCRIPTION | 2 +- NEWS.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1e3d157..34bfb9b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chatterbox Title: Text-to-Speech Using Chatterbox TTS Engine -Version: 0.1.0.6 +Version: 0.1.0.7 Authors@R: c(person("Troy", "Hernandez", role = c("aut", "cre"), email = "troy@cornball.ai", diff --git a/NEWS.md b/NEWS.md index 01376ef..40356bd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# chatterbox 0.1.0.7 (development) + +- New `voice_convert()`: speech-to-speech voice conversion (port of + Python ChatterboxVC); re-renders source speech in a target voice, + preserving the source timing. + # chatterbox 0.1.0.6 (development) - `generate(skip_vocoder = TRUE)` returns the mel spectrogram instead of