cornball-ai · TroyHernandez · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chatterbox
 Title: Text-to-Speech Using Chatterbox TTS Engine
-Version: 0.1.0.6
+Version: 0.1.0.7
 Authors@R:
     c(person("Troy", "Hernandez", role = c("aut", "cre"),
              email = "troy@cornball.ai",

diff --git a/NAMESPACE b/NAMESPACE
@@ -41,6 +41,7 @@ export(text_to_tokens)
 export(tts_chunked)
 export(tts_to_file)
 export(turbo_models_available)
+export(voice_convert)
 export(write_audio)
 
 S3method(print,chatterbox)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# chatterbox 0.1.0.7 (development)
+
+- New `voice_convert()`: speech-to-speech voice conversion (port of
+  Python ChatterboxVC); re-renders source speech in a target voice,
+  preserving the source timing.
+
 # chatterbox 0.1.0.6 (development)
 
 - `generate(skip_vocoder = TRUE)` returns the mel spectrogram instead of

diff --git a/R/tts.R b/R/tts.R
@@ -615,7 +615,7 @@ generate <- function(model, text, voice, exaggeration = 0.5,
 tts_to_file <- function(model, text, voice, output_path, ...) {
     if (isTRUE(list(...)$skip_vocoder)) {
         stop("skip_vocoder makes no sense here: there is no audio to ",
-            "write. Use generate() to get the mel.")
+             "write. Use generate() to get the mel.")
     }
     result <- generate(model, text, voice, ...)
     write_audio(result$audio, result$sample_rate, output_path)

diff --git a/R/vc.R b/R/vc.R
@@ -0,0 +1,86 @@
+# Voice conversion (port of Python chatterbox's vc.py).
+# Speech-to-speech: re-synthesize source speech in the target voice.
+# Skips T3 entirely - the source audio is tokenized by the S3 tokenizer
+# and S3Gen regenerates it conditioned on the target speaker.
+
+#' Convert speech to a target voice
+#'
+#' Re-synthesizes \code{audio} so the same words and prosody come out in
+#' the target voice (Python chatterbox's \code{ChatterboxVC}). No text
+#' or T3 generation is involved: the source speech is tokenized directly
+#' (25 tokens/s) and S3Gen renders the tokens with the target speaker's
+#' conditioning, so the result follows the source's timing.
+#'
+#' @param model Loaded chatterbox model (standard, not turbo)
+#' @param audio Source speech (file path, numeric vector, or torch
+#'   tensor)
+#' @param voice Target voice: a voice_embedding from
+#'   \code{\link{create_voice_embedding}} (or
+#'   \code{\link{load_voice_embedding}}), or a path to reference audio
+#' @param sample_rate Sample rate of \code{audio} (if not a file)
+#' @return List with \code{audio} (numeric vector), \code{sample_rate}
+#'   (24000), and \code{audio_sec}, like \code{\link{generate}}
+#' @export
+voice_convert <- function(model, audio, voice, sample_rate = NULL) {
+    if (!is_loaded(model)) {
+        stop("Model not loaded. Call load_chatterbox() first.")
+    }
+    if (isTRUE(model$turbo)) {
+        stop("Voice conversion uses the standard S3Gen decoder; ",
+             "load a standard (non-turbo) model.")
+    }
+
+    # Target voice conditioning (only ref_dict is used; Python VC's
+    # embed_ref caps the reference at 10 s, as create_voice_embedding
+    # already does)
+    if (is.character(voice)) {
+        voice <- create_voice_embedding(model, voice)
+    }
+    if (!inherits(voice, "voice_embedding")) {
+        stop("voice must be a voice_embedding object or path to ",
+             "reference audio")
+    }
+
+    # Source speech at 16 kHz for the S3 tokenizer
+    if (is.character(audio)) {
+        audio_data <- read_audio(audio)
+        samples <- audio_data$samples
+        sample_rate <- audio_data$sr
+    } else if (is.numeric(audio)) {
+        if (is.null(sample_rate)) {
+            stop("sample_rate must be provided for numeric audio input")
+        }
+        samples <- audio
+    } else if (inherits(audio, "torch_tensor")) {
+        if (is.null(sample_rate)) {
+            stop("sample_rate must be provided for tensor audio input")
+        }
+        samples <- as.numeric(audio$cpu())
+    } else {
+        stop("audio must be a file path, numeric vector, or torch tensor")
+    }
+    if (sample_rate != S3_SR) {
+        samples <- resample_audio(samples, sample_rate, S3_SR)
+    }
+
+    device <- model$device
+    audio_16k <- torch::torch_tensor(samples,
+                                     dtype = torch::torch_float32())$unsqueeze(1)$to(device = device)
+
+    torch::with_no_grad({
+        # Full-length tokenization: VC keeps the source's timing
+        tok <- model$s3gen$tokenizer$forward(audio_16k)
+        result <- model$s3gen$inference(
+                                        speech_tokens = tok$tokens$to(device = device),
+                                        ref_dict = voice$ref_dict,
+                                        finalize = TRUE
+        )
+    })
+
+    audio_samples <- as.numeric(result[[1]]$squeeze()$cpu())
+    list(
+         audio = audio_samples,
+         sample_rate = S3GEN_SR,
+         audio_sec = length(audio_samples) / S3GEN_SR
+    )
+}
diff --git a/inst/tinytest/test_vc.R b/inst/tinytest/test_vc.R
@@ -0,0 +1,23 @@
+# voice_convert error paths (no weights needed)
+
+if (requireNamespace("torch", quietly = TRUE) && torch::torch_is_installed()) {
+    unloaded <- chatterbox::chatterbox("cpu")
+    expect_error(chatterbox::voice_convert(unloaded, "x.wav", "y.wav"),
+        "not loaded")
+
+    fake_turbo <- structure(list(loaded = TRUE, turbo = TRUE),
+        class = "chatterbox")
+    expect_error(chatterbox::voice_convert(fake_turbo, "x.wav", "y.wav"),
+        "non-turbo")
+
+    fake <- structure(list(loaded = TRUE, turbo = FALSE),
+        class = "chatterbox")
+    fake_voice <- structure(list(ref_dict = list()),
+        class = "voice_embedding")
+    expect_error(chatterbox::voice_convert(fake, TRUE, fake_voice),
+        "file path, numeric")
+    expect_error(chatterbox::voice_convert(fake, c(0, 0.1), fake_voice),
+        "sample_rate")
+    expect_error(chatterbox::voice_convert(fake, c(0, 0.1), list()),
+        "voice_embedding")
+}
diff --git a/man/voice_convert.Rd b/man/voice_convert.Rd
@@ -0,0 +1,30 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{voice_convert}
+\alias{voice_convert}
+\title{Convert speech to a target voice}
+\usage{
+voice_convert(model, audio, voice, sample_rate = NULL)
+}
+\arguments{
+\item{model}{Loaded chatterbox model (standard, not turbo)}
+
+\item{audio}{Source speech (file path, numeric vector, or torch
+tensor)}
+
+\item{voice}{Target voice: a voice_embedding from
+\code{\link{create_voice_embedding}} (or
+\code{\link{load_voice_embedding}}), or a path to reference audio}
+
+\item{sample_rate}{Sample rate of \code{audio} (if not a file)}
+}
+\value{
+List with \code{audio} (numeric vector), \code{sample_rate}
+  (24000), and \code{audio_sec}, like \code{\link{generate}}
+}
+\description{
+Re-synthesizes \code{audio} so the same words and prosody come out in
+the target voice (Python chatterbox's \code{ChatterboxVC}). No text
+or T3 generation is involved: the source speech is tokenized directly
+(25 tokens/s) and S3Gen renders the tokens with the target speaker's
+conditioning, so the result follows the source's timing.
+}
diff --git a/scripts/vc_reference.py b/scripts/vc_reference.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+"""Run Python ChatterboxVC for R voice_convert comparison."""
+
+import glob
+import torch
+import soundfile as sf
+from chatterbox.vc import ChatterboxVC
+
+snap = glob.glob("/root/.cache/huggingface/hub/models--ResembleAI--chatterbox/snapshots/*")[0]
+vc = ChatterboxVC.from_local(snap, "cuda")
+wav = vc.generate("/pkg/inst/audio/jfk.wav", target_voice_path="/pkg/scripts/reference.wav")
+wav = wav.squeeze(0).numpy()
+print(f"py vc: {len(wav)/24000:.2f}s, std={wav.std():.4f}")
+sf.write("/outputs/vc_py_jfk_to_reference.wav", wav, 24000)