From 7e6a369d87ebf7c82fd29f7371cde3e6cb58e4a0 Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 13:42:08 -0500
Subject: [PATCH 1/3] Voice conversion: voice_convert() ports Python's
 ChatterboxVC

Source speech -> S3 tokenizer (full length, keeps source timing) ->
S3Gen with the target voice's ref_dict. Validated against the 0.1.7
container on the same source/target: durations match to 0.01 s
(7.56 s), amplitude in family (std 0.048 R vs 0.051 Python; CFM noise
draws differ by construction).
---
 NAMESPACE               |  1 +
 R/vc.R                  | 86 +++++++++++++++++++++++++++++++++++++++++
 inst/tinytest/test_vc.R | 23 +++++++++++
 man/voice_convert.Rd    | 30 ++++++++++++++
 scripts/vc_reference.py | 14 +++++++
 5 files changed, 154 insertions(+)
 create mode 100644 R/vc.R
 create mode 100644 inst/tinytest/test_vc.R
 create mode 100644 man/voice_convert.Rd
 create mode 100644 scripts/vc_reference.py

diff --git a/NAMESPACE b/NAMESPACE
index d8e04e8..cdfa788 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -41,6 +41,7 @@ export(text_to_tokens)
 export(tts_chunked)
 export(tts_to_file)
 export(turbo_models_available)
+export(voice_convert)
 export(write_audio)
 
 S3method(print,chatterbox)
diff --git a/R/vc.R b/R/vc.R
new file mode 100644
index 0000000..71b346c
--- /dev/null
+++ b/R/vc.R
@@ -0,0 +1,86 @@
+# Voice conversion (port of Python chatterbox's vc.py).
+# Speech-to-speech: re-synthesize source speech in the target voice.
+# Skips T3 entirely - the source audio is tokenized by the S3 tokenizer
+# and S3Gen regenerates it conditioned on the target speaker.
+
+#' Convert speech to a target voice
+#'
+#' Re-synthesizes \code{audio} so the same words and prosody come out in
+#' the target voice (Python chatterbox's \code{ChatterboxVC}). No text
+#' or T3 generation is involved: the source speech is tokenized directly
+#' (25 tokens/s) and S3Gen renders the tokens with the target speaker's
+#' conditioning, so the result follows the source's timing.
+#'
+#' @param model Loaded chatterbox model (standard, not turbo)
+#' @param audio Source speech (file path, numeric vector, or torch
+#'   tensor)
+#' @param voice Target voice: a voice_embedding from
+#'   \code{\link{create_voice_embedding}} (or
+#'   \code{\link{load_voice_embedding}}), or a path to reference audio
+#' @param sample_rate Sample rate of \code{audio} (if not a file)
+#' @return List with \code{audio} (numeric vector), \code{sample_rate}
+#'   (24000), and \code{audio_sec}, like \code{\link{generate}}
+#' @export
+voice_convert <- function (model, audio, voice, sample_rate = NULL) {
+    if (!is_loaded(model)) {
+        stop("Model not loaded. Call load_chatterbox() first.")
+    }
+    if (isTRUE(model$turbo)) {
+        stop("Voice conversion uses the standard S3Gen decoder; ",
+            "load a standard (non-turbo) model.")
+    }
+
+    # Target voice conditioning (only ref_dict is used; Python VC's
+    # embed_ref caps the reference at 10 s, as create_voice_embedding
+    # already does)
+    if (is.character(voice)) {
+        voice <- create_voice_embedding(model, voice)
+    }
+    if (!inherits(voice, "voice_embedding")) {
+        stop("voice must be a voice_embedding object or path to ",
+            "reference audio")
+    }
+
+    # Source speech at 16 kHz for the S3 tokenizer
+    if (is.character(audio)) {
+        audio_data <- read_audio(audio)
+        samples <- audio_data$samples
+        sample_rate <- audio_data$sr
+    } else if (is.numeric(audio)) {
+        if (is.null(sample_rate)) {
+            stop("sample_rate must be provided for numeric audio input")
+        }
+        samples <- audio
+    } else if (inherits(audio, "torch_tensor")) {
+        if (is.null(sample_rate)) {
+            stop("sample_rate must be provided for tensor audio input")
+        }
+        samples <- as.numeric(audio$cpu())
+    } else {
+        stop("audio must be a file path, numeric vector, or torch tensor")
+    }
+    if (sample_rate != S3_SR) {
+        samples <- resample_audio(samples, sample_rate, S3_SR)
+    }
+
+    device <- model$device
+    audio_16k <- torch::torch_tensor(samples,
+        dtype = torch::torch_float32())$unsqueeze(1)$to(device = device)
+
+    torch::with_no_grad({
+        # Full-length tokenization: VC keeps the source's timing
+        tok <- model$s3gen$tokenizer$forward(audio_16k)
+        result <- model$s3gen$inference(
+            speech_tokens = tok$tokens$to(device = device),
+            ref_dict = voice$ref_dict,
+            finalize = TRUE
+        )
+    })
+
+    audio_samples <- as.numeric(result[[1]]$squeeze()$cpu())
+    list(
+        audio = audio_samples,
+        sample_rate = S3GEN_SR,
+        audio_sec = length(audio_samples) / S3GEN_SR
+    )
+}
diff --git a/inst/tinytest/test_vc.R b/inst/tinytest/test_vc.R
new file mode 100644
index 0000000..6d8e5f9
--- /dev/null
+++ b/inst/tinytest/test_vc.R
@@ -0,0 +1,23 @@
+# voice_convert error paths (no weights needed)
+
+if (requireNamespace("torch", quietly = TRUE) && torch::torch_is_installed()) {
+    unloaded <- chatterbox::chatterbox("cpu")
+    expect_error(chatterbox::voice_convert(unloaded, "x.wav", "y.wav"),
+        "not loaded")
+
+    fake_turbo <- structure(list(loaded = TRUE, turbo = TRUE),
+        class = "chatterbox")
+    expect_error(chatterbox::voice_convert(fake_turbo, "x.wav", "y.wav"),
+        "non-turbo")
+
+    fake <- structure(list(loaded = TRUE, turbo = FALSE),
+        class = "chatterbox")
+    fake_voice <- structure(list(ref_dict = list()),
+        class = "voice_embedding")
+    expect_error(chatterbox::voice_convert(fake, TRUE, fake_voice),
+        "file path, numeric")
+    expect_error(chatterbox::voice_convert(fake, c(0, 0.1), fake_voice),
+        "sample_rate")
+    expect_error(chatterbox::voice_convert(fake, c(0, 0.1), list()),
+        "voice_embedding")
+}
diff --git a/man/voice_convert.Rd b/man/voice_convert.Rd
new file mode 100644
index 0000000..7f9d3aa
--- /dev/null
+++ b/man/voice_convert.Rd
@@ -0,0 +1,30 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{voice_convert}
+\alias{voice_convert}
+\title{Convert speech to a target voice}
+\usage{
+voice_convert(model, audio, voice, sample_rate = NULL)
+}
+\arguments{
+\item{model}{Loaded chatterbox model (standard, not turbo)}
+
+\item{audio}{Source speech (file path, numeric vector, or torch
+tensor)}
+
+\item{voice}{Target voice: a voice_embedding from
+\code{\link{create_voice_embedding}} (or
+\code{\link{load_voice_embedding}}), or a path to reference audio}
+
+\item{sample_rate}{Sample rate of \code{audio} (if not a file)}
+}
+\value{
+List with \code{audio} (numeric vector), \code{sample_rate}
+  (24000), and \code{audio_sec}, like \code{\link{generate}}
+}
+\description{
+Re-synthesizes \code{audio} so the same words and prosody come out in
+the target voice (Python chatterbox's \code{ChatterboxVC}). No text
+or T3 generation is involved: the source speech is tokenized directly
+(25 tokens/s) and S3Gen renders the tokens with the target speaker's
+conditioning, so the result follows the source's timing.
+}
diff --git a/scripts/vc_reference.py b/scripts/vc_reference.py
new file mode 100644
index 0000000..cead29a
--- /dev/null
+++ b/scripts/vc_reference.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+"""Run Python ChatterboxVC for R voice_convert comparison."""
+
+import glob
+import torch
+import soundfile as sf
+from chatterbox.vc import ChatterboxVC
+
+snap = glob.glob("/root/.cache/huggingface/hub/models--ResembleAI--chatterbox/snapshots/*")[0]
+vc = ChatterboxVC.from_local(snap, "cuda")
+wav = vc.generate("/pkg/inst/audio/jfk.wav", target_voice_path="/pkg/scripts/reference.wav")
+wav = wav.squeeze(0).numpy()
+print(f"py vc: {len(wav)/24000:.2f}s, std={wav.std():.4f}")
+sf.write("/outputs/vc_py_jfk_to_reference.wav", wav, 24000)

From 12f7725d9111283456b4d76cd542118f397f3efb Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 13:42:31 -0500
Subject: [PATCH 2/3] rformat + document

---
 R/tts.R |  2 +-
 R/vc.R  | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/R/tts.R b/R/tts.R
index 470ac8b..ab0d61d 100644
--- a/R/tts.R
+++ b/R/tts.R
@@ -615,7 +615,7 @@ generate <- function(model, text, voice, exaggeration = 0.5,
 tts_to_file <- function(model, text, voice, output_path, ...) {
     if (isTRUE(list(...)$skip_vocoder)) {
         stop("skip_vocoder makes no sense here: there is no audio to ",
-            "write. Use generate() to get the mel.")
+             "write. Use generate() to get the mel.")
     }
     result <- generate(model, text, voice, ...)
     write_audio(result$audio, result$sample_rate, output_path)
diff --git a/R/vc.R b/R/vc.R
index 71b346c..8fb5df4 100644
--- a/R/vc.R
+++ b/R/vc.R
@@ -21,13 +21,13 @@
 #' @return List with \code{audio} (numeric vector), \code{sample_rate}
 #'   (24000), and \code{audio_sec}, like \code{\link{generate}}
 #' @export
-voice_convert <- function (model, audio, voice, sample_rate = NULL) {
+voice_convert <- function(model, audio, voice, sample_rate = NULL) {
     if (!is_loaded(model)) {
         stop("Model not loaded. Call load_chatterbox() first.")
     }
     if (isTRUE(model$turbo)) {
         stop("Voice conversion uses the standard S3Gen decoder; ",
-            "load a standard (non-turbo) model.")
+             "load a standard (non-turbo) model.")
     }
 
     # Target voice conditioning (only ref_dict is used; Python VC's
@@ -38,7 +38,7 @@ voice_convert <- function (model, audio, voice, sample_rate = NULL) {
     }
     if (!inherits(voice, "voice_embedding")) {
         stop("voice must be a voice_embedding object or path to ",
-            "reference audio")
+             "reference audio")
     }
 
     # Source speech at 16 kHz for the S3 tokenizer
@@ -65,22 +65,22 @@ voice_convert <- function (model, audio, voice, sample_rate = NULL) {
 
     device <- model$device
     audio_16k <- torch::torch_tensor(samples,
-        dtype = torch::torch_float32())$unsqueeze(1)$to(device = device)
+                                     dtype = torch::torch_float32())$unsqueeze(1)$to(device = device)
 
     torch::with_no_grad({
         # Full-length tokenization: VC keeps the source's timing
         tok <- model$s3gen$tokenizer$forward(audio_16k)
         result <- model$s3gen$inference(
-            speech_tokens = tok$tokens$to(device = device),
-            ref_dict = voice$ref_dict,
-            finalize = TRUE
+                                        speech_tokens = tok$tokens$to(device = device),
+                                        ref_dict = voice$ref_dict,
+                                        finalize = TRUE
         )
     })
 
     audio_samples <- as.numeric(result[[1]]$squeeze()$cpu())
     list(
-        audio = audio_samples,
-        sample_rate = S3GEN_SR,
-        audio_sec = length(audio_samples) / S3GEN_SR
+         audio = audio_samples,
+         sample_rate = S3GEN_SR,
+         audio_sec = length(audio_samples) / S3GEN_SR
     )
 }

From 2d6f1af894c1fecc7297ab0a41e4f2de0f9502d9 Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 13:42:31 -0500
Subject: [PATCH 3/3] Bump version to 0.1.0.7

---
 DESCRIPTION | 2 +-
 NEWS.md     | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1e3d157..34bfb9b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chatterbox
 Title: Text-to-Speech Using Chatterbox TTS Engine
-Version: 0.1.0.6
+Version: 0.1.0.7
 Authors@R:
     c(person("Troy", "Hernandez", role = c("aut", "cre"),
              email = "troy@cornball.ai",
diff --git a/NEWS.md b/NEWS.md
index 01376ef..40356bd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,9 @@
+# chatterbox 0.1.0.7 (development)
+
+- New `voice_convert()`: speech-to-speech voice conversion (port of
+  Python ChatterboxVC); re-renders source speech in a target voice,
+  preserving the source timing.
+
 # chatterbox 0.1.0.6 (development)
 
 - `generate(skip_vocoder = TRUE)` returns the mel spectrogram instead of