Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: chatterbox
Title: Text-to-Speech Using Chatterbox TTS Engine
Version: 0.1.0.6
Version: 0.1.0.7
Authors@R:
c(person("Troy", "Hernandez", role = c("aut", "cre"),
email = "troy@cornball.ai",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ export(text_to_tokens)
export(tts_chunked)
export(tts_to_file)
export(turbo_models_available)
export(voice_convert)
export(write_audio)

S3method(print,chatterbox)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# chatterbox 0.1.0.7 (development)

- New `voice_convert()`: speech-to-speech voice conversion (port of
Python ChatterboxVC); re-renders source speech in a target voice,
preserving the source timing.

# chatterbox 0.1.0.6 (development)

- `generate(skip_vocoder = TRUE)` returns the mel spectrogram instead of
Expand Down
2 changes: 1 addition & 1 deletion R/tts.R
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ generate <- function(model, text, voice, exaggeration = 0.5,
tts_to_file <- function(model, text, voice, output_path, ...) {
if (isTRUE(list(...)$skip_vocoder)) {
stop("skip_vocoder makes no sense here: there is no audio to ",
"write. Use generate() to get the mel.")
"write. Use generate() to get the mel.")
}
result <- generate(model, text, voice, ...)
write_audio(result$audio, result$sample_rate, output_path)
Expand Down
86 changes: 86 additions & 0 deletions R/vc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Voice conversion (port of Python chatterbox's vc.py).
# Speech-to-speech: re-synthesize source speech in the target voice.
# Skips T3 entirely - the source audio is tokenized by the S3 tokenizer
# and S3Gen regenerates it conditioned on the target speaker.

#' Convert speech to a target voice
#'
#' Re-synthesizes \code{audio} so the same words and prosody come out in
#' the target voice (Python chatterbox's \code{ChatterboxVC}). No text
#' or T3 generation is involved: the source speech is tokenized directly
#' (25 tokens/s) and S3Gen renders the tokens with the target speaker's
#' conditioning, so the result follows the source's timing.
#'
#' @param model Loaded chatterbox model (standard, not turbo)
#' @param audio Source speech (file path, numeric vector, or torch
#' tensor)
#' @param voice Target voice: a voice_embedding from
#' \code{\link{create_voice_embedding}} (or
#' \code{\link{load_voice_embedding}}), or a path to reference audio
#' @param sample_rate Sample rate of \code{audio} (if not a file)
#' @return List with \code{audio} (numeric vector), \code{sample_rate}
#' (24000), and \code{audio_sec}, like \code{\link{generate}}
#' @export
voice_convert <- function(model, audio, voice, sample_rate = NULL) {
if (!is_loaded(model)) {
stop("Model not loaded. Call load_chatterbox() first.")
}
if (isTRUE(model$turbo)) {
stop("Voice conversion uses the standard S3Gen decoder; ",
"load a standard (non-turbo) model.")
}

# Target voice conditioning (only ref_dict is used; Python VC's
# embed_ref caps the reference at 10 s, as create_voice_embedding
# already does)
if (is.character(voice)) {
voice <- create_voice_embedding(model, voice)
}
if (!inherits(voice, "voice_embedding")) {
stop("voice must be a voice_embedding object or path to ",
"reference audio")
}

# Source speech at 16 kHz for the S3 tokenizer
if (is.character(audio)) {
audio_data <- read_audio(audio)
samples <- audio_data$samples
sample_rate <- audio_data$sr
} else if (is.numeric(audio)) {
if (is.null(sample_rate)) {
stop("sample_rate must be provided for numeric audio input")
}
samples <- audio
} else if (inherits(audio, "torch_tensor")) {
if (is.null(sample_rate)) {
stop("sample_rate must be provided for tensor audio input")
}
samples <- as.numeric(audio$cpu())
} else {
stop("audio must be a file path, numeric vector, or torch tensor")
}
if (sample_rate != S3_SR) {
samples <- resample_audio(samples, sample_rate, S3_SR)
}

device <- model$device
audio_16k <- torch::torch_tensor(samples,
dtype = torch::torch_float32())$unsqueeze(1)$to(device = device)

torch::with_no_grad({
# Full-length tokenization: VC keeps the source's timing
tok <- model$s3gen$tokenizer$forward(audio_16k)
result <- model$s3gen$inference(
speech_tokens = tok$tokens$to(device = device),
ref_dict = voice$ref_dict,
finalize = TRUE
)
})

audio_samples <- as.numeric(result[[1]]$squeeze()$cpu())
list(
audio = audio_samples,
sample_rate = S3GEN_SR,
audio_sec = length(audio_samples) / S3GEN_SR
)
}
23 changes: 23 additions & 0 deletions inst/tinytest/test_vc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# voice_convert error paths (no weights needed)

if (requireNamespace("torch", quietly = TRUE) && torch::torch_is_installed()) {
unloaded <- chatterbox::chatterbox("cpu")
expect_error(chatterbox::voice_convert(unloaded, "x.wav", "y.wav"),
"not loaded")

fake_turbo <- structure(list(loaded = TRUE, turbo = TRUE),
class = "chatterbox")
expect_error(chatterbox::voice_convert(fake_turbo, "x.wav", "y.wav"),
"non-turbo")

fake <- structure(list(loaded = TRUE, turbo = FALSE),
class = "chatterbox")
fake_voice <- structure(list(ref_dict = list()),
class = "voice_embedding")
expect_error(chatterbox::voice_convert(fake, TRUE, fake_voice),
"file path, numeric")
expect_error(chatterbox::voice_convert(fake, c(0, 0.1), fake_voice),
"sample_rate")
expect_error(chatterbox::voice_convert(fake, c(0, 0.1), list()),
"voice_embedding")
}
30 changes: 30 additions & 0 deletions man/voice_convert.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
% tinyrox says don't edit this manually, but it can't stop you!
\name{voice_convert}
\alias{voice_convert}
\title{Convert speech to a target voice}
\usage{
voice_convert(model, audio, voice, sample_rate = NULL)
}
\arguments{
\item{model}{Loaded chatterbox model (standard, not turbo)}

\item{audio}{Source speech (file path, numeric vector, or torch
tensor)}

\item{voice}{Target voice: a voice_embedding from
\code{\link{create_voice_embedding}} (or
\code{\link{load_voice_embedding}}), or a path to reference audio}

\item{sample_rate}{Sample rate of \code{audio} (if not a file)}
}
\value{
List with \code{audio} (numeric vector), \code{sample_rate}
(24000), and \code{audio_sec}, like \code{\link{generate}}
}
\description{
Re-synthesizes \code{audio} so the same words and prosody come out in
the target voice (Python chatterbox's \code{ChatterboxVC}). No text
or T3 generation is involved: the source speech is tokenized directly
(25 tokens/s) and S3Gen renders the tokens with the target speaker's
conditioning, so the result follows the source's timing.
}
14 changes: 14 additions & 0 deletions scripts/vc_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env python3
"""Run Python ChatterboxVC for R voice_convert comparison."""

import glob
import torch
import soundfile as sf
from chatterbox.vc import ChatterboxVC

snap = glob.glob("/root/.cache/huggingface/hub/models--ResembleAI--chatterbox/snapshots/*")[0]
vc = ChatterboxVC.from_local(snap, "cuda")
wav = vc.generate("/pkg/inst/audio/jfk.wav", target_voice_path="/pkg/scripts/reference.wav")
wav = wav.squeeze(0).numpy()
print(f"py vc: {len(wav)/24000:.2f}s, std={wav.std():.4f}")
sf.write("/outputs/vc_py_jfk_to_reference.wav", wav, 24000)
Loading