From becbba7e0eec2f45fdcaab86f1de30be21d00801 Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 16:59:52 -0500 Subject: [PATCH 1/4] chatterbox_defaults(): hardware-adaptive setup + 6GB jit validation results - New chatterbox_defaults(vram_gb = NULL): detects the GPU (CPU when absent or under 4GB) and returns GC options, backend, max_new_tokens, and chunking threshold as a classed object; print method emits a ready-to-paste setup snippet. Measured tiers labeled vs projected. - 6GB validation results (GTX 1660 Ti, June 2026) folded into gc_options 6GB note, the performance vignette, and CLAUDE.md: jit 35-38 ms/token (4.7GB) vs container 30 - jit is fastest on every measured card; traced additionally truncates long-form at its 350-position cap. The old 'traced wins on 6GB' guidance predated the jit backend. - 12GB projected row added to the tier table. --- CLAUDE.md | 14 +++- NAMESPACE | 2 + R/defaults.R | 137 +++++++++++++++++++++++++++++++ R/gc_options.R | 11 ++- inst/tinytest/test_defaults.R | 37 +++++++++ man/chatterbox_defaults.Rd | 37 +++++++++ man/print.chatterbox_defaults.Rd | 18 ++++ vignettes/performance.md | 34 +++++--- 8 files changed, 267 insertions(+), 23 deletions(-) create mode 100644 R/defaults.R create mode 100644 inst/tinytest/test_defaults.R create mode 100644 man/chatterbox_defaults.Rd create mode 100644 man/print.chatterbox_defaults.Rd diff --git a/CLAUDE.md b/CLAUDE.md index 8aa34cf..836e0ac 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -47,6 +47,7 @@ chatterbox | `generate(model, text, voice)` | Generate speech | | `create_voice_embedding(model, audio)` | Create speaker embedding | | `tts_chunked(model, text, voice)` | Long texts, sentence-chunked, gc per chunk | +| `chatterbox_defaults()` | Per-card setup: GC options + backend + chunking thresholds | | `chatterbox_gc_options()` | Print torch GC settings for this GPU (set before torch loads) | | `quick_tts(text, ref_audio, output)` | One-liner convenience (loads whole model per call) | @@ -694,8 +695,11 @@ See `vignettes/performance.md` for the full story. Two facts dominate: | lean eager R (ATen builtins, no nn_module) | 71 | proves the per-op R call is the cost, not wrapper style | End-to-end long-form (~20s audio): jit ~6s wall vs container ~6s - -container parity. On 6GB hardware (GTX 1660 Ti, rate 0.75): traced -88-94, pure R 300-360; jit not yet validated there. +container parity. On 6GB hardware (GTX 1660 Ti, rate 0.75, June 2026): +jit 35-38 ms/token (4.7GB peak) vs container 30 - jit wins there too; +traced 88-94 but its 350-position cache truncates long-form at ~120 +tokens; pure R 254-287. `chatterbox_defaults()` returns the per-card +setup (GC tier + backend + chunking). ### Architecture note: pure R package since June 2026 @@ -722,10 +726,12 @@ There is no `useDynLib` and no compiled code. ### When to Use What -- jit + tuned GC: default on any GPU. +- jit + tuned GC: default on any GPU (fastest on both measured cards). - Container: production deployments via tts.api/gpu.ctl. -- Traced: long-running sessions, short utterances. +- Traced: niche - short utterances only (350-position cache cap). - Pure R: debugging, CPU-only. +- `chatterbox_defaults()`: detects the card, returns GC options + + backend + chunking thresholds as one pasteable snippet. ## Related - Alternative to tts.api container backend for local TTS (no Docker required) diff --git a/NAMESPACE b/NAMESPACE index 2524c59..5fc9960 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # tinyrox says don't edit this manually, but it can't stop you! export(chatterbox) +export(chatterbox_defaults) export(chatterbox_gc_options) export(compute_mel_spectrogram) export(compute_mel_spectrogram_ve) @@ -46,5 +47,6 @@ export(voice_convert) export(write_audio) S3method(print,chatterbox) +S3method(print,chatterbox_defaults) S3method(print,chatterbox_gc_options) S3method(print,voice_embedding) diff --git a/R/defaults.R b/R/defaults.R new file mode 100644 index 0000000..ab9d5dc --- /dev/null +++ b/R/defaults.R @@ -0,0 +1,137 @@ +# Hardware-adaptive defaults: GC settings, backend, and chunking +# thresholds per detected GPU/CPU. Measured tiers: 16 GB (RTX 5060 Ti) +# and 6 GB (GTX 1660 Ti); 8/12 GB projected from the tier rule. + +#' Recommended chatterbox settings for this machine +#' +#' Detects the GPU (or its absence) and returns everything worth setting +#' for it: the torch GC options (which must be set BEFORE torch loads - +#' see \code{\link{chatterbox_gc_options}} for why), the fastest +#' validated backend, the per-call token budget, and when to switch to +#' \code{\link{tts_chunked}}. Printing the result shows a ready-to-paste +#' setup snippet. +#' +#' Measured tiers (long-form, tuned GC): 16 GB RTX 5060 Ti - jit +#' 11 ms/token, container parity; 6 GB GTX 1660 Ti - jit 35-38 ms/token +#' vs container 30, in 4.7 GB VRAM. The 8 and 12 GB tiers are projected +#' from the rule (the GC trigger line must clear the ~4.6 GB loaded +#' model) and marked as such when printed. +#' +#' @param vram_gb Total GPU memory in GB. Default: detected via +#' nvidia-smi; 0 (or detection failure) means CPU-only. +#' @return An object of class \code{"chatterbox_defaults"}: a list with +#' \code{device}, \code{vram_gb}, \code{options} (for +#' \code{do.call(options, ...)} before torch loads), \code{backend}, +#' \code{max_new_tokens}, \code{chunk_chars}, and \code{measured}. +#' @examples +#' chatterbox_defaults(vram_gb = 6) +#' chatterbox_defaults(vram_gb = 0) # CPU +#' @export +chatterbox_defaults <- function (vram_gb = NULL) { + if (is.null(vram_gb)) { + smi <- suppressWarnings(tryCatch( + system2("nvidia-smi", + c("--query-gpu=memory.total", "--format=csv,noheader,nounits"), + stdout = TRUE, stderr = FALSE), + error = function (e) character(0) + )) + vram_gb <- if (length(smi) >= 1 && nzchar(smi[1]) && + !is.na(suppressWarnings(as.numeric(smi[1])))) { + round(as.numeric(smi[1]) / 1024, 1) + } else { + 0 + } + } + + if (vram_gb < 4) { + # CPU (or a card too small for the ~4.6 GB loaded model). + # The CUDA allocator knobs are irrelevant; only the CPU + # allocation odometer exists, and it measured as minor. + out <- list( + device = "cpu", + vram_gb = vram_gb, + options = list(), + backend = "r", + max_new_tokens = 1000L, + chunk_chars = 200L, + measured = FALSE + ) + } else { + rate <- if (vram_gb <= 6.5) 0.75 else if (vram_gb <= 10) 0.6 else 0.5 + out <- list( + device = "cuda", + vram_gb = vram_gb, + options = list(torch.cuda_allocator_reserved_rate = rate), + backend = "jit", + max_new_tokens = 1000L, + chunk_chars = 200L, + measured = vram_gb <= 6.5 || vram_gb > 12 + ) + } + + if (isNamespaceLoaded("torch") && length(out$options) > 0) { + warning("torch is already initialized in this session; the GC ", + "options take effect only in a fresh R session that sets ", + "them before torch loads.", call. = FALSE) + } + + structure(out, class = "chatterbox_defaults") +} + +#' Print method for chatterbox_defaults +#' +#' @param x Object from \code{\link{chatterbox_defaults}} +#' @param ... Ignored +#' @return \code{x}, invisibly +#' @export +print.chatterbox_defaults <- function (x, ...) { + if (x$device == "cpu") { + cat("CPU-only setup (no usable GPU detected).\n\n", + " library(chatterbox)\n", + " model <- load_chatterbox(chatterbox(\"cpu\"))\n\n", + "Use backend = \"r\". Expect minutes per utterance; for\n", + "anything longer than a sentence or two, use tts_chunked()\n", + "so audio arrives incrementally.\n", sep = "") + return(invisible(x)) + } + + tier <- if (isTRUE(x$measured)) "measured" else "projected" + rate <- x$options$torch.cuda_allocator_reserved_rate + cat(sprintf("Recommended for a %s GB GPU (%s tier) - put the\n", + format(x$vram_gb), tier)) + cat("options() line in .Rprofile or at the top of your script,\n") + cat("BEFORE torch loads:\n\n") + cat(sprintf(" options(torch.cuda_allocator_reserved_rate = %.2f)\n", + rate)) + cat(" library(chatterbox)\n") + cat(" model <- load_chatterbox(chatterbox(\"cuda\"))\n") + cat(sprintf( + " result <- generate(model, text, voice, backend = \"%s\")\n\n", + x$backend)) + cat(sprintf( + "Per call, up to max_new_tokens = %d (~40 s of audio). For\n", + x$max_new_tokens)) + cat(sprintf( + "longer texts use tts_chunked() (sentence chunks, ~%d chars,\n", + x$chunk_chars)) + cat("one gc() per chunk). In your own batch loops, call gc() after\n") + cat("each generate().\n") + + if (x$vram_gb <= 6.5) { + cat("\nNote: on a ", format(x$vram_gb), " GB card the model floor", + " leaves little headroom,\nso the 0.8 backstop still fires", + " some collections. Measured on a\nGTX 1660 Ti: jit", + " 35-38 ms/token (~4.7 GB peak) vs container 30;\npure R", + " ~10x slower. Do NOT lower", + " torch.cuda_allocator_allocated_rate\nhere - 60% of a small", + " card sits below the model floor and recreates\nthe", + " constant-collection regime.\n", sep = "") + } else if (x$vram_gb >= 8) { + cat("\nOptional, to hold the VRAM plateau lower (e.g. shared", + " GPUs), at\nno speed cost:\n\n", + " options(torch.cuda_allocator_allocated_rate = 0.6)\n", + sep = "") + } + + invisible(x) +} diff --git a/R/gc_options.R b/R/gc_options.R index 11f9a9e..cc09a56 100644 --- a/R/gc_options.R +++ b/R/gc_options.R @@ -102,12 +102,11 @@ print.chatterbox_gc_options <- function(x, ...) { if (vram_gb <= 6.5) { cat("\nNote: on a ", vram_gb, " GB card the model floor leaves", " little headroom, so the\n0.8 backstop still fires some", - " collections: expect ~3-5x from tuning for\npure R, not", - " the ~10x larger cards see. traced = TRUE measured", - " fastest\non 6 GB hardware (88-94 ms/token, ~5 GB peak -", - " tight but it fits).\nDo NOT lower allocated_rate here -", - " 60% of a small card sits below\nthe model floor and", - " recreates the constant-collection regime.\n", sep = "") + " collections. backend = \"jit\" measured\nfastest on 6 GB", + " hardware (35-38 ms/token, ~4.7 GB peak, vs the\n", + "container's 30). Do NOT lower allocated_rate here - 60%", + " of a small\ncard sits below the model floor and recreates", + " the\nconstant-collection regime.\n", sep = "") } invisible(x) diff --git a/inst/tinytest/test_defaults.R b/inst/tinytest/test_defaults.R new file mode 100644 index 0000000..42b10d0 --- /dev/null +++ b/inst/tinytest/test_defaults.R @@ -0,0 +1,37 @@ +# chatterbox_defaults tier logic (no GPU or weights needed) + +d6 <- chatterbox::chatterbox_defaults(vram_gb = 6) +expect_inherits(d6, "chatterbox_defaults") +expect_identical(d6$device, "cuda") +expect_equal(d6$options$torch.cuda_allocator_reserved_rate, 0.75) +expect_identical(d6$backend, "jit") +expect_true(d6$measured) + +d8 <- chatterbox::chatterbox_defaults(vram_gb = 8) +expect_equal(d8$options$torch.cuda_allocator_reserved_rate, 0.6) +expect_false(d8$measured) + +d12 <- chatterbox::chatterbox_defaults(vram_gb = 12) +expect_equal(d12$options$torch.cuda_allocator_reserved_rate, 0.5) +expect_false(d12$measured) + +d16 <- chatterbox::chatterbox_defaults(vram_gb = 16) +expect_equal(d16$options$torch.cuda_allocator_reserved_rate, 0.5) +expect_true(d16$measured) + +dcpu <- chatterbox::chatterbox_defaults(vram_gb = 0) +expect_identical(dcpu$device, "cpu") +expect_identical(dcpu$backend, "r") +expect_identical(dcpu$options, list()) + +# a 2 GB card cannot hold the model: treated as CPU +expect_identical(chatterbox::chatterbox_defaults(vram_gb = 2)$device, "cpu") + +# print methods run and return invisibly +expect_stdout(print(d6), "jit") +expect_stdout(print(d6), "0.75") +expect_stdout(print(d8), "projected") +expect_stdout(print(dcpu), "CPU-only") + +# the GC option is applicable directly +expect_silent(do.call(options, d16$options)) diff --git a/man/chatterbox_defaults.Rd b/man/chatterbox_defaults.Rd new file mode 100644 index 0000000..feb969a --- /dev/null +++ b/man/chatterbox_defaults.Rd @@ -0,0 +1,37 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{chatterbox_defaults} +\alias{chatterbox_defaults} +\title{Recommended chatterbox settings for this machine} +\usage{ +chatterbox_defaults(vram_gb = NULL) +} +\arguments{ +\item{vram_gb}{Total GPU memory in GB. Default: detected via +nvidia-smi; 0 (or detection failure) means CPU-only.} +} +\value{ +An object of class \code{"chatterbox_defaults"}: a list with + \code{device}, \code{vram_gb}, \code{options} (for + \code{do.call(options, ...)} before torch loads), \code{backend}, + \code{max_new_tokens}, \code{chunk_chars}, and \code{measured}. +} +\description{ +Detects the GPU (or its absence) and returns everything worth setting +for it: the torch GC options (which must be set BEFORE torch loads - +see \code{\link{chatterbox_gc_options}} for why), the fastest +validated backend, the per-call token budget, and when to switch to +\code{\link{tts_chunked}}. Printing the result shows a ready-to-paste +setup snippet. +} +\details{ +Measured tiers (long-form, tuned GC): 16 GB RTX 5060 Ti - jit +11 ms/token, container parity; 6 GB GTX 1660 Ti - jit 35-38 ms/token +vs container 30, in 4.7 GB VRAM. The 8 and 12 GB tiers are projected +from the rule (the GC trigger line must clear the ~4.6 GB loaded +model) and marked as such when printed. + +} +\examples{ +chatterbox_defaults(vram_gb = 6) +chatterbox_defaults(vram_gb = 0) # CPU +} diff --git a/man/print.chatterbox_defaults.Rd b/man/print.chatterbox_defaults.Rd new file mode 100644 index 0000000..945fa07 --- /dev/null +++ b/man/print.chatterbox_defaults.Rd @@ -0,0 +1,18 @@ +% tinyrox says don't edit this manually, but it can't stop you! +\name{print.chatterbox_defaults} +\alias{print.chatterbox_defaults} +\title{Print method for chatterbox_defaults} +\usage{ +\method{print}{chatterbox_defaults}(x, ...) +} +\arguments{ +\item{x}{Object from \code{\link{chatterbox_defaults}}} + +\item{...}{Ignored} +} +\value{ +\code{x}, invisibly +} +\description{ +Print method for chatterbox_defaults +} diff --git a/vignettes/performance.md b/vignettes/performance.md index 7230cbf..b5617cd 100644 --- a/vignettes/performance.md +++ b/vignettes/performance.md @@ -22,16 +22,15 @@ settings that matter more than the backend choice. - Scripts: `scripts/bench_backends.R`, `scripts/tune_gc.R`, `scripts/profile_backends.R` -**Scope caveat: all numbers are from this one machine.** The GC +**Scope caveat: most numbers are from this one machine.** The GC mechanism is R/torch-side and applies on any CUDA GPU, and the cliff is ratio arithmetic - collections strangle inference whenever the model's reserved fraction of the card exceeds `reserved_rate`. So severity scales with card size: a 24 GB card's ~19% floor sits under the default 0.2 line and may never hit this at all, while small cards -live deep past it. Absolute ms/token figures do not travel, and the -cpp-vs-traced ranking is measured to FLIP between machines (cpp wins -on the 16 GB desktop, traced wins on a 6 GB laptop) - benchmark on -your own hardware before choosing. +live deep past it. Absolute ms/token figures do not travel; the 6 GB +section below has the second measured machine, and +`chatterbox_defaults()` encodes both tiers. ## Headline Numbers @@ -110,9 +109,13 @@ total VRAM, must clear what the loaded model reserves): | card | reserved_rate | status | |------|--------------|--------| | 16 GB | 0.50 | measured (RTX 5060 Ti) | +| 12 GB | 0.50 | projected from the rule; not yet validated | | 8 GB | 0.60 | projected from the rule; not yet validated | | 6 GB | 0.75 | measured (GTX 1660 Ti) | +`chatterbox_defaults()` returns the full per-card setup (this option +plus backend and chunking thresholds) as a ready-to-paste snippet. + To validate on new hardware, run `scripts/tune_gc.R` with a few values; the win is a cliff, so any rate that clears the floor gives full speed. Optionally add `torch.cuda_allocator_allocated_rate = 0.6` to hold the @@ -127,8 +130,9 @@ recreates the constant-collection regime. | default | 1032-1811 | 3.6 GB flat | | reserved_rate 0.75 | **300-360** | 4.4-5.4 GB oscillating | | 0.75 + allocated_rate 0.6 | 423-441 (worse) | 4.7-4.9 GB | -| 0.75, traced (warm) | **88-94** | 5.0 GB - tight, no OOM | -| 0.75, cpp | 150-163 | 4.4 GB stable | +| 0.75, jit, long-form (June 2026) | **35-38** | 4.7 GB | +| 0.75, traced (warm, short text) | 88-94 | 5.0 GB - tight, no OOM | +| 0.75, cpp (retired) | 150-163 | 4.4 GB stable | | 0.75, long text (~20-23s audio) | 351-392, completes | 4.4 GB stable | | 0.9 + 0.9 backstop, short text | 302-305, steadier | 5.3 GB flat | | 0.9 + 0.9 backstop, long text | **OOM, both runs** | - | @@ -140,12 +144,16 @@ visible as the oscillating VRAM). The allocated_rate=0.6 row is the floor rule above demonstrated: 60% of this card sits below the model floor. -Note the backend ranking **flips** on this card: traced beats cpp here, -the reverse of the 16 GB machine. cpp's per-token cost is hundreds of -small host-side dispatches (a laptop CPU pays full price); traced's few -fused launches hold up on weak hosts. Both compiled paths beat pure R -everywhere measured - which one wins is machine-dependent, so benchmark -on your hardware (`scripts/bench_backends.R`). +The jit row (validated June 2026, after the cpp retirement) settles the +backend question on small cards too: 35-38 ms/token long-form against +the container's 30 on the same box - within ~25% of Python, 2.6x faster +than traced, ~8x faster than pure R, in 4.7 GB. Traced is additionally +disqualified for long-form here: its 350-position cache cap truncates +this test's text at 120 tokens (~5 s of audio) without an EOS. The +earlier "traced wins on 6 GB" finding was an artifact of jit not yet +existing when those rows were measured. **backend = "jit" is the +recommendation on every measured card**; `chatterbox_defaults()` +returns it along with the GC tier. The 0.9 rows are why the backstop stays at its default. Pushing both lines to 90% runs steadier on short utterances (the card parks at a From 688b4f810dd3816f2145b8b25a61a65c92d45a26 Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 17:00:17 -0500 Subject: [PATCH 2/4] rformat + document --- R/defaults.R | 69 ++++++++++++++++++++++++++++------------------------ R/s3gen.R | 2 +- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/R/defaults.R b/R/defaults.R index ab9d5dc..a279ba2 100644 --- a/R/defaults.R +++ b/R/defaults.R @@ -27,14 +27,15 @@ #' chatterbox_defaults(vram_gb = 6) #' chatterbox_defaults(vram_gb = 0) # CPU #' @export -chatterbox_defaults <- function (vram_gb = NULL) { +chatterbox_defaults <- function(vram_gb = NULL) { if (is.null(vram_gb)) { smi <- suppressWarnings(tryCatch( - system2("nvidia-smi", - c("--query-gpu=memory.total", "--format=csv,noheader,nounits"), - stdout = TRUE, stderr = FALSE), - error = function (e) character(0) - )) + system2("nvidia-smi", + c("--query-gpu=memory.total", + "--format=csv,noheader,nounits"), + stdout = TRUE, stderr = FALSE), + error = function(e) character(0) + )) vram_gb <- if (length(smi) >= 1 && nzchar(smi[1]) && !is.na(suppressWarnings(as.numeric(smi[1])))) { round(as.numeric(smi[1]) / 1024, 1) @@ -48,31 +49,31 @@ chatterbox_defaults <- function (vram_gb = NULL) { # The CUDA allocator knobs are irrelevant; only the CPU # allocation odometer exists, and it measured as minor. out <- list( - device = "cpu", - vram_gb = vram_gb, - options = list(), - backend = "r", - max_new_tokens = 1000L, - chunk_chars = 200L, - measured = FALSE + device = "cpu", + vram_gb = vram_gb, + options = list(), + backend = "r", + max_new_tokens = 1000L, + chunk_chars = 200L, + measured = FALSE ) } else { rate <- if (vram_gb <= 6.5) 0.75 else if (vram_gb <= 10) 0.6 else 0.5 out <- list( - device = "cuda", - vram_gb = vram_gb, - options = list(torch.cuda_allocator_reserved_rate = rate), - backend = "jit", - max_new_tokens = 1000L, - chunk_chars = 200L, - measured = vram_gb <= 6.5 || vram_gb > 12 + device = "cuda", + vram_gb = vram_gb, + options = list(torch.cuda_allocator_reserved_rate = rate), + backend = "jit", + max_new_tokens = 1000L, + chunk_chars = 200L, + measured = vram_gb <= 6.5 || vram_gb > 12 ) } if (isNamespaceLoaded("torch") && length(out$options) > 0) { warning("torch is already initialized in this session; the GC ", - "options take effect only in a fresh R session that sets ", - "them before torch loads.", call. = FALSE) + "options take effect only in a fresh R session that sets ", + "them before torch loads.", call. = FALSE) } structure(out, class = "chatterbox_defaults") @@ -84,7 +85,7 @@ chatterbox_defaults <- function (vram_gb = NULL) { #' @param ... Ignored #' @return \code{x}, invisibly #' @export -print.chatterbox_defaults <- function (x, ...) { +print.chatterbox_defaults <- function(x, ...) { if (x$device == "cpu") { cat("CPU-only setup (no usable GPU detected).\n\n", " library(chatterbox)\n", @@ -95,25 +96,29 @@ print.chatterbox_defaults <- function (x, ...) { return(invisible(x)) } - tier <- if (isTRUE(x$measured)) "measured" else "projected" + if (isTRUE(x$measured)) { + tier <- "measured" + } else { + tier <- "projected" + } rate <- x$options$torch.cuda_allocator_reserved_rate cat(sprintf("Recommended for a %s GB GPU (%s tier) - put the\n", - format(x$vram_gb), tier)) + format(x$vram_gb), tier)) cat("options() line in .Rprofile or at the top of your script,\n") cat("BEFORE torch loads:\n\n") cat(sprintf(" options(torch.cuda_allocator_reserved_rate = %.2f)\n", - rate)) + rate)) cat(" library(chatterbox)\n") cat(" model <- load_chatterbox(chatterbox(\"cuda\"))\n") cat(sprintf( - " result <- generate(model, text, voice, backend = \"%s\")\n\n", - x$backend)) + " result <- generate(model, text, voice, backend = \"%s\")\n\n", + x$backend)) cat(sprintf( - "Per call, up to max_new_tokens = %d (~40 s of audio). For\n", - x$max_new_tokens)) + "Per call, up to max_new_tokens = %d (~40 s of audio). For\n", + x$max_new_tokens)) cat(sprintf( - "longer texts use tts_chunked() (sentence chunks, ~%d chars,\n", - x$chunk_chars)) + "longer texts use tts_chunked() (sentence chunks, ~%d chars,\n", + x$chunk_chars)) cat("one gc() per chunk). In your own batch loops, call gc() after\n") cat("each generate().\n") diff --git a/R/s3gen.R b/R/s3gen.R index a848e6f..be1e8c5 100644 --- a/R/s3gen.R +++ b/R/s3gen.R @@ -1122,7 +1122,7 @@ s3gen <- torch::nn_module( if (!is.null(speech_token_lens)) { gen_mel_lens <- (speech_token_len * 2L)$to(dtype = torch::torch_long()) gen_mask <- (!make_pad_mask(gen_mel_lens, - max_len = output_mels$size(3)))$unsqueeze(2)$to( + max_len = output_mels$size(3)))$unsqueeze(2)$to( dtype = output_mels$dtype, device = output_mels$device) output_mels <- output_mels * gen_mask } From 5d16209c2f5c0d11f25306f5ab26314c7f02e3db Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 17:00:17 -0500 Subject: [PATCH 3/4] Bump version to 0.1.0.9 --- DESCRIPTION | 2 +- NEWS.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6966fee..53ac374 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chatterbox Title: Text-to-Speech Using Chatterbox TTS Engine -Version: 0.1.0.8 +Version: 0.1.0.9 Authors@R: c(person("Troy", "Hernandez", role = c("aut", "cre"), email = "troy@cornball.ai", diff --git a/NEWS.md b/NEWS.md index 8b20134..b33476e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# chatterbox 0.1.0.9 (development) + +- New `chatterbox_defaults()`: detects the GPU and returns the full + recommended setup (GC options, backend, token budget, chunking + threshold) as a pasteable snippet. +- 6GB hardware validation: jit measures 35-38 ms/token vs container 30; + per-card guidance updated (jit is fastest on every measured card). + # chatterbox 0.1.0.8 (development) - New `generate_batch()`: several texts, one batched S3Gen synthesis From 11db02ef0e653282085aaf529bcda7f3fcd7ec14 Mon Sep 17 00:00:00 2001 From: TroyHernandez Date: Fri, 12 Jun 2026 17:15:08 -0500 Subject: [PATCH 4/4] Review fixes: stale vignette sentence, native-backend wording, small-card threshold - Vignette no longer claims jit is un-revalidated on 6GB (it was, this PR). - 'jit wins/fastest' tightened to 'fastest native path/backend' - it does not beat the container. - Cards under 5GB are treated as CPU (model floor ~4.6GB, measured 6GB peak 4.7GB); measured flag no longer sweeps 4-5.5GB or 13GB into measured tiers. --- CLAUDE.md | 6 ++++-- NEWS.md | 3 ++- R/defaults.R | 19 +++++++++++++------ inst/tinytest/test_defaults.R | 12 +++++++++++- man/chatterbox_defaults.Rd | 3 ++- vignettes/performance.md | 3 ++- 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 836e0ac..fb48c0b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -696,7 +696,8 @@ See `vignettes/performance.md` for the full story. Two facts dominate: End-to-end long-form (~20s audio): jit ~6s wall vs container ~6s - container parity. On 6GB hardware (GTX 1660 Ti, rate 0.75, June 2026): -jit 35-38 ms/token (4.7GB peak) vs container 30 - jit wins there too; +jit 35-38 ms/token (4.7GB peak) vs container 30 - the fastest native +path there too; traced 88-94 but its 350-position cache truncates long-form at ~120 tokens; pure R 254-287. `chatterbox_defaults()` returns the per-card setup (GC tier + backend + chunking). @@ -726,7 +727,8 @@ There is no `useDynLib` and no compiled code. ### When to Use What -- jit + tuned GC: default on any GPU (fastest on both measured cards). +- jit + tuned GC: default on any GPU (fastest native path on both + measured cards). - Container: production deployments via tts.api/gpu.ctl. - Traced: niche - short utterances only (350-position cache cap). - Pure R: debugging, CPU-only. diff --git a/NEWS.md b/NEWS.md index b33476e..fbc590b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,8 @@ recommended setup (GC options, backend, token budget, chunking threshold) as a pasteable snippet. - 6GB hardware validation: jit measures 35-38 ms/token vs container 30; - per-card guidance updated (jit is fastest on every measured card). + per-card guidance updated (jit is the fastest native backend on every + measured card). # chatterbox 0.1.0.8 (development) diff --git a/R/defaults.R b/R/defaults.R index a279ba2..cb518da 100644 --- a/R/defaults.R +++ b/R/defaults.R @@ -18,7 +18,8 @@ #' model) and marked as such when printed. #' #' @param vram_gb Total GPU memory in GB. Default: detected via -#' nvidia-smi; 0 (or detection failure) means CPU-only. +#' nvidia-smi; 0 (or detection failure) means CPU-only. Cards under +#' 5 GB are treated as CPU: the loaded model alone needs ~4.6 GB. #' @return An object of class \code{"chatterbox_defaults"}: a list with #' \code{device}, \code{vram_gb}, \code{options} (for #' \code{do.call(options, ...)} before torch loads), \code{backend}, @@ -44,10 +45,12 @@ chatterbox_defaults <- function(vram_gb = NULL) { } } - if (vram_gb < 4) { - # CPU (or a card too small for the ~4.6 GB loaded model). - # The CUDA allocator knobs are irrelevant; only the CPU - # allocation odometer exists, and it measured as minor. + if (vram_gb < 5) { + # CPU, or a card too small to be supported: the loaded model + # floor is ~4.6 GB and the measured 6 GB peak was 4.7 GB, so + # anything under 5 GB cannot run the CUDA path. The CUDA + # allocator knobs are irrelevant; only the CPU allocation + # odometer exists, and it measured as minor. out <- list( device = "cpu", vram_gb = vram_gb, @@ -66,7 +69,11 @@ chatterbox_defaults <- function(vram_gb = NULL) { backend = "jit", max_new_tokens = 1000L, chunk_chars = 200L, - measured = vram_gb <= 6.5 || vram_gb > 12 + # Measured tiers: a 6 GB card (GTX 1660 Ti) and a + # 16 GB card (RTX 5060 Ti); near-miss sizes (5-5.5, + # 7-13) are projections of those measurements + measured = (vram_gb > 5.5 && vram_gb <= 6.5) || + vram_gb >= 14 ) } diff --git a/inst/tinytest/test_defaults.R b/inst/tinytest/test_defaults.R index 42b10d0..0f0fed4 100644 --- a/inst/tinytest/test_defaults.R +++ b/inst/tinytest/test_defaults.R @@ -24,8 +24,18 @@ expect_identical(dcpu$device, "cpu") expect_identical(dcpu$backend, "r") expect_identical(dcpu$options, list()) -# a 2 GB card cannot hold the model: treated as CPU +# cards under 5 GB cannot hold the ~4.6 GB model: treated as CPU expect_identical(chatterbox::chatterbox_defaults(vram_gb = 2)$device, "cpu") +expect_identical(chatterbox::chatterbox_defaults(vram_gb = 4)$device, "cpu") +expect_identical(chatterbox::chatterbox_defaults(vram_gb = 4.9)$device, "cpu") + +# 5-5.5 GB runs CUDA but is a projection, not the measured 6 GB tier +d5 <- chatterbox::chatterbox_defaults(vram_gb = 5) +expect_identical(d5$device, "cuda") +expect_false(d5$measured) + +# 13 GB sits between measured tiers: projected +expect_false(chatterbox::chatterbox_defaults(vram_gb = 13)$measured) # print methods run and return invisibly expect_stdout(print(d6), "jit") diff --git a/man/chatterbox_defaults.Rd b/man/chatterbox_defaults.Rd index feb969a..4a0330e 100644 --- a/man/chatterbox_defaults.Rd +++ b/man/chatterbox_defaults.Rd @@ -7,7 +7,8 @@ chatterbox_defaults(vram_gb = NULL) } \arguments{ \item{vram_gb}{Total GPU memory in GB. Default: detected via -nvidia-smi; 0 (or detection failure) means CPU-only.} +nvidia-smi; 0 (or detection failure) means CPU-only. Cards under +5 GB are treated as CPU: the loaded model alone needs ~4.6 GB.} } \value{ An object of class \code{"chatterbox_defaults"}: a list with diff --git a/vignettes/performance.md b/vignettes/performance.md index b5617cd..785dfb8 100644 --- a/vignettes/performance.md +++ b/vignettes/performance.md @@ -202,7 +202,8 @@ install order, was permanently dead in any CRAN-built binary, and could go stale on torch upgrades. The TorchScript route shares traced mode's deprecation caveat but none of those failure modes. The 6 GB rows below labelled cpp are historical measurements of the retired -backend; jit has not yet been re-validated on that hardware. +backend; jit was validated on that hardware in June 2026 (35-38 +ms/token - see the 6 GB section). ### Where the speed actually comes from