From becbba7e0eec2f45fdcaab86f1de30be21d00801 Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 16:59:52 -0500
Subject: [PATCH 1/4] chatterbox_defaults(): hardware-adaptive setup + 6GB jit
 validation results

- New chatterbox_defaults(vram_gb = NULL): detects the GPU (CPU when
  absent or under 4GB) and returns GC options, backend, max_new_tokens,
  and chunking threshold as a classed object; print method emits a
  ready-to-paste setup snippet. Measured tiers labeled vs projected.
- 6GB validation results (GTX 1660 Ti, June 2026) folded into
  gc_options 6GB note, the performance vignette, and CLAUDE.md:
  jit 35-38 ms/token (4.7GB) vs container 30 - jit is fastest on every
  measured card; traced additionally truncates long-form at its
  350-position cap. The old 'traced wins on 6GB' guidance predated the
  jit backend.
- 12GB projected row added to the tier table.
---
 CLAUDE.md                        |  14 +++-
 NAMESPACE                        |   2 +
 R/defaults.R                     | 137 +++++++++++++++++++++++++++++++
 R/gc_options.R                   |  11 ++-
 inst/tinytest/test_defaults.R    |  37 +++++++++
 man/chatterbox_defaults.Rd       |  37 +++++++++
 man/print.chatterbox_defaults.Rd |  18 ++++
 vignettes/performance.md         |  34 +++++---
 8 files changed, 267 insertions(+), 23 deletions(-)
 create mode 100644 R/defaults.R
 create mode 100644 inst/tinytest/test_defaults.R
 create mode 100644 man/chatterbox_defaults.Rd
 create mode 100644 man/print.chatterbox_defaults.Rd

diff --git a/CLAUDE.md b/CLAUDE.md
index 8aa34cf..836e0ac 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -47,6 +47,7 @@ chatterbox
 | `generate(model, text, voice)` | Generate speech |
 | `create_voice_embedding(model, audio)` | Create speaker embedding |
 | `tts_chunked(model, text, voice)` | Long texts, sentence-chunked, gc per chunk |
+| `chatterbox_defaults()` | Per-card setup: GC options + backend + chunking thresholds |
 | `chatterbox_gc_options()` | Print torch GC settings for this GPU (set before torch loads) |
 | `quick_tts(text, ref_audio, output)` | One-liner convenience (loads whole model per call) |
 
@@ -694,8 +695,11 @@ See `vignettes/performance.md` for the full story. Two facts dominate:
 | lean eager R (ATen builtins, no nn_module) | 71 | proves the per-op R call is the cost, not wrapper style |
 
 End-to-end long-form (~20s audio): jit ~6s wall vs container ~6s -
-container parity. On 6GB hardware (GTX 1660 Ti, rate 0.75): traced
-88-94, pure R 300-360; jit not yet validated there.
+container parity. On 6GB hardware (GTX 1660 Ti, rate 0.75, June 2026):
+jit 35-38 ms/token (4.7GB peak) vs container 30 - jit wins there too;
+traced 88-94 but its 350-position cache truncates long-form at ~120
+tokens; pure R 254-287. `chatterbox_defaults()` returns the per-card
+setup (GC tier + backend + chunking).
 
 ### Architecture note: pure R package since June 2026
 
@@ -722,10 +726,12 @@ There is no `useDynLib` and no compiled code.
 
 ### When to Use What
 
-- jit + tuned GC: default on any GPU.
+- jit + tuned GC: default on any GPU (fastest on both measured cards).
 - Container: production deployments via tts.api/gpu.ctl.
-- Traced: long-running sessions, short utterances.
+- Traced: niche - short utterances only (350-position cache cap).
 - Pure R: debugging, CPU-only.
+- `chatterbox_defaults()`: detects the card, returns GC options +
+  backend + chunking thresholds as one pasteable snippet.
 ## Related
 
 - Alternative to tts.api container backend for local TTS (no Docker required)
diff --git a/NAMESPACE b/NAMESPACE
index 2524c59..5fc9960 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # tinyrox says don't edit this manually, but it can't stop you!
 
 export(chatterbox)
+export(chatterbox_defaults)
 export(chatterbox_gc_options)
 export(compute_mel_spectrogram)
 export(compute_mel_spectrogram_ve)
@@ -46,5 +47,6 @@ export(voice_convert)
 export(write_audio)
 
 S3method(print,chatterbox)
+S3method(print,chatterbox_defaults)
 S3method(print,chatterbox_gc_options)
 S3method(print,voice_embedding)
diff --git a/R/defaults.R b/R/defaults.R
new file mode 100644
index 0000000..ab9d5dc
--- /dev/null
+++ b/R/defaults.R
@@ -0,0 +1,137 @@
+# Hardware-adaptive defaults: GC settings, backend, and chunking
+# thresholds per detected GPU/CPU. Measured tiers: 16 GB (RTX 5060 Ti)
+# and 6 GB (GTX 1660 Ti); 8/12 GB projected from the tier rule.
+
+#' Recommended chatterbox settings for this machine
+#'
+#' Detects the GPU (or its absence) and returns everything worth setting
+#' for it: the torch GC options (which must be set BEFORE torch loads -
+#' see \code{\link{chatterbox_gc_options}} for why), the fastest
+#' validated backend, the per-call token budget, and when to switch to
+#' \code{\link{tts_chunked}}. Printing the result shows a ready-to-paste
+#' setup snippet.
+#'
+#' Measured tiers (long-form, tuned GC): 16 GB RTX 5060 Ti - jit
+#' 11 ms/token, container parity; 6 GB GTX 1660 Ti - jit 35-38 ms/token
+#' vs container 30, in 4.7 GB VRAM. The 8 and 12 GB tiers are projected
+#' from the rule (the GC trigger line must clear the ~4.6 GB loaded
+#' model) and marked as such when printed.
+#'
+#' @param vram_gb Total GPU memory in GB. Default: detected via
+#'   nvidia-smi; 0 (or detection failure) means CPU-only.
+#' @return An object of class \code{"chatterbox_defaults"}: a list with
+#'   \code{device}, \code{vram_gb}, \code{options} (for
+#'   \code{do.call(options, ...)} before torch loads), \code{backend},
+#'   \code{max_new_tokens}, \code{chunk_chars}, and \code{measured}.
+#' @examples
+#' chatterbox_defaults(vram_gb = 6)
+#' chatterbox_defaults(vram_gb = 0) # CPU
+#' @export
+chatterbox_defaults <- function (vram_gb = NULL) {
+    if (is.null(vram_gb)) {
+        smi <- suppressWarnings(tryCatch(
+            system2("nvidia-smi",
+                c("--query-gpu=memory.total", "--format=csv,noheader,nounits"),
+                stdout = TRUE, stderr = FALSE),
+            error = function (e) character(0)
+        ))
+        vram_gb <- if (length(smi) >= 1 && nzchar(smi[1]) &&
+            !is.na(suppressWarnings(as.numeric(smi[1])))) {
+            round(as.numeric(smi[1]) / 1024, 1)
+        } else {
+            0
+        }
+    }
+
+    if (vram_gb < 4) {
+        # CPU (or a card too small for the ~4.6 GB loaded model).
+        # The CUDA allocator knobs are irrelevant; only the CPU
+        # allocation odometer exists, and it measured as minor.
+        out <- list(
+            device = "cpu",
+            vram_gb = vram_gb,
+            options = list(),
+            backend = "r",
+            max_new_tokens = 1000L,
+            chunk_chars = 200L,
+            measured = FALSE
+        )
+    } else {
+        rate <- if (vram_gb <= 6.5) 0.75 else if (vram_gb <= 10) 0.6 else 0.5
+        out <- list(
+            device = "cuda",
+            vram_gb = vram_gb,
+            options = list(torch.cuda_allocator_reserved_rate = rate),
+            backend = "jit",
+            max_new_tokens = 1000L,
+            chunk_chars = 200L,
+            measured = vram_gb <= 6.5 || vram_gb > 12
+        )
+    }
+
+    if (isNamespaceLoaded("torch") && length(out$options) > 0) {
+        warning("torch is already initialized in this session; the GC ",
+            "options take effect only in a fresh R session that sets ",
+            "them before torch loads.", call. = FALSE)
+    }
+
+    structure(out, class = "chatterbox_defaults")
+}
+
+#' Print method for chatterbox_defaults
+#'
+#' @param x Object from \code{\link{chatterbox_defaults}}
+#' @param ... Ignored
+#' @return \code{x}, invisibly
+#' @export
+print.chatterbox_defaults <- function (x, ...) {
+    if (x$device == "cpu") {
+        cat("CPU-only setup (no usable GPU detected).\n\n",
+            "    library(chatterbox)\n",
+            "    model <- load_chatterbox(chatterbox(\"cpu\"))\n\n",
+            "Use backend = \"r\". Expect minutes per utterance; for\n",
+            "anything longer than a sentence or two, use tts_chunked()\n",
+            "so audio arrives incrementally.\n", sep = "")
+        return(invisible(x))
+    }
+
+    tier <- if (isTRUE(x$measured)) "measured" else "projected"
+    rate <- x$options$torch.cuda_allocator_reserved_rate
+    cat(sprintf("Recommended for a %s GB GPU (%s tier) - put the\n",
+        format(x$vram_gb), tier))
+    cat("options() line in .Rprofile or at the top of your script,\n")
+    cat("BEFORE torch loads:\n\n")
+    cat(sprintf("    options(torch.cuda_allocator_reserved_rate = %.2f)\n",
+        rate))
+    cat("    library(chatterbox)\n")
+    cat("    model <- load_chatterbox(chatterbox(\"cuda\"))\n")
+    cat(sprintf(
+        "    result <- generate(model, text, voice, backend = \"%s\")\n\n",
+        x$backend))
+    cat(sprintf(
+        "Per call, up to max_new_tokens = %d (~40 s of audio). For\n",
+        x$max_new_tokens))
+    cat(sprintf(
+        "longer texts use tts_chunked() (sentence chunks, ~%d chars,\n",
+        x$chunk_chars))
+    cat("one gc() per chunk). In your own batch loops, call gc() after\n")
+    cat("each generate().\n")
+
+    if (x$vram_gb <= 6.5) {
+        cat("\nNote: on a ", format(x$vram_gb), " GB card the model floor",
+            " leaves little headroom,\nso the 0.8 backstop still fires",
+            " some collections. Measured on a\nGTX 1660 Ti: jit",
+            " 35-38 ms/token (~4.7 GB peak) vs container 30;\npure R",
+            " ~10x slower. Do NOT lower",
+            " torch.cuda_allocator_allocated_rate\nhere - 60% of a small",
+            " card sits below the model floor and recreates\nthe",
+            " constant-collection regime.\n", sep = "")
+    } else if (x$vram_gb >= 8) {
+        cat("\nOptional, to hold the VRAM plateau lower (e.g. shared",
+            " GPUs), at\nno speed cost:\n\n",
+            "    options(torch.cuda_allocator_allocated_rate = 0.6)\n",
+            sep = "")
+    }
+
+    invisible(x)
+}
diff --git a/R/gc_options.R b/R/gc_options.R
index 11f9a9e..cc09a56 100644
--- a/R/gc_options.R
+++ b/R/gc_options.R
@@ -102,12 +102,11 @@ print.chatterbox_gc_options <- function(x, ...) {
     if (vram_gb <= 6.5) {
         cat("\nNote: on a ", vram_gb, " GB card the model floor leaves",
             " little headroom, so the\n0.8 backstop still fires some",
-            " collections: expect ~3-5x from tuning for\npure R, not",
-            " the ~10x larger cards see. traced = TRUE measured",
-            " fastest\non 6 GB hardware (88-94 ms/token, ~5 GB peak -",
-            " tight but it fits).\nDo NOT lower allocated_rate here -",
-            " 60% of a small card sits below\nthe model floor and",
-            " recreates the constant-collection regime.\n", sep = "")
+            " collections. backend = \"jit\" measured\nfastest on 6 GB",
+            " hardware (35-38 ms/token, ~4.7 GB peak, vs the\n",
+            "container's 30). Do NOT lower allocated_rate here - 60%",
+            " of a small\ncard sits below the model floor and recreates",
+            " the\nconstant-collection regime.\n", sep = "")
     }
 
     invisible(x)
diff --git a/inst/tinytest/test_defaults.R b/inst/tinytest/test_defaults.R
new file mode 100644
index 0000000..42b10d0
--- /dev/null
+++ b/inst/tinytest/test_defaults.R
@@ -0,0 +1,37 @@
+# chatterbox_defaults tier logic (no GPU or weights needed)
+
+d6 <- chatterbox::chatterbox_defaults(vram_gb = 6)
+expect_inherits(d6, "chatterbox_defaults")
+expect_identical(d6$device, "cuda")
+expect_equal(d6$options$torch.cuda_allocator_reserved_rate, 0.75)
+expect_identical(d6$backend, "jit")
+expect_true(d6$measured)
+
+d8 <- chatterbox::chatterbox_defaults(vram_gb = 8)
+expect_equal(d8$options$torch.cuda_allocator_reserved_rate, 0.6)
+expect_false(d8$measured)
+
+d12 <- chatterbox::chatterbox_defaults(vram_gb = 12)
+expect_equal(d12$options$torch.cuda_allocator_reserved_rate, 0.5)
+expect_false(d12$measured)
+
+d16 <- chatterbox::chatterbox_defaults(vram_gb = 16)
+expect_equal(d16$options$torch.cuda_allocator_reserved_rate, 0.5)
+expect_true(d16$measured)
+
+dcpu <- chatterbox::chatterbox_defaults(vram_gb = 0)
+expect_identical(dcpu$device, "cpu")
+expect_identical(dcpu$backend, "r")
+expect_identical(dcpu$options, list())
+
+# a 2 GB card cannot hold the model: treated as CPU
+expect_identical(chatterbox::chatterbox_defaults(vram_gb = 2)$device, "cpu")
+
+# print methods run and return invisibly
+expect_stdout(print(d6), "jit")
+expect_stdout(print(d6), "0.75")
+expect_stdout(print(d8), "projected")
+expect_stdout(print(dcpu), "CPU-only")
+
+# the GC option is applicable directly
+expect_silent(do.call(options, d16$options))
diff --git a/man/chatterbox_defaults.Rd b/man/chatterbox_defaults.Rd
new file mode 100644
index 0000000..feb969a
--- /dev/null
+++ b/man/chatterbox_defaults.Rd
@@ -0,0 +1,37 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{chatterbox_defaults}
+\alias{chatterbox_defaults}
+\title{Recommended chatterbox settings for this machine}
+\usage{
+chatterbox_defaults(vram_gb = NULL)
+}
+\arguments{
+\item{vram_gb}{Total GPU memory in GB. Default: detected via
+nvidia-smi; 0 (or detection failure) means CPU-only.}
+}
+\value{
+An object of class \code{"chatterbox_defaults"}: a list with
+  \code{device}, \code{vram_gb}, \code{options} (for
+  \code{do.call(options, ...)} before torch loads), \code{backend},
+  \code{max_new_tokens}, \code{chunk_chars}, and \code{measured}.
+}
+\description{
+Detects the GPU (or its absence) and returns everything worth setting
+for it: the torch GC options (which must be set BEFORE torch loads -
+see \code{\link{chatterbox_gc_options}} for why), the fastest
+validated backend, the per-call token budget, and when to switch to
+\code{\link{tts_chunked}}. Printing the result shows a ready-to-paste
+setup snippet.
+}
+\details{
+Measured tiers (long-form, tuned GC): 16 GB RTX 5060 Ti - jit
+11 ms/token, container parity; 6 GB GTX 1660 Ti - jit 35-38 ms/token
+vs container 30, in 4.7 GB VRAM. The 8 and 12 GB tiers are projected
+from the rule (the GC trigger line must clear the ~4.6 GB loaded
+model) and marked as such when printed.
+
+}
+\examples{
+chatterbox_defaults(vram_gb = 6)
+chatterbox_defaults(vram_gb = 0) # CPU
+}
diff --git a/man/print.chatterbox_defaults.Rd b/man/print.chatterbox_defaults.Rd
new file mode 100644
index 0000000..945fa07
--- /dev/null
+++ b/man/print.chatterbox_defaults.Rd
@@ -0,0 +1,18 @@
+% tinyrox says don't edit this manually, but it can't stop you!
+\name{print.chatterbox_defaults}
+\alias{print.chatterbox_defaults}
+\title{Print method for chatterbox_defaults}
+\usage{
+\method{print}{chatterbox_defaults}(x, ...)
+}
+\arguments{
+\item{x}{Object from \code{\link{chatterbox_defaults}}}
+
+\item{...}{Ignored}
+}
+\value{
+\code{x}, invisibly
+}
+\description{
+Print method for chatterbox_defaults
+}
diff --git a/vignettes/performance.md b/vignettes/performance.md
index 7230cbf..b5617cd 100644
--- a/vignettes/performance.md
+++ b/vignettes/performance.md
@@ -22,16 +22,15 @@ settings that matter more than the backend choice.
 - Scripts: `scripts/bench_backends.R`, `scripts/tune_gc.R`,
   `scripts/profile_backends.R`
 
-**Scope caveat: all numbers are from this one machine.** The GC
+**Scope caveat: most numbers are from this one machine.** The GC
 mechanism is R/torch-side and applies on any CUDA GPU, and the cliff
 is ratio arithmetic - collections strangle inference whenever the
 model's reserved fraction of the card exceeds `reserved_rate`. So
 severity scales with card size: a 24 GB card's ~19% floor sits under
 the default 0.2 line and may never hit this at all, while small cards
-live deep past it. Absolute ms/token figures do not travel, and the
-cpp-vs-traced ranking is measured to FLIP between machines (cpp wins
-on the 16 GB desktop, traced wins on a 6 GB laptop) - benchmark on
-your own hardware before choosing.
+live deep past it. Absolute ms/token figures do not travel; the 6 GB
+section below has the second measured machine, and
+`chatterbox_defaults()` encodes both tiers.
 
 ## Headline Numbers
 
@@ -110,9 +109,13 @@ total VRAM, must clear what the loaded model reserves):
 | card | reserved_rate | status |
 |------|--------------|--------|
 | 16 GB | 0.50 | measured (RTX 5060 Ti) |
+| 12 GB | 0.50 | projected from the rule; not yet validated |
 | 8 GB | 0.60 | projected from the rule; not yet validated |
 | 6 GB | 0.75 | measured (GTX 1660 Ti) |
 
+`chatterbox_defaults()` returns the full per-card setup (this option
+plus backend and chunking thresholds) as a ready-to-paste snippet.
+
 To validate on new hardware, run `scripts/tune_gc.R` with a few values;
 the win is a cliff, so any rate that clears the floor gives full speed.
 Optionally add `torch.cuda_allocator_allocated_rate = 0.6` to hold the
@@ -127,8 +130,9 @@ recreates the constant-collection regime.
 | default | 1032-1811 | 3.6 GB flat |
 | reserved_rate 0.75 | **300-360** | 4.4-5.4 GB oscillating |
 | 0.75 + allocated_rate 0.6 | 423-441 (worse) | 4.7-4.9 GB |
-| 0.75, traced (warm) | **88-94** | 5.0 GB - tight, no OOM |
-| 0.75, cpp | 150-163 | 4.4 GB stable |
+| 0.75, jit, long-form (June 2026) | **35-38** | 4.7 GB |
+| 0.75, traced (warm, short text) | 88-94 | 5.0 GB - tight, no OOM |
+| 0.75, cpp (retired) | 150-163 | 4.4 GB stable |
 | 0.75, long text (~20-23s audio) | 351-392, completes | 4.4 GB stable |
 | 0.9 + 0.9 backstop, short text | 302-305, steadier | 5.3 GB flat |
 | 0.9 + 0.9 backstop, long text | **OOM, both runs** | - |
@@ -140,12 +144,16 @@ visible as the oscillating VRAM). The allocated_rate=0.6 row is the
 floor rule above demonstrated: 60% of this card sits below the model
 floor.
 
-Note the backend ranking **flips** on this card: traced beats cpp here,
-the reverse of the 16 GB machine. cpp's per-token cost is hundreds of
-small host-side dispatches (a laptop CPU pays full price); traced's few
-fused launches hold up on weak hosts. Both compiled paths beat pure R
-everywhere measured - which one wins is machine-dependent, so benchmark
-on your hardware (`scripts/bench_backends.R`).
+The jit row (validated June 2026, after the cpp retirement) settles the
+backend question on small cards too: 35-38 ms/token long-form against
+the container's 30 on the same box - within ~25% of Python, 2.6x faster
+than traced, ~8x faster than pure R, in 4.7 GB. Traced is additionally
+disqualified for long-form here: its 350-position cache cap truncates
+this test's text at 120 tokens (~5 s of audio) without an EOS. The
+earlier "traced wins on 6 GB" finding was an artifact of jit not yet
+existing when those rows were measured. **backend = "jit" is the
+recommendation on every measured card**; `chatterbox_defaults()`
+returns it along with the GC tier.
 
 The 0.9 rows are why the backstop stays at its default. Pushing both
 lines to 90% runs steadier on short utterances (the card parks at a

From 688b4f810dd3816f2145b8b25a61a65c92d45a26 Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 17:00:17 -0500
Subject: [PATCH 2/4] rformat + document

---
 R/defaults.R | 69 ++++++++++++++++++++++++++++------------------------
 R/s3gen.R    |  2 +-
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/R/defaults.R b/R/defaults.R
index ab9d5dc..a279ba2 100644
--- a/R/defaults.R
+++ b/R/defaults.R
@@ -27,14 +27,15 @@
 #' chatterbox_defaults(vram_gb = 6)
 #' chatterbox_defaults(vram_gb = 0) # CPU
 #' @export
-chatterbox_defaults <- function (vram_gb = NULL) {
+chatterbox_defaults <- function(vram_gb = NULL) {
     if (is.null(vram_gb)) {
         smi <- suppressWarnings(tryCatch(
-            system2("nvidia-smi",
-                c("--query-gpu=memory.total", "--format=csv,noheader,nounits"),
-                stdout = TRUE, stderr = FALSE),
-            error = function (e) character(0)
-        ))
+                system2("nvidia-smi",
+                        c("--query-gpu=memory.total",
+                          "--format=csv,noheader,nounits"),
+                        stdout = TRUE, stderr = FALSE),
+                error = function(e) character(0)
+            ))
         vram_gb <- if (length(smi) >= 1 && nzchar(smi[1]) &&
             !is.na(suppressWarnings(as.numeric(smi[1])))) {
             round(as.numeric(smi[1]) / 1024, 1)
@@ -48,31 +49,31 @@ chatterbox_defaults <- function (vram_gb = NULL) {
         # The CUDA allocator knobs are irrelevant; only the CPU
         # allocation odometer exists, and it measured as minor.
         out <- list(
-            device = "cpu",
-            vram_gb = vram_gb,
-            options = list(),
-            backend = "r",
-            max_new_tokens = 1000L,
-            chunk_chars = 200L,
-            measured = FALSE
+                    device = "cpu",
+                    vram_gb = vram_gb,
+                    options = list(),
+                    backend = "r",
+                    max_new_tokens = 1000L,
+                    chunk_chars = 200L,
+                    measured = FALSE
         )
     } else {
         rate <- if (vram_gb <= 6.5) 0.75 else if (vram_gb <= 10) 0.6 else 0.5
         out <- list(
-            device = "cuda",
-            vram_gb = vram_gb,
-            options = list(torch.cuda_allocator_reserved_rate = rate),
-            backend = "jit",
-            max_new_tokens = 1000L,
-            chunk_chars = 200L,
-            measured = vram_gb <= 6.5 || vram_gb > 12
+                    device = "cuda",
+                    vram_gb = vram_gb,
+                    options = list(torch.cuda_allocator_reserved_rate = rate),
+                    backend = "jit",
+                    max_new_tokens = 1000L,
+                    chunk_chars = 200L,
+                    measured = vram_gb <= 6.5 || vram_gb > 12
         )
     }
 
     if (isNamespaceLoaded("torch") && length(out$options) > 0) {
         warning("torch is already initialized in this session; the GC ",
-            "options take effect only in a fresh R session that sets ",
-            "them before torch loads.", call. = FALSE)
+                "options take effect only in a fresh R session that sets ",
+                "them before torch loads.", call. = FALSE)
     }
 
     structure(out, class = "chatterbox_defaults")
@@ -84,7 +85,7 @@ chatterbox_defaults <- function (vram_gb = NULL) {
 #' @param ... Ignored
 #' @return \code{x}, invisibly
 #' @export
-print.chatterbox_defaults <- function (x, ...) {
+print.chatterbox_defaults <- function(x, ...) {
     if (x$device == "cpu") {
         cat("CPU-only setup (no usable GPU detected).\n\n",
             "    library(chatterbox)\n",
@@ -95,25 +96,29 @@ print.chatterbox_defaults <- function (x, ...) {
         return(invisible(x))
     }
 
-    tier <- if (isTRUE(x$measured)) "measured" else "projected"
+    if (isTRUE(x$measured)) {
+        tier <- "measured"
+    } else {
+        tier <- "projected"
+    }
     rate <- x$options$torch.cuda_allocator_reserved_rate
     cat(sprintf("Recommended for a %s GB GPU (%s tier) - put the\n",
-        format(x$vram_gb), tier))
+                format(x$vram_gb), tier))
     cat("options() line in .Rprofile or at the top of your script,\n")
     cat("BEFORE torch loads:\n\n")
     cat(sprintf("    options(torch.cuda_allocator_reserved_rate = %.2f)\n",
-        rate))
+                rate))
     cat("    library(chatterbox)\n")
     cat("    model <- load_chatterbox(chatterbox(\"cuda\"))\n")
     cat(sprintf(
-        "    result <- generate(model, text, voice, backend = \"%s\")\n\n",
-        x$backend))
+                "    result <- generate(model, text, voice, backend = \"%s\")\n\n",
+                x$backend))
     cat(sprintf(
-        "Per call, up to max_new_tokens = %d (~40 s of audio). For\n",
-        x$max_new_tokens))
+                "Per call, up to max_new_tokens = %d (~40 s of audio). For\n",
+                x$max_new_tokens))
     cat(sprintf(
-        "longer texts use tts_chunked() (sentence chunks, ~%d chars,\n",
-        x$chunk_chars))
+                "longer texts use tts_chunked() (sentence chunks, ~%d chars,\n",
+                x$chunk_chars))
     cat("one gc() per chunk). In your own batch loops, call gc() after\n")
     cat("each generate().\n")
 
diff --git a/R/s3gen.R b/R/s3gen.R
index a848e6f..be1e8c5 100644
--- a/R/s3gen.R
+++ b/R/s3gen.R
@@ -1122,7 +1122,7 @@ s3gen <- torch::nn_module(
     if (!is.null(speech_token_lens)) {
         gen_mel_lens <- (speech_token_len * 2L)$to(dtype = torch::torch_long())
         gen_mask <- (!make_pad_mask(gen_mel_lens,
-            max_len = output_mels$size(3)))$unsqueeze(2)$to(
+                                    max_len = output_mels$size(3)))$unsqueeze(2)$to(
             dtype = output_mels$dtype, device = output_mels$device)
         output_mels <- output_mels * gen_mask
     }

From 5d16209c2f5c0d11f25306f5ab26314c7f02e3db Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 17:00:17 -0500
Subject: [PATCH 3/4] Bump version to 0.1.0.9

---
 DESCRIPTION | 2 +-
 NEWS.md     | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 6966fee..53ac374 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chatterbox
 Title: Text-to-Speech Using Chatterbox TTS Engine
-Version: 0.1.0.8
+Version: 0.1.0.9
 Authors@R:
     c(person("Troy", "Hernandez", role = c("aut", "cre"),
              email = "troy@cornball.ai",
diff --git a/NEWS.md b/NEWS.md
index 8b20134..b33476e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,11 @@
+# chatterbox 0.1.0.9 (development)
+
+- New `chatterbox_defaults()`: detects the GPU and returns the full
+  recommended setup (GC options, backend, token budget, chunking
+  threshold) as a pasteable snippet.
+- 6GB hardware validation: jit measures 35-38 ms/token vs container 30;
+  per-card guidance updated (jit is fastest on every measured card).
+
 # chatterbox 0.1.0.8 (development)
 
 - New `generate_batch()`: several texts, one batched S3Gen synthesis

From 11db02ef0e653282085aaf529bcda7f3fcd7ec14 Mon Sep 17 00:00:00 2001
From: TroyHernandez <troy@cornball.ai>
Date: Fri, 12 Jun 2026 17:15:08 -0500
Subject: [PATCH 4/4] Review fixes: stale vignette sentence, native-backend
 wording, small-card threshold

- Vignette no longer claims jit is un-revalidated on 6GB (it was, this PR).
- 'jit wins/fastest' tightened to 'fastest native path/backend' - it
  does not beat the container.
- Cards under 5GB are treated as CPU (model floor ~4.6GB, measured 6GB
  peak 4.7GB); measured flag no longer sweeps 4-5.5GB or 13GB into
  measured tiers.
---
 CLAUDE.md                     |  6 ++++--
 NEWS.md                       |  3 ++-
 R/defaults.R                  | 19 +++++++++++++------
 inst/tinytest/test_defaults.R | 12 +++++++++++-
 man/chatterbox_defaults.Rd    |  3 ++-
 vignettes/performance.md      |  3 ++-
 6 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 836e0ac..fb48c0b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -696,7 +696,8 @@ See `vignettes/performance.md` for the full story. Two facts dominate:
 
 End-to-end long-form (~20s audio): jit ~6s wall vs container ~6s -
 container parity. On 6GB hardware (GTX 1660 Ti, rate 0.75, June 2026):
-jit 35-38 ms/token (4.7GB peak) vs container 30 - jit wins there too;
+jit 35-38 ms/token (4.7GB peak) vs container 30 - the fastest native
+path there too;
 traced 88-94 but its 350-position cache truncates long-form at ~120
 tokens; pure R 254-287. `chatterbox_defaults()` returns the per-card
 setup (GC tier + backend + chunking).
@@ -726,7 +727,8 @@ There is no `useDynLib` and no compiled code.
 
 ### When to Use What
 
-- jit + tuned GC: default on any GPU (fastest on both measured cards).
+- jit + tuned GC: default on any GPU (fastest native path on both
+  measured cards).
 - Container: production deployments via tts.api/gpu.ctl.
 - Traced: niche - short utterances only (350-position cache cap).
 - Pure R: debugging, CPU-only.
diff --git a/NEWS.md b/NEWS.md
index b33476e..fbc590b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,7 +4,8 @@
   recommended setup (GC options, backend, token budget, chunking
   threshold) as a pasteable snippet.
 - 6GB hardware validation: jit measures 35-38 ms/token vs container 30;
-  per-card guidance updated (jit is fastest on every measured card).
+  per-card guidance updated (jit is the fastest native backend on every
+  measured card).
 
 # chatterbox 0.1.0.8 (development)
 
diff --git a/R/defaults.R b/R/defaults.R
index a279ba2..cb518da 100644
--- a/R/defaults.R
+++ b/R/defaults.R
@@ -18,7 +18,8 @@
 #' model) and marked as such when printed.
 #'
 #' @param vram_gb Total GPU memory in GB. Default: detected via
-#'   nvidia-smi; 0 (or detection failure) means CPU-only.
+#'   nvidia-smi; 0 (or detection failure) means CPU-only. Cards under
+#'   5 GB are treated as CPU: the loaded model alone needs ~4.6 GB.
 #' @return An object of class \code{"chatterbox_defaults"}: a list with
 #'   \code{device}, \code{vram_gb}, \code{options} (for
 #'   \code{do.call(options, ...)} before torch loads), \code{backend},
@@ -44,10 +45,12 @@ chatterbox_defaults <- function(vram_gb = NULL) {
         }
     }
 
-    if (vram_gb < 4) {
-        # CPU (or a card too small for the ~4.6 GB loaded model).
-        # The CUDA allocator knobs are irrelevant; only the CPU
-        # allocation odometer exists, and it measured as minor.
+    if (vram_gb < 5) {
+        # CPU, or a card too small to be supported: the loaded model
+        # floor is ~4.6 GB and the measured 6 GB peak was 4.7 GB, so
+        # anything under 5 GB cannot run the CUDA path. The CUDA
+        # allocator knobs are irrelevant; only the CPU allocation
+        # odometer exists, and it measured as minor.
         out <- list(
                     device = "cpu",
                     vram_gb = vram_gb,
@@ -66,7 +69,11 @@ chatterbox_defaults <- function(vram_gb = NULL) {
                     backend = "jit",
                     max_new_tokens = 1000L,
                     chunk_chars = 200L,
-                    measured = vram_gb <= 6.5 || vram_gb > 12
+                    # Measured tiers: a 6 GB card (GTX 1660 Ti) and a
+                    # 16 GB card (RTX 5060 Ti); near-miss sizes (5-5.5,
+                    # 7-13) are projections of those measurements
+                    measured = (vram_gb > 5.5 && vram_gb <= 6.5) ||
+                        vram_gb >= 14
         )
     }
 
diff --git a/inst/tinytest/test_defaults.R b/inst/tinytest/test_defaults.R
index 42b10d0..0f0fed4 100644
--- a/inst/tinytest/test_defaults.R
+++ b/inst/tinytest/test_defaults.R
@@ -24,8 +24,18 @@ expect_identical(dcpu$device, "cpu")
 expect_identical(dcpu$backend, "r")
 expect_identical(dcpu$options, list())
 
-# a 2 GB card cannot hold the model: treated as CPU
+# cards under 5 GB cannot hold the ~4.6 GB model: treated as CPU
 expect_identical(chatterbox::chatterbox_defaults(vram_gb = 2)$device, "cpu")
+expect_identical(chatterbox::chatterbox_defaults(vram_gb = 4)$device, "cpu")
+expect_identical(chatterbox::chatterbox_defaults(vram_gb = 4.9)$device, "cpu")
+
+# 5-5.5 GB runs CUDA but is a projection, not the measured 6 GB tier
+d5 <- chatterbox::chatterbox_defaults(vram_gb = 5)
+expect_identical(d5$device, "cuda")
+expect_false(d5$measured)
+
+# 13 GB sits between measured tiers: projected
+expect_false(chatterbox::chatterbox_defaults(vram_gb = 13)$measured)
 
 # print methods run and return invisibly
 expect_stdout(print(d6), "jit")
diff --git a/man/chatterbox_defaults.Rd b/man/chatterbox_defaults.Rd
index feb969a..4a0330e 100644
--- a/man/chatterbox_defaults.Rd
+++ b/man/chatterbox_defaults.Rd
@@ -7,7 +7,8 @@ chatterbox_defaults(vram_gb = NULL)
 }
 \arguments{
 \item{vram_gb}{Total GPU memory in GB. Default: detected via
-nvidia-smi; 0 (or detection failure) means CPU-only.}
+nvidia-smi; 0 (or detection failure) means CPU-only. Cards under
+5 GB are treated as CPU: the loaded model alone needs ~4.6 GB.}
 }
 \value{
 An object of class \code{"chatterbox_defaults"}: a list with
diff --git a/vignettes/performance.md b/vignettes/performance.md
index b5617cd..785dfb8 100644
--- a/vignettes/performance.md
+++ b/vignettes/performance.md
@@ -202,7 +202,8 @@ install order, was permanently dead in any CRAN-built binary, and
 could go stale on torch upgrades. The TorchScript route shares traced
 mode's deprecation caveat but none of those failure modes. The 6 GB
 rows below labelled cpp are historical measurements of the retired
-backend; jit has not yet been re-validated on that hardware.
+backend; jit was validated on that hardware in June 2026 (35-38
+ms/token - see the 6 GB section).
 
 ### Where the speed actually comes from