rtk-ai · pszymkowiak · Mar 18, 2026 · Mar 18, 2026
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -0,0 +1,4 @@
+[alias]
+b = "build --features metal"
+r = "run --features metal"
+t = "test --features metal"
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,7 +19,7 @@ anyhow = "1"
 rusqlite = { version = "0.32", features = ["bundled"] }
 dirs = "6"
 reqwest = { version = "0.12", features = ["blocking", "json", "stream"] }
-tokio = { version = "1", features = ["rt", "net", "io-util"] }
+tokio = { version = "1", features = ["rt", "rt-multi-thread", "net", "io-util", "macros", "signal", "time"] }
 futures-util = "0.3"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"

diff --git a/README.md b/README.md
@@ -27,19 +27,25 @@ Cross-platform TTS CLI with five backends and MCP server for AI assistants.
 
 ### Benchmark — single sentence (~50 chars)
 
-Real-world measurements. Cold start = first run (includes model loading). Warm = model cached on disk.
+All times measured end-to-end (model loading + inference + audio playback). Cold = first CLI call.
 
-| Backend | M2 Pro (CPU) | RTX 4070 Ti SUPER (CUDA) | Voice cloning | Quality |
+| Backend | M2 Pro (CPU) | RTX 4070 Ti SUPER | Voice cloning | Quality |
 |---------|-------------:|-------------------------:|:---:|---------|
 | **`say`** | **3s** | macOS only | No | System voices |
 | **`kokoro`** | **10s** | ~10s | No | Good |
-| **`voxtream`** | **68s** / 8s warm | **44s** / **22s** warm | Yes (zero-shot) | Excellent |
-| **`qwen-native`** | **11m33s** / 3s warm | ~30s / ~2s warm | Yes | Excellent |
-| **`qwen`** | ~15s / 2s warm | macOS only | Yes | Excellent |
+| **`voxtream`** (VoXtream2, 0.5B) | **68s** / 40s warm | **23s** / **19s** warm | Yes (zero-shot) | Excellent |
+| **`qwen-native`** (Qwen3-TTS, 0.6B) | **11m33s** / 3s warm | **48s** (CPU) | Yes | Excellent |
+| **`qwen`** (MLX-Audio) | ~15s / 2s warm | macOS only | Yes | Excellent |
 
-> `voxtream` cold start includes model download (~500MB) on first run. Subsequent "warm" runs reuse cached model.
-> `qwen-native` benefits massively from `--features metal` (macOS) or `--features cuda` (Linux).
-> For lowest latency: `say` (macOS) or `kokoro` (all platforms). For best quality + cloning: `voxtream` on GPU.
+**With daemon** (`vox daemon start` — keeps model server warm):
+
+| Backend | M2 Pro (CPU) | Notes |
+|---------|-------------:|-------|
+| **`voxtream`** | **32s** | Inference CPU-bound (~25s). On CUDA: paper reports 74ms first-packet |
+| **`qwen-native`** | **~3s** | Model stays in RAM via global Mutex |
+
+> All CUDA benchmarks measured on RTX 4070 Ti SUPER (16GB). qwen-native CUDA not yet supported (requires cudarc update for CUDA 13.2).
+> For lowest latency: `say` (macOS) or `kokoro`. For best quality + cloning: `voxtream` on CUDA with daemon.
 
 ## Install
 

diff --git a/src/backend/voxtream.rs b/src/backend/voxtream.rs
@@ -16,7 +16,7 @@ use crate::config;
 /// Default prompt audio for voxtream when no voice clone is provided.
 /// Generated on first use via macOS `say` or a bundled fallback.
 /// Stored in /tmp to avoid paths with spaces (torchaudio PosixPath bug).
-fn default_prompt_audio() -> Result<PathBuf> {
+pub fn default_prompt_audio() -> Result<PathBuf> {
     let path = PathBuf::from("/tmp/vox_voxtream_default_prompt.wav");
     if path.exists() {
         return Ok(path);
@@ -53,7 +53,7 @@ fn default_prompt_audio() -> Result<PathBuf> {
 pub struct VoxtreamBackend;
 
 /// Find the voxtream binary — check PATH first, then common venv locations.
-fn find_voxtream() -> Option<PathBuf> {
+pub fn find_voxtream() -> Option<PathBuf> {
     // Check PATH
     if let Ok(status) = Command::new("voxtream")
         .arg("--help")