rtk-ai · pszymkowiak · Mar 18, 2026 · Mar 18, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,6 +26,8 @@ serde_json = "1"
 tempfile = "3"
 rodio = "0.20"
 qwen3-tts = { git = "https://github.com/TrevorS/qwen3-tts-rs", features = ["hub"], default-features = false }
+ratatui = "0.29"
+crossterm = "0.28"
 
 [dev-dependencies]
 assert_cmd = "2"

diff --git a/README.md b/README.md
@@ -1,21 +1,46 @@
 # vox
 
-Cross-platform TTS CLI with four backends and MCP server for AI assistants.
+Cross-platform TTS CLI with five backends and MCP server for AI assistants.
 
 ```
-                           vox
-                            |
-          +--------+--------+--------+--------+
-          |        |                 |        |
-        say      qwen          qwen-native  kokoro
-     (macOS)  (MLX/Python)    (pure Rust)  (pure Rust)
-      native  Apple Silicon   CPU/Metal    CPU/GPU
-                               /CUDA
-                        |
-                      rodio
-                  (audio playback)
+                              vox
+                               |
+       +--------+--------+----+----+--------+-----------+
+       |        |        |         |        |           |
+     say      qwen    qwen-native kokoro  voxtream    (TUI)
+   (macOS)  (MLX/Py)  (Rust/candle) (ONNX) (zero-shot) vox setup
+   native   Apple Si.  CPU/Metal  CPU/GPU  CUDA/MPS
+                        /CUDA
+                          |
+                        rodio (audio playback)
 ```
 
+## Backends
+
+| Backend | Engine | Voice cloning | Latency (cold) | Latency (warm) | GPU | Platform |
+|---------|--------|:---:|---:|---:|:---:|----------|
+| `say` | macOS native | No | **3s** | **3s** | No | macOS |
+| `kokoro` | ONNX via Python | No | **10s** | **10s** | No | All |
+| `qwen-native` | Candle (Rust) | Yes | **11m33s** | ~3s | Metal/CUDA | All |
+| `voxtream` | PyTorch 0.5B | Yes | **68s** | ~8s | CUDA/MPS | All |
+| `qwen` | MLX-Audio (Python) | Yes | ~15s | ~2s | Apple Neural | macOS |
+
+### Benchmark — single sentence (~50 chars)
+
+Real-world measurements. Cold start = first run (includes model loading). Warm = model cached on disk.
+
+| Backend | M2 Pro (CPU) | RTX 4070 Ti SUPER (CUDA) | Voice cloning | Quality |
+|---------|-------------:|-------------------------:|:---:|---------|
+| **`say`** | **3s** | macOS only | No | System voices |
+| **`kokoro`** | **10s** | ~10s | No | Good |
+| **`voxtream`** | **68s** / 8s warm | **44s** / **22s** warm | Yes (zero-shot) | Excellent |
+| **`qwen-native`** | **11m33s** / 3s warm | ~30s / ~2s warm | Yes | Excellent |
+| **`qwen`** | ~15s / 2s warm | macOS only | Yes | Excellent |
+
+> `voxtream` cold start includes model download (~500MB) on first run. Subsequent "warm" runs reuse cached model.
+> `qwen-native` benefits massively from `--features metal` (macOS) or `--features cuda` (Linux).
+> For lowest latency: `say` (macOS) or `kokoro` (all platforms). For best quality + cloning: `voxtream` on GPU.
+
 ## Install
 
 ```bash
@@ -30,6 +55,17 @@ cargo install --path . --features metal  # macOS Apple Silicon
 cargo install --path . --features cuda   # Linux NVIDIA
 ```
 
+### VoXtream backend (optional)
+
+```bash
+brew install espeak-ng                              # macOS (or apt install espeak-ng on Linux)
+uv venv ~/.local/venvs/voxtream --python 3.11
+uv pip install --python ~/.local/venvs/voxtream/bin/python "voxtream>=0.2"
+# Copy config files
+git clone --depth 1 https://github.com/herimor/voxtream.git /tmp/voxtream-repo
+cp /tmp/voxtream-repo/configs/*.json "$(vox config show 2>/dev/null | head -1 | grep -v backend || echo ~/.config/vox)/voxtream/"
+```
+
 | Platform | Default backend | GPU |
 |----------|----------------|-----|
 | macOS | `say` | `--features metal` |
@@ -41,10 +77,34 @@ Linux requires `sudo apt install libasound2-dev`.
 
 ```bash
 vox "Hello, world."                     # Speak with default backend
-vox -b kokoro -l fr "Bonjour"           # Specific backend + language
+vox -b voxtream "Zero-shot TTS."        # VoXtream2 (fastest neural)
+vox -b kokoro -l fr "Bonjour"           # Kokoro with language
 echo "Piped text" | vox                 # Read from stdin
 vox --list-voices                       # List available voices
+vox setup                               # Interactive TUI configuration
+```
+
+## Interactive setup (TUI)
+
+For humans — choose backend, voice, language, and style interactively:
+
+```bash
+vox setup
+```
+
 ```
+┌ Backend ──┐┌ Voice ─────┐┌ Lang ┐┌ Style ────┐┌ Config ──────┐
+│> say      ││> Samantha  ││> en  ││> (default)││ Backend: say │
+│  kokoro   ││  Thomas    ││  fr  ││  calm     ││ Voice: ...   │
+│  qwen-nat ││  Amelie    ││  es  ││  warm     ││ Lang:  en    │
+│  voxtream ││           ││  de  ││  cheerful ││              │
+│  qwen     ││           ││  ja  ││          ││ [T]est [S]ave│
+└───────────┘└────────────┘└──────┘└──────────┘└──────────────┘
+```
+
+Navigate with arrow keys / hjkl, Tab to switch panel, T to test, S to save, Q to quit.
+
+AI agents use CLI flags instead: `vox -b voxtream -l fr "text"`
 
 ## AI assistant integration
 
@@ -63,15 +123,6 @@ Running `vox init` again is safe — it skips files that are already configured.
 
 **CLI mode is recommended** for AI coding agents. Benchmarks show CLI tools are [10-32x cheaper and 100% reliable vs 72% for MCP](https://mariozechner.at/posts/2025-08-15-mcp-vs-cli/) due to MCP's TCP timeout overhead and JSON schema cost per call.
 
-With CLI mode, the agent calls vox directly via Bash — no server, no protocol overhead:
-
-```bash
-# Agent just runs this after completing a task
-vox "Fix applied and tests passing."
-```
-
-MCP mode remains useful for tools that don't have shell access (Cursor, VS Code extensions) or when you need structured tool discovery.
-
 | Mode | Reliability | Token cost | Best for |
 |------|------------|------------|----------|
 | **CLI** (`vox init -m cli`) | 100% | Low (Bash call) | Claude Code, Codex, terminal agents |
@@ -87,11 +138,13 @@ vox clone list
 vox clone remove patrick
 ```
 
+Works with `qwen`, `qwen-native`, and `voxtream` backends. VoXtream2 uses zero-shot cloning (3-10s audio prompt, no training needed).
+
 ## Preferences
 
 ```bash
 vox config show
-vox config set backend kokoro
+vox config set backend voxtream
 vox config set lang fr
 vox config set voice Chelsie
 vox config set gender feminine
@@ -118,13 +171,14 @@ vox hear -l fr                     # Speech-to-text only
 
 ## Data
 
-All state is stored locally in `~/.config/vox/`:
+All state is stored locally — no data sent to external servers (except `vox chat` which uses Claude API).
 
 ```
-~/.config/vox/
-  vox.db          # SQLite: preferences, voice clones, usage logs
-  clones/         # Audio files for voice clones
-  packs/          # Installed sound packs
+~/.config/vox/           # or ~/Library/Application Support/vox/ on macOS
+  vox.db                 # SQLite: preferences, voice clones, usage logs
+  clones/                # Audio files for voice clones
+  packs/                 # Installed sound packs
+  voxtream/              # VoXtream2 config files
 ```
 
 | Env var | Description |
@@ -136,9 +190,9 @@ All state is stored locally in `~/.config/vox/`:
 
 | Document | Description |
 |----------|-------------|
-| [Architecture](docs/ARCHITECTURE.md) | Architecture technique, backends, DB schema, protocole MCP, securite |
-| [Features](docs/FEATURES.md) | Documentation fonctionnelle de toutes les commandes et fonctionnalites |
-| [Guide](docs/GUIDE.md) | Guide utilisateur, installation, demarrage rapide, depannage |
+| [Architecture](docs/ARCHITECTURE.md) | Technical architecture, backends, DB schema, MCP protocol, security |
+| [Features](docs/FEATURES.md) | All commands and features documented |
+| [Guide](docs/GUIDE.md) | Installation, quick start, troubleshooting |
 
 ## License
 

diff --git a/src/backend/mod.rs b/src/backend/mod.rs
@@ -1,14 +1,15 @@
 //! TTS backend abstraction layer.
 //!
 //! Each backend implements `TtsBackend` and is selected at runtime via `get_backend()`.
-//! Platform-gated: `say` and `qwen` are macOS-only; `kokoro` and `qwen-native` are cross-platform.
+//! Platform-gated: `say` and `qwen` are macOS-only; `kokoro`, `qwen-native`, and `voxtream` are cross-platform.
 
 pub mod kokoro;
 #[cfg(target_os = "macos")]
 pub mod qwen;
 pub mod qwen_native;
 #[cfg(target_os = "macos")]
 pub mod say;
+pub mod voxtream;
 
 use anyhow::Result;
 
@@ -39,6 +40,7 @@ pub fn get_backend(name: &str) -> Result<Box<dyn TtsBackend>> {
         #[cfg(target_os = "macos")]
         "qwen" => Ok(Box::new(qwen::QwenBackend)),
         "qwen-native" => Ok(Box::new(qwen_native::QwenNativeBackend)),
+        "voxtream" => Ok(Box::new(voxtream::VoxtreamBackend)),
         #[cfg(not(target_os = "macos"))]
         "say" | "qwen" => {
             anyhow::bail!("Backend '{name}' is only available on macOS. Use 'qwen-native' instead.")

diff --git a/src/backend/voxtream.rs b/src/backend/voxtream.rs
@@ -0,0 +1,163 @@
+//! VoXtream2 TTS backend — zero-shot streaming TTS with dynamic speaking rate control.
+//!
+//! 0.5B param model, 74ms first-packet latency, 4x real-time on consumer GPU.
+//! Supports zero-shot voice cloning via audio prompt (3-10s).
+//! Requires: `pip install "voxtream>=0.2"` and `espeak-ng`.
+
+use std::path::PathBuf;
+use std::process::{Command, Stdio};
+
+use anyhow::{Context, Result};
+
+use super::{SpeakOptions, TtsBackend};
+use crate::audio;
+use crate::config;
+
+/// Default prompt audio for voxtream when no voice clone is provided.
+/// Generated on first use via macOS `say` or a bundled fallback.
+/// Stored in /tmp to avoid paths with spaces (torchaudio PosixPath bug).
+fn default_prompt_audio() -> Result<PathBuf> {
+    let path = PathBuf::from("/tmp/vox_voxtream_default_prompt.wav");
+    if path.exists() {
+        return Ok(path);
+    }
+
+    // Try generating with macOS say
+    #[cfg(target_os = "macos")]
+    {
+        std::fs::create_dir_all(config::config_dir()).ok();
+        let status = Command::new("/usr/bin/say")
+            .arg("-v")
+            .arg("Samantha")
+            .arg("-o")
+            .arg(&*path.to_string_lossy())
+            .arg("--data-format=LEI16@16000")
+            .arg("Hello, my name is Samantha. I am testing voice synthesis today.")
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .status();
+        if let Ok(s) = status {
+            if s.success() && path.exists() {
+                return Ok(path);
+            }
+        }
+    }
+
+    anyhow::bail!(
+        "VoXtream2 requires a prompt audio file (3-10s). Provide one via voice clone:\n\
+         vox clone add myvoice --audio ~/voice.wav\n\
+         vox -b voxtream -v myvoice \"text\""
+    )
+}
+
+pub struct VoxtreamBackend;
+
+/// Find the voxtream binary — check PATH first, then common venv locations.
+fn find_voxtream() -> Option<PathBuf> {
+    // Check PATH
+    if let Ok(status) = Command::new("voxtream")
+        .arg("--help")
+        .stdout(Stdio::null())
+        .stderr(Stdio::null())
+        .status()
+    {
+        if status.success() {
+            return Some(PathBuf::from("voxtream"));
+        }
+    }
+
+    // Check common venv locations
+    let candidates = [
+        dirs::home_dir().map(|h| h.join(".local/venvs/voxtream/bin/voxtream")),
+        dirs::home_dir().map(|h| h.join(".venvs/voxtream/bin/voxtream")),
+        dirs::home_dir().map(|h| h.join("venvs/voxtream/bin/voxtream")),
+    ];
+
+    for candidate in candidates.into_iter().flatten() {
+        if candidate.exists() {
+            return Some(candidate);
+        }
+    }
+
+    None
+}
+
+impl TtsBackend for VoxtreamBackend {
+    fn name(&self) -> &str {
+        "voxtream"
+    }
+
+    fn speak(&self, text: &str, opts: &SpeakOptions) -> Result<()> {
+        let bin = find_voxtream().context(
+            "voxtream not found. Install it:\n\
+             python3.11 -m venv ~/.local/venvs/voxtream\n\
+             ~/.local/venvs/voxtream/bin/pip install \"voxtream>=0.2\"\n\
+             brew install espeak-ng",
+        )?;
+
+        let tmp = tempfile::NamedTempFile::new().context("failed to create temp file")?;
+        let wav_path = tmp.path().with_extension("wav");
+        let wav_str = wav_path.to_string_lossy().to_string();
+
+        // Config files are stored in ~/.config/vox/voxtream/
+        let config_dir = config::config_dir().join("voxtream");
+        let generator_config = config_dir.join("generator.json");
+        let rate_config = config_dir.join("speaking_rate.json");
+        if !generator_config.exists() {
+            anyhow::bail!(
+                "VoXtream2 config not found at {}. Clone the repo configs:\n\
+                 git clone --depth 1 https://github.com/herimor/voxtream.git /tmp/voxtream-repo\n\
+                 mkdir -p ~/.config/vox/voxtream\n\
+                 cp /tmp/voxtream-repo/configs/*.json ~/.config/vox/voxtream/",
+                generator_config.display()
+            );
+        }
+
+        let mut cmd = Command::new(&bin);
+        cmd.arg("-t").arg(text);
+        cmd.arg("-o").arg(&wav_str);
+        cmd.arg("-c").arg(&generator_config);
+
+        // Prompt audio is required — use ref_audio (clone) or generate a default
+        let prompt_path = match opts.ref_audio {
+            Some(ref path) => PathBuf::from(path),
+            None => default_prompt_audio()?,
+        };
+        cmd.arg("-pa").arg(&prompt_path);
+
+        // Always pass speaking rate config (voxtream requires it)
+        cmd.arg("--spk-rate-config").arg(&rate_config);
+
+        // Speaking rate (syllables per second)
+        if let Some(rate) = opts.rate {
+            cmd.arg("-fs");
+            cmd.arg("--spk-rate").arg(format!("{}.0", rate));
+        }
+
+        let output = cmd
+            .stdout(Stdio::null())
+            .stderr(Stdio::piped())
+            .output()
+            .context("failed to execute voxtream")?;
+
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            anyhow::bail!("VoXtream2 TTS failed: {stderr}");
+        }
+
+        audio::play_wav_blocking(&wav_path)?;
+        let _ = std::fs::remove_file(&wav_path);
+        Ok(())
+    }
+
+    fn list_voices(&self) -> Result<Vec<String>> {
+        // VoXtream2 is zero-shot — any audio prompt works as a "voice"
+        Ok(vec![
+            "(zero-shot: use --voice with a clone name, or provide any audio prompt)".into(),
+        ])
+    }
+
+    fn is_available(&self) -> bool {
+        find_voxtream().is_some()
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -17,3 +17,4 @@ pub mod mcp;
 pub mod pack;
 #[cfg(target_os = "macos")]
 pub mod stt;
+pub mod tui;