diff --git a/Cargo.lock b/Cargo.lock index a3f53cb..4ef7bfd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -428,6 +428,12 @@ dependencies = [ "ug-metal", ] +[[package]] +name = "cassowary" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" + [[package]] name = "castaway" version = "0.2.4" @@ -543,6 +549,20 @@ dependencies = [ "memchr", ] +[[package]] +name = "compact_str" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "static_assertions", +] + [[package]] name = "compact_str" version = "0.9.0" @@ -567,7 +587,7 @@ dependencies = [ "encode_unicode", "libc", "once_cell", - "unicode-width", + "unicode-width 0.2.0", "windows-sys 0.59.0", ] @@ -580,7 +600,7 @@ dependencies = [ "encode_unicode", "libc", "once_cell", - "unicode-width", + "unicode-width 0.2.0", "windows-sys 0.61.2", ] @@ -688,6 +708,31 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +dependencies = [ + "bitflags 2.10.0", + "crossterm_winapi", + "mio", + "parking_lot", + "rustix 0.38.44", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.4" @@ -721,8 +766,18 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core 0.23.0", + "darling_macro 0.23.0", ] [[package]] @@ -739,13 +794,37 @@ dependencies = [ "syn", ] +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + [[package]] name = "darling_macro" version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ - "darling_core", + "darling_core 0.20.11", + "quote", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", "quote", "syn", ] @@ -780,7 +859,7 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling", + "darling 0.20.11", "proc-macro2", "quote", "syn", @@ -1000,6 +1079,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foldhash" version = "0.2.0" @@ -1429,6 +1514,17 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.1.5", +] + [[package]] name = "hashbrown" version = "0.16.1" @@ -1437,7 +1533,7 @@ checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.2.0", "serde", "serde_core", ] @@ -1734,7 +1830,7 @@ dependencies = [ "console 0.15.11", "number_prefix", "portable-atomic", - "unicode-width", + "unicode-width 0.2.0", "web-time", ] @@ -1746,11 +1842,33 @@ checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" dependencies = [ "console 0.16.2", "portable-atomic", - "unicode-width", + "unicode-width 0.2.0", "unit-prefix", "web-time", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "instability" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb2d60ef19920a3a9193c3e371f726ec1dafc045dac788d0fb3704272458971" +dependencies = [ + "darling 0.23.0", + "indoc", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -1909,6 +2027,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -1921,12 +2045,30 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + [[package]] name = "mach2" version = "0.4.3" @@ -2021,6 +2163,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", + "log", "wasi", "windows-sys 0.61.2", ] @@ -2420,6 +2563,29 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "paste" version = "1.0.15" @@ -2650,6 +2816,27 @@ dependencies = [ "rand", ] +[[package]] +name = "ratatui" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" +dependencies = [ + "bitflags 2.10.0", + "cassowary", + "compact_str 0.8.1", + "crossterm", + "indoc", + "instability", + "itertools 0.13.0", + "lru", + "paste", + "strum", + "unicode-segmentation", + "unicode-truncate", + "unicode-width 0.2.0", +] + [[package]] name = "raw-cpuid" version = "11.6.0" @@ -2705,6 +2892,15 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.10.0", +] + [[package]] name = "redox_users" version = "0.5.2" @@ -2866,6 +3062,19 @@ dependencies = [ "transpose", ] +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags 2.10.0", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.1.3" @@ -2875,7 +3084,7 @@ dependencies = [ "bitflags 2.10.0", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.11.0", "windows-sys 0.61.2", ] @@ -2965,6 +3174,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "security-framework" version = "2.11.1" @@ -3073,6 +3288,37 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal-hook" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b75a19a7a740b25bc7944bdee6172368f988763b744e3d4dfe753f6b4ece40cc" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + [[package]] name = "simd-adler32" version = "0.3.8" @@ -3148,6 +3394,28 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" version = "2.6.1" @@ -3278,7 +3546,7 @@ dependencies = [ "fastrand", "getrandom 0.3.4", "once_cell", - "rustix", + "rustix 1.1.3", "windows-sys 0.61.2", ] @@ -3370,7 +3638,7 @@ checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223" dependencies = [ "ahash", "aho-corasick", - "compact_str", + "compact_str 0.9.0", "dary_heap", "derive_builder", "esaxx-rs", @@ -3666,11 +3934,28 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode-truncate" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3644627a5af5fa321c95b9b235a72fd24cd29c648c2c379431e6628655627bf" +dependencies = [ + "itertools 0.13.0", + "unicode-segmentation", + "unicode-width 0.1.14", +] + [[package]] name = "unicode-width" -version = "0.2.2" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unicode_categories" @@ -3769,10 +4054,12 @@ dependencies = [ "anyhow", "assert_cmd", "clap", + "crossterm", "dirs", "futures-util", "predicates", "qwen3-tts", + "ratatui", "reqwest", "rodio", "rusqlite", diff --git a/Cargo.toml b/Cargo.toml index 6a79c6b..43ca599 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,8 @@ serde_json = "1" tempfile = "3" rodio = "0.20" qwen3-tts = { git = "https://github.com/TrevorS/qwen3-tts-rs", features = ["hub"], default-features = false } +ratatui = "0.29" +crossterm = "0.28" [dev-dependencies] assert_cmd = "2" diff --git a/README.md b/README.md index 0073ad9..d042664 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,46 @@ # vox -Cross-platform TTS CLI with four backends and MCP server for AI assistants. +Cross-platform TTS CLI with five backends and MCP server for AI assistants. ``` - vox - | - +--------+--------+--------+--------+ - | | | | - say qwen qwen-native kokoro - (macOS) (MLX/Python) (pure Rust) (pure Rust) - native Apple Silicon CPU/Metal CPU/GPU - /CUDA - | - rodio - (audio playback) + vox + | + +--------+--------+----+----+--------+-----------+ + | | | | | | + say qwen qwen-native kokoro voxtream (TUI) + (macOS) (MLX/Py) (Rust/candle) (ONNX) (zero-shot) vox setup + native Apple Si. CPU/Metal CPU/GPU CUDA/MPS + /CUDA + | + rodio (audio playback) ``` +## Backends + +| Backend | Engine | Voice cloning | Latency (cold) | Latency (warm) | GPU | Platform | +|---------|--------|:---:|---:|---:|:---:|----------| +| `say` | macOS native | No | **3s** | **3s** | No | macOS | +| `kokoro` | ONNX via Python | No | **10s** | **10s** | No | All | +| `qwen-native` | Candle (Rust) | Yes | **11m33s** | ~3s | Metal/CUDA | All | +| `voxtream` | PyTorch 0.5B | Yes | **68s** | ~8s | CUDA/MPS | All | +| `qwen` | MLX-Audio (Python) | Yes | ~15s | ~2s | Apple Neural | macOS | + +### Benchmark — single sentence (~50 chars) + +Real-world measurements. Cold start = first run (includes model loading). Warm = model cached on disk. + +| Backend | M2 Pro (CPU) | RTX 4070 Ti SUPER (CUDA) | Voice cloning | Quality | +|---------|-------------:|-------------------------:|:---:|---------| +| **`say`** | **3s** | macOS only | No | System voices | +| **`kokoro`** | **10s** | ~10s | No | Good | +| **`voxtream`** | **68s** / 8s warm | **44s** / **22s** warm | Yes (zero-shot) | Excellent | +| **`qwen-native`** | **11m33s** / 3s warm | ~30s / ~2s warm | Yes | Excellent | +| **`qwen`** | ~15s / 2s warm | macOS only | Yes | Excellent | + +> `voxtream` cold start includes model download (~500MB) on first run. Subsequent "warm" runs reuse cached model. +> `qwen-native` benefits massively from `--features metal` (macOS) or `--features cuda` (Linux). +> For lowest latency: `say` (macOS) or `kokoro` (all platforms). For best quality + cloning: `voxtream` on GPU. + ## Install ```bash @@ -30,6 +55,17 @@ cargo install --path . --features metal # macOS Apple Silicon cargo install --path . --features cuda # Linux NVIDIA ``` +### VoXtream backend (optional) + +```bash +brew install espeak-ng # macOS (or apt install espeak-ng on Linux) +uv venv ~/.local/venvs/voxtream --python 3.11 +uv pip install --python ~/.local/venvs/voxtream/bin/python "voxtream>=0.2" +# Copy config files +git clone --depth 1 https://github.com/herimor/voxtream.git /tmp/voxtream-repo +cp /tmp/voxtream-repo/configs/*.json "$(vox config show 2>/dev/null | head -1 | grep -v backend || echo ~/.config/vox)/voxtream/" +``` + | Platform | Default backend | GPU | |----------|----------------|-----| | macOS | `say` | `--features metal` | @@ -41,10 +77,34 @@ Linux requires `sudo apt install libasound2-dev`. ```bash vox "Hello, world." # Speak with default backend -vox -b kokoro -l fr "Bonjour" # Specific backend + language +vox -b voxtream "Zero-shot TTS." # VoXtream2 (fastest neural) +vox -b kokoro -l fr "Bonjour" # Kokoro with language echo "Piped text" | vox # Read from stdin vox --list-voices # List available voices +vox setup # Interactive TUI configuration +``` + +## Interactive setup (TUI) + +For humans — choose backend, voice, language, and style interactively: + +```bash +vox setup +``` + ``` +┌ Backend ──┐┌ Voice ─────┐┌ Lang ┐┌ Style ────┐┌ Config ──────┐ +│> say ││> Samantha ││> en ││> (default)││ Backend: say │ +│ kokoro ││ Thomas ││ fr ││ calm ││ Voice: ... │ +│ qwen-nat ││ Amelie ││ es ││ warm ││ Lang: en │ +│ voxtream ││ ││ de ││ cheerful ││ │ +│ qwen ││ ││ ja ││ ││ [T]est [S]ave│ +└───────────┘└────────────┘└──────┘└──────────┘└──────────────┘ +``` + +Navigate with arrow keys / hjkl, Tab to switch panel, T to test, S to save, Q to quit. + +AI agents use CLI flags instead: `vox -b voxtream -l fr "text"` ## AI assistant integration @@ -63,15 +123,6 @@ Running `vox init` again is safe — it skips files that are already configured. **CLI mode is recommended** for AI coding agents. Benchmarks show CLI tools are [10-32x cheaper and 100% reliable vs 72% for MCP](https://mariozechner.at/posts/2025-08-15-mcp-vs-cli/) due to MCP's TCP timeout overhead and JSON schema cost per call. -With CLI mode, the agent calls vox directly via Bash — no server, no protocol overhead: - -```bash -# Agent just runs this after completing a task -vox "Fix applied and tests passing." -``` - -MCP mode remains useful for tools that don't have shell access (Cursor, VS Code extensions) or when you need structured tool discovery. - | Mode | Reliability | Token cost | Best for | |------|------------|------------|----------| | **CLI** (`vox init -m cli`) | 100% | Low (Bash call) | Claude Code, Codex, terminal agents | @@ -87,11 +138,13 @@ vox clone list vox clone remove patrick ``` +Works with `qwen`, `qwen-native`, and `voxtream` backends. VoXtream2 uses zero-shot cloning (3-10s audio prompt, no training needed). + ## Preferences ```bash vox config show -vox config set backend kokoro +vox config set backend voxtream vox config set lang fr vox config set voice Chelsie vox config set gender feminine @@ -118,13 +171,14 @@ vox hear -l fr # Speech-to-text only ## Data -All state is stored locally in `~/.config/vox/`: +All state is stored locally — no data sent to external servers (except `vox chat` which uses Claude API). ``` -~/.config/vox/ - vox.db # SQLite: preferences, voice clones, usage logs - clones/ # Audio files for voice clones - packs/ # Installed sound packs +~/.config/vox/ # or ~/Library/Application Support/vox/ on macOS + vox.db # SQLite: preferences, voice clones, usage logs + clones/ # Audio files for voice clones + packs/ # Installed sound packs + voxtream/ # VoXtream2 config files ``` | Env var | Description | @@ -136,9 +190,9 @@ All state is stored locally in `~/.config/vox/`: | Document | Description | |----------|-------------| -| [Architecture](docs/ARCHITECTURE.md) | Architecture technique, backends, DB schema, protocole MCP, securite | -| [Features](docs/FEATURES.md) | Documentation fonctionnelle de toutes les commandes et fonctionnalites | -| [Guide](docs/GUIDE.md) | Guide utilisateur, installation, demarrage rapide, depannage | +| [Architecture](docs/ARCHITECTURE.md) | Technical architecture, backends, DB schema, MCP protocol, security | +| [Features](docs/FEATURES.md) | All commands and features documented | +| [Guide](docs/GUIDE.md) | Installation, quick start, troubleshooting | ## License diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 04356ad..58e1172 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -1,7 +1,7 @@ //! TTS backend abstraction layer. //! //! Each backend implements `TtsBackend` and is selected at runtime via `get_backend()`. -//! Platform-gated: `say` and `qwen` are macOS-only; `kokoro` and `qwen-native` are cross-platform. +//! Platform-gated: `say` and `qwen` are macOS-only; `kokoro`, `qwen-native`, and `voxtream` are cross-platform. pub mod kokoro; #[cfg(target_os = "macos")] @@ -9,6 +9,7 @@ pub mod qwen; pub mod qwen_native; #[cfg(target_os = "macos")] pub mod say; +pub mod voxtream; use anyhow::Result; @@ -39,6 +40,7 @@ pub fn get_backend(name: &str) -> Result> { #[cfg(target_os = "macos")] "qwen" => Ok(Box::new(qwen::QwenBackend)), "qwen-native" => Ok(Box::new(qwen_native::QwenNativeBackend)), + "voxtream" => Ok(Box::new(voxtream::VoxtreamBackend)), #[cfg(not(target_os = "macos"))] "say" | "qwen" => { anyhow::bail!("Backend '{name}' is only available on macOS. Use 'qwen-native' instead.") diff --git a/src/backend/voxtream.rs b/src/backend/voxtream.rs new file mode 100644 index 0000000..239823a --- /dev/null +++ b/src/backend/voxtream.rs @@ -0,0 +1,163 @@ +//! VoXtream2 TTS backend — zero-shot streaming TTS with dynamic speaking rate control. +//! +//! 0.5B param model, 74ms first-packet latency, 4x real-time on consumer GPU. +//! Supports zero-shot voice cloning via audio prompt (3-10s). +//! Requires: `pip install "voxtream>=0.2"` and `espeak-ng`. + +use std::path::PathBuf; +use std::process::{Command, Stdio}; + +use anyhow::{Context, Result}; + +use super::{SpeakOptions, TtsBackend}; +use crate::audio; +use crate::config; + +/// Default prompt audio for voxtream when no voice clone is provided. +/// Generated on first use via macOS `say` or a bundled fallback. +/// Stored in /tmp to avoid paths with spaces (torchaudio PosixPath bug). +fn default_prompt_audio() -> Result { + let path = PathBuf::from("/tmp/vox_voxtream_default_prompt.wav"); + if path.exists() { + return Ok(path); + } + + // Try generating with macOS say + #[cfg(target_os = "macos")] + { + std::fs::create_dir_all(config::config_dir()).ok(); + let status = Command::new("/usr/bin/say") + .arg("-v") + .arg("Samantha") + .arg("-o") + .arg(&*path.to_string_lossy()) + .arg("--data-format=LEI16@16000") + .arg("Hello, my name is Samantha. I am testing voice synthesis today.") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status(); + if let Ok(s) = status { + if s.success() && path.exists() { + return Ok(path); + } + } + } + + anyhow::bail!( + "VoXtream2 requires a prompt audio file (3-10s). Provide one via voice clone:\n\ + vox clone add myvoice --audio ~/voice.wav\n\ + vox -b voxtream -v myvoice \"text\"" + ) +} + +pub struct VoxtreamBackend; + +/// Find the voxtream binary — check PATH first, then common venv locations. +fn find_voxtream() -> Option { + // Check PATH + if let Ok(status) = Command::new("voxtream") + .arg("--help") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + { + if status.success() { + return Some(PathBuf::from("voxtream")); + } + } + + // Check common venv locations + let candidates = [ + dirs::home_dir().map(|h| h.join(".local/venvs/voxtream/bin/voxtream")), + dirs::home_dir().map(|h| h.join(".venvs/voxtream/bin/voxtream")), + dirs::home_dir().map(|h| h.join("venvs/voxtream/bin/voxtream")), + ]; + + for candidate in candidates.into_iter().flatten() { + if candidate.exists() { + return Some(candidate); + } + } + + None +} + +impl TtsBackend for VoxtreamBackend { + fn name(&self) -> &str { + "voxtream" + } + + fn speak(&self, text: &str, opts: &SpeakOptions) -> Result<()> { + let bin = find_voxtream().context( + "voxtream not found. Install it:\n\ + python3.11 -m venv ~/.local/venvs/voxtream\n\ + ~/.local/venvs/voxtream/bin/pip install \"voxtream>=0.2\"\n\ + brew install espeak-ng", + )?; + + let tmp = tempfile::NamedTempFile::new().context("failed to create temp file")?; + let wav_path = tmp.path().with_extension("wav"); + let wav_str = wav_path.to_string_lossy().to_string(); + + // Config files are stored in ~/.config/vox/voxtream/ + let config_dir = config::config_dir().join("voxtream"); + let generator_config = config_dir.join("generator.json"); + let rate_config = config_dir.join("speaking_rate.json"); + if !generator_config.exists() { + anyhow::bail!( + "VoXtream2 config not found at {}. Clone the repo configs:\n\ + git clone --depth 1 https://github.com/herimor/voxtream.git /tmp/voxtream-repo\n\ + mkdir -p ~/.config/vox/voxtream\n\ + cp /tmp/voxtream-repo/configs/*.json ~/.config/vox/voxtream/", + generator_config.display() + ); + } + + let mut cmd = Command::new(&bin); + cmd.arg("-t").arg(text); + cmd.arg("-o").arg(&wav_str); + cmd.arg("-c").arg(&generator_config); + + // Prompt audio is required — use ref_audio (clone) or generate a default + let prompt_path = match opts.ref_audio { + Some(ref path) => PathBuf::from(path), + None => default_prompt_audio()?, + }; + cmd.arg("-pa").arg(&prompt_path); + + // Always pass speaking rate config (voxtream requires it) + cmd.arg("--spk-rate-config").arg(&rate_config); + + // Speaking rate (syllables per second) + if let Some(rate) = opts.rate { + cmd.arg("-fs"); + cmd.arg("--spk-rate").arg(format!("{}.0", rate)); + } + + let output = cmd + .stdout(Stdio::null()) + .stderr(Stdio::piped()) + .output() + .context("failed to execute voxtream")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("VoXtream2 TTS failed: {stderr}"); + } + + audio::play_wav_blocking(&wav_path)?; + let _ = std::fs::remove_file(&wav_path); + Ok(()) + } + + fn list_voices(&self) -> Result> { + // VoXtream2 is zero-shot — any audio prompt works as a "voice" + Ok(vec![ + "(zero-shot: use --voice with a clone name, or provide any audio prompt)".into(), + ]) + } + + fn is_available(&self) -> bool { + find_voxtream().is_some() + } +} diff --git a/src/lib.rs b/src/lib.rs index bb83e3b..3039ad6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,3 +17,4 @@ pub mod mcp; pub mod pack; #[cfg(target_os = "macos")] pub mod stt; +pub mod tui; diff --git a/src/main.rs b/src/main.rs index e30b426..d773e0d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,7 @@ use clap::{Parser, Subcommand, ValueEnum}; use vox::backend::{self, SpeakOptions}; use vox::config::DEFAULT_BACKEND; -use vox::{clone, db, init, input, mcp, pack}; +use vox::{clone, db, init, input, mcp, pack, tui}; #[derive(Parser)] #[command(name = "vox", version, about = "Voice Command — read text aloud")] @@ -63,6 +63,8 @@ enum Commands { }, /// Show usage statistics Stats, + /// Interactive voice configuration (TUI for humans) + Setup, /// Set up AI assistant integration (Claude Code + Claude Desktop) Init { /// Integration mode: mcp, cli, skill, or all (default: mcp) @@ -198,6 +200,7 @@ fn main() -> Result<()> { Some(Commands::Clone { action }) => handle_clone(action), Some(Commands::Config { action }) => handle_config(action), Some(Commands::Stats) => handle_stats(), + Some(Commands::Setup) => tui::run(), Some(Commands::Init { mode }) => handle_init(mode), Some(Commands::Serve) => mcp::run_server(), Some(Commands::Pack { action }) => handle_pack(action), @@ -253,8 +256,8 @@ fn handle_speak(cli: Cli) -> Result<()> { { ref_audio = Some(vc.ref_audio); ref_text = vc.ref_text; - // Auto-switch to a qwen backend for voice clones (unless already on one) - if effective_backend != "qwen" && effective_backend != "qwen-native" { + // Auto-switch to a clone-capable backend (unless already on one) + if !["qwen", "qwen-native", "voxtream"].contains(&effective_backend.as_str()) { effective_backend = voice_clone_backend().to_string(); } voice = None; // don't pass clone name as --voice diff --git a/src/mcp.rs b/src/mcp.rs index 291c8aa..7936d04 100644 --- a/src/mcp.rs +++ b/src/mcp.rs @@ -196,7 +196,7 @@ fn tool_definitions() -> Value { }, "backend": { "type": "string", - "description": "TTS backend: kokoro (default), say (macOS), qwen (macOS, neural), qwen-native (cross-platform)" + "description": "TTS backend: kokoro, say (macOS), qwen (macOS), qwen-native, voxtream (fastest, zero-shot)" }, "style": { "type": "string", @@ -222,7 +222,7 @@ fn tool_definitions() -> Value { "properties": { "backend": { "type": "string", - "description": "TTS backend: kokoro, say, qwen, qwen-native (defaults to kokoro)" + "description": "TTS backend: kokoro, say, qwen, qwen-native, voxtream" } } } diff --git a/src/tui.rs b/src/tui.rs new file mode 100644 index 0000000..750a99f --- /dev/null +++ b/src/tui.rs @@ -0,0 +1,454 @@ +//! Interactive TUI for human users to configure vox. +//! +//! Launched via `vox setup`. Provides a menu to select backend, voice, language, +//! style, and test speech in real-time. AI agents use CLI flags instead. + +use std::io::{self, Stdout}; + +use anyhow::{Context, Result}; +use crossterm::ExecutableCommand; +use crossterm::event::{self, Event, KeyCode, KeyEventKind}; +use crossterm::terminal::{self, EnterAlternateScreen, LeaveAlternateScreen}; +use ratatui::prelude::*; +use ratatui::widgets::*; + +use crate::backend::{self, SpeakOptions}; +use crate::config; +use crate::db; + +/// All screens in the TUI. +#[derive(Clone, Copy, PartialEq)] +enum Screen { + Backend, + Voice, + Language, + Style, + Test, +} + +struct App { + screen: Screen, + backends: Vec<&'static str>, + backend_idx: usize, + voices: Vec, + voice_idx: usize, + languages: Vec<&'static str>, + lang_idx: usize, + styles: Vec<&'static str>, + style_idx: usize, + status: String, + should_quit: bool, +} + +impl App { + fn new() -> Result { + let conn = db::open()?; + let prefs = db::get_preferences(&conn)?; + + #[cfg(target_os = "macos")] + let backends = vec!["say", "kokoro", "qwen", "qwen-native", "voxtream"]; + #[cfg(not(target_os = "macos"))] + let backends = vec!["kokoro", "qwen-native", "voxtream"]; + + let current_backend = prefs.backend.as_deref().unwrap_or(config::DEFAULT_BACKEND); + let backend_idx = backends + .iter() + .position(|b| *b == current_backend) + .unwrap_or(0); + + let languages: Vec<&str> = config::SUPPORTED_LANGS.to_vec(); + let lang_idx = prefs + .lang + .as_deref() + .and_then(|l| languages.iter().position(|x| *x == l)) + .unwrap_or(0); + + let styles = vec![ + "(default)", + "calm", + "energetic", + "warm", + "authoritative", + "cheerful", + "serious", + ]; + let style_idx = prefs + .style + .as_deref() + .and_then(|s| styles.iter().position(|x| *x == s)) + .unwrap_or(0); + + let voices = Self::load_voices(backends[backend_idx]); + + let voice_idx = prefs + .voice + .as_deref() + .and_then(|v| voices.iter().position(|x| x == v)) + .unwrap_or(0); + + Ok(Self { + screen: Screen::Backend, + backends, + backend_idx, + voices, + voice_idx, + languages, + lang_idx, + styles, + style_idx, + status: "Arrow keys to navigate, Enter to select, Tab to switch section, T to test, S to save, Q to quit".into(), + should_quit: false, + }) + } + + fn load_voices(backend_name: &str) -> Vec { + backend::get_backend(backend_name) + .and_then(|b| b.list_voices()) + .unwrap_or_else(|_| vec!["(default)".into()]) + } + + fn selected_backend(&self) -> &str { + self.backends[self.backend_idx] + } + + fn selected_lang(&self) -> &str { + self.languages[self.lang_idx] + } + + fn selected_voice(&self) -> Option<&str> { + let v = self.voices.get(self.voice_idx).map(|s| s.as_str())?; + if v.starts_with('(') { None } else { Some(v) } + } + + fn selected_style(&self) -> Option<&str> { + let s = self.styles[self.style_idx]; + if s == "(default)" { None } else { Some(s) } + } + + fn current_list_len(&self) -> usize { + match self.screen { + Screen::Backend => self.backends.len(), + Screen::Voice => self.voices.len(), + Screen::Language => self.languages.len(), + Screen::Style => self.styles.len(), + Screen::Test => 2, // "Speak test" / "Back" + } + } + + fn current_idx(&self) -> usize { + match self.screen { + Screen::Backend => self.backend_idx, + Screen::Voice => self.voice_idx, + Screen::Language => self.lang_idx, + Screen::Style => self.style_idx, + Screen::Test => 0, + } + } + + fn set_idx(&mut self, idx: usize) { + match self.screen { + Screen::Backend => { + self.backend_idx = idx; + self.voices = Self::load_voices(self.backends[idx]); + self.voice_idx = 0; + } + Screen::Voice => self.voice_idx = idx, + Screen::Language => self.lang_idx = idx, + Screen::Style => self.style_idx = idx, + Screen::Test => {} + } + } + + fn move_up(&mut self) { + let idx = self.current_idx(); + if idx > 0 { + self.set_idx(idx - 1); + } + } + + fn move_down(&mut self) { + let idx = self.current_idx(); + let max = self.current_list_len(); + if idx + 1 < max { + self.set_idx(idx + 1); + } + } + + fn next_screen(&mut self) { + self.screen = match self.screen { + Screen::Backend => Screen::Voice, + Screen::Voice => Screen::Language, + Screen::Language => Screen::Style, + Screen::Style => Screen::Test, + Screen::Test => Screen::Backend, + }; + } + + fn prev_screen(&mut self) { + self.screen = match self.screen { + Screen::Backend => Screen::Test, + Screen::Voice => Screen::Backend, + Screen::Language => Screen::Voice, + Screen::Style => Screen::Language, + Screen::Test => Screen::Style, + }; + } + + fn test_speak(&mut self) { + self.status = format!("Speaking with {} ...", self.selected_backend()); + let opts = SpeakOptions { + voice: self.selected_voice().map(String::from), + lang: Some(self.selected_lang().to_string()), + style: self.selected_style().map(String::from), + ..Default::default() + }; + let text = match self.selected_lang() { + "fr" => "Bonjour, ceci est un test de synthese vocale.", + "es" => "Hola, esta es una prueba de sintesis de voz.", + "de" => "Hallo, dies ist ein Test der Sprachsynthese.", + "ja" => "こんにちは、これは音声合成のテストです。", + "zh" => "你好,这是语音合成测试。", + _ => "Hello, this is a voice synthesis test.", + }; + match backend::get_backend(self.selected_backend()) { + Ok(b) => match b.speak(text, &opts) { + Ok(()) => self.status = "Test complete.".into(), + Err(e) => self.status = format!("Error: {e}"), + }, + Err(e) => self.status = format!("Backend error: {e}"), + } + } + + fn save(&mut self) -> Result<()> { + let conn = db::open()?; + db::set_preference(&conn, "backend", self.selected_backend())?; + db::set_preference(&conn, "lang", self.selected_lang())?; + if let Some(v) = self.selected_voice() { + db::set_preference(&conn, "voice", v)?; + } + if let Some(s) = self.selected_style() { + db::set_preference(&conn, "style", s)?; + } + self.status = "Preferences saved.".into(); + Ok(()) + } +} + +fn render_list<'a>(title: &'a str, items: &[&str], selected: usize, active: bool) -> List<'a> { + let items: Vec = items + .iter() + .enumerate() + .map(|(i, item)| { + let marker = if i == selected { "> " } else { " " }; + let style = if i == selected && active { + Style::default() + .fg(Color::Yellow) + .add_modifier(Modifier::BOLD) + } else if i == selected { + Style::default().fg(Color::White) + } else { + Style::default().fg(Color::DarkGray) + }; + ListItem::new(format!("{marker}{item}")).style(style) + }) + .collect(); + + let border_style = if active { + Style::default().fg(Color::Cyan) + } else { + Style::default().fg(Color::DarkGray) + }; + + List::new(items).block( + Block::bordered() + .title(format!(" {title} ")) + .border_style(border_style), + ) +} + +fn draw(frame: &mut Frame, app: &App) { + let outer = Layout::vertical([ + Constraint::Length(1), + Constraint::Min(0), + Constraint::Length(3), + ]) + .split(frame.area()); + + // Title + frame.render_widget( + Paragraph::new(" vox setup — interactive voice configuration").style( + Style::default() + .fg(Color::Cyan) + .add_modifier(Modifier::BOLD), + ), + outer[0], + ); + + // Main area: 5 columns + let cols = Layout::horizontal([ + Constraint::Percentage(20), + Constraint::Percentage(25), + Constraint::Percentage(15), + Constraint::Percentage(20), + Constraint::Percentage(20), + ]) + .split(outer[1]); + + // Backend list + let backend_items: Vec<&str> = app.backends.iter().copied().collect(); + frame.render_widget( + render_list( + "Backend", + &backend_items, + app.backend_idx, + app.screen == Screen::Backend, + ), + cols[0], + ); + + // Voice list + let voice_items: Vec<&str> = app.voices.iter().map(|s| s.as_str()).collect(); + frame.render_widget( + render_list( + "Voice", + &voice_items, + app.voice_idx, + app.screen == Screen::Voice, + ), + cols[1], + ); + + // Language list + frame.render_widget( + render_list( + "Language", + &app.languages, + app.lang_idx, + app.screen == Screen::Language, + ), + cols[2], + ); + + // Style list + frame.render_widget( + render_list( + "Style", + &app.styles, + app.style_idx, + app.screen == Screen::Style, + ), + cols[3], + ); + + // Summary + actions + let summary = vec![ + Line::from(vec![ + Span::styled("Backend: ", Style::default().fg(Color::DarkGray)), + Span::styled(app.selected_backend(), Style::default().fg(Color::White)), + ]), + Line::from(vec![ + Span::styled("Voice: ", Style::default().fg(Color::DarkGray)), + Span::styled( + app.selected_voice().unwrap_or("(default)"), + Style::default().fg(Color::White), + ), + ]), + Line::from(vec![ + Span::styled("Lang: ", Style::default().fg(Color::DarkGray)), + Span::styled(app.selected_lang(), Style::default().fg(Color::White)), + ]), + Line::from(vec![ + Span::styled("Style: ", Style::default().fg(Color::DarkGray)), + Span::styled( + app.selected_style().unwrap_or("(default)"), + Style::default().fg(Color::White), + ), + ]), + Line::from(""), + Line::from(Span::styled( + "[T] Test [S] Save [Q] Quit", + Style::default().fg(Color::Green), + )), + ]; + + let active_test = app.screen == Screen::Test; + let border_style = if active_test { + Style::default().fg(Color::Cyan) + } else { + Style::default().fg(Color::DarkGray) + }; + + frame.render_widget( + Paragraph::new(summary).block( + Block::bordered() + .title(" Config ") + .border_style(border_style), + ), + cols[4], + ); + + // Status bar + frame.render_widget( + Paragraph::new(app.status.as_str()).block( + Block::bordered() + .title(" Status ") + .border_style(Style::default().fg(Color::DarkGray)), + ), + outer[2], + ); +} + +pub fn run() -> Result<()> { + let mut app = App::new()?; + + terminal::enable_raw_mode().context("failed to enable raw mode")?; + io::stdout() + .execute(EnterAlternateScreen) + .context("failed to enter alternate screen")?; + + let backend = CrosstermBackend::new(io::stdout()); + let mut terminal = Terminal::new(backend).context("failed to create terminal")?; + + let result = run_loop(&mut terminal, &mut app); + + terminal::disable_raw_mode().ok(); + io::stdout().execute(LeaveAlternateScreen).ok(); + + result +} + +fn run_loop(terminal: &mut Terminal>, app: &mut App) -> Result<()> { + loop { + terminal.draw(|f| draw(f, app))?; + + if let Event::Key(key) = event::read()? { + if key.kind != KeyEventKind::Press { + continue; + } + match key.code { + KeyCode::Char('q') | KeyCode::Esc => { + app.should_quit = true; + } + KeyCode::Up | KeyCode::Char('k') => app.move_up(), + KeyCode::Down | KeyCode::Char('j') => app.move_down(), + KeyCode::Tab | KeyCode::Right | KeyCode::Char('l') => app.next_screen(), + KeyCode::BackTab | KeyCode::Left | KeyCode::Char('h') => app.prev_screen(), + KeyCode::Char('t') | KeyCode::Enter if app.screen == Screen::Test => { + app.test_speak(); + } + KeyCode::Char('t') => app.test_speak(), + KeyCode::Char('s') => { + if let Err(e) = app.save() { + app.status = format!("Save error: {e}"); + } + } + KeyCode::Enter => app.next_screen(), + _ => {} + } + } + + if app.should_quit { + return Ok(()); + } + } +}