diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ea4b0fce..8047ff60 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -22,5 +22,7 @@ jobs: run: xcodebuild -downloadComponent MetalToolchain - name: Build run: xcodebuild build-for-testing -scheme MLXAudio-Package -destination 'platform=macOS' MACOSX_DEPLOYMENT_TARGET=14.0 CODE_SIGNING_ALLOWED=NO + - name: Build consumer graph fixture + run: swift build --package-path Integration/ConsumerGraphFixture - name: Run tests run: xcodebuild test-without-building -scheme MLXAudio-Package -destination 'platform=macOS' -skip-testing:'MLXAudioTests/SmokeTests' -parallel-testing-enabled NO CODE_SIGNING_ALLOWED=NO diff --git a/Integration/ConsumerGraphFixture/Package.swift b/Integration/ConsumerGraphFixture/Package.swift new file mode 100644 index 00000000..79845c6e --- /dev/null +++ b/Integration/ConsumerGraphFixture/Package.swift @@ -0,0 +1,26 @@ +// swift-tools-version: 6.2 +import PackageDescription + +let package = Package( + name: "ConsumerGraphFixture", + platforms: [.macOS(.v14)], + products: [ + .executable(name: "ConsumerGraphFixture", targets: ["ConsumerGraphFixture"]), + ], + dependencies: [ + .package(path: "../.."), + .package(url: "https://github.com/huggingface/swift-transformers.git", from: "1.3.0"), + .package(url: "https://github.com/ml-explore/mlx-swift-lm.git", branch: "main"), + ], + targets: [ + .executableTarget( + name: "ConsumerGraphFixture", + dependencies: [ + .product(name: "MLXAudioTTS", package: "mlx-audio-swift"), + .product(name: "MLXLMCommon", package: "mlx-swift-lm"), + .product(name: "Hub", package: "swift-transformers"), + .product(name: "Tokenizers", package: "swift-transformers"), + ] + ), + ] +) diff --git a/Integration/ConsumerGraphFixture/README.md b/Integration/ConsumerGraphFixture/README.md new file mode 100644 index 00000000..ca6e7fa0 --- /dev/null +++ b/Integration/ConsumerGraphFixture/README.md @@ -0,0 +1,19 @@ +# Consumer Graph Fixture + +This fixture reproduces package-graph behavior that does not show up when building `mlx-audio-swift` by itself. + +It depends on: + +- the local package via `.package(path: "../..")` +- `swift-transformers` from `1.3.0` +- `mlx-swift-lm` from the `main` branch + +It intentionally depends on `MLXAudioTTS`, not just `MLXAudioCodecs`, because the current consumer-graph failure shows up while compiling the higher-level TTS target against that newer shared stack. + +Build it locally with: + +```sh +swift build --package-path Integration/ConsumerGraphFixture +``` + +That same command is intended to run in CI as a regression check. diff --git a/Integration/ConsumerGraphFixture/Sources/ConsumerGraphFixture/main.swift b/Integration/ConsumerGraphFixture/Sources/ConsumerGraphFixture/main.swift new file mode 100644 index 00000000..03e0fd94 --- /dev/null +++ b/Integration/ConsumerGraphFixture/Sources/ConsumerGraphFixture/main.swift @@ -0,0 +1,6 @@ +import Hub +import MLXAudioTTS +import MLXLMCommon +import Tokenizers + +print("ConsumerGraphFixture built successfully") diff --git a/Package.swift b/Package.swift index a24b72c6..73bb728f 100644 --- a/Package.swift +++ b/Package.swift @@ -90,6 +90,7 @@ let package = Package( .product(name: "MLXNN", package: "mlx-swift"), .product(name: "MLXLMCommon", package: "mlx-swift-lm"), .product(name: "HuggingFace", package: "swift-huggingface"), + .product(name: "Tokenizers", package: "swift-transformers"), ], path: "Sources/MLXAudioCodecs" ), diff --git a/Sources/MLXAudioTTS/Models/Chatterbox/ChatterboxModel.swift b/Sources/MLXAudioTTS/Models/Chatterbox/ChatterboxModel.swift index 9c753237..d74be61f 100644 --- a/Sources/MLXAudioTTS/Models/Chatterbox/ChatterboxModel.swift +++ b/Sources/MLXAudioTTS/Models/Chatterbox/ChatterboxModel.swift @@ -68,7 +68,7 @@ public final class ChatterboxModel: Module, SpeechGenerationModel, @unchecked Se // MARK: - State /// Text tokenizer loaded from tokenizer.json. - public var tokenizer: Tokenizer? + public var tokenizer: TTSModelTokenizer? /// S3TokenizerV2: converts audio → speech token IDs (loaded separately). public var s3Tokenizer: S3TokenizerV2? diff --git a/Sources/MLXAudioTTS/Models/Llama/LlamaTTS.swift b/Sources/MLXAudioTTS/Models/Llama/LlamaTTS.swift index 9d283cdb..535ecda7 100644 --- a/Sources/MLXAudioTTS/Models/Llama/LlamaTTS.swift +++ b/Sources/MLXAudioTTS/Models/Llama/LlamaTTS.swift @@ -354,7 +354,7 @@ private class LlamaTTSModelInner: Module { public class LlamaTTSModel: Module, KVCacheDimensionProvider, SpeechGenerationModel, @unchecked Sendable { public let vocabularySize: Int public let kvHeads: [Int] - public var tokenizer: Tokenizer? + public var tokenizer: TTSModelTokenizer? public var _snacModel: SNAC? private let model: LlamaTTSModelInner diff --git a/Sources/MLXAudioTTS/Models/Marvis/MarvisTTSModel.swift b/Sources/MLXAudioTTS/Models/Marvis/MarvisTTSModel.swift index c69954a4..906278a9 100644 --- a/Sources/MLXAudioTTS/Models/Marvis/MarvisTTSModel.swift +++ b/Sources/MLXAudioTTS/Models/Marvis/MarvisTTSModel.swift @@ -25,7 +25,7 @@ public final class MarvisTTSModel: Module { private let model: CSMModel private let _promptURLs: [URL]? - private let _textTokenizer: Tokenizer + private let _textTokenizer: TTSModelTokenizer private let _audio_tokenizer: MimiTokenizer private let _streamingDecoder: MimiStreamingDecoder @@ -33,7 +33,7 @@ public final class MarvisTTSModel: Module { config: CSMModelArgs, repoId: String, promptURLs: [URL]? = nil, - textTokenizer: Tokenizer, + textTokenizer: TTSModelTokenizer, audioTokenizer: MimiTokenizer ) { _ = repoId @@ -55,7 +55,7 @@ public final class MarvisTTSModel: Module { promptURLs: [URL]? = nil, progressHandler: @Sendable @escaping (Progress) -> Void ) async throws { - let textTokenizer = try await loadTokenizer(configuration: ModelConfiguration(id: repoId), hub: hub) + let textTokenizer = try await AutoTokenizer.from(pretrained: repoId, hubApi: hub) let codec = try await Mimi.fromPretrained(progressHandler: progressHandler) let audioTokenizer = MimiTokenizer(codec) self.init( diff --git a/Sources/MLXAudioTTS/Models/Qwen3/Qwen3.swift b/Sources/MLXAudioTTS/Models/Qwen3/Qwen3.swift index 1bbe98b6..110cfae8 100644 --- a/Sources/MLXAudioTTS/Models/Qwen3/Qwen3.swift +++ b/Sources/MLXAudioTTS/Models/Qwen3/Qwen3.swift @@ -323,7 +323,7 @@ public class Qwen3Model: Module, KVCacheDimensionProvider, SpeechGenerationModel public let vocabularySize: Int public let kvHeads: [Int] - public var tokenizer: Tokenizer? + public var tokenizer: TTSModelTokenizer? public var _snacModel: SNAC? private let model: Qwen3ModelInner diff --git a/Sources/MLXAudioTTS/Models/Qwen3TTS/Qwen3TTS.swift b/Sources/MLXAudioTTS/Models/Qwen3TTS/Qwen3TTS.swift index e5440ce5..e5bbff41 100644 --- a/Sources/MLXAudioTTS/Models/Qwen3TTS/Qwen3TTS.swift +++ b/Sources/MLXAudioTTS/Models/Qwen3TTS/Qwen3TTS.swift @@ -13,7 +13,7 @@ public final class Qwen3TTSModel: Module, SpeechGenerationModel, @unchecked Send let talker: Qwen3TTSTalkerForConditionalGeneration var speakerEncoder: Qwen3TTSSpeakerEncoder? var speechTokenizer: Qwen3TTSSpeechTokenizer? - var tokenizer: Tokenizer? + var tokenizer: TTSModelTokenizer? public var sampleRate: Int { config.sampleRate } diff --git a/Sources/MLXAudioTTS/Models/Soprano/Soprano.swift b/Sources/MLXAudioTTS/Models/Soprano/Soprano.swift index c44171a0..a670499f 100644 --- a/Sources/MLXAudioTTS/Models/Soprano/Soprano.swift +++ b/Sources/MLXAudioTTS/Models/Soprano/Soprano.swift @@ -184,7 +184,7 @@ private class SopranoModelInner: Module { public class SopranoModel: Module, KVCacheDimensionProvider, SpeechGenerationModel, @unchecked Sendable { public let vocabularySize: Int public let kvHeads: [Int] - public var tokenizer: Tokenizer? + public var tokenizer: TTSModelTokenizer? private let model: SopranoModelInner let configuration: SopranoConfiguration diff --git a/Sources/MLXAudioTTS/TokenizerTypes.swift b/Sources/MLXAudioTTS/TokenizerTypes.swift new file mode 100644 index 00000000..b3ee73a0 --- /dev/null +++ b/Sources/MLXAudioTTS/TokenizerTypes.swift @@ -0,0 +1,5 @@ +import Tokenizers + +// Both MLXLMCommon and swift-transformers expose a `Tokenizer` protocol. +// TTS model loading uses the swift-transformers tokenizer API explicitly. +public typealias TTSModelTokenizer = any Tokenizers.Tokenizer