ml-explore · atlascodesai · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/IntegrationTesting/IntegrationTestingTests/Gemma4AudioIntegrationTests.swift b/IntegrationTesting/IntegrationTestingTests/Gemma4AudioIntegrationTests.swift
@@ -0,0 +1,100 @@
+// Copyright © 2026 Apple Inc.
+//
+// Real end-to-end Gemma 4 audio inference. Downloads an audio-capable Gemma 4
+// VLM and asks it to transcribe real speech clips, exercising the full audio
+// path: AVAssetReader PCM -> mel feature extractor -> Conformer audio tower ->
+// begin/end-of-audio prompt splice -> text.
+//
+// Speech clips are committed under Tests/MLXLMTests/Resources/.
+//
+// Run:
+//   xcodebuild test -project IntegrationTesting.xcodeproj \
+//     -scheme IntegrationTesting -destination 'platform=macOS' \
+//     -only-testing:IntegrationTestingTests/Gemma4AudioIntegrationTests
+
+import Foundation
+import HuggingFace
+import IntegrationTestHelpers
+import MLXHuggingFace
+import MLXLMCommon
+import Testing
+import Tokenizers
+
+private let models = IntegrationTestModels(
+    downloader: #hubDownloader(),
+    tokenizerLoader: #huggingFaceTokenizerLoader()
+)
+
+private let resources = URL(fileURLWithPath: #filePath)
+    .deletingLastPathComponent()
+    .deletingLastPathComponent()
+    .deletingLastPathComponent()
+    .appendingPathComponent("Tests/MLXLMTests/Resources")
+    .path
+
+struct SpeechCase: Sendable, CustomStringConvertible {
+    let file: String
+    let expected: [String]
+    var description: String { file }
+}
+
+private let speechCases: [SpeechCase] = [
+    .init(
+        file: "gemma_speech_test.wav",
+        expected: ["quick", "brown", "fox", "lazy", "dog", "river"]),
+    .init(
+        file: "gemma_speech_long.wav",
+        expected: ["weather", "rain", "forecast", "afternoon", "breeze", "evening", "sky"]),
+]
+
+@Suite(.serialized)
+struct Gemma4AudioIntegrationTests {
+
+    private func transcribe(model: String, clip: SpeechCase) async throws -> String {
+        let container = try await models.vlmContainer(for: ModelConfiguration(id: model))
+        let session = ChatSession(
+            container, generateParameters: GenerateParameters(maxTokens: 120, temperature: 0))
+        let url = URL(fileURLWithPath: "\(resources)/\(clip.file)")
+        return try await session.respond(
+            to: "Transcribe the speech in this audio clip.",
+            images: [], videos: [], audios: [.url(url)])
+    }
+
+    private func assertRecovered(_ answer: String, _ clip: SpeechCase) {
+        let lower = answer.lowercased()
+        #expect(!lower.contains("<pad>"), "audio path regressed to a <pad> wall: \(answer)")
+        let hits = clip.expected.filter { lower.contains($0) }
+        #expect(
+            hits.count >= 3,
+            "[\(clip.file)] did not recover the spoken words (matched \(hits) in: \(answer))")
+    }
+
+    @Test(arguments: speechCases)
+    func gemma4_e4b_transcribes(_ clip: SpeechCase) async throws {
+        let answer = try await transcribe(model: "mlx-community/gemma-4-e4b-it-4bit", clip: clip)
+        print("[e4b/\(clip.file)] \(answer)")
+        assertRecovered(answer, clip)
+    }
+
+    @Test func gemma4_e4b_perceivesRealSpeech() async throws {
+        let clip = SpeechCase(file: "gemma_audio_librispeech.wav", expected: [])
+        let answer = try await transcribe(model: "mlx-community/gemma-4-e4b-it-4bit", clip: clip)
+        print("[e4b/librispeech] \(answer)")
+        let lower = answer.lowercased()
+        #expect(!lower.contains("<pad>"), "audio path regressed to a <pad> wall")
+        #expect(
+            !lower.contains("not provided") && !lower.contains("no audio")
+                && !lower.contains("haven't provided") && !lower.contains("have not provided"),
+            "model claims no audio; audio not reaching the model: \(answer)")
+        // One content-word hit is the perception bar, not an ASR-quality bar:
+        // the 4-bit QAT checkpoint paraphrases real (non-TTS) speech loosely,
+        // and verbatim transcription quality is a model property rather than an
+        // integration requirement — this test only guards that real audio
+        // reaches the tower and influences the answer.
+        let contentWords = ["middle", "class", "welcome", "mr", "mister", "gospel", "apostle"]
+        let hits = contentWords.filter { lower.contains($0) }
+        #expect(
+            hits.count >= 1,
+            "did not perceive the real-speech content (matched \(hits) in: \(answer))")
+    }
+}
diff --git a/IntegrationTesting/IntegrationTestingTests/Gemma4VideoIntegrationTests.swift b/IntegrationTesting/IntegrationTestingTests/Gemma4VideoIntegrationTests.swift
@@ -0,0 +1,102 @@
+// Copyright © 2026 Apple Inc.
+//
+// Real end-to-end Gemma 4 VIDEO inference. Downloads gemma-4-e4b-it-4bit and
+// asks it to describe a real video clip, exercising the PR #256 video tower
+// end to end (frame sampling via MediaProcessing → vision tower → text).
+//
+// Run:
+//   xcodebuild test -project IntegrationTesting.xcodeproj \
+//     -scheme IntegrationTesting -destination 'platform=macOS' \
+//     -only-testing:IntegrationTestingTests/Gemma4VideoIntegrationTests
+
+import Foundation
+import HuggingFace
+import IntegrationTestHelpers
+import MLXHuggingFace
+import MLXLMCommon
+import Testing
+import Tokenizers
+
+private let models = IntegrationTestModels(
+    downloader: #hubDownloader(),
+    tokenizerLoader: #huggingFaceTokenizerLoader()
+)
+
+// Resolve Tests/MLXLMTests/Resources relative to this source file so the suite
+// runs on any clone / CI checkout, not just one developer's machine.
+private let videoResources = URL(fileURLWithPath: #filePath)
+    .deletingLastPathComponent()  // IntegrationTestingTests/
+    .deletingLastPathComponent()  // IntegrationTesting/
+    .deletingLastPathComponent()  // repo root
+    .appendingPathComponent("Tests/MLXLMTests/Resources")
+
+@Suite(.serialized)
+struct Gemma4VideoIntegrationTests {
+
+    // The repo already ships a small real clip for VLM tests.
+    private static let videoURL = videoResources.appendingPathComponent("1080p_30.mov")
+
+    @Test func gemma4_e4b_describesVideo() async throws {
+        let container = try await models.vlmContainer(
+            for: ModelConfiguration(id: "mlx-community/gemma-4-e4b-it-4bit")
+        )
+
+        let session = ChatSession(
+            container,
+            generateParameters: GenerateParameters(maxTokens: 120, temperature: 0)
+        )
+
+        let answer = try await session.respond(
+            to: "Describe what happens in this video in one or two sentences.",
+            images: [],
+            videos: [.url(Self.videoURL)],
+            audios: []
+        )
+
+        print("[e4b/video] \(answer)")
+
+        let lower = answer.lowercased()
+        // Reject the degenerate <pad>/special-token wall failure mode.
+        #expect(!lower.contains("<pad>"), "description is a <pad> wall — video tower not producing usable embeddings")
+        // Must be a substantive natural-language description, not a token fragment.
+        let wordCount = answer.split(whereSeparator: { $0 == " " || $0 == "\n" }).count
+        #expect(wordCount >= 8, "description too short to be a real video understanding: \(answer)")
+        // The clip is a sequence of solid colour blocks; a correct description
+        // should reference colour/blocks/frames. Require at least one such cue.
+        let visualCues = ["color", "colour", "block", "frame", "screen", "background", "blue", "green", "yellow", "magenta", "red"]
+        #expect(
+            visualCues.contains(where: { lower.contains($0) }),
+            "description lacks any visual cue from the clip: \(answer)"
+        )
+    }
+
+    // Big Buck Bunny (Blender Foundation, CC-BY-3.0) — a real animated outdoor
+    // scene. See Resources/FIXTURES_LICENSES.md.
+    private static let bbbURL = videoResources.appendingPathComponent("gemma_video_bbb.mp4")
+
+    @Test func gemma4_e4b_describesVideo_bbb() async throws {
+        let container = try await models.vlmContainer(
+            for: ModelConfiguration(id: "mlx-community/gemma-4-e4b-it-4bit"))
+        let session = ChatSession(
+            container, generateParameters: GenerateParameters(maxTokens: 150, temperature: 0))
+
+        let answer = try await session.respond(
+            to: "Describe what happens in this video in one or two sentences.",
+            images: [], videos: [.url(Self.bbbURL)], audios: [])
+
+        print("[e4b/video-bbb] \(answer)")
+        let lower = answer.lowercased()
+        #expect(!lower.contains("<pad>"), "BBB description is a <pad> wall")
+        #expect(
+            answer.split(whereSeparator: { $0 == " " || $0 == "\n" }).count >= 8,
+            "BBB description too short: \(answer)")
+        // Animated outdoor nature scene (rabbit/animal, grass/trees, sky, cartoon).
+        let cues = [
+            "rabbit", "bunny", "animal", "creature", "animat", "cartoon", "character",
+            "grass", "tree", "forest", "field", "nature", "sky", "green", "outdoor",
+        ]
+        #expect(
+            cues.contains(where: { lower.contains($0) }),
+            "BBB description lacks any scene cue: \(answer)")
+    }
+}
diff --git a/Libraries/MLXLMCommon/LanguageModel.swift b/Libraries/MLXLMCommon/LanguageModel.swift
@@ -126,10 +126,17 @@ public struct LMInput {
 
         public let samples: MLXArray
 
+        /// Optional padding mask over the time axis of `samples`, where `true`
+        /// marks padding positions (frames the audio encoder should ignore)
+        /// and `false` marks valid audio. `nil` means all positions are valid.
+        public let mask: MLXArray?
+
         public init(
-            samples: MLXArray
+            samples: MLXArray,
+            mask: MLXArray? = nil
         ) {
             self.samples = samples
+            self.mask = mask
         }
     }
 

diff --git a/Libraries/MLXLMCommon/UserInput.swift b/Libraries/MLXLMCommon/UserInput.swift
@@ -159,10 +159,21 @@ public struct UserInput {
         public var minPixels: Int?
         public var maxPixels: Int?
 
-        public init(resize: CGSize? = nil, minPixels: Int? = nil, maxPixels: Int? = nil) {
+        /// Optional per-call cap on the number of frames sampled from each
+        /// video, for processors that support it. When set, the effective cap
+        /// is `min(model configured maximum, videoMaxFrames)`; when `nil` the
+        /// model configuration is used. Memory-constrained callers (e.g. apps
+        /// near a per-process memory limit) can lower this per request.
+        public var videoMaxFrames: Int?
+
+        public init(
+            resize: CGSize? = nil, minPixels: Int? = nil, maxPixels: Int? = nil,
+            videoMaxFrames: Int? = nil
+        ) {
             self.resize = resize
             self.minPixels = minPixels
             self.maxPixels = maxPixels
+            self.videoMaxFrames = videoMaxFrames
         }
     }