Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright © 2026 Apple Inc.
//
// Real end-to-end Gemma 4 audio inference. Downloads an audio-capable Gemma 4
// VLM and asks it to transcribe real speech clips, exercising the full audio
// path: AVAssetReader PCM -> mel feature extractor -> Conformer audio tower ->
// begin/end-of-audio prompt splice -> text.
//
// Speech clips are committed under Tests/MLXLMTests/Resources/.
//
// Run:
// xcodebuild test -project IntegrationTesting.xcodeproj \
// -scheme IntegrationTesting -destination 'platform=macOS' \
// -only-testing:IntegrationTestingTests/Gemma4AudioIntegrationTests

import Foundation
import HuggingFace
import IntegrationTestHelpers
import MLXHuggingFace
import MLXLMCommon
import Testing
import Tokenizers

private let models = IntegrationTestModels(
downloader: #hubDownloader(),
tokenizerLoader: #huggingFaceTokenizerLoader()
)

private let resources = URL(fileURLWithPath: #filePath)
.deletingLastPathComponent()
.deletingLastPathComponent()
.deletingLastPathComponent()
.appendingPathComponent("Tests/MLXLMTests/Resources")
.path

struct SpeechCase: Sendable, CustomStringConvertible {
let file: String
let expected: [String]
var description: String { file }
}

private let speechCases: [SpeechCase] = [
.init(
file: "gemma_speech_test.wav",
expected: ["quick", "brown", "fox", "lazy", "dog", "river"]),
.init(
file: "gemma_speech_long.wav",
expected: ["weather", "rain", "forecast", "afternoon", "breeze", "evening", "sky"]),
]

@Suite(.serialized)
struct Gemma4AudioIntegrationTests {

private func transcribe(model: String, clip: SpeechCase) async throws -> String {
let container = try await models.vlmContainer(for: ModelConfiguration(id: model))
let session = ChatSession(
container, generateParameters: GenerateParameters(maxTokens: 120, temperature: 0))
let url = URL(fileURLWithPath: "\(resources)/\(clip.file)")
return try await session.respond(
to: "Transcribe the speech in this audio clip.",
images: [], videos: [], audios: [.url(url)])
}

private func assertRecovered(_ answer: String, _ clip: SpeechCase) {
let lower = answer.lowercased()
#expect(!lower.contains("<pad>"), "audio path regressed to a <pad> wall: \(answer)")
let hits = clip.expected.filter { lower.contains($0) }
#expect(
hits.count >= 3,
"[\(clip.file)] did not recover the spoken words (matched \(hits) in: \(answer))")
}

@Test(arguments: speechCases)
func gemma4_e4b_transcribes(_ clip: SpeechCase) async throws {
let answer = try await transcribe(model: "mlx-community/gemma-4-e4b-it-4bit", clip: clip)
print("[e4b/\(clip.file)] \(answer)")
assertRecovered(answer, clip)
}

@Test func gemma4_e4b_perceivesRealSpeech() async throws {
let clip = SpeechCase(file: "gemma_audio_librispeech.wav", expected: [])
let answer = try await transcribe(model: "mlx-community/gemma-4-e4b-it-4bit", clip: clip)
print("[e4b/librispeech] \(answer)")
let lower = answer.lowercased()
#expect(!lower.contains("<pad>"), "audio path regressed to a <pad> wall")
#expect(
!lower.contains("not provided") && !lower.contains("no audio")
&& !lower.contains("haven't provided") && !lower.contains("have not provided"),
"model claims no audio; audio not reaching the model: \(answer)")
// One content-word hit is the perception bar, not an ASR-quality bar:
// the 4-bit QAT checkpoint paraphrases real (non-TTS) speech loosely,
// and verbatim transcription quality is a model property rather than an
// integration requirement — this test only guards that real audio
// reaches the tower and influences the answer.
let contentWords = ["middle", "class", "welcome", "mr", "mister", "gospel", "apostle"]
let hits = contentWords.filter { lower.contains($0) }
#expect(
hits.count >= 1,
"did not perceive the real-speech content (matched \(hits) in: \(answer))")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright © 2026 Apple Inc.
//
// Real end-to-end Gemma 4 VIDEO inference. Downloads gemma-4-e4b-it-4bit and
// asks it to describe a real video clip, exercising the PR #256 video tower
// end to end (frame sampling via MediaProcessing → vision tower → text).
//
// Run:
// xcodebuild test -project IntegrationTesting.xcodeproj \
// -scheme IntegrationTesting -destination 'platform=macOS' \
// -only-testing:IntegrationTestingTests/Gemma4VideoIntegrationTests

import Foundation
import HuggingFace
import IntegrationTestHelpers
import MLXHuggingFace
import MLXLMCommon
import Testing
import Tokenizers

private let models = IntegrationTestModels(
downloader: #hubDownloader(),
tokenizerLoader: #huggingFaceTokenizerLoader()
)

// Resolve Tests/MLXLMTests/Resources relative to this source file so the suite
// runs on any clone / CI checkout, not just one developer's machine.
private let videoResources = URL(fileURLWithPath: #filePath)
.deletingLastPathComponent() // IntegrationTestingTests/
.deletingLastPathComponent() // IntegrationTesting/
.deletingLastPathComponent() // repo root
.appendingPathComponent("Tests/MLXLMTests/Resources")

@Suite(.serialized)
struct Gemma4VideoIntegrationTests {

// The repo already ships a small real clip for VLM tests.
private static let videoURL = videoResources.appendingPathComponent("1080p_30.mov")

@Test func gemma4_e4b_describesVideo() async throws {
let container = try await models.vlmContainer(
for: ModelConfiguration(id: "mlx-community/gemma-4-e4b-it-4bit")
)

let session = ChatSession(
container,
generateParameters: GenerateParameters(maxTokens: 120, temperature: 0)
)

let answer = try await session.respond(
to: "Describe what happens in this video in one or two sentences.",
images: [],
videos: [.url(Self.videoURL)],
audios: []
)

print("[e4b/video] \(answer)")

let lower = answer.lowercased()
// Reject the degenerate <pad>/special-token wall failure mode.
#expect(!lower.contains("<pad>"), "description is a <pad> wall — video tower not producing usable embeddings")
// Must be a substantive natural-language description, not a token fragment.
let wordCount = answer.split(whereSeparator: { $0 == " " || $0 == "\n" }).count
#expect(wordCount >= 8, "description too short to be a real video understanding: \(answer)")
// The clip is a sequence of solid colour blocks; a correct description
// should reference colour/blocks/frames. Require at least one such cue.
let visualCues = ["color", "colour", "block", "frame", "screen", "background", "blue", "green", "yellow", "magenta", "red"]
#expect(
visualCues.contains(where: { lower.contains($0) }),
"description lacks any visual cue from the clip: \(answer)"
)
}

// Big Buck Bunny (Blender Foundation, CC-BY-3.0) — a real animated outdoor
// scene. See Resources/FIXTURES_LICENSES.md.
private static let bbbURL = videoResources.appendingPathComponent("gemma_video_bbb.mp4")

@Test func gemma4_e4b_describesVideo_bbb() async throws {
let container = try await models.vlmContainer(
for: ModelConfiguration(id: "mlx-community/gemma-4-e4b-it-4bit"))
let session = ChatSession(
container, generateParameters: GenerateParameters(maxTokens: 150, temperature: 0))

let answer = try await session.respond(
to: "Describe what happens in this video in one or two sentences.",
images: [], videos: [.url(Self.bbbURL)], audios: [])

print("[e4b/video-bbb] \(answer)")
let lower = answer.lowercased()
#expect(!lower.contains("<pad>"), "BBB description is a <pad> wall")
#expect(
answer.split(whereSeparator: { $0 == " " || $0 == "\n" }).count >= 8,
"BBB description too short: \(answer)")
// Animated outdoor nature scene (rabbit/animal, grass/trees, sky, cartoon).
let cues = [
"rabbit", "bunny", "animal", "creature", "animat", "cartoon", "character",
"grass", "tree", "forest", "field", "nature", "sky", "green", "outdoor",
]
#expect(
cues.contains(where: { lower.contains($0) }),
"BBB description lacks any scene cue: \(answer)")
}
}
9 changes: 8 additions & 1 deletion Libraries/MLXLMCommon/LanguageModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,17 @@ public struct LMInput {

public let samples: MLXArray

/// Optional padding mask over the time axis of `samples`, where `true`
/// marks padding positions (frames the audio encoder should ignore)
/// and `false` marks valid audio. `nil` means all positions are valid.
public let mask: MLXArray?

public init(
samples: MLXArray
samples: MLXArray,
mask: MLXArray? = nil
) {
self.samples = samples
self.mask = mask
}
}

Expand Down
13 changes: 12 additions & 1 deletion Libraries/MLXLMCommon/UserInput.swift
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,21 @@ public struct UserInput {
public var minPixels: Int?
public var maxPixels: Int?

public init(resize: CGSize? = nil, minPixels: Int? = nil, maxPixels: Int? = nil) {
/// Optional per-call cap on the number of frames sampled from each
/// video, for processors that support it. When set, the effective cap
/// is `min(model configured maximum, videoMaxFrames)`; when `nil` the
/// model configuration is used. Memory-constrained callers (e.g. apps
/// near a per-process memory limit) can lower this per request.
public var videoMaxFrames: Int?

public init(
resize: CGSize? = nil, minPixels: Int? = nil, maxPixels: Int? = nil,
videoMaxFrames: Int? = nil
) {
self.resize = resize
self.minPixels = minPixels
self.maxPixels = maxPixels
self.videoMaxFrames = videoMaxFrames
}
}

Expand Down
Loading