Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Libraries/MLXLMCommon/Evaluate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1227,9 +1227,18 @@ public struct MTPTokenIterator: TokenIteratorProtocol {

let mtpResult = model.callMTP(verifyInput.tokens[.newAxis], cache: cache, mtpCaches: mtpCaches)
guard !mtpResult.isEmpty else { return }

let mainLogits = mtpResult[0]

// Flush the Metal command buffer immediately after the verification forward pass.
// On hybrid SSM/attention models (e.g. Qwen35), the recurrent SSM layers accumulate
// un-evaluated graph nodes across rounds. Without an explicit sync here the Metal
// command buffer grows until it triggers the GPU Watchdog.
//
// Only force the main logits needed for verification/sampling so we avoid eagerly
// evaluating speculative MTP head logits that may be discarded on rejection.
eval(mainLogits)

Comment thread
solderzzc marked this conversation as resolved.
let mainTokens: MLXArray
var mainProcessedLogits = [MLXArray]()
if var verifyProcessor = processor {
Expand Down
Loading