fix: correctly handle Gemma 4 and Llama 3 custom completion turn tokens natively to prevent padding loops

solderzzc · solderzzc · commit 5fc3e13b6275 · 2026-04-13T13:18:19.000-07:00
diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift
@@ -1022,7 +1022,7 @@ func handleChatCompletion(
     let temperature = chatReq.temperature.map(Float.init) ?? config.temp
     let topP = chatReq.topP.map(Float.init) ?? config.topP
     let repeatPenalty = chatReq.repetitionPenalty.map(Float.init) ?? config.repeatPenalty
-    let stopSequences = chatReq.stop ?? []
+    let stopSequences = (chatReq.stop ?? []) + ["<end_of_turn>", "<|im_end|>", "<|eot_id|>", "<turn|>", "<|tool_response|>"]
     let includeUsage = chatReq.streamOptions?.includeUsage ?? false
 
     // Log extra sampling params if provided (accepted for API compat, not all are used)
diff --git a/mlx-swift-lm b/mlx-swift-lm
@@ -1 +1 @@
-Subproject commit bc9c956677f714fcf9391e9419a2c47268333e3f
+Subproject commit 112c45e45ee9c8ca9e6696835cbd83f14008f99d