Skip to content

Commit f394801

Browse files
committed
feat: real-time token streaming to stdout + fflush
Both streaming and non-streaming handlers now: - Print 'srv generate: prompt=Xt | ' on first token - Print each token text immediately with no newline + fflush(stdout) - Print newline then full response JSON when done - fflush(stdout) after response JSON Output visible in server log while generation is in progress.
1 parent 6ec2297 commit f394801

1 file changed

Lines changed: 21 additions & 1 deletion

File tree

Sources/mlx-server/Server.swift

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -970,6 +970,7 @@ func handleChatStreaming(
970970
var completionTokenCount = 0
971971
var fullText = ""
972972
var stopped = false
973+
var firstToken = true
973974
for await generation in stream {
974975
if stopped { break }
975976
switch generation {
@@ -980,6 +981,13 @@ func handleChatStreaming(
980981
if completionTokenCount % 8 == 0 {
981982
try? await Task.sleep(for: .microseconds(50))
982983
}
984+
// Real-time stdout: print token text as it arrives (no newline)
985+
if firstToken {
986+
print("srv generate: prompt=\(promptTokenCount)t | ", terminator: "")
987+
firstToken = false
988+
}
989+
print(text, terminator: "")
990+
fflush(stdout)
983991
// ── Stop sequence check ──
984992
if let (trimmedText, _) = checkStopSequences(fullText, stopSequences: stopSequences) {
985993
let emittedSoFar = fullText.count - text.count
@@ -1017,7 +1025,8 @@ func handleChatStreaming(
10171025
}
10181026
cont.yield("data: [DONE]\n\n")
10191027
cont.finish()
1020-
// llama-server style: log full response JSON on one line
1028+
// llama-server style: print newline then full response JSON
1029+
print("") // end the real-time token stream line
10211030
let dur = Date().timeIntervalSince(genStart)
10221031
let tokPerSec = dur > 0 ? Double(completionTokenCount) / dur : 0
10231032
let logResp: [String: Any] = [
@@ -1036,6 +1045,7 @@ func handleChatStreaming(
10361045
if let logData = try? JSONSerialization.data(withJSONObject: logResp),
10371046
let logStr = String(data: logData, encoding: .utf8) {
10381047
print("srv log_server_r: response: \(logStr)")
1048+
fflush(stdout)
10391049
}
10401050
}
10411051
}
@@ -1069,6 +1079,7 @@ func handleChatNonStreaming(
10691079
var collectedToolCalls: [ToolCallResponse] = []
10701080
var tcIndex = 0
10711081
var generationStopReason: GenerateStopReason = .stop
1082+
var firstToken = true
10721083
for await generation in stream {
10731084
switch generation {
10741085
case .chunk(let text, _):
@@ -1078,6 +1089,13 @@ func handleChatNonStreaming(
10781089
if completionTokenCount % 8 == 0 {
10791090
try? await Task.sleep(for: .microseconds(50))
10801091
}
1092+
// Real-time stdout: print token text as it arrives (no newline)
1093+
if firstToken {
1094+
print("srv generate: prompt=\(promptTokenCount)t | ", terminator: "")
1095+
firstToken = false
1096+
}
1097+
print(text, terminator: "")
1098+
fflush(stdout)
10811099
case .toolCall(let tc):
10821100
let argsJson = serializeToolCallArgs(tc.function.arguments)
10831101
collectedToolCalls.append(ToolCallResponse(
@@ -1090,6 +1108,7 @@ func handleChatNonStreaming(
10901108
generationStopReason = info.stopReason
10911109
}
10921110
}
1111+
print("") // end the real-time token stream line
10931112
let duration = Date().timeIntervalSince(genStart)
10941113
await stats.requestFinished(tokens: completionTokenCount, duration: duration)
10951114
await semaphore.signal()
@@ -1145,6 +1164,7 @@ func handleChatNonStreaming(
11451164
// llama-server style: log full response JSON on one line
11461165
if let responseStr = String(data: encoded, encoding: .utf8) {
11471166
print("srv log_server_r: response: \(responseStr)")
1167+
fflush(stdout)
11481168
}
11491169
return Response(
11501170
status: .ok,

0 commit comments

Comments
 (0)