@@ -970,6 +970,7 @@ func handleChatStreaming(
970970 var completionTokenCount = 0
971971 var fullText = " "
972972 var stopped = false
973+ var firstToken = true
973974 for await generation in stream {
974975 if stopped { break }
975976 switch generation {
@@ -980,6 +981,13 @@ func handleChatStreaming(
980981 if completionTokenCount % 8 == 0 {
981982 try ? await Task . sleep ( for: . microseconds( 50 ) )
982983 }
984+ // Real-time stdout: print token text as it arrives (no newline)
985+ if firstToken {
986+ print ( " srv generate: prompt= \( promptTokenCount) t | " , terminator: " " )
987+ firstToken = false
988+ }
989+ print ( text, terminator: " " )
990+ fflush ( stdout)
983991 // ── Stop sequence check ──
984992 if let ( trimmedText, _) = checkStopSequences ( fullText, stopSequences: stopSequences) {
985993 let emittedSoFar = fullText. count - text. count
@@ -1017,7 +1025,8 @@ func handleChatStreaming(
10171025 }
10181026 cont. yield ( " data: [DONE] \n \n " )
10191027 cont. finish ( )
1020- // llama-server style: log full response JSON on one line
1028+ // llama-server style: print newline then full response JSON
1029+ print ( " " ) // end the real-time token stream line
10211030 let dur = Date ( ) . timeIntervalSince ( genStart)
10221031 let tokPerSec = dur > 0 ? Double ( completionTokenCount) / dur : 0
10231032 let logResp : [ String : Any ] = [
@@ -1036,6 +1045,7 @@ func handleChatStreaming(
10361045 if let logData = try ? JSONSerialization . data ( withJSONObject: logResp) ,
10371046 let logStr = String ( data: logData, encoding: . utf8) {
10381047 print ( " srv log_server_r: response: \( logStr) " )
1048+ fflush ( stdout)
10391049 }
10401050 }
10411051 }
@@ -1069,6 +1079,7 @@ func handleChatNonStreaming(
10691079 var collectedToolCalls : [ ToolCallResponse ] = [ ]
10701080 var tcIndex = 0
10711081 var generationStopReason : GenerateStopReason = . stop
1082+ var firstToken = true
10721083 for await generation in stream {
10731084 switch generation {
10741085 case . chunk( let text, _) :
@@ -1078,6 +1089,13 @@ func handleChatNonStreaming(
10781089 if completionTokenCount % 8 == 0 {
10791090 try ? await Task . sleep ( for: . microseconds( 50 ) )
10801091 }
1092+ // Real-time stdout: print token text as it arrives (no newline)
1093+ if firstToken {
1094+ print ( " srv generate: prompt= \( promptTokenCount) t | " , terminator: " " )
1095+ firstToken = false
1096+ }
1097+ print ( text, terminator: " " )
1098+ fflush ( stdout)
10811099 case . toolCall( let tc) :
10821100 let argsJson = serializeToolCallArgs ( tc. function. arguments)
10831101 collectedToolCalls. append ( ToolCallResponse (
@@ -1090,6 +1108,7 @@ func handleChatNonStreaming(
10901108 generationStopReason = info. stopReason
10911109 }
10921110 }
1111+ print ( " " ) // end the real-time token stream line
10931112 let duration = Date ( ) . timeIntervalSince ( genStart)
10941113 await stats. requestFinished ( tokens: completionTokenCount, duration: duration)
10951114 await semaphore. signal ( )
@@ -1145,6 +1164,7 @@ func handleChatNonStreaming(
11451164 // llama-server style: log full response JSON on one line
11461165 if let responseStr = String ( data: encoded, encoding: . utf8) {
11471166 print ( " srv log_server_r: response: \( responseStr) " )
1167+ fflush ( stdout)
11481168 }
11491169 return Response (
11501170 status: . ok,
0 commit comments