Fix chunk loss in the long streaming response with native response field (#8881)

TomeHirata · chenmoneygithub · web-flow · commit 083d04db43a7 · 2025-10-01T11:02:58.000-07:00
* Fix chunk loss in the long streaming response

* minor validation

* comment

* do not hit buffer yield block when chunk_message is empt

* fix condition check

---------

Co-authored-by: chenmoneygithub &lt;chen.qian@databricks.com&gt;
diff --git a/dspy/streaming/streamify.py b/dspy/streaming/streamify.py
@@ -185,12 +185,11 @@ async def async_streamer(*args, **kwargs):
                     else:
                         # We are receiving a chunk from the LM's response stream, delegate it to the listeners to
                         # determine if we should yield a value to the user.
-                        output = None
                         for listener in predict_id_to_listener[value.predict_id]:
-                            # There should be at most one listener provides a return value.
-                            output = listener.receive(value) or output
-                        if output:
-                            yield output
+                            # In some special cases such as Citation API, it is possible that multiple listeners
+                            # return values at the same time due to the chunk buffer of the listener.
+                            if output := listener.receive(value):
+                                yield output
                 elif isinstance(value, StatusMessage):
                     yield value
                 elif isinstance(value, Prediction):
diff --git a/dspy/streaming/streaming_listener.py b/dspy/streaming/streaming_listener.py
@@ -159,7 +159,7 @@ def receive(self, chunk: ModelResponseStream):
                 self.field_start_queue = []
                 return
 
-        if self.stream_start:
+        if self.stream_start and chunk_message:
             # The stream is started, we keep returning the token until we see the start of the next field.
             token = None
             self.field_end_queue.put(chunk_message)
@@ -168,6 +168,7 @@ def receive(self, chunk: ModelResponseStream):
                 # i.e., "[[ ## {next_field_name} ## ]]" for ChatAdapter to identify the end of the current field.
                 # In most cases 10 tokens are enough to cover the end_identifier for all adapters.
                 token = self.field_end_queue.get()
+
             concat_message = "".join(self.field_end_queue.queue).strip()
             if re.search(end_identifier, concat_message):
                 # The next field is identified, we can end the stream and flush out all tokens in the buffer.
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
@@ -949,29 +949,39 @@ def forward(self, documents, question, **kwargs):
 
     async def citation_stream(*args, **kwargs):
         # Stream chunks with citation data in provider_specific_fields
+        # To verify the realistic scenario with more than 10 chunks in the stream, include more than 10 chunks before the citation.
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="[[ ##"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" answer"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" ## ]]\n\n"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="Water"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" boils"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" at"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" 100°C"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="."))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="\n\n"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content='[{"type": "char_location", "cited_text": "Water boils at 100°C", "document_index": 0, "document_title": "Physics Facts", "start_char_index": 0, "end_char_index": 19}]'))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="A"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="c"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="c"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="o"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="r"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="d"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="i"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="n"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="g"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" to "))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="the references,"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(
             content="",
             provider_specific_fields={
                 "citation": {
                     "type": "char_location",
-                    "cited_text": "Water boils at 100°C",
+                    "cited_text": "water boils at 100°C",
                     "document_index": 0,
                     "document_title": "Physics Facts",
                     "start_char_index": 0,
                     "end_char_index": 19
                 }
             }
         ))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" water"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" boils"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" at"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" 100°C"))])
+        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="."))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="\n\n"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="[[ ##"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" completed"))])
@@ -982,6 +992,7 @@ async def citation_stream(*args, **kwargs):
         program = dspy.streamify(
             MyProgram(),
             stream_listeners=[
+                dspy.streaming.StreamListener(signature_field_name="answer"),
                 dspy.streaming.StreamListener(signature_field_name="citations"),
             ],
         )
@@ -992,10 +1003,13 @@ async def citation_stream(*args, **kwargs):
         with dspy.context(lm=dspy.LM("anthropic/claude-3-5-sonnet-20241022", cache=False), adapter=dspy.ChatAdapter(native_response_types=[Citations])):
             output = program(documents=docs, question="What temperature does water boil?")
             citation_chunks = []
+            answer_chunks = []
             final_prediction = None
             async for value in output:
                 if isinstance(value, dspy.streaming.StreamResponse) and value.signature_field_name == "citations":
                     citation_chunks.append(value)
+                elif isinstance(value, dspy.streaming.StreamResponse) and value.signature_field_name == "answer":
+                    answer_chunks.append(value.chunk)
                 elif isinstance(value, dspy.Prediction):
                     final_prediction = value
 
@@ -1004,10 +1018,14 @@ async def citation_stream(*args, **kwargs):
             citation_chunk = citation_chunks[0]
             assert isinstance(citation_chunk.chunk, Citations)
             assert len(citation_chunk.chunk) == 1
-            assert citation_chunk.chunk[0].cited_text == "Water boils at 100°C"
+            assert citation_chunk.chunk[0].cited_text == "water boils at 100°C"
             assert citation_chunk.chunk[0].document_title == "Physics Facts"
 
+            # Verify the answer chunks are correct
+            assert "".join(answer_chunks) == "According to the references, water boils at 100°C."
+
             # Test that prediction contains the expected fields
             assert final_prediction is not None
             assert hasattr(final_prediction, "answer")
             assert hasattr(final_prediction, "citations")
+            assert final_prediction.answer == "According to the references, water boils at 100°C."