vllm-project · liuyanyi · Mar 1, 2025 · Mar 3, 2025 · gaocegege · Mar 2, 2025
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -19,73 +19,50 @@
 where you want to display chat completions to the user as they are generated
 by the model.
 
-Here we do not use the OpenAI Python client library, because it does not support
-`reasoning_content` fields in the response.
+Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
 """
 
-import json
-
-import requests
+from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-models = requests.get(
-    f"{openai_api_base}/models",
-    headers={
-        "Authorization": f"Bearer {openai_api_key}"
-    },
-).json()
-model = models["data"][0]["id"]
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
 
-# Streaming chat completions
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+models = client.models.list()
+model = models.data[0].id
 
-response = requests.post(
-    f"{openai_api_base}/chat/completions",
-    headers={"Authorization": f"Bearer {openai_api_key}"},
-    json={
-        "model": model,
-        "messages": messages,
-        "stream": True
-    },
-)
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+stream = client.chat.completions.create(model=model,
+                                        messages=messages,
+                                        stream=True)
 
 print("client: Start streaming chat completions...")
 printed_reasoning_content = False
 printed_content = False
-# Make the streaming request
-if response.status_code == 200:
-    # Process the streaming response
-    for line in response.iter_lines():
-        if line:  # Filter out keep-alive new lines
-            # Decode the line and parse the JSON
-            decoded_line = line.decode("utf-8")
-            if decoded_line.startswith("data:"):
-                data = decoded_line[5:].strip()  # Remove "data:" prefix
-                if data == "[DONE]":  # End of stream
-                    print("\nclient: Stream completed.")
-                    break
-                try:
-                    # Parse the JSON data
-                    chunk = json.loads(data)
-                    reasoning_content = chunk["choices"][0]["delta"].get(
-                        "reasoning_content", "")
-                    content = chunk["choices"][0]["delta"].get("content", "")
 
-                    if reasoning_content:
-                        if not printed_reasoning_content:
-                            printed_reasoning_content = True
-                            print("reasoning_content:", end="", flush=True)
-                        print(reasoning_content, end="", flush=True)
-                    elif content:
-                        if not printed_content:
-                            printed_content = True
-                            print("\ncontent:", end="", flush=True)
-                        # Extract and print the content
-                        print(content, end="", flush=True)
-                except json.JSONDecodeError:
-                    print("Error decoding JSON:", decoded_line)
-else:
-    print(f"Error: {response.status_code} - {response.text}")
+for chunk in stream:
+    reasoning_content = None
+    content = None
+    # Check the content is reasoning_content or content
+    if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        reasoning_content = chunk.choices[0].delta.reasoning_content
+    elif hasattr(chunk.choices[0].delta, "content"):
+        content = chunk.choices[0].delta.content
+
+    if reasoning_content:
+        if not printed_reasoning_content:
+            printed_reasoning_content = True
+            print("reasoning_content:", end="", flush=True)
+        print(reasoning_content, end="", flush=True)
+    elif content:
+        if not printed_content:
+            printed_content = True
+            print("\ncontent:", end="", flush=True)
+        # Extract and print the content
+        print(content, end="", flush=True)