Skip to content

Commit

Permalink
[Doc] Update reasoning with stream example to use OpenAI library (vll…
Browse files Browse the repository at this point in the history
…m-project#14077)

Signed-off-by: liuyanyi <[email protected]>
  • Loading branch information
liuyanyi authored Mar 6, 2025
1 parent fa82b93 commit 0ddc991
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 57 deletions.
50 changes: 49 additions & 1 deletion docs/source/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,55 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
}
```

Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:

```python
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)

print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False

for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content

if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
```

Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).

## Structured output

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,73 +19,50 @@
where you want to display chat completions to the user as they are generated
by the model.
Here we do not use the OpenAI Python client library, because it does not support
`reasoning_content` fields in the response.
Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
content may not exist leading to errors if you try to access it.
"""

import json

import requests
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

models = requests.get(
f"{openai_api_base}/models",
headers={
"Authorization": f"Bearer {openai_api_key}"
},
).json()
model = models["data"][0]["id"]
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

# Streaming chat completions
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
models = client.models.list()
model = models.data[0].id

response = requests.post(
f"{openai_api_base}/chat/completions",
headers={"Authorization": f"Bearer {openai_api_key}"},
json={
"model": model,
"messages": messages,
"stream": True
},
)
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)

print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False
# Make the streaming request
if response.status_code == 200:
# Process the streaming response
for line in response.iter_lines():
if line: # Filter out keep-alive new lines
# Decode the line and parse the JSON
decoded_line = line.decode("utf-8")
if decoded_line.startswith("data:"):
data = decoded_line[5:].strip() # Remove "data:" prefix
if data == "[DONE]": # End of stream
print("\nclient: Stream completed.")
break
try:
# Parse the JSON data
chunk = json.loads(data)
reasoning_content = chunk["choices"][0]["delta"].get(
"reasoning_content", "")
content = chunk["choices"][0]["delta"].get("content", "")

if reasoning_content:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
except json.JSONDecodeError:
print("Error decoding JSON:", decoded_line)
else:
print(f"Error: {response.status_code} - {response.text}")
for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content

if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)

0 comments on commit 0ddc991

Please sign in to comment.