diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index e2c154d31baec..e5c03793f7552 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -78,7 +78,55 @@ Streaming chat completions are also supported for reasoning models. The `reasoni } ``` -Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). +OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + +print("client: Start streaming chat completions...") +printed_reasoning_content = False +printed_content = False + +for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True) +``` + +Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). ## Structured output diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 489bfcd5ec2a2..fe4332576d438 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -19,73 +19,50 @@ where you want to display chat completions to the user as they are generated by the model. -Here we do not use the OpenAI Python client library, because it does not support -`reasoning_content` fields in the response. +Remember to check content and reasoning_content exist in `ChatCompletionChunk`, +content may not exist leading to errors if you try to access it. """ -import json - -import requests +from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -models = requests.get( - f"{openai_api_base}/models", - headers={ - "Authorization": f"Bearer {openai_api_key}" - }, -).json() -model = models["data"][0]["id"] +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) -# Streaming chat completions -messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +models = client.models.list() +model = models.data[0].id -response = requests.post( - f"{openai_api_base}/chat/completions", - headers={"Authorization": f"Bearer {openai_api_key}"}, - json={ - "model": model, - "messages": messages, - "stream": True - }, -) +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) print("client: Start streaming chat completions...") printed_reasoning_content = False printed_content = False -# Make the streaming request -if response.status_code == 200: - # Process the streaming response - for line in response.iter_lines(): - if line: # Filter out keep-alive new lines - # Decode the line and parse the JSON - decoded_line = line.decode("utf-8") - if decoded_line.startswith("data:"): - data = decoded_line[5:].strip() # Remove "data:" prefix - if data == "[DONE]": # End of stream - print("\nclient: Stream completed.") - break - try: - # Parse the JSON data - chunk = json.loads(data) - reasoning_content = chunk["choices"][0]["delta"].get( - "reasoning_content", "") - content = chunk["choices"][0]["delta"].get("content", "") - if reasoning_content: - if not printed_reasoning_content: - printed_reasoning_content = True - print("reasoning_content:", end="", flush=True) - print(reasoning_content, end="", flush=True) - elif content: - if not printed_content: - printed_content = True - print("\ncontent:", end="", flush=True) - # Extract and print the content - print(content, end="", flush=True) - except json.JSONDecodeError: - print("Error decoding JSON:", decoded_line) -else: - print(f"Error: {response.status_code} - {response.text}") +for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True)