From 45cf5ccce897f5c86c812d3c729f2b9ac8e144b0 Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Thu, 27 Jun 2024 18:26:48 +0200 Subject: [PATCH 1/8] Update openai_api_server.py Allows logprobs for chat completions --- fastchat/serve/openai_api_server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py index a6ffee96b..cccc3f2b2 100644 --- a/fastchat/serve/openai_api_server.py +++ b/fastchat/serve/openai_api_server.py @@ -430,6 +430,7 @@ async def create_chat_completion(request: ChatCompletionRequest): presence_penalty=request.presence_penalty, frequency_penalty=request.frequency_penalty, max_tokens=request.max_tokens, + logprobs=request.logprobs, echo=False, stop=request.stop, ) @@ -471,6 +472,7 @@ async def create_chat_completion(request: ChatCompletionRequest): choices.append( ChatCompletionResponseChoice( index=i, + logprobs=create_openai_logprobs(content.get("logprobs", None)), message=ChatMessage(role="assistant", content=content["text"]), finish_reason=content.get("finish_reason", "stop"), ) From 9af28691dcaabddcf2ee8457a6c9ed02da6e7e1d Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Thu, 27 Jun 2024 18:27:44 +0200 Subject: [PATCH 2/8] Update openai_api_protocol.py Allows logprobs for chat completions --- fastchat/protocol/openai_api_protocol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py index bb50a5ef0..eed5cbf21 100644 --- a/fastchat/protocol/openai_api_protocol.py +++ b/fastchat/protocol/openai_api_protocol.py @@ -67,6 +67,7 @@ class ChatCompletionRequest(BaseModel): top_k: Optional[int] = -1 n: Optional[int] = 1 max_tokens: Optional[int] = None + logprobs: Optional[int] = None stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False presence_penalty: Optional[float] = 0.0 From 8672f4f0d6606b6fefa0dbf5bb3000d6fd6ecbee Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Thu, 27 Jun 2024 18:30:44 +0200 Subject: [PATCH 3/8] Update openai_api.md Explain how to display logprobs --- docs/openai_api.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/docs/openai_api.md b/docs/openai_api.md index 089b500ff..fde33ecaa 100644 --- a/docs/openai_api.md +++ b/docs/openai_api.md @@ -63,6 +63,24 @@ completion = openai.chat.completions.create( print(completion.choices[0].message.content) ``` +### Logprobs + +Logprobs are supported with the OpenAI API: + +```python + +# create a chat completion +completion = openai.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "Hello! What is your name?"}], + log_probs=True +) +# print the completion +print(completion.choices[0].logprobs) +``` + +### Streaming + Streaming is also supported. See [test_openai_api.py](../tests/test_openai_api.py). If your api server is behind a proxy you'll need to turn off buffering, you can do so in Nginx by setting `proxy_buffering off;` in the location block for the proxy. ### cURL From d39084622ae6221f6e334fa770b418e5016c04ce Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Thu, 27 Jun 2024 18:37:39 +0200 Subject: [PATCH 4/8] Explain logprobs use through the OpenAI API --- docs/openai_api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/openai_api.md b/docs/openai_api.md index fde33ecaa..42b381a2d 100644 --- a/docs/openai_api.md +++ b/docs/openai_api.md @@ -164,7 +164,7 @@ export FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE=1 ## Todos Some features to be implemented: -- [ ] Support more parameters like `logprobs`, `logit_bias`, `user`, `presence_penalty` and `frequency_penalty` +- [ ] Support more parameters like `logit_bias`, `user`, `presence_penalty` and `frequency_penalty` - [ ] Model details (permissions, owner and create time) - [ ] Edits API - [ ] Rate Limitation Settings From 089857a5b6b9f62178332d66628e1021a914821d Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Fri, 28 Jun 2024 08:56:50 +0200 Subject: [PATCH 5/8] Update openai protocol to include logprobs --- fastchat/protocol/openai_api_protocol.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py index eed5cbf21..f575926fd 100644 --- a/fastchat/protocol/openai_api_protocol.py +++ b/fastchat/protocol/openai_api_protocol.py @@ -64,10 +64,10 @@ class ChatCompletionRequest(BaseModel): ] temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 + logprobs: Optional[int] = None top_k: Optional[int] = -1 n: Optional[int] = 1 max_tokens: Optional[int] = None - logprobs: Optional[int] = None stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False presence_penalty: Optional[float] = 0.0 @@ -82,6 +82,7 @@ class ChatMessage(BaseModel): class ChatCompletionResponseChoice(BaseModel): index: int + logprobs: Optional[LogProbs] = None message: ChatMessage finish_reason: Optional[Literal["stop", "length"]] = None From 7b676482830b486f621b0c67f0cccff31c0db565 Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Fri, 28 Jun 2024 08:57:32 +0200 Subject: [PATCH 6/8] Update openai_api_server. to include logprobs --- fastchat/serve/openai_api_server.py | 45 +++++++++++++++-------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py index cccc3f2b2..be246111d 100644 --- a/fastchat/serve/openai_api_server.py +++ b/fastchat/serve/openai_api_server.py @@ -22,7 +22,10 @@ from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer import httpx -from pydantic_settings import BaseSettings +try: + from pydantic.v1 import BaseSettings +except ImportError: + from pydantic import BaseSettings import shortuuid import tiktoken import uvicorn @@ -130,7 +133,7 @@ async def check_api_key( def create_error_response(code: int, message: str) -> JSONResponse: return JSONResponse( - ErrorResponse(message=message, code=code).model_dump(), status_code=400 + ErrorResponse(message=message, code=code).dict(), status_code=400 ) @@ -275,7 +278,7 @@ async def get_gen_params( frequency_penalty: Optional[float], max_tokens: Optional[int], echo: Optional[bool], - logprobs: Optional[int] = None, + logprobs: Optional[int], stop: Optional[Union[str, List[str]]], best_of: Optional[int] = None, use_beam_search: Optional[bool] = None, @@ -316,9 +319,7 @@ async def get_gen_params( if item["type"] == "text" ] - # TODO(chris): This only applies to LLaVA model. Implement an image_token string in the conv template. - text = "\n" * len(image_list) - text += "\n".join(text_list) + text = "\n".join(text_list) conv.append_message(conv.roles[0], (text, image_list)) else: conv.append_message(conv.roles[0], message["content"]) @@ -430,8 +431,8 @@ async def create_chat_completion(request: ChatCompletionRequest): presence_penalty=request.presence_penalty, frequency_penalty=request.frequency_penalty, max_tokens=request.max_tokens, - logprobs=request.logprobs, echo=False, + logprobs=request.logprobs, stop=request.stop, ) @@ -464,6 +465,7 @@ async def create_chat_completion(request: ChatCompletionRequest): return create_error_response(ErrorCode.INTERNAL_ERROR, str(e)) usage = UsageInfo() for i, content in enumerate(all_tasks): + print(content) if isinstance(content, str): content = json.loads(content) @@ -472,14 +474,14 @@ async def create_chat_completion(request: ChatCompletionRequest): choices.append( ChatCompletionResponseChoice( index=i, - logprobs=create_openai_logprobs(content.get("logprobs", None)), message=ChatMessage(role="assistant", content=content["text"]), + logprobs=create_openai_logprobs(content.get("logprobs", None)), finish_reason=content.get("finish_reason", "stop"), ) ) if "usage" in content: - task_usage = UsageInfo.model_validate(content["usage"]) - for usage_key, usage_value in task_usage.model_dump().items(): + task_usage = UsageInfo.parse_obj(content["usage"]) + for usage_key, usage_value in task_usage.dict().items(): setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) return ChatCompletionResponse(model=request.model, choices=choices, usage=usage) @@ -504,7 +506,7 @@ async def chat_completion_stream_generator( chunk = ChatCompletionStreamResponse( id=id, choices=[choice_data], model=model_name ) - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" previous_text = "" async for content in generate_completion_stream(gen_params, worker_addr): @@ -534,10 +536,10 @@ async def chat_completion_stream_generator( if content.get("finish_reason", None) is not None: finish_stream_events.append(chunk) continue - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" # There is not "content" field in the last delta message, so exclude_none to exclude field "content". for finish_chunk in finish_stream_events: - yield f"data: {finish_chunk.model_dump_json(exclude_none=True)}\n\n" + yield f"data: {finish_chunk.json(exclude_none=True, ensure_ascii=False)}\n\n" yield "data: [DONE]\n\n" @@ -601,6 +603,7 @@ async def create_completion(request: CompletionRequest): choices = [] usage = UsageInfo() for i, content in enumerate(all_tasks): + if content["error_code"] != 0: return create_error_response(content["error_code"], content["text"]) choices.append( @@ -611,12 +614,12 @@ async def create_completion(request: CompletionRequest): finish_reason=content.get("finish_reason", "stop"), ) ) - task_usage = UsageInfo.model_validate(content["usage"]) - for usage_key, usage_value in task_usage.model_dump().items(): + task_usage = UsageInfo.parse_obj(content["usage"]) + for usage_key, usage_value in task_usage.dict().items(): setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) return CompletionResponse( - model=request.model, choices=choices, usage=UsageInfo.model_validate(usage) + model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage) ) @@ -672,10 +675,10 @@ async def generate_completion_stream_generator( if content.get("finish_reason", None) is not None: finish_stream_events.append(chunk) continue - yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" # There is not "content" field in the last delta message, so exclude_none to exclude field "content". for finish_chunk in finish_stream_events: - yield f"data: {finish_chunk.model_dump_json(exclude_unset=True)}\n\n" + yield f"data: {finish_chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" yield "data: [DONE]\n\n" @@ -750,7 +753,7 @@ async def create_embeddings(request: EmbeddingsRequest, model_name: str = None): total_tokens=token_num, completion_tokens=None, ), - ).model_dump(exclude_none=True) + ).dict(exclude_none=True) async def get_embedding(payload: Dict[str, Any]): @@ -867,8 +870,8 @@ async def create_chat_completion(request: APIChatCompletionRequest): finish_reason=content.get("finish_reason", "stop"), ) ) - task_usage = UsageInfo.model_validate(content["usage"]) - for usage_key, usage_value in task_usage.model_dump().items(): + task_usage = UsageInfo.parse_obj(content["usage"]) + for usage_key, usage_value in task_usage.dict().items(): setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) return ChatCompletionResponse(model=request.model, choices=choices, usage=usage) From ad2073c8ce114fc511f168cf3fda39c2e8703938 Mon Sep 17 00:00:00 2001 From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com> Date: Fri, 28 Jun 2024 09:08:27 +0200 Subject: [PATCH 7/8] Update openai_api_server to include logprobs --- fastchat/serve/openai_api_server.py | 39 +++++++++++++---------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py index be246111d..0df7e8e74 100644 --- a/fastchat/serve/openai_api_server.py +++ b/fastchat/serve/openai_api_server.py @@ -22,10 +22,7 @@ from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer import httpx -try: - from pydantic.v1 import BaseSettings -except ImportError: - from pydantic import BaseSettings +from pydantic_settings import BaseSettings import shortuuid import tiktoken import uvicorn @@ -133,7 +130,7 @@ async def check_api_key( def create_error_response(code: int, message: str) -> JSONResponse: return JSONResponse( - ErrorResponse(message=message, code=code).dict(), status_code=400 + ErrorResponse(message=message, code=code).model_dump(), status_code=400 ) @@ -319,7 +316,9 @@ async def get_gen_params( if item["type"] == "text" ] - text = "\n".join(text_list) + # TODO(chris): This only applies to LLaVA model. Implement an image_token string in the conv template. + text = "\n" * len(image_list) + text += "\n".join(text_list) conv.append_message(conv.roles[0], (text, image_list)) else: conv.append_message(conv.roles[0], message["content"]) @@ -465,7 +464,6 @@ async def create_chat_completion(request: ChatCompletionRequest): return create_error_response(ErrorCode.INTERNAL_ERROR, str(e)) usage = UsageInfo() for i, content in enumerate(all_tasks): - print(content) if isinstance(content, str): content = json.loads(content) @@ -480,8 +478,8 @@ async def create_chat_completion(request: ChatCompletionRequest): ) ) if "usage" in content: - task_usage = UsageInfo.parse_obj(content["usage"]) - for usage_key, usage_value in task_usage.dict().items(): + task_usage = UsageInfo.model_validate(content["usage"]) + for usage_key, usage_value in task_usage.model_dump().items(): setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) return ChatCompletionResponse(model=request.model, choices=choices, usage=usage) @@ -506,7 +504,7 @@ async def chat_completion_stream_generator( chunk = ChatCompletionStreamResponse( id=id, choices=[choice_data], model=model_name ) - yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" previous_text = "" async for content in generate_completion_stream(gen_params, worker_addr): @@ -536,10 +534,10 @@ async def chat_completion_stream_generator( if content.get("finish_reason", None) is not None: finish_stream_events.append(chunk) continue - yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" # There is not "content" field in the last delta message, so exclude_none to exclude field "content". for finish_chunk in finish_stream_events: - yield f"data: {finish_chunk.json(exclude_none=True, ensure_ascii=False)}\n\n" + yield f"data: {finish_chunk.model_dump_json(exclude_none=True)}\n\n" yield "data: [DONE]\n\n" @@ -603,7 +601,6 @@ async def create_completion(request: CompletionRequest): choices = [] usage = UsageInfo() for i, content in enumerate(all_tasks): - if content["error_code"] != 0: return create_error_response(content["error_code"], content["text"]) choices.append( @@ -614,12 +611,12 @@ async def create_completion(request: CompletionRequest): finish_reason=content.get("finish_reason", "stop"), ) ) - task_usage = UsageInfo.parse_obj(content["usage"]) - for usage_key, usage_value in task_usage.dict().items(): + task_usage = UsageInfo.model_validate(content["usage"]) + for usage_key, usage_value in task_usage.model_dump().items(): setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) return CompletionResponse( - model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage) + model=request.model, choices=choices, usage=UsageInfo.model_validate(usage) ) @@ -675,10 +672,10 @@ async def generate_completion_stream_generator( if content.get("finish_reason", None) is not None: finish_stream_events.append(chunk) continue - yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" # There is not "content" field in the last delta message, so exclude_none to exclude field "content". for finish_chunk in finish_stream_events: - yield f"data: {finish_chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + yield f"data: {finish_chunk.model_dump_json(exclude_unset=True)}\n\n" yield "data: [DONE]\n\n" @@ -753,7 +750,7 @@ async def create_embeddings(request: EmbeddingsRequest, model_name: str = None): total_tokens=token_num, completion_tokens=None, ), - ).dict(exclude_none=True) + ).model_dump(exclude_none=True) async def get_embedding(payload: Dict[str, Any]): @@ -870,8 +867,8 @@ async def create_chat_completion(request: APIChatCompletionRequest): finish_reason=content.get("finish_reason", "stop"), ) ) - task_usage = UsageInfo.parse_obj(content["usage"]) - for usage_key, usage_value in task_usage.dict().items(): + task_usage = UsageInfo.model_validate(content["usage"]) + for usage_key, usage_value in task_usage.model_dump().items(): setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) return ChatCompletionResponse(model=request.model, choices=choices, usage=usage) From 30066674ba82e43c536d9d072e32bbd416de5c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bent=C3=A9geac?= Date: Fri, 28 Jun 2024 14:31:02 +0200 Subject: [PATCH 8/8] fix: file formatting --- fastchat/protocol/openai_api_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py index f575926fd..d6cfb0d80 100644 --- a/fastchat/protocol/openai_api_protocol.py +++ b/fastchat/protocol/openai_api_protocol.py @@ -64,7 +64,7 @@ class ChatCompletionRequest(BaseModel): ] temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 - logprobs: Optional[int] = None + logprobs: Optional[int] = None top_k: Optional[int] = -1 n: Optional[int] = 1 max_tokens: Optional[int] = None