From 45cf5ccce897f5c86c812d3c729f2b9ac8e144b0 Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:26:48 +0200
Subject: [PATCH 1/8] Update openai_api_server.py

Allows logprobs for chat completions
---
 fastchat/serve/openai_api_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index a6ffee96b..cccc3f2b2 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -430,6 +430,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         presence_penalty=request.presence_penalty,
         frequency_penalty=request.frequency_penalty,
         max_tokens=request.max_tokens,
+        logprobs=request.logprobs,
         echo=False,
         stop=request.stop,
     )
@@ -471,6 +472,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         choices.append(
             ChatCompletionResponseChoice(
                 index=i,
+                logprobs=create_openai_logprobs(content.get("logprobs", None)),
                 message=ChatMessage(role="assistant", content=content["text"]),
                 finish_reason=content.get("finish_reason", "stop"),
             )

From 9af28691dcaabddcf2ee8457a6c9ed02da6e7e1d Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:27:44 +0200
Subject: [PATCH 2/8] Update openai_api_protocol.py

Allows logprobs for chat completions
---
 fastchat/protocol/openai_api_protocol.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index bb50a5ef0..eed5cbf21 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -67,6 +67,7 @@ class ChatCompletionRequest(BaseModel):
     top_k: Optional[int] = -1
     n: Optional[int] = 1
     max_tokens: Optional[int] = None
+    logprobs: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0

From 8672f4f0d6606b6fefa0dbf5bb3000d6fd6ecbee Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:30:44 +0200
Subject: [PATCH 3/8] Update openai_api.md

Explain how to display logprobs
---
 docs/openai_api.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/docs/openai_api.md b/docs/openai_api.md
index 089b500ff..fde33ecaa 100644
--- a/docs/openai_api.md
+++ b/docs/openai_api.md
@@ -63,6 +63,24 @@ completion = openai.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
+### Logprobs 
+
+Logprobs are supported with the OpenAI API:
+
+```python
+
+# create a chat completion
+completion = openai.chat.completions.create(
+  model=model,
+  messages=[{"role": "user", "content": "Hello! What is your name?"}],
+  log_probs=True
+)
+# print the completion
+print(completion.choices[0].logprobs)
+```
+
+### Streaming
+
 Streaming is also supported. See [test_openai_api.py](../tests/test_openai_api.py).  If your api server is behind a proxy you'll need to turn off buffering, you can do so in Nginx by setting `proxy_buffering off;` in the location block for the proxy.
 
 ### cURL

From d39084622ae6221f6e334fa770b418e5016c04ce Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Thu, 27 Jun 2024 18:37:39 +0200
Subject: [PATCH 4/8] Explain logprobs use through the OpenAI API

---
 docs/openai_api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai_api.md b/docs/openai_api.md
index fde33ecaa..42b381a2d 100644
--- a/docs/openai_api.md
+++ b/docs/openai_api.md
@@ -164,7 +164,7 @@ export FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE=1
 ## Todos
 Some features to be implemented:
 
-- [ ] Support more parameters like `logprobs`, `logit_bias`, `user`, `presence_penalty` and `frequency_penalty`
+- [ ] Support more parameters like `logit_bias`, `user`, `presence_penalty` and `frequency_penalty`
 - [ ] Model details (permissions, owner and create time)
 - [ ] Edits API
 - [ ] Rate Limitation Settings

From 089857a5b6b9f62178332d66628e1021a914821d Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Fri, 28 Jun 2024 08:56:50 +0200
Subject: [PATCH 5/8] Update openai protocol to include logprobs

---
 fastchat/protocol/openai_api_protocol.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index eed5cbf21..f575926fd 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -64,10 +64,10 @@ class ChatCompletionRequest(BaseModel):
     ]
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
+    logprobs: Optional[int] = None    
     top_k: Optional[int] = -1
     n: Optional[int] = 1
     max_tokens: Optional[int] = None
-    logprobs: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = None
     stream: Optional[bool] = False
     presence_penalty: Optional[float] = 0.0
@@ -82,6 +82,7 @@ class ChatMessage(BaseModel):
 
 class ChatCompletionResponseChoice(BaseModel):
     index: int
+    logprobs: Optional[LogProbs] = None
     message: ChatMessage
     finish_reason: Optional[Literal["stop", "length"]] = None
 

From 7b676482830b486f621b0c67f0cccff31c0db565 Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Fri, 28 Jun 2024 08:57:32 +0200
Subject: [PATCH 6/8] Update openai_api_server. to include logprobs

---
 fastchat/serve/openai_api_server.py | 45 +++++++++++++++--------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index cccc3f2b2..be246111d 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -22,7 +22,10 @@
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 import httpx
 
-from pydantic_settings import BaseSettings
+try:
+    from pydantic.v1 import BaseSettings
+except ImportError:
+    from pydantic import BaseSettings
 import shortuuid
 import tiktoken
 import uvicorn
@@ -130,7 +133,7 @@ async def check_api_key(
 
 def create_error_response(code: int, message: str) -> JSONResponse:
     return JSONResponse(
-        ErrorResponse(message=message, code=code).model_dump(), status_code=400
+        ErrorResponse(message=message, code=code).dict(), status_code=400
     )
 
 
@@ -275,7 +278,7 @@ async def get_gen_params(
     frequency_penalty: Optional[float],
     max_tokens: Optional[int],
     echo: Optional[bool],
-    logprobs: Optional[int] = None,
+    logprobs: Optional[int],
     stop: Optional[Union[str, List[str]]],
     best_of: Optional[int] = None,
     use_beam_search: Optional[bool] = None,
@@ -316,9 +319,7 @@ async def get_gen_params(
                         if item["type"] == "text"
                     ]
 
-                    # TODO(chris): This only applies to LLaVA model. Implement an image_token string in the conv template.
-                    text = "<image>\n" * len(image_list)
-                    text += "\n".join(text_list)
+                    text = "\n".join(text_list)
                     conv.append_message(conv.roles[0], (text, image_list))
                 else:
                     conv.append_message(conv.roles[0], message["content"])
@@ -430,8 +431,8 @@ async def create_chat_completion(request: ChatCompletionRequest):
         presence_penalty=request.presence_penalty,
         frequency_penalty=request.frequency_penalty,
         max_tokens=request.max_tokens,
-        logprobs=request.logprobs,
         echo=False,
+        logprobs=request.logprobs,
         stop=request.stop,
     )
 
@@ -464,6 +465,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         return create_error_response(ErrorCode.INTERNAL_ERROR, str(e))
     usage = UsageInfo()
     for i, content in enumerate(all_tasks):
+        print(content)
         if isinstance(content, str):
             content = json.loads(content)
 
@@ -472,14 +474,14 @@ async def create_chat_completion(request: ChatCompletionRequest):
         choices.append(
             ChatCompletionResponseChoice(
                 index=i,
-                logprobs=create_openai_logprobs(content.get("logprobs", None)),
                 message=ChatMessage(role="assistant", content=content["text"]),
+                logprobs=create_openai_logprobs(content.get("logprobs", None)),
                 finish_reason=content.get("finish_reason", "stop"),
             )
         )
         if "usage" in content:
-            task_usage = UsageInfo.model_validate(content["usage"])
-            for usage_key, usage_value in task_usage.model_dump().items():
+            task_usage = UsageInfo.parse_obj(content["usage"])
+            for usage_key, usage_value in task_usage.dict().items():
                 setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
     return ChatCompletionResponse(model=request.model, choices=choices, usage=usage)
@@ -504,7 +506,7 @@ async def chat_completion_stream_generator(
         chunk = ChatCompletionStreamResponse(
             id=id, choices=[choice_data], model=model_name
         )
-        yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
+        yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
 
         previous_text = ""
         async for content in generate_completion_stream(gen_params, worker_addr):
@@ -534,10 +536,10 @@ async def chat_completion_stream_generator(
                 if content.get("finish_reason", None) is not None:
                     finish_stream_events.append(chunk)
                 continue
-            yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
+            yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
     # There is not "content" field in the last delta message, so exclude_none to exclude field "content".
     for finish_chunk in finish_stream_events:
-        yield f"data: {finish_chunk.model_dump_json(exclude_none=True)}\n\n"
+        yield f"data: {finish_chunk.json(exclude_none=True, ensure_ascii=False)}\n\n"
     yield "data: [DONE]\n\n"
 
 
@@ -601,6 +603,7 @@ async def create_completion(request: CompletionRequest):
         choices = []
         usage = UsageInfo()
         for i, content in enumerate(all_tasks):
+
             if content["error_code"] != 0:
                 return create_error_response(content["error_code"], content["text"])
             choices.append(
@@ -611,12 +614,12 @@ async def create_completion(request: CompletionRequest):
                     finish_reason=content.get("finish_reason", "stop"),
                 )
             )
-            task_usage = UsageInfo.model_validate(content["usage"])
-            for usage_key, usage_value in task_usage.model_dump().items():
+            task_usage = UsageInfo.parse_obj(content["usage"])
+            for usage_key, usage_value in task_usage.dict().items():
                 setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
         return CompletionResponse(
-            model=request.model, choices=choices, usage=UsageInfo.model_validate(usage)
+            model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage)
         )
 
 
@@ -672,10 +675,10 @@ async def generate_completion_stream_generator(
                     if content.get("finish_reason", None) is not None:
                         finish_stream_events.append(chunk)
                     continue
-                yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
+                yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
     # There is not "content" field in the last delta message, so exclude_none to exclude field "content".
     for finish_chunk in finish_stream_events:
-        yield f"data: {finish_chunk.model_dump_json(exclude_unset=True)}\n\n"
+        yield f"data: {finish_chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
     yield "data: [DONE]\n\n"
 
 
@@ -750,7 +753,7 @@ async def create_embeddings(request: EmbeddingsRequest, model_name: str = None):
             total_tokens=token_num,
             completion_tokens=None,
         ),
-    ).model_dump(exclude_none=True)
+    ).dict(exclude_none=True)
 
 
 async def get_embedding(payload: Dict[str, Any]):
@@ -867,8 +870,8 @@ async def create_chat_completion(request: APIChatCompletionRequest):
                 finish_reason=content.get("finish_reason", "stop"),
             )
         )
-        task_usage = UsageInfo.model_validate(content["usage"])
-        for usage_key, usage_value in task_usage.model_dump().items():
+        task_usage = UsageInfo.parse_obj(content["usage"])
+        for usage_key, usage_value in task_usage.dict().items():
             setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
     return ChatCompletionResponse(model=request.model, choices=choices, usage=usage)

From ad2073c8ce114fc511f168cf3fda39c2e8703938 Mon Sep 17 00:00:00 2001
From: BastienLeGuellec <54061033+BastienLeGuellec@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:08:27 +0200
Subject: [PATCH 7/8] Update openai_api_server to include logprobs

---
 fastchat/serve/openai_api_server.py | 39 +++++++++++++----------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/fastchat/serve/openai_api_server.py b/fastchat/serve/openai_api_server.py
index be246111d..0df7e8e74 100644
--- a/fastchat/serve/openai_api_server.py
+++ b/fastchat/serve/openai_api_server.py
@@ -22,10 +22,7 @@
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 import httpx
 
-try:
-    from pydantic.v1 import BaseSettings
-except ImportError:
-    from pydantic import BaseSettings
+from pydantic_settings import BaseSettings
 import shortuuid
 import tiktoken
 import uvicorn
@@ -133,7 +130,7 @@ async def check_api_key(
 
 def create_error_response(code: int, message: str) -> JSONResponse:
     return JSONResponse(
-        ErrorResponse(message=message, code=code).dict(), status_code=400
+        ErrorResponse(message=message, code=code).model_dump(), status_code=400
     )
 
 
@@ -319,7 +316,9 @@ async def get_gen_params(
                         if item["type"] == "text"
                     ]
 
-                    text = "\n".join(text_list)
+                    # TODO(chris): This only applies to LLaVA model. Implement an image_token string in the conv template.
+                    text = "<image>\n" * len(image_list)
+                    text += "\n".join(text_list)
                     conv.append_message(conv.roles[0], (text, image_list))
                 else:
                     conv.append_message(conv.roles[0], message["content"])
@@ -465,7 +464,6 @@ async def create_chat_completion(request: ChatCompletionRequest):
         return create_error_response(ErrorCode.INTERNAL_ERROR, str(e))
     usage = UsageInfo()
     for i, content in enumerate(all_tasks):
-        print(content)
         if isinstance(content, str):
             content = json.loads(content)
 
@@ -480,8 +478,8 @@ async def create_chat_completion(request: ChatCompletionRequest):
             )
         )
         if "usage" in content:
-            task_usage = UsageInfo.parse_obj(content["usage"])
-            for usage_key, usage_value in task_usage.dict().items():
+            task_usage = UsageInfo.model_validate(content["usage"])
+            for usage_key, usage_value in task_usage.model_dump().items():
                 setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
     return ChatCompletionResponse(model=request.model, choices=choices, usage=usage)
@@ -506,7 +504,7 @@ async def chat_completion_stream_generator(
         chunk = ChatCompletionStreamResponse(
             id=id, choices=[choice_data], model=model_name
         )
-        yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+        yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
 
         previous_text = ""
         async for content in generate_completion_stream(gen_params, worker_addr):
@@ -536,10 +534,10 @@ async def chat_completion_stream_generator(
                 if content.get("finish_reason", None) is not None:
                     finish_stream_events.append(chunk)
                 continue
-            yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+            yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
     # There is not "content" field in the last delta message, so exclude_none to exclude field "content".
     for finish_chunk in finish_stream_events:
-        yield f"data: {finish_chunk.json(exclude_none=True, ensure_ascii=False)}\n\n"
+        yield f"data: {finish_chunk.model_dump_json(exclude_none=True)}\n\n"
     yield "data: [DONE]\n\n"
 
 
@@ -603,7 +601,6 @@ async def create_completion(request: CompletionRequest):
         choices = []
         usage = UsageInfo()
         for i, content in enumerate(all_tasks):
-
             if content["error_code"] != 0:
                 return create_error_response(content["error_code"], content["text"])
             choices.append(
@@ -614,12 +611,12 @@ async def create_completion(request: CompletionRequest):
                     finish_reason=content.get("finish_reason", "stop"),
                 )
             )
-            task_usage = UsageInfo.parse_obj(content["usage"])
-            for usage_key, usage_value in task_usage.dict().items():
+            task_usage = UsageInfo.model_validate(content["usage"])
+            for usage_key, usage_value in task_usage.model_dump().items():
                 setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
         return CompletionResponse(
-            model=request.model, choices=choices, usage=UsageInfo.parse_obj(usage)
+            model=request.model, choices=choices, usage=UsageInfo.model_validate(usage)
         )
 
 
@@ -675,10 +672,10 @@ async def generate_completion_stream_generator(
                     if content.get("finish_reason", None) is not None:
                         finish_stream_events.append(chunk)
                     continue
-                yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+                yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
     # There is not "content" field in the last delta message, so exclude_none to exclude field "content".
     for finish_chunk in finish_stream_events:
-        yield f"data: {finish_chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+        yield f"data: {finish_chunk.model_dump_json(exclude_unset=True)}\n\n"
     yield "data: [DONE]\n\n"
 
 
@@ -753,7 +750,7 @@ async def create_embeddings(request: EmbeddingsRequest, model_name: str = None):
             total_tokens=token_num,
             completion_tokens=None,
         ),
-    ).dict(exclude_none=True)
+    ).model_dump(exclude_none=True)
 
 
 async def get_embedding(payload: Dict[str, Any]):
@@ -870,8 +867,8 @@ async def create_chat_completion(request: APIChatCompletionRequest):
                 finish_reason=content.get("finish_reason", "stop"),
             )
         )
-        task_usage = UsageInfo.parse_obj(content["usage"])
-        for usage_key, usage_value in task_usage.dict().items():
+        task_usage = UsageInfo.model_validate(content["usage"])
+        for usage_key, usage_value in task_usage.model_dump().items():
             setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
 
     return ChatCompletionResponse(model=request.model, choices=choices, usage=usage)

From 30066674ba82e43c536d9d072e32bbd416de5c45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Bent=C3=A9geac?= <raphael@bentegeac.com>
Date: Fri, 28 Jun 2024 14:31:02 +0200
Subject: [PATCH 8/8] fix: file formatting

---
 fastchat/protocol/openai_api_protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastchat/protocol/openai_api_protocol.py b/fastchat/protocol/openai_api_protocol.py
index f575926fd..d6cfb0d80 100644
--- a/fastchat/protocol/openai_api_protocol.py
+++ b/fastchat/protocol/openai_api_protocol.py
@@ -64,7 +64,7 @@ class ChatCompletionRequest(BaseModel):
     ]
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
-    logprobs: Optional[int] = None    
+    logprobs: Optional[int] = None
     top_k: Optional[int] = -1
     n: Optional[int] = 1
     max_tokens: Optional[int] = None