From 90fa384117ab1827872d8661498dbeb121505ef3 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Wed, 25 Dec 2024 22:15:11 +0800 Subject: [PATCH] update docstring --- lmdeploy/serve/proxy/proxy.py | 43 +++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py index 76a7130f7..392ede326 100644 --- a/lmdeploy/serve/proxy/proxy.py +++ b/lmdeploy/serve/proxy/proxy.py @@ -422,28 +422,50 @@ async def chat_completions_v1(request: ChatCompletionRequest, The request should be a JSON object with the following fields: - model: model name. Available from /v1/models. - - messages: string prompt or chat history in OpenAI format. A example - for chat history is `[{"role": "user", "content":"knock knock"}]`. + - messages: string prompt or chat history in OpenAI format. Chat history + example: `[{"role": "user", "content": "hi"}]`. - temperature (float): to modulate the next token probability - top_p (float): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. - n (int): How many chat completion choices to generate for each input - message. Only support one here. + message. **Only support one here**. - stream: whether to stream the results or not. Default to false. - - max_tokens (int): output token nums + - max_tokens (int | None): output token nums. Default to None. - repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty - stop (str | List[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. + - response_format (Dict | None): Only pytorch backend support formatting + response. Examples: `{"type": "json_schema", "json_schema": {"name": + "test","schema": {"properties": {"name": {"type": "string"}}, + "required": ["name"], "type": "object"}}}` + or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}` + - logit_bias (Dict): Bias to logits. Only supported in pytorch engine. + - tools (List): A list of tools the model may call. Currently, only + internlm2 functions are supported as a tool. Use this to specify a + list of functions for which the model can generate JSON inputs. + - tool_choice (str | object): Controls which (if any) tool is called by + the model. `none` means the model will not call any tool and instead + generates a message. Specifying a particular tool via {"type": + "function", "function": {"name": "my_function"}} forces the model to + call that tool. `auto` or `required` will put all the tools information + to the model. Additional arguments supported by LMDeploy: + - top_k (int): The number of the highest probability vocabulary + tokens to keep for top-k-filtering - ignore_eos (bool): indicator for ignoring eos - - session_id (int): if not specified, will set random value + - skip_special_tokens (bool): Whether or not to remove special tokens + in the decoding. Default to be True. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) Currently we do not support the following features: - - function_call (Users should implement this by themselves) - - logit_bias (not supported yet) - presence_penalty (replaced with repetition_penalty) - frequency_penalty (replaced with repetition_penalty) """ @@ -481,13 +503,13 @@ async def completions_v1(request: CompletionRequest, - model (str): model name. Available from /v1/models. - prompt (str): the input prompt. - suffix (str): The suffix that comes after a completion of inserted text. - - max_tokens (int): output token nums + - max_tokens (int): output token nums. Default to 16. - temperature (float): to modulate the next token probability - top_p (float): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. - n (int): How many chat completion choices to generate for each input - message. Only support one here. + message. **Only support one here**. - stream: whether to stream the results or not. Default to false. - repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty @@ -497,7 +519,8 @@ async def completions_v1(request: CompletionRequest, Additional arguments supported by LMDeploy: - ignore_eos (bool): indicator for ignoring eos - - session_id (int): if not specified, will set random value + - skip_special_tokens (bool): Whether or not to remove special tokens + in the decoding. Default to be True. - top_k (int): The number of the highest probability vocabulary tokens to keep for top-k-filtering