From 57cf99b9ac390d265a9d91aa6272f67baa5ccb45 Mon Sep 17 00:00:00 2001
From: fade_away <1028552010@qq.com>
Date: Thu, 7 Sep 2023 17:42:02 +0800
Subject: [PATCH 01/43] bug-fix: always use stream mode to enable persistent
 batching (#346)

Co-authored-by: sleepwalker <just_for_singing@foxmail.com>
---
 lmdeploy/serve/openai/api_server.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 932372bde1..4ec2d58636 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -278,7 +278,7 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
     generation = VariableInterface.async_engine.generate(
         request.prompt,
         request.instance_id,
-        stream_response=request.stream,
+        stream_response=True,  # always use stream to enable batching
         sequence_start=request.sequence_start,
         sequence_end=request.sequence_end,
         request_output_len=request.request_output_len,
@@ -303,12 +303,14 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
         return StreamingResponse(stream_results())
     else:
         ret = {}
+        text = ''
+        tokens = 0
+        finish_reason = None
         async for out in generation:
-            ret = {
-                'text': out.response,
-                'tokens': out.generate_token_len,
-                'finish_reason': out.finish_reason
-            }
+            text += out.response
+            tokens += out.generate_token_len
+            finish_reason = out.finish_reason
+        ret = {'text': text, 'tokens': tokens, 'finish_reason': finish_reason}
         return JSONResponse(ret)
 
 

From 71ade772a4a54d9828fe70d398e900fa5d48f6cc Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Thu, 7 Sep 2023 18:38:32 +0800
Subject: [PATCH 02/43] [Fix] Set max dynamic smem size for decoder MHA to
 support context length > 8k (#377)

* Fix crash when context window size is large by setting max dynamic smem size

* fix linting
---
 .../decoder_masked_multihead_attention_128.cu  | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
index 329bcd6484..370594a274 100644
--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
@@ -28,16 +28,18 @@
 
 #define MMHA_LAUNCH_KERNEL(                                                                                            \
     T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS, QUANT_POLICY, stream)                      \
+    auto   func    = &mmha::masked_multihead_attention_kernel<T,                                                       \
+                                                         Dh,                                                      \
+                                                         Dh_MAX,                                                  \
+                                                         THDS_PER_KEY,                                            \
+                                                         THDS_PER_VALUE,                                          \
+                                                         THDS_PER_BLOCK,                                          \
+                                                         HAS_BEAMS,                                               \
+                                                         QUANT_POLICY>;                                           \
     size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK);                              \
     dim3   grid(params.num_heads, params.batch_size);                                                                  \
-    mmha::masked_multihead_attention_kernel<T,                                                                         \
-                                            Dh,                                                                        \
-                                            Dh_MAX,                                                                    \
-                                            THDS_PER_KEY,                                                              \
-                                            THDS_PER_VALUE,                                                            \
-                                            THDS_PER_BLOCK,                                                            \
-                                            HAS_BEAMS,                                                                 \
-                                            QUANT_POLICY><<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+    cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz);                                  \
+    func<<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 

From ce21a31873f642bb64c0bb350625f5f8ed7e25f8 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 7 Sep 2023 18:38:55 +0800
Subject: [PATCH 03/43] fix exceed session len core dump for chat and generate
 (#366)

---
 lmdeploy/serve/async_engine.py | 2 +-
 lmdeploy/turbomind/chat.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index dd03fd640d..40f87ac0ea 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -112,7 +112,7 @@ async def generate(
         prompt = self.model.messages2prompt(messages, sequence_start)
         input_ids = self.tokenizer.encode(prompt)
         finish_reason = 'stop' if stop else None
-        if not sequence_end and self.steps[str(session_id)] + len(
+        if self.steps[str(session_id)] + len(
                 input_ids) >= self.tm_model.session_len:
             finish_reason = 'length'
             yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index 63014e205d..68692a840d 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -74,12 +74,12 @@ def main(model_path,
             seed = random.getrandbits(64)
         else:
             print(f'session {session_id}')
-            if step >= tm_model.session_len:
+            prompt = model.get_prompt(prompt, nth_round == 1)
+            input_ids = tokenizer.encode(prompt)
+            if step + len(input_ids) >= tm_model.session_len:
                 print('WARNING: exceed session max length.'
                       ' Please end the session.')
                 continue
-            prompt = model.get_prompt(prompt, nth_round == 1)
-            input_ids = tokenizer.encode(prompt)
             print(f'{prompt} ', end='', flush=True)
             response_size = 0
             for outputs in generator.stream_infer(

From 55764e0b33d8b9298f68b77484bab3832696c010 Mon Sep 17 00:00:00 2001
From: WRH <12756472+wangruohui@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:03:02 +0800
Subject: [PATCH 04/43] Support baichuan2-chat chat template (#378)

* support baichuan2-chat

* update args from generation config

* update deploy.py

* update readme

* tested with tp

* step-1 when last id is eos

* add news

---------

Co-authored-by: chenxin <chenxin@pjlab.org.cn>
---
 README.md                          | 12 ++++---
 README_zh-CN.md                    | 12 ++++---
 lmdeploy/model.py                  | 50 ++++++++++++++++++++++++++++++
 lmdeploy/serve/turbomind/deploy.py |  5 +++
 lmdeploy/turbomind/turbomind.py    | 12 +++++--
 5 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 820b26df9d..3ed91d925c 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
 - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
 - \[2023/08\] TurboMind supports Windows (tp=1)
@@ -55,11 +56,12 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 > **Note**<br />
 > W4A16 inference requires Nvidia GPU with Ampere architecture or above.
 
-|  Models  | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :-------------: | :--: | :-----: | :---: | :--: |
-|  Llama   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|  Llama2  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|  Models   | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :-------: | :-------------: | :--: | :-----: | :---: | :--: |
+|   Llama   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|  Llama2   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2 |       Yes       | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 9e9649f7d1..1e3b101d3b 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
 - \[2023/08\] TurboMind 支持 Qwen-7B，动态NTK-RoPE缩放，动态logN缩放
 - \[2023/08\] TurboMind 支持 Windows (tp=1)
@@ -56,11 +57,12 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 > **Note**<br />
 > W4A16 推理需要 Ampere 及以上架构的 Nvidia GPU
 
-|   模型   | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :------: | :--: | :-----: | :---: | :--: |
-|  Llama   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|  Llama2  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| InternLM |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|   模型    | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :-------: | :------: | :--: | :-----: | :---: | :--: |
+|   Llama   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|  Llama2   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2 |   Yes    | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 8fa03b7df3..b3706a59d3 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -227,6 +227,56 @@ def __init__(self, repetition_penalty=1.1, **kwargs):
         self.repetition_penalty = repetition_penalty
 
 
+@MODELS.register_module(name='baichuan2-7b-chat')
+class Baichuan2_7BChat(BaseModel):
+
+    def __init__(self,
+                 temperature=0.3,
+                 top_k=5,
+                 top_p=0.85,
+                 repetition_penalty=1.05,
+                 **kwargs):
+        super().__init__(temperature=temperature,
+                         top_k=top_k,
+                         top_p=top_p,
+                         repetition_penalty=repetition_penalty,
+                         **kwargs)
+        self.user_token = '<reserved_106>'  # id = 195
+        self.assistant_token = '<reserved_107>'  # id = 196
+
+    def get_prompt(self, prompt, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            prompt (str): user's input prompt
+            sequence_start (bool): indicator for the first round chat of a
+               session sequence
+        Returns:
+            str: the concatenated prompt
+        """
+        return f'{self.user_token}{prompt}{self.assistant_token}'
+
+    def messages2prompt(self, messages, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            messages (str | List): user's input prompt
+        Returns:
+            str: the concatenated prompt
+        """
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        system, users, assistants = self._translate_messages(messages)
+        ret = ''
+        for user, assistant in zip(users, assistants):
+            ret += f'{self.user_token}{user}{self.assistant_token}'
+            if assistant:
+                ret += f'{assistant}'
+        return ret
+
+
 @MODELS.register_module(name='puyu')
 class Puyu(BaseModel):
     """Chat template of puyu model.This is only for internal usage in Shanghai
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 1aa88bb19e..516afd7934 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -525,6 +525,11 @@ def get_tensor_transposed(name: str):
     for ft, hf in other:
         model_params[ft] = get_tensor(hf)
 
+    if model_name == 'baichuan2-7b-chat':
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+        model_params['output.weight'] = torch.nn.functional.normalize(
+            model_params['output.weight'])
+
     return export(model_name, num_layer, norm_eps, kv_head_num, model_params,
                   tokenizer_path, triton_models_path, tp)
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 807bd55c82..c39110b713 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -340,8 +340,16 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
                     output_ids, seq_start, sequence_length)
             ]
             sequence_length -= seq_start.to(sequence_length.device)
-            yield [(output, l.item())
-                   for output, l in zip(output_ids, sequence_length)]
+
+            outputs = []
+            for output, len_ in zip(output_ids, sequence_length):
+                output, len_ = output, len_.item()
+                if output[-1].item() == self.eos_id:
+                    outputs.append((output[:-1], len_ - 1))
+                else:
+                    outputs.append((output, len_))
+
+            yield outputs
 
             if finish:
                 for t in self.threads:

From 65c662f95b8128844f82df9a3d1c7f11bbf3a622 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 11 Sep 2023 13:09:50 +0800
Subject: [PATCH 05/43] Support codellama (#359)

* tmp

* add demo for codellama inference

* update

* update

* update

* update codellama.md

* export rope_theta

* update

* update doc

* fix client.py

* define SamplingParam

* rollback 'end'

* rotary_emb_base to rotary_embedding_base

* change to baichuan2-7b
---
 README.md                                     |  16 +-
 README_zh-CN.md                               |  16 +-
 docs/en/supported_models/codellama.md         | 112 ++++++++++
 docs/zh_cn/supported_models/codellama.md      | 114 ++++++++++
 lmdeploy/model.py                             | 149 +++++++++++--
 lmdeploy/serve/client.py                      |  28 ++-
 lmdeploy/serve/turbomind/chatbot.py           |   3 +-
 lmdeploy/serve/turbomind/deploy.py            |  31 ++-
 lmdeploy/turbomind/chat.py                    |  83 +++++--
 lmdeploy/turbomind/tokenizer.py               |   6 +-
 requirements.txt                              |   2 +-
 .../decoder_masked_multihead_attention.h      |   1 +
 ...er_masked_multihead_attention_template.cuh |  13 +-
 .../kernels/unfused_attention_kernels.cu      |  11 +-
 .../kernels/unfused_attention_kernels.h       |   1 +
 .../llama/LlamaContextAttentionLayer.cc       |   1 +
 .../llama/LlamaDecoderSelfAttentionLayer.cc   |   3 +
 src/turbomind/models/llama/llama_params.h     |   9 +-
 .../triton_backend/llama/LlamaTritonModel.cc  |   1 +
 tests/test_lmdeploy/test_model.py             | 205 ++++++++++++++++++
 20 files changed, 720 insertions(+), 85 deletions(-)
 create mode 100644 docs/en/supported_models/codellama.md
 create mode 100644 docs/zh_cn/supported_models/codellama.md
 create mode 100644 tests/test_lmdeploy/test_model.py

diff --git a/README.md b/README.md
index 3ed91d925c..42bcbee8b7 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
 - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
@@ -56,12 +57,15 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 > **Note**<br />
 > W4A16 inference requires Nvidia GPU with Ampere architecture or above.
 
-|  Models   | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :-------: | :-------------: | :--: | :-----: | :---: | :--: |
-|   Llama   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|  Llama2   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan2 |       Yes       | Yes  |   No    |  No   |  No  |
+|    Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|   InternLM   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
+| Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
+|  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 1e3b101d3b..35cae96eb8 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
 - \[2023/08\] TurboMind 支持 Qwen-7B，动态NTK-RoPE缩放，动态logN缩放
@@ -57,12 +58,15 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 > **Note**<br />
 > W4A16 推理需要 Ampere 及以上架构的 Nvidia GPU
 
-|   模型    | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :-------: | :------: | :--: | :-----: | :---: | :--: |
-|   Llama   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|  Llama2   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| InternLM  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan2 |   Yes    | Yes  |   No    |  No   |  No  |
+|     模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :----------: | :------: | :--: | :-----: | :---: | :--: |
+|    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|   InternLM   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
+| Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
+|  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
new file mode 100644
index 0000000000..1b51402056
--- /dev/null
+++ b/docs/en/supported_models/codellama.md
@@ -0,0 +1,112 @@
+# codellama
+
+## Introduction
+
+[codellama](https://github.com/facebookresearch/codellama) features enhanced coding capabilities. It can generate code and natural language about code, from both code and natural language prompts (e.g., “Write me a function that outputs the fibonacci sequence”). It can also be used for code completion and debugging. It supports many of the most popular programming languages used today, including Python, C++, Java, PHP, Typescript (Javascript), C#, Bash and more.
+
+There are three sizes (7b, 13b, 34b) as well as three flavours (base model, Python fine-tuned, and instruction tuned) released on [HuggingFace](https://huggingface.co/codellama).
+
+| Base Model                                                                      | Python                                                                                        | Instruct                                                                                          |
+| ------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)   | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf)   | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)   |
+| [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
+| [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
+
+The correspondence between the model and capabilities is:
+
+| models     | code completion | infilling         | instructions / chat | python specialist |
+| ---------- | --------------- | ----------------- | ------------------- | ----------------- |
+| Base Model | Y               | Y(7B,13B), N(34B) | N                   | N                 |
+| Python     | Y               | N                 | N                   | Y                 |
+| Instruct   | Y               | Y(7B,13B), N(34B) | Y                   | N                 |
+
+## Inference
+
+Based on the above table, download the model that meets your requirements. Execute the following command to interact with the model in the console:
+
+```shell
+# install lmdeploy
+python3 -m pip install lmdeploy
+
+# convert weight layout
+python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model
+```
+
+Then, you can communicate with codellama in consolo by following instructions in next sections
+
+**Note**:
+
+- minimum requirement of `transformers` is **v4.33.0**
+- lmdeploy supports copying code blocks to the console. But you have to press enter, input "!!" and press enter again to end the prompt. The way to input prompt for other supported models keeps unchanged, i.e., double pressing enter.
+
+### Completion
+
+```shell
+python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+```
+
+### Infilling
+
+```shell
+python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+```
+
+The input code is supposed to have a special placeholder `<FILL>`. For example,
+
+```
+def remove_non_ascii(s: str) -> str:
+    """ <FILL>
+    return result
+```
+
+And the generated code piece by `turbomind.chat` is the one to be filled in `<FILL>`
+
+### Chat
+
+```
+python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+```
+
+`--sys-instruct` instruction can be changed to other coding languages as long as codellama supports it
+
+### Python specialist
+
+```
+python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+```
+
+Python fine-tuned model is highly recommended when 'python specialist' capability is required.
+
+## Quantization
+
+TBD
+
+## Serving
+
+**LMDeploy server only supports `chat` capabllity**. The res ones are going to be supported soon.
+
+Launch inference server by:
+
+```shell
+# --instance_num: number of instances to performance inference, which can be viewed as max requests concurrency
+# --tp: the number of GPUs used in tensor parallelism
+python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+```
+
+Then, you can communicate with it by command line,
+
+```shell
+# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+python -m lmdeploy.serve.openai.api_client restful_api_url
+```
+
+or through webui after launching gradio,
+
+```shell
+# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# server_ip and server_port here are for gradio ui
+# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+```
+
+Regarding the detailed information of RESTful API, you can refer to [restful_api.md](../restful_api.md).
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
new file mode 100644
index 0000000000..ca9029a527
--- /dev/null
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -0,0 +1,114 @@
+# Code Llama
+
+## 模型介绍
+
+[codellama](https://github.com/facebookresearch/codellama) 支持很多种编程语言，包括 Python, C++, Java, PHP, Typescript (Javascript), C#, Bash 等等。具备代码续写、代码填空、对话、python专项等 4 种能力。
+
+它在 [HuggingFace](https://huggingface.co/codellama) 上发布了基座模型，Python模型和指令微调模型：
+
+| 基座模型                                                                        | Python微调模型                                                                                | 指令模型                                                                                          |
+| ------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)   | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf)   | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf)   |
+| [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
+| [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
+
+模型和能力的对应关系为：
+
+| 模型           | 代码续写 | 代码填空          | 对话 | Python专项 |
+| -------------- | -------- | ----------------- | ---- | ---------- |
+| 基座模型       | Y        | Y(7B,13B), N(34B) | N    | N          |
+| Python微调模型 | Y        | N                 | N    | Y          |
+| 指令微调模型   | Y        | Y(7B,13B), N(34B) | Y    | N          |
+
+## 推理
+
+根据上述的模型和能力关系表，下载感兴趣的模型。执行如下的命令，把模型权重转成 turbomind 要求的格式：
+
+```shell
+# 安装 lmdeploy
+python3 -m pip install lmdeploy
+
+# 转模型格式
+python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
+```
+
+接下来，可参考如下章节，在控制台与 codellama 进行交互式对话。
+
+**注意**:
+
+- **transformers最低要求 v4.33.0**
+- `lmdeploy.turbomind.chat` 支持把代码块拷贝到控制台，**结束输出的方式为回车，再输入"!!"，再回车**。其他非 codellama 模型，仍然是两次回车结束输入。
+
+### 代码续写
+
+```shell
+python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+```
+
+### 代码填空
+
+```shell
+python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+```
+
+输入的代码块中要包含 `<FILL>`，比如：
+
+```
+def remove_non_ascii(s: str) -> str:
+    """ <FILL>
+    return result
+```
+
+`turbomind.chat` 输出的代码即是要填到 `<FILL>` 中的内容
+
+### 对话
+
+```
+python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+```
+
+可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。
+
+### Python 专项
+
+```
+python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+```
+
+建议这里部署 Python 微调模型
+
+## 量化
+
+TBD
+
+## 服务
+
+**目前，server 支持的是对话功能**，其余功能后续再加上。
+
+启动 sever 的方式是：
+
+```shell
+# --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数
+# --tp: 在 tensor parallel时，使用的GPU数量
+python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+```
+
+打开 `http://{server_ip}:{server_port}`，即可访问 swagger，查阅 RESTful API 的详细信息。
+
+你可以用命令行，在控制台与 server 通信：
+
+```shell
+# restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
+python -m lmdeploy.serve.openai.api_client restful_api_url
+```
+
+或者，启动 gradio，在 webui 的聊天对话框中，与 codellama 交流：
+
+```shell
+# restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
+# server_ip 和 server_port 是用来提供 gradio ui 访问服务的
+# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+```
+
+关于 RESTful API的详细介绍，请参考[这份](../restful_api.md)文档。
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index b3706a59d3..bf89e39063 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import dataclasses
 from abc import abstractmethod
 from typing import List
 
@@ -7,7 +8,17 @@
 MODELS = Registry('model', locations=['lmdeploy.model'])
 
 
+@dataclasses.dataclass
+class SamplingParam:
+    top_p: float = 0.8
+    top_k: float = None
+    temperature: float = 0.8
+    repetition_penalty: float = 1.0
+
+
+@MODELS.register_module(name='internlm')
 @MODELS.register_module(name='llama')
+@MODELS.register_module(name='base')
 class BaseModel:
     """Base model."""
 
@@ -17,15 +28,16 @@ def __init__(self,
                  top_k=None,
                  temperature=0.8,
                  repetition_penalty=1.0,
+                 capability='chat',
                  **kwargs):
         self.session_len = session_len
         self.top_p = top_p
         self.top_k = top_k
         self.temperature = temperature
         self.repetition_penalty = repetition_penalty
+        self.capability = capability
 
-    @staticmethod
-    def get_prompt(prompt, sequence_start=True):
+    def get_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
         chat template.
 
@@ -36,7 +48,14 @@ def get_prompt(prompt, sequence_start=True):
         Returns:
             str: the concatenated prompt
         """
-        return prompt
+        if self.capability == 'completion':
+            return prompt
+        else:
+            return self.decorate_prompt(prompt, sequence_start)
+
+    @abstractmethod
+    def decorate_prompt(self, prompt, sequence_start):
+        pass
 
     @staticmethod
     def _translate_messages(messages: List):
@@ -87,6 +106,13 @@ def stop_words(self):
         """Return the stop-words' token ids."""
         return None
 
+    @property
+    def sampling_param(self):
+        return SamplingParam(top_p=self.top_p,
+                             top_k=self.top_k,
+                             temperature=self.temperature,
+                             repetition_penalty=self.repetition_penalty)
+
 
 @MODELS.register_module(name='vicuna')
 class Vicuna(BaseModel):
@@ -103,7 +129,7 @@ def __init__(
         self.user = user
         self.assistant = assistant
 
-    def get_prompt(self, prompt, sequence_start=True):
+    def decorate_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
         chat template.
 
@@ -114,6 +140,8 @@ def get_prompt(self, prompt, sequence_start=True):
         Returns:
             str: the concatenated prompt
         """
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
             return f'{self.system} {self.user}: {prompt} {self.assistant}: '
         else:
@@ -141,13 +169,6 @@ def messages2prompt(self, messages, sequence_start=True):
         return ret
 
 
-@MODELS.register_module(name='internlm')
-class InternLM(BaseModel):
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-
 @MODELS.register_module(name='internlm-chat-7b')
 class InternLMChat7B(BaseModel):
     """Chat template of InternLM model."""
@@ -166,7 +187,7 @@ def __init__(self,
         self.eoa = eoa
         self.assistant = assistant
 
-    def get_prompt(self, prompt, sequence_start=True):
+    def decorate_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
         chat template.
 
@@ -177,6 +198,8 @@ def get_prompt(self, prompt, sequence_start=True):
         Returns:
             str: the concatenated prompt
         """
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
             return f'<BOS>{self.user}:{prompt}{self.eoh}\n' \
                    f'{self.assistant}:'
@@ -227,8 +250,8 @@ def __init__(self, repetition_penalty=1.1, **kwargs):
         self.repetition_penalty = repetition_penalty
 
 
-@MODELS.register_module(name='baichuan2-7b-chat')
-class Baichuan2_7BChat(BaseModel):
+@MODELS.register_module(name='baichuan2-7b')
+class Baichuan2_7B(BaseModel):
 
     def __init__(self,
                  temperature=0.3,
@@ -244,7 +267,7 @@ def __init__(self,
         self.user_token = '<reserved_106>'  # id = 195
         self.assistant_token = '<reserved_107>'  # id = 196
 
-    def get_prompt(self, prompt, sequence_start=True):
+    def decorate_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
         chat template.
 
@@ -255,6 +278,8 @@ def get_prompt(self, prompt, sequence_start=True):
         Returns:
             str: the concatenated prompt
         """
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
         return f'{self.user_token}{prompt}{self.assistant_token}'
 
     def messages2prompt(self, messages, sequence_start=True):
@@ -283,22 +308,24 @@ class Puyu(BaseModel):
     AI Laboratory."""
 
     def __init__(self,
-                 meta_instruction='',
+                 system='',
                  user='<|Human|>: ',
                  eoh='',
                  eosys='',
                  assistant='<|Assistant|>: ',
-                 system='<|System|>: ',
+                 system_role='<|System|>: ',
                  **kwargs):
         super().__init__(**kwargs)
-        self.meta_instruction = meta_instruction
+        self.meta_instruction = system
         self.user = user
         self.eoh = eoh
         self.eosys = eosys
         self.assistant = assistant
-        self.system = system
+        self.system = system_role
 
-    def get_prompt(self, prompt, sequence_start=True):
+    def decorate_prompt(self, prompt, sequence_start=True):
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
             return f'<BOS>{self.system}{self.meta_instruction}{self.eosys}\n' \
                    f'{self.user}{prompt}{self.eoh}\n' \
@@ -345,7 +372,7 @@ def __init__(
             e_inst='[/INST]',
             b_sys='<<SYS>>\n',
             e_sys='\n<</SYS>>\n\n',
-            default_sys_prompt="""\
+            system="""\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",  # noqa: E501
@@ -356,10 +383,10 @@ def __init__(
         self.e_inst = e_inst
         self.b_sys = b_sys
         self.e_sys = e_sys
-        self.default_sys_prompt = default_sys_prompt
+        self.default_sys_prompt = system
         self.session_len = session_len
 
-    def get_prompt(self, prompt, sequence_start=True):
+    def decorate_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
         chat template.
 
@@ -370,6 +397,8 @@ def get_prompt(self, prompt, sequence_start=True):
         Returns:
             str: the concatenated prompt
         """
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
             return f'<BOS>{self.b_inst} ' \
                    f'{self.b_sys} {self.default_sys_prompt} {self.e_sys}' \
@@ -424,7 +453,9 @@ def __init__(self,
         self.im_end = im_end
         self.system = system
 
-    def get_prompt(self, prompt, sequence_start=True):
+    def decorate_prompt(self, prompt, sequence_start=True):
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
             return f'{self.im_start}system\n{self.system}{self.im_end}' \
                    f'\n{self.im_start}user\n{prompt}{self.im_end}' \
@@ -462,6 +493,76 @@ def stop_words(self):
         return [151645]  # <|im_end|>
 
 
+@MODELS.register_module(name='codellama')
+class CodeLlama(Llama2):
+
+    def __init__(self,
+                 system='',
+                 session_len=4096,
+                 suffix_first=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        caps = ['completion', 'infilling', 'chat', 'python']
+        assert self.capability in caps, \
+            f'{self.capability} is not supported. ' \
+            f'The supported capabilities are: {caps}'
+        self.default_sys_prompt = system
+        self.session_len = session_len
+        self.suffix_first = suffix_first
+
+        # The following sampling parameters refers to https://github.com/facebookresearch/codellama # noqa: E501
+        if self.capability == 'completion' or self.capability == 'python':
+            self.top_p = kwargs.get('top_p', 0.9)
+            self.temperature = kwargs.get('temperature', 0.2)
+        if self.capability == 'chat':
+            self.top_p = kwargs.get('top_p', 0.95)
+            self.temperature = kwargs.get('temperature', 0.2)
+        elif self.capability == 'infilling':
+            self.top_p = kwargs.get('top_p', 0.9)
+            self.temperature = kwargs.get('temperature', 0.0)
+
+    def decorate_prompt(self, prompt, sequence_start=True):
+        if self.capability == 'infilling':
+            return self._infill_prompt(prompt)
+        elif self.capability == 'chat':
+            return self._get_prompt(prompt, sequence_start)
+        else:  # python speicalist
+            return prompt
+
+    def _infill_prompt(self, prompt):
+        prefix, suffix = prompt.split('<FILL>')
+        if self.suffix_first:
+            # format as "<PRE> <SUF>{suf} <MID> {pre}"
+            prompt = f'<BOS><PRE> <SUF>{suffix} <MID> {prefix}'
+        else:
+            # format as "<PRE> {pre} <SUF>{suf} <MID>"
+            prompt = f'<BOS><PRE> {prefix} <SUF>{suffix} <MID>'
+        return prompt
+
+    def _get_prompt(self, prompt, sequence_start):
+        prompt = prompt.strip()
+        if sequence_start:
+            return f'<BOS>{self.b_inst} ' \
+                   f'{self.b_sys}{self.default_sys_prompt}{self.e_sys}' \
+                   f'{prompt} {self.e_inst}'
+
+        return f'{self.b_inst} {prompt} {self.e_inst}'
+
+    @property
+    def stop_words(self):
+        if self.capability == 'infilling':
+            # EOT ID
+            return [32010]
+        else:
+            return None
+
+    def messages2prompt(self, messages, sequence_start=True):
+        assert self.capability == 'chat', \
+            f'codellama message2prompt only supports chat mode ' \
+            f'but got {self.cap} mode'
+        return super().messages2prompt(messages, sequence_start)
+
+
 def main(model_name: str = 'test'):
     assert model_name in MODELS.module_dict.keys(), \
         f"'{model_name}' is not supported. " \
diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
index 1d22d4ba38..283e96e299 100644
--- a/lmdeploy/serve/client.py
+++ b/lmdeploy/serve/client.py
@@ -6,16 +6,23 @@
 from lmdeploy.serve.turbomind.chatbot import Chatbot
 
 
-def input_prompt():
-    """Input a prompt in the console interface."""
-    print('\ndouble enter to end input >>> ', end='')
-    sentinel = ''  # ends when this string is seen
+def input_prompt(model_name):
+    """Input a prompt in the consolo interface."""
+    if model_name == 'codellama':
+        print('\nenter !! to end the input >>>\n', end='')
+        sentinel = '!!'
+    else:
+        print('\ndouble enter to end input >>> ', end='')
+        sentinel = ''  # ends when this string is seen
     return '\n'.join(iter(input, sentinel))
 
 
 def main(tritonserver_addr: str,
          session_id: int = 1,
-         stream_output: bool = True):
+         cap: str = 'chat',
+         sys_instruct: str = None,
+         stream_output: bool = True,
+         **kwargs):
     """An example to communicate with inference server through the command line
     interface.
 
@@ -23,15 +30,22 @@ def main(tritonserver_addr: str,
         tritonserver_addr (str): the address in format "ip:port" of
           triton inference server
         session_id (int): the identical id of a session
+        cap (str): the capability of a model. For example, codellama has
+            the ability among ['completion', 'infill', 'instruct', 'python']
+        sys_instruct (str): the content of 'system' role, which is used by
+            conversational model
         stream_output (bool): indicator for streaming output or not
+        **kwargs (dict): other arguments for initializing model's chat template
     """
     log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
+    kwargs.update(capability=cap, system=sys_instruct)
     chatbot = Chatbot(tritonserver_addr,
                       log_level=log_level,
-                      display=stream_output)
+                      display=stream_output,
+                      **kwargs)
     nth_round = 1
     while True:
-        prompt = input_prompt()
+        prompt = input_prompt(chatbot.model_name)
         if prompt == 'exit':
             exit(0)
         elif prompt == 'end':
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index 1212d34598..eb532e2602 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -149,6 +149,7 @@ def stream_infer(self,
         self._session.status = 1
         self._session.request_id = request_id
         self._session.response = ''
+        self.cfg.update(**kwargs)
 
         self._session.prompt = self._get_prompt(prompt, sequence_start)
         for status, res, tokens in self._stream_infer(self._session,
@@ -507,7 +508,7 @@ def _stream_producer(tritonserver_addr, session, que, cfg, input_ids,
                 server
             session (Session): an instance of a session
             que (multiprocessing.Queue): response queue
-            cfg:
+            cfg (dict): parameters for sampling
             input_ids (numpy.ndarray): token ids of input prompt
             input_lengths (numpy.ndarray): length of input_ids
             request_output_len (int): the max number of tokens to be generated
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 516afd7934..1c2b1becc3 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -122,6 +122,7 @@ def export(model_name: str,
            max_position_embeddings: int = 0,
            use_dynamic_ntk: int = 0,
            use_logn_attn: int = 0,
+           rope_theta: float = 10000.0,
            tokenizer_info=tokenizer_info_sp):
     """Export deploying information to a config file.
 
@@ -213,6 +214,7 @@ def save_bin(param: torch.Tensor, name):
         vocab_size=_vocab_size,
         num_layer=num_layer,
         rotary_embedding=size_per_head,
+        rope_theta=rope_theta,
         inter_size=inter_size,
         norm_eps=norm_eps,
         attn_bias=int(attn_bias),
@@ -233,7 +235,8 @@ def save_bin(param: torch.Tensor, name):
         # extra attention params
         max_position_embeddings=max_position_embeddings,
         use_dynamic_ntk=int(use_dynamic_ntk),
-        use_logn_attn=int(use_logn_attn)))
+        use_logn_attn=int(use_logn_attn),
+    ))
 
     config = configparser.ConfigParser()
     for section, key_values in cfg.items():
@@ -415,6 +418,10 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
             model_arg = json.load(f)
             num_layer = model_arg['num_hidden_layers']
             norm_eps = model_arg['rms_norm_eps']
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            repo_scaling = bool(model_arg.get('rope_scaling', False))
             if 'num_key_value_heads' in model_arg:
                 kv_head_num = model_arg['num_key_value_heads']
             else:
@@ -525,13 +532,23 @@ def get_tensor_transposed(name: str):
     for ft, hf in other:
         model_params[ft] = get_tensor(hf)
 
-    if model_name == 'baichuan2-7b-chat':
+    if model_name == 'baichuan2-7b':
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/modeling_baichuan.py#L507
         # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
         model_params['output.weight'] = torch.nn.functional.normalize(
             model_params['output.weight'])
 
-    return export(model_name, num_layer, norm_eps, kv_head_num, model_params,
-                  tokenizer_path, triton_models_path, tp)
+    return export(model_name,
+                  num_layer,
+                  norm_eps,
+                  kv_head_num,
+                  model_params,
+                  tokenizer_path,
+                  triton_models_path,
+                  tp,
+                  max_position_embeddings=max_position_embeddings,
+                  use_dynamic_ntk=repo_scaling,
+                  rope_theta=rope_theta)
 
 
 def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
@@ -574,6 +591,7 @@ def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
             model_arg = json.load(f)
             num_layer = model_arg['num_hidden_layers']
             norm_eps = model_arg['rms_norm_eps']
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
             if 'num_key_value_heads' in model_arg:
                 kv_head_num = model_arg['num_key_value_heads']
             else:
@@ -761,7 +779,8 @@ def tp_m_s4(x: torch.Tensor, tp: int):
                   triton_models_path,
                   tp,
                   weight_type='int4',
-                  group_size=group_size)
+                  group_size=group_size,
+                  rope_theta=rope_theta)
 
 
 def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
@@ -802,6 +821,7 @@ def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
             config = json.load(f)
             num_layer = config['num_hidden_layers']
             norm_eps = config['layer_norm_epsilon']
+            rope_theta = float(config.get('rotary_emb_base', 10000.0))
             if 'num_key_value_heads' in config:
                 kv_head_num = config['num_key_value_heads']
             else:
@@ -889,6 +909,7 @@ def get_tensor(name, trans=True):
                   max_position_embeddings=seq_length,
                   use_dynamic_ntk=use_dynamic_ntk,
                   use_logn_attn=use_logn_attn,
+                  rope_theta=rope_theta,
                   tokenizer_info=tokenizer_info_qwen)
 
 
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index 68692a840d..d617b19835 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import dataclasses
 import os
 import os.path as osp
 import random
@@ -12,10 +13,26 @@
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
 
-def input_prompt():
+@dataclasses.dataclass
+class GenParam:
+    top_p: float
+    top_k: float
+    temperature: float
+    repetition_penalty: float
+    sequence_start: bool = False
+    sequence_end: bool = False
+    step: int = 0
+    request_output_len: int = 512
+
+
+def input_prompt(model_name):
     """Input a prompt in the consolo interface."""
-    print('\ndouble enter to end input >>> ', end='')
-    sentinel = ''  # ends when this string is seen
+    if model_name == 'codellama':
+        print('\nenter !! to end the input >>>\n', end='')
+        sentinel = '!!'
+    else:
+        print('\ndouble enter to end input >>> ', end='')
+        sentinel = ''  # ends when this string is seen
     return '\n'.join(iter(input, sentinel))
 
 
@@ -29,20 +46,50 @@ def valid_str(string, coding='utf-8'):
     return ret
 
 
+def get_gen_param(cap,
+                  sampling_param,
+                  nth_round,
+                  step,
+                  request_output_len=512,
+                  **kwargs):
+    """return parameters used by token generation."""
+    gen_param = GenParam(**dataclasses.asdict(sampling_param),
+                         request_output_len=request_output_len)
+    # Fix me later. turbomind.py doesn't support None top_k
+    if gen_param.top_k is None:
+        gen_param.top_k = 40
+
+    if cap == 'chat':
+        gen_param.sequence_start = (nth_round == 1)
+        gen_param.sequence_end = False
+        gen_param.step = step
+    else:
+        gen_param.sequence_start = True
+        gen_param.sequence_end = True
+        gen_param.step = 0
+    return gen_param
+
+
 def main(model_path,
          session_id: int = 1,
-         repetition_penalty: float = 1.0,
+         cap: str = 'chat',
+         sys_instruct: str = None,
          tp=1,
-         stream_output=True):
+         stream_output=True,
+         **kwargs):
     """An example to perform model inference through the command line
     interface.
 
     Args:
         model_path (str): the path of the deployed model
         session_id (int): the identical id of a session
-        repetition_penalty (float): parameter to penalize repetition
+        cap (str): the capability of a model. For example, codellama has
+            the ability among ['completion', 'infilling', 'chat', 'python']
+        sys_instruct (str): the content of 'system' role, which is used by
+            conversational model
         tp (int): GPU number used in tensor parallelism
         stream_output (bool): indicator for streaming output or not
+        **kwarg (dict): other arguments for initializing model's chat template
     """
     tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
     tokenizer = Tokenizer(tokenizer_model_path)
@@ -53,10 +100,13 @@ def main(model_path,
     step = 0
     seed = random.getrandbits(64)
     model_name = tm_model.model_name
-    model = MODELS.get(model_name)()
+    model = MODELS.get(model_name)(capability=cap, **kwargs) \
+        if sys_instruct is None else MODELS.get(model_name)(
+            capability=cap, system=sys_instruct, **kwargs)
 
+    print(f'session {session_id}')
     while True:
-        prompt = input_prompt()
+        prompt = input_prompt(model_name)
         if prompt == 'exit':
             exit(0)
         elif prompt == 'end':
@@ -73,28 +123,23 @@ def main(model_path,
             step = 0
             seed = random.getrandbits(64)
         else:
-            print(f'session {session_id}')
-            prompt = model.get_prompt(prompt, nth_round == 1)
+            prompt = model.get_prompt(prompt, nth_round)
             input_ids = tokenizer.encode(prompt)
             if step + len(input_ids) >= tm_model.session_len:
                 print('WARNING: exceed session max length.'
                       ' Please end the session.')
                 continue
+
+            gen_param = get_gen_param(cap, model.sampling_param, nth_round,
+                                      step, **kwargs)
+
             print(f'{prompt} ', end='', flush=True)
             response_size = 0
             for outputs in generator.stream_infer(
                     session_id=session_id,
                     input_ids=[input_ids],
                     stream_output=stream_output,
-                    request_output_len=512,
-                    sequence_start=(nth_round == 1),
-                    sequence_end=False,
-                    step=step,
-                    stop=False,
-                    top_k=40,
-                    top_p=0.8,
-                    temperature=0.8,
-                    repetition_penalty=repetition_penalty,
+                    **dataclasses.asdict(gen_param),
                     ignore_eos=False,
                     random_seed=seed if nth_round == 1 else None):
                 res, tokens = outputs[0]
diff --git a/lmdeploy/turbomind/tokenizer.py b/lmdeploy/turbomind/tokenizer.py
index bb7f95e9ef..98db9c2b61 100644
--- a/lmdeploy/turbomind/tokenizer.py
+++ b/lmdeploy/turbomind/tokenizer.py
@@ -111,7 +111,8 @@ class HuggingFaceTokenizer:
     """
 
     def __init__(self, model_dir: str):
-        from transformers import AutoTokenizer, LlamaTokenizerFast
+        from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
+                                  LlamaTokenizerFast)
         model_file = osp.join(model_dir, 'tokenizer.model')
         backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
         model_file_exists = osp.exists(model_file)
@@ -120,7 +121,8 @@ def __init__(self, model_dir: str):
                   'It may take long time to initialize the tokenizer.')
         self.model = AutoTokenizer.from_pretrained(model_dir,
                                                    trust_remote_code=True)
-        self.need_padding = isinstance(self.model, LlamaTokenizerFast)
+        self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
+            or isinstance(self.model, CodeLlamaTokenizerFast)
         self._no_prefix_space_tokens = None
         # save tokenizer.json to reuse
         if not osp.exists(backend_tokenizer_file) and model_file_exists:
diff --git a/requirements.txt b/requirements.txt
index c0cd48396c..861623c040 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,6 @@ setuptools
 shortuuid
 tiktoken
 torch
-transformers
+transformers>=4.33.0
 tritonclient[all]
 uvicorn
diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention.h b/src/turbomind/kernels/decoder_masked_multihead_attention.h
index dba396bf40..b44332090f 100644
--- a/src/turbomind/kernels/decoder_masked_multihead_attention.h
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention.h
@@ -121,6 +121,7 @@ struct Multihead_attention_params: public Multihead_attention_params_base<T> {
     int        max_position_embeddings    = 0;
     bool       use_dynamic_ntk            = false;
     bool       use_logn_attn              = false;
+    float      rotary_embedding_base      = 10000.0f;
 };
 
 template<class T>
diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
index 6b9101abb0..c2b6039d67 100644
--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -1378,19 +1378,20 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
     q = add(q, q_bias);
     k = add(k, k_bias);
 
-    float rotary_emb_base = 10000.f;
+    float rotary_embedding_base = params.rotary_embedding_base;
     if (params.use_dynamic_ntk) {
         // +1 because of `length_per_sample == context_length - 1`
-        rotary_emb_base = rotary_embedding_get_base(params.length_per_sample[bi] + 1,
-                                                    params.max_position_embeddings,
-                                                    params.rotary_embedding_dim,
-                                                    rotary_emb_base);
+        rotary_embedding_base = rotary_embedding_get_base(params.length_per_sample[bi] + 1,
+                                                          params.max_position_embeddings,
+                                                          params.rotary_embedding_dim,
+                                                          rotary_embedding_base);
     }
 
     // Padded len
     const int padd_len = (params.total_padding_tokens == nullptr) ? 0 : params.total_padding_tokens[bi];
     if (params.rotary_embedding_dim > 0) {
-        apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, rotary_emb_base, params.timestep - padd_len);
+        apply_rotary_embedding(
+            q, k, tidx, params.rotary_embedding_dim, rotary_embedding_base, params.timestep - padd_len);
     }
 
     if (params.use_logn_attn) {
diff --git a/src/turbomind/kernels/unfused_attention_kernels.cu b/src/turbomind/kernels/unfused_attention_kernels.cu
index 536175ccf8..b2450c8675 100644
--- a/src/turbomind/kernels/unfused_attention_kernels.cu
+++ b/src/turbomind/kernels/unfused_attention_kernels.cu
@@ -863,6 +863,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
                                                    int        kv_head_num,
                                                    int        size_per_head,
                                                    int        rotary_embedding_dim,
+                                                   float      rotary_embedding_base,
                                                    int        max_position_embeddings,
                                                    bool       use_dynamic_ntk,
                                                    bool       use_logn_attn)
@@ -931,14 +932,13 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
     const int context_len = history_len + input_length[batch_idx];
     const int timestep    = history_len + seq_idx;
 
-    float rotary_emb_base = 10000.f;
     if (use_dynamic_ntk) {
-        rotary_emb_base = mmha::rotary_embedding_get_base(
-            context_len, max_position_embeddings, rotary_embedding_dim, rotary_emb_base);
+        rotary_embedding_base = mmha::rotary_embedding_get_base(
+            context_len, max_position_embeddings, rotary_embedding_dim, rotary_embedding_base);
     }
 
     // TODO: unused computation on k if GQA is used
-    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_emb_base, timestep);
+    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_embedding_base, timestep);
 
     if (use_logn_attn) {
         // +1 to convert to context length at the timestep
@@ -990,6 +990,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
                                                                                              kv_head_num,              \
                                                                                              size_per_head,            \
                                                                                              rotary_embedding_dim,     \
+                                                                                             rotary_embedding_base,    \
                                                                                              max_position_embeddings,  \
                                                                                              use_dynamic_ntk,          \
                                                                                              use_logn_attn);
@@ -1010,6 +1011,7 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                     const int    kv_head_num,
                                     const int    size_per_head,
                                     const int    rotary_embedding_dim,
+                                    float        rotary_embedding_base,
                                     int          max_position_embeddings,
                                     bool         use_dynamic_ntk,
                                     bool         use_logn_attn,
@@ -1039,6 +1041,7 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                                  const int    kv_head_num,                                             \
                                                  const int    size_per_head,                                           \
                                                  const int    rotary_embedding_dim,                                    \
+                                                 float        rotary_embedding_base,                                   \
                                                  int          max_position_embeddings,                                 \
                                                  bool         use_dynamic_ntk,                                         \
                                                  bool         use_logn_attn,                                           \
diff --git a/src/turbomind/kernels/unfused_attention_kernels.h b/src/turbomind/kernels/unfused_attention_kernels.h
index 50069fc33a..b5c37b5d48 100644
--- a/src/turbomind/kernels/unfused_attention_kernels.h
+++ b/src/turbomind/kernels/unfused_attention_kernels.h
@@ -79,6 +79,7 @@ void invokeAddFusedQKVBiasTranspose(T*           q_buf,
                                     const int    kv_head_num,
                                     const int    size_per_head,
                                     const int    rotary_embedding_dim,
+                                    float        rotary_embedding_base,
                                     int          max_position_embeddings,
                                     bool         use_dynamic_ntk,
                                     bool         use_logn_attn,
diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
index 66bcf7570f..e8f77e1c74 100644
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -175,6 +175,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                    local_kv_head_num_,
                                    size_per_head_,
                                    params_.rotray_embedding_dim,
+                                   params_.rotary_embedding_base,
                                    params_.max_position_embeddings,
                                    params_.use_dynamic_ntk,
                                    params_.use_logn_attn,
diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
index eec9a7fbd4..3caaf59068 100644
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -61,6 +61,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
                                                       const int    kv_head_num,
                                                       const int    size_per_head,
                                                       const int    rotary_embedding_dim,
+                                                      const float  rotary_embedding_base,
                                                       const int    max_position_embeddings,
                                                       const bool   use_dynamic_ntk,
                                                       const bool   use_logn_attn,
@@ -129,6 +130,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
 
     params.hidden_size_per_head    = size_per_head;
     params.rotary_embedding_dim    = rotary_embedding_dim;
+    params.rotary_embedding_base         = rotary_embedding_base;
     params.max_position_embeddings = max_position_embeddings;
     params.use_dynamic_ntk         = use_dynamic_ntk;
     params.use_logn_attn           = use_logn_attn;
@@ -261,6 +263,7 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
         local_kv_head_num_,
         size_per_head_,
         params_.rotray_embedding_dim,
+        params_.rotary_embedding_base,
         params_.max_position_embeddings,
         params_.use_dynamic_ntk,
         params_.use_logn_attn,
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index a2387e44e0..8f8c96837b 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -5,10 +5,11 @@
 namespace turbomind {
 
 struct LlamaAttentionParams {
-    int  rotray_embedding_dim;
-    int  max_position_embeddings;
-    bool use_dynamic_ntk;
-    bool use_logn_attn;
+    int   rotray_embedding_dim;
+    float rotary_embedding_base;
+    int   max_position_embeddings;
+    bool  use_dynamic_ntk;
+    bool  use_logn_attn;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 169d6cbdba..456f5f41c4 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -137,6 +137,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
     group_size_            = reader.GetInteger("llama", "group_size", 0);
 
     attn_params_.rotray_embedding_dim    = reader.GetInteger("llama", "rotary_embedding");
+    attn_params_.rotary_embedding_base   = reader.GetFloat("llama", "rope_theta", 10000.0f);
     attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
     attn_params_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
     attn_params_.use_logn_attn           = reader.GetInteger("llama", "use_logn_attn", 0);
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
new file mode 100644
index 0000000000..83487f1f03
--- /dev/null
+++ b/tests/test_lmdeploy/test_model.py
@@ -0,0 +1,205 @@
+import pytest
+
+from lmdeploy.model import MODELS, SamplingParam
+
+
+def test_base_model():
+    model = MODELS.get('llama')()
+    assert model is not None
+    assert model.capability == 'chat'
+    assert model.get_prompt('test') is None
+    assert model.stop_words is None
+
+    model = MODELS.get('internlm')(capability='completion')
+    assert model.capability == 'completion'
+    assert model.get_prompt('hi') == 'hi'
+    assert model.messages2prompt('test') == 'test'
+
+
+def test_vicuna():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('vicuna')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is None
+    assert model.system is not None
+
+    model = MODELS.get('vicuna')(capability='chat',
+                                 system='Provide answers in Python')
+    assert model.get_prompt(prompt, sequence_start=True) != prompt
+    assert model.get_prompt(prompt, sequence_start=False) != prompt
+    assert model.system == 'Provide answers in Python'
+
+    model = MODELS.get('vicuna')(capability='voice')
+    _prompt = None
+    with pytest.raises(AssertionError):
+        _prompt = model.get_prompt(prompt, sequence_start=True)
+    assert _prompt is None
+
+
+def test_internlm_chat():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('internlm-chat-7b')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is not None
+    assert model.system == ''
+    assert model.session_len == 2048
+
+    model = MODELS.get('internlm-chat-7b')(capability='chat',
+                                           system='Provide answers in Python')
+    assert model.get_prompt(prompt, sequence_start=True) != prompt
+    assert model.get_prompt(prompt, sequence_start=False) != prompt
+    assert model.system == 'Provide answers in Python'
+
+    model = MODELS.get('internlm-chat-7b')(capability='voice')
+    _prompt = None
+    with pytest.raises(AssertionError):
+        _prompt = model.get_prompt(prompt, sequence_start=True)
+    assert _prompt is None
+
+    model = MODELS.get('internlm-chat-7b-8k')()
+    assert model.session_len == 8192
+
+
+def test_baichuan():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('baichuan-7b')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is None
+    assert model.repetition_penalty == 1.1
+
+    model = MODELS.get('baichuan-7b')(capability='chat')
+    _prompt = model.get_prompt(prompt, sequence_start=True)
+    assert _prompt is None
+
+
+def test_llama2():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('llama2')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is None
+    assert model.default_sys_prompt is not None
+
+    model = MODELS.get('llama2')(capability='chat',
+                                 system='Provide answers in Python')
+    assert model.get_prompt(prompt, sequence_start=True) != prompt
+    assert model.get_prompt(prompt, sequence_start=False) != prompt
+    assert model.default_sys_prompt == 'Provide answers in Python'
+
+    model = MODELS.get('llama2')(capability='voice')
+    _prompt = None
+    with pytest.raises(AssertionError):
+        _prompt = model.get_prompt(prompt, sequence_start=True)
+    assert _prompt is None
+
+
+def test_qwen():
+    prompt = 'hello, can u introduce yourself'
+    model = MODELS.get('qwen-7b')(capability='completion')
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is not None
+
+    model = MODELS.get('qwen-7b')(capability='chat')
+    assert model.get_prompt(prompt, sequence_start=True) != prompt
+    assert model.get_prompt(prompt, sequence_start=False) != prompt
+
+    model = MODELS.get('qwen-7b')(capability='voice')
+    _prompt = None
+    with pytest.raises(AssertionError):
+        _prompt = model.get_prompt(prompt, sequence_start=True)
+    assert _prompt is None
+
+
+def test_codellama_completion():
+    model = MODELS.get('codellama')(capability='completion')
+    prompt = """\
+import socket
+
+def ping_exponential_backoff(host: str):"""
+    assert model.get_prompt(prompt) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is None
+
+
+def test_codellama_infilling():
+    model = MODELS.get('codellama')(capability='infilling')
+    prompt = '''def remove_non_ascii(s: str) -> str:
+    """ <FILL>
+    return result
+'''
+    _prompt = model.get_prompt(prompt)
+    assert _prompt.find('<FILL>') == -1
+    assert model.stop_words == [32010]
+
+    model = MODELS.get('codellama')(capability='infilling', suffix_first=True)
+    _prompt = model.get_prompt(prompt)
+    assert _prompt.find('<FILL>') == -1
+
+
+def test_codellama_chat():
+    model = MODELS.get('codellama')(capability='chat',
+                                    system='Provide answers in Python')
+    prompt = 'Write a function that computes the set of sums of all contiguous sublists of a given list.'  # noqa: E501
+    _prompt = model.get_prompt(prompt, sequence_start=True)
+    assert _prompt.find('Provide answers in Python') != -1
+
+    _prompt = model.get_prompt(prompt, sequence_start=False)
+    assert _prompt.find('Provide answers in Python') == -1
+    assert model.stop_words is None
+
+
+def test_codellama_python_specialist():
+    model = MODELS.get('codellama')(capability='python')
+    prompt = """
+    def remove_non_ascii(s: str) -> str:
+"""
+    assert model.get_prompt(prompt, sequence_start=True) == prompt
+    assert model.get_prompt(prompt, sequence_start=False) == prompt
+    assert model.stop_words is None
+
+
+def test_codellama_others():
+    model = None
+    with pytest.raises(AssertionError):
+        model = MODELS.get('codellama')(capability='java')
+    assert model is None
+
+
+def test_sampling_param():
+    model = MODELS.get('llama')()
+    default_sampling_param = SamplingParam()
+    assert model.sampling_param == default_sampling_param
+
+    model = MODELS.get('llama')(top_p=0.1, top_k=10)
+    assert model.sampling_param.top_p == 0.1 and \
+        model.sampling_param.top_k == 10
+    assert model.sampling_param.temperature == 0.8 and \
+        model.sampling_param.repetition_penalty == 1.0
+
+    model = MODELS.get('codellama')(capability='completion')
+    assert model.sampling_param.top_p == 0.9 and \
+        model.sampling_param.top_k is None and \
+        model.sampling_param.temperature == 0.2 and \
+        model.sampling_param.repetition_penalty == 1.0
+
+    model = MODELS.get('codellama')(capability='chat')
+    assert model.sampling_param.top_p == 0.95 and \
+        model.sampling_param.top_k is None and \
+        model.sampling_param.temperature == 0.2 and \
+        model.sampling_param.repetition_penalty == 1.0
+
+    model = MODELS.get('codellama')(capability='infilling')
+    assert model.sampling_param.top_p == 0.9 and \
+        model.sampling_param.top_k is None and \
+        model.sampling_param.temperature == 0.0 and \
+        model.sampling_param.repetition_penalty == 1.0
+
+    model = MODELS.get('codellama')(capability='python')
+    assert model.sampling_param.top_p == 0.9 and \
+        model.sampling_param.top_k is None and \
+        model.sampling_param.temperature == 0.2 and \
+        model.sampling_param.repetition_penalty == 1.0

From cfec5bede59ebff0a62129abb5e1437b851635c5 Mon Sep 17 00:00:00 2001
From: liukuikun <24622904+Harold-lkk@users.noreply.github.com>
Date: Mon, 11 Sep 2023 16:18:38 +0800
Subject: [PATCH 06/43] [Fix] Update puyu model (#399)

---
 lmdeploy/model.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index bf89e39063..fd724a2a19 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -308,30 +308,32 @@ class Puyu(BaseModel):
     AI Laboratory."""
 
     def __init__(self,
+                 meta_instruction='',
                  system='',
-                 user='<|Human|>: ',
-                 eoh='',
                  eosys='',
-                 assistant='<|Assistant|>: ',
-                 system_role='<|System|>: ',
+                 user='',
+                 eoh='',
+                 assistant='',
+                 eoa='',
                  **kwargs):
         super().__init__(**kwargs)
-        self.meta_instruction = system
+        self.meta_instruction = meta_instruction
+        self.system = system
         self.user = user
-        self.eoh = eoh
-        self.eosys = eosys
         self.assistant = assistant
-        self.system = system_role
+        self.eosys = eosys
+        self.eoh = eoh
+        self.eoa = eoa
 
     def decorate_prompt(self, prompt, sequence_start=True):
         assert self.capability == 'chat', \
             f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
-            return f'<BOS>{self.system}{self.meta_instruction}{self.eosys}\n' \
-                   f'{self.user}{prompt}{self.eoh}\n' \
+            return f'<BOS>{self.system}{self.meta_instruction}{self.eosys}' \
+                   f'{self.user}{prompt}{self.eoh}' \
                    f'{self.assistant}'
         else:
-            return f'\n{self.user}{prompt}{self.eoh}\n{self.assistant}'
+            return f'{self.eoa}{self.user}{prompt}{self.eoh}{self.assistant}'
 
     def messages2prompt(self, messages, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
@@ -350,10 +352,10 @@ def messages2prompt(self, messages, sequence_start=True):
         ret = f'<BOS>{system}{self.meta_instruction}{self.eosys}'
         for user, assistant in zip(users, assistants):
             if assistant:
-                ret += f'\n{self.user}{user}{self.eoh}\n{self.assistant}' \
-                       f'{assistant}'
+                ret += f'{self.user}{user}{self.eoh}{self.assistant}' \
+                       f'{assistant}{self.eoa}'
             else:
-                ret += f'\n{self.user}{user}{self.eoh}\n{self.assistant}'
+                ret += f'{self.user}{user}{self.eoh}{self.assistant}'
         return ret
 
     @property

From 450757b2bdf3b124cfa14950e4f8ed6f6d15550c Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 11 Sep 2023 21:56:58 +0800
Subject: [PATCH 07/43] bump version to v0.0.8 (#401)

---
 lmdeploy/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 175e5af091..d397dc7003 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.0.7'
+__version__ = '0.0.8'
 short_version = __version__
 
 

From e37915e598a250b0262bb7f957163194c5601fb3 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Tue, 12 Sep 2023 14:16:34 +0800
Subject: [PATCH 08/43] Fix disk space limit for building docker image (#404)

This reverts commit 7368b88692ecca3f5b39f92a8cc41cf21e3fd71e.
---
 .github/workflows/docker.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index dcfb19af38..9d6d62306b 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -26,7 +26,13 @@ jobs:
         uses: actions/checkout@v3
       - name: Check disk space
         run: |
+          df -h
+          ls /opt/hostedtoolcache
           rm -rf ${GITHUB_WORKSPACE}/.git
+          rm -rf  /opt/hostedtoolcache/go
+          rm -rf  /opt/hostedtoolcache/node
+          rm -rf  /opt/hostedtoolcache/Ruby
+          rm -rf  /opt/hostedtoolcache/CodeQL
           cat /proc/cpuinfo  | grep -ic proc
           free
           df -h

From 2537c5ed1a2d363b258ba164d3ba5dcc29c44b56 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Wed, 13 Sep 2023 11:18:29 +0800
Subject: [PATCH 09/43] more general pypi ci (#412)

---
 builder/manywheel/build_wheel.sh      | 5 +----
 builder/manywheel/entrypoint_build.sh | 3 ---
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/builder/manywheel/build_wheel.sh b/builder/manywheel/build_wheel.sh
index f8d2b72860..9b1cf3d7e0 100755
--- a/builder/manywheel/build_wheel.sh
+++ b/builder/manywheel/build_wheel.sh
@@ -6,8 +6,6 @@ PLAT_NAME="$2"
 DOCKER_TAG="$3"
 OUTPUT_DIR="$4"
 
-GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
-GIT_BRANCH=${GIT_BRANCH:-main}
 DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}"
 export USERID=$(id -u)
 export GROUPID=$(id -g)
@@ -20,8 +18,7 @@ docker run --rm -it \
     --env PLAT_NAME="${PLAT_NAME}" \
     --env USERID="${USERID}" \
     --env GROUPID="${GROUPID}" \
-    --env GIT_BRANCH="${GIT_BRANCH}" \
-    --env GIT_REMOTE="${GIT_REMOTE}" \
+    --volume "$(pwd)/../../:/lmdeploy" \
     --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \
     --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \
     --entrypoint /entrypoint_build.sh \
diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh
index 60746fd9b0..abb90562a2 100755
--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -7,12 +7,9 @@ export USERID=${USERID}
 export GROUPID=${GROUPID}
 export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p')
 
-export GIT_REMOTE=${GIT_REMOTE:-https://github.com/InternLM/lmdeploy}
-export GIT_BRANCH=${GIT_BRANCH:-main}
 source /opt/conda/bin/activate
 conda activate $PYTHON_VERSION
 
-git clone -b ${GIT_BRANCH} ${GIT_REMOTE}
 cd lmdeploy
 mkdir build && cd build
 bash ../generate.sh

From 64c39dd8b207daedacf38f36c68deeab93bf073a Mon Sep 17 00:00:00 2001
From: WRH <12756472+wangruohui@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:33:20 +0800
Subject: [PATCH 10/43] fix output[-1] when output is empty (#405)

---
 lmdeploy/turbomind/turbomind.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index c39110b713..45760a309a 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -344,7 +344,7 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
             outputs = []
             for output, len_ in zip(output_ids, sequence_length):
                 output, len_ = output, len_.item()
-                if output[-1].item() == self.eos_id:
+                if len(output) > 0 and output[-1].item() == self.eos_id:
                     outputs.append((output[:-1], len_ - 1))
                 else:
                     outputs.append((output, len_))

From ec034c15023ca0412a91aeddd8aad164e155b695 Mon Sep 17 00:00:00 2001
From: nlp-pang <30644796+pangsg@users.noreply.github.com>
Date: Thu, 14 Sep 2023 11:42:17 +0800
Subject: [PATCH 11/43] Fix build.md (#411)

* fix the build step

* Fix the build step
---
 docs/en/build.md    | 1 +
 docs/zh_cn/build.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/en/build.md b/docs/en/build.md
index 33cadb996b..7ee53ac90c 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -18,4 +18,5 @@
   ```shell
   mkdir build && cd build
   sh ../generate.sh
+  make -j$(nproc) && make install
   ```
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
index 07ed5d7f79..d97bab7196 100644
--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -18,4 +18,5 @@
   ```shell
   mkdir build && cd build
   sh ../generate.sh
+  make -j$(nproc) && make install
   ```

From 2dec28aeafd159879b4f437de25c10f7d7139679 Mon Sep 17 00:00:00 2001
From: Chen Xin <chenxin@pjlab.org.cn>
Date: Thu, 14 Sep 2023 20:55:57 +0800
Subject: [PATCH 12/43] Fix memory leak (#415)

---
 .../layers/sampling_layers/BaseSamplingLayer.cc  | 16 ++++++++--------
 .../llama/LlamaTritonModelInstance.cc            | 10 +++-------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc b/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
index 15f063c654..c8c36f65da 100644
--- a/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
+++ b/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
@@ -45,10 +45,10 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
         reinterpret_cast<bool*>(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false));
 
     // host buffers.
-    temperature_        = new float[batch_size];
-    repetition_penalty_ = new float[batch_size];
-    min_lengths_        = new int[batch_size];
-    skip_decode_        = new bool[batch_size];
+    temperature_        = (float*)std::realloc((void*)temperature_, batch_size * sizeof(float));
+    repetition_penalty_ = (float*)std::realloc((void*)repetition_penalty_, batch_size * sizeof(float));
+    min_lengths_        = (int*)std::realloc((void*)min_lengths_, batch_size * sizeof(int));
+    skip_decode_        = (bool*)std::realloc((void*)skip_decode_, batch_size * sizeof(bool));
 
     is_allocate_buffer_ = true;
 }
@@ -65,10 +65,10 @@ void BaseSamplingLayer<T>::freeBuffer()
         allocator_->free((void**)(&min_lengths_buf_));
         allocator_->free((void**)(&runtime_logits_buf_));
         allocator_->free((void**)(&skip_decode_buf_));
-        delete[] temperature_;
-        delete[] repetition_penalty_;
-        delete[] min_lengths_;
-        delete[] skip_decode_;
+        std::free(temperature_);
+        std::free(repetition_penalty_);
+        std::free(min_lengths_);
+        std::free(skip_decode_);
         is_allocate_buffer_ = false;
     }
 }
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
index 025b066433..102b324b8e 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -61,8 +61,8 @@ std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert
 
     const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
     const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
-    // freed in forward()
-    h_total_output_lengths_ = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
+    h_total_output_lengths_ =
+        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
 
     std::unordered_map<std::string, ft::Tensor> ft_input_tensors = std::unordered_map<std::string, ft::Tensor>{
         {"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
@@ -251,11 +251,6 @@ LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::str
         output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
     }
 
-    if (h_total_output_lengths_ != nullptr) {
-        free(h_total_output_lengths_);
-        h_total_output_lengths_ = nullptr;
-    }
-
     return convert_outputs(output_tensors);
 }
 
@@ -293,6 +288,7 @@ void LlamaTritonModelInstance<T>::freeBuffer()
     allocator_->free((void**)(&d_sequence_lengths_));
     allocator_->free((void**)(&d_output_log_probs_));
     allocator_->free((void**)(&d_cum_log_probs_));
+    std::free(h_total_output_lengths_);
 }
 
 template struct LlamaTritonModelInstance<float>;

From d44a8bfea49cda9b74960e7cfc61f16ae0e59808 Mon Sep 17 00:00:00 2001
From: Chen Xin <chenxin@pjlab.org.cn>
Date: Mon, 18 Sep 2023 20:03:44 +0800
Subject: [PATCH 13/43] Reduce gil switching (#407)

* reduce gil switching

* ffi lock func

* remove unused

* remove unused

* remove unused
---
 src/turbomind/models/llama/LlamaBatch.cc         | 11 +++++++++--
 src/turbomind/models/llama/LlamaV2.h             |  9 +++++++++
 src/turbomind/python/bind.cpp                    | 16 ++++++++++++++--
 .../triton_backend/llama/LlamaTritonModel.cc     |  1 +
 .../triton_backend/llama/LlamaTritonModel.h      |  7 +++++++
 5 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 83db7ad65d..995f15b710 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -899,8 +899,9 @@ void LlamaBatch<T>::outputContextLogits(T*                      context_decoder_
 
     if (context_logits_buf_ == nullptr) {
         NcclGuard guard(llama_->tensor_para_, stream_, true);
-        context_logits_buf_ = (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_);
-        const auto tp       = llama_->tensor_para_.world_size_;
+        context_logits_buf_ =
+            (float*)allocator_->malloc(sizeof(float) * llama_->vocab_size_padded_ * max_context_token_num_);
+        const auto tp = llama_->tensor_para_.world_size_;
         if (tp > 1) {
             FT_CHECK(llama_->vocab_size_padded_ % tp == 0);
             const auto local_vocab_size = llama_->vocab_size_padded_ / tp;
@@ -938,12 +939,18 @@ void LlamaBatch<T>::finish()
 
     check_cuda_error(cudaStreamSynchronize(stream_));
 
+    if (rank_ == 0 && llama_->ffi_lock_) {
+        llama_->ffi_lock_(1);
+    }
     for (int i = 0; i < batch_size_; ++i) {
         FT_CHECK(requests_[i] != nullptr);
         if (requests_[i]->stream_cb && rank_ == 0) {
             requests_[i]->stream_cb(&requests_[i]->outputs[rank_].get());
         }
     }
+    if (rank_ == 0 && llama_->ffi_lock_) {
+        llama_->ffi_lock_(0);
+    }
 
     if (debug_ && rank_ == 0) {
         std::stringstream ss;
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index ed13aa40f4..c52a02db0c 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -34,6 +34,8 @@
 #include "src/turbomind/utils/nccl_utils.h"
 #include <unordered_map>
 
+using ffi_api_lock_ctrl_t = std::function<void(int)>;
+
 namespace turbomind {
 
 template<typename T>
@@ -91,6 +93,11 @@ class LlamaV2 {
         return vocab_size_;
     }
 
+    void setFfiLock(ffi_api_lock_ctrl_t func)
+    {
+        ffi_lock_ = func;
+    }
+
 private:
     friend class Batch;
 
@@ -188,6 +195,8 @@ class LlamaV2 {
     std::shared_ptr<SharedState> shared_state_;
 
     std::thread internal_thread_;
+
+    ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 592b2b30e6..b55ed040af 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -344,13 +344,25 @@ PYBIND11_MODULE(_turbomind, m)
                size_t      pipeline_para_size,
                int         enable_custom_all_reduce,
                std::string data_type) -> std::shared_ptr<AbstractTransformerModel> {
+                auto gil_control = [state = PyGILState_STATE{}](int op) mutable {
+                    if (op) {
+                        state = PyGILState_Ensure();
+                    }
+                    else {
+                        PyGILState_Release(state);
+                    }
+                };
                 if (data_type == "half" || data_type == "fp16" || data_type == "int4") {
-                    return std::make_shared<LlamaTritonModel<half>>(
+                    auto model = std::make_shared<LlamaTritonModel<half>>(
                         tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir);
+                    model->setFfiLock(gil_control);
+                    return model;
                 }
                 else {
-                    return std::make_shared<LlamaTritonModel<float>>(
+                    auto model = std::make_shared<LlamaTritonModel<float>>(
                         tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir);
+                    model->setFfiLock(gil_control);
+                    return model;
                 }
             },
             "model_dir"_a,
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 456f5f41c4..57d5c9be5b 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -276,6 +276,7 @@ LlamaTritonModel<T>::createModelInstance(int
         instance = shared_instances_[device_id].lock();
         if (!instance) {
             instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
+            instance->llm->setFfiLock(ffi_lock_);
             shared_instances_[device_id] = instance;
         }
     }
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index f3a3a327a9..332000ce62 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -63,6 +63,11 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     void handleMissingParams();
 
+    void setFfiLock(ffi_api_lock_ctrl_t func)
+    {
+        ffi_lock_ = func;
+    }
+
     std::string toString() override;
     int         getTensorParaSize() override;
     int         getPipelineParaSize() override;
@@ -112,4 +117,6 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     std::string model_name_;
     std::string model_dir_;
+
+    ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
 };

From 3a7880a866435e61f29ed700d8bb5af7ab44d90c Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 18 Sep 2023 20:04:56 +0800
Subject: [PATCH 14/43] Fix token count bug (#416)

* fix token count bug

* fix error response
---
 lmdeploy/serve/openai/api_server.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 4ec2d58636..28096af9b6 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -57,9 +57,10 @@ def create_error_response(status: HTTPStatus, message: str):
         status (HTTPStatus): HTTP status codes and reason phrases
         message (str): error message
     """
-    return JSONResponse(ErrorResponse(message=message,
-                                      type='invalid_request_error').dict(),
-                        status_code=status.value)
+    return JSONResponse(
+        ErrorResponse(message=message,
+                      type='invalid_request_error',
+                      code=status.value).dict())
 
 
 async def check_request(request) -> Optional[JSONResponse]:
@@ -117,7 +118,7 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     result_generator = VariableInterface.async_engine.generate_openai(
         request.messages,
         instance_id,
-        request.stream,
+        True,  # always use stream to enable batching
         request.renew_session,
         request_output_len=request.max_tokens if request.max_tokens else 512,
         stop=request.stop,
@@ -130,7 +131,7 @@ async def abort_request() -> None:
         async for _ in VariableInterface.async_engine.generate_openai(
                 request.messages,
                 instance_id,
-                request.stream,
+                True,
                 request.renew_session,
                 stop=True):
             pass
@@ -188,6 +189,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
 
     # Non-streaming response
     final_res = None
+    text = ''
     async for res in result_generator:
         if await raw_request.is_disconnected():
             # Abort the request if the client disconnects.
@@ -195,11 +197,12 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
             return create_error_response(HTTPStatus.BAD_REQUEST,
                                          'Client disconnected')
         final_res = res
+        text += res.response
     assert final_res is not None
     choices = []
     choice_data = ChatCompletionResponseChoice(
         index=0,
-        message=ChatMessage(role='assistant', content=final_res.response),
+        message=ChatMessage(role='assistant', content=text),
         finish_reason=final_res.finish_reason,
     )
     choices.append(choice_data)
@@ -308,7 +311,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
         finish_reason = None
         async for out in generation:
             text += out.response
-            tokens += out.generate_token_len
+            tokens = out.generate_token_len
             finish_reason = out.finish_reason
         ret = {'text': text, 'tokens': tokens, 'finish_reason': finish_reason}
         return JSONResponse(ret)

From abe9f7bd99c07d3c29f206952b564c1de998d0da Mon Sep 17 00:00:00 2001
From: "q.yao" <yaoqian@sensetime.com>
Date: Mon, 18 Sep 2023 20:38:38 +0800
Subject: [PATCH 15/43] [Fix] Support actual seqlen in flash-attention2 (#418)

* support actual seqlen

* fix lint

* update variable types

* lint

* update type

* fix lint

---------
---
 ...er_masked_multihead_attention_template.cuh |  4 +-
 .../llama/LlamaContextAttentionLayer.cc       |  8 +++-
 .../models/llama/LlamaContextAttentionLayer.h |  1 +
 .../llama/LlamaDecoderSelfAttentionLayer.cc   |  2 +-
 src/turbomind/models/llama/LlamaV2.cc         |  5 ++-
 src/turbomind/models/llama/LlamaWeight.cc     |  6 ++-
 .../llama/flash_attention2/block_info.h       | 12 ++++--
 .../models/llama/flash_attention2/flash.h     | 10 +++--
 .../llama/flash_attention2/flash_api.cpp      |  3 ++
 .../llama_flash_attention_kernel.cu           | 10 ++---
 src/turbomind/models/llama/llama_kernels.h    | 22 ++++++-----
 .../unittests/test_context_attention_layer.cu | 37 ++++++++++++-------
 12 files changed, 75 insertions(+), 45 deletions(-)

diff --git a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
index c2b6039d67..a0f7490e00 100644
--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -1422,8 +1422,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
             // Trigger the stores to global memory.
             if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
 
-                int offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + tlength_circ * Dh
-                             + co * QK_ELTS_IN_16B + ci;
+                size_t offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh
+                                + tlength_circ * Dh + co * QK_ELTS_IN_16B + ci;
 
                 if (!QUANT_POLICY) {
                     *reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) =
diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
index e8f77e1c74..881582acea 100644
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -215,6 +215,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                 layer_offset,
                                 attention_mask,
                                 cu_seqlens,
+                                input_tensors->at("context_lengths").getPtr<int>(),
                                 batch_size,
                                 max_q_len,
                                 max_k_len,
@@ -258,6 +259,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
                                                             size_t cache_layer_offset,
                                                             T*     attention_mask,
                                                             int*   cu_seqlens,
+                                                            int*   context_lengths,
                                                             int    batch_size,
                                                             int    max_q_len,
                                                             int    max_k_len,
@@ -274,13 +276,13 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
                     int(size_per_head_),
                     int(max_seq_len * size_per_head_),
                     false,
-                    int(cache_layer_offset),
+                    cache_layer_offset,
                     key_cache_ptrs};
     Layout layout_v{int(local_head_num_ * max_seq_len * size_per_head_),
                     int(size_per_head_),
                     int(max_seq_len * size_per_head_),
                     false,
-                    int(cache_layer_offset),
+                    cache_layer_offset,
                     val_cache_ptrs};
     Layout layout_o{
         int(local_head_num_ * max_q_len * size_per_head_),
@@ -298,6 +300,8 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
                                              qk_buf_float_,
                                              cu_seqlens,
                                              nullptr,
+                                             nullptr,
+                                             context_lengths,
                                              group_size,
                                              layout_q,
                                              layout_k,
diff --git a/src/turbomind/models/llama/LlamaContextAttentionLayer.h b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
index 235b575b8e..f79eaa4ef2 100644
--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -72,6 +72,7 @@ class LlamaContextAttentionLayer {
                                  size_t cache_layer_offset,
                                  T*     attention_mask,
                                  int*   cu_seqlens,
+                                 int*   context_lengths,
                                  int    batch_size,
                                  int    max_q_len,
                                  int    max_k_len,
diff --git a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
index 3caaf59068..103b32e88f 100644
--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -130,7 +130,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
 
     params.hidden_size_per_head    = size_per_head;
     params.rotary_embedding_dim    = rotary_embedding_dim;
-    params.rotary_embedding_base         = rotary_embedding_base;
+    params.rotary_embedding_base   = rotary_embedding_base;
     params.max_position_embeddings = max_position_embeddings;
     params.use_dynamic_ntk         = use_dynamic_ntk;
     params.use_logn_attn           = use_logn_attn;
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index beaf3c3f6d..9c48e4f818 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -93,7 +93,8 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
     TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
 
-    vocab_size_padded_ = (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;
+    vocab_size_padded_ =
+        (vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;
 
     size_t elem_bits = 0;
     if (quant_policy & QuantPolicy::kCacheKVInt8) {
@@ -171,7 +172,7 @@ void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
 
     dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
                                                           vocab_size_padded_,
-                                                          0,            // end_id, deprecated
+                                                          0,  // end_id, deprecated
                                                           stream_,
                                                           cublas_wrapper_,
                                                           allocator_,
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 511cbe5bbf..80e561442a 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -95,8 +95,10 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
 
     loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);
 
-    loadWeightFromBin(
-        (T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_padded_}, dir_path + "output.weight", model_file_type);
+    loadWeightFromBin((T*)post_decoder_embedding_kernel,
+                      {hidden_units_ * vocab_size_padded_},
+                      dir_path + "output.weight",
+                      model_file_type);
 
     for (unsigned layer = 0; layer < num_layer_; ++layer) {
         decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
diff --git a/src/turbomind/models/llama/flash_attention2/block_info.h b/src/turbomind/models/llama/flash_attention2/block_info.h
index 310d1f22bf..38b6aa2583 100644
--- a/src/turbomind/models/llama/flash_attention2/block_info.h
+++ b/src/turbomind/models/llama/flash_attention2/block_info.h
@@ -15,10 +15,14 @@ struct BlockInfo {
     __device__ BlockInfo(const Params& params, const int bidb):
         sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb]),
         sum_s_k(!Varlen || params.cu_seqlens_k == nullptr ? -1 : params.cu_seqlens_k[bidb]),
-        actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q :
-                                                                    params.cu_seqlens_q[bidb + 1] - sum_s_q),
-        actual_seqlen_k(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k :
-                                                                    params.cu_seqlens_k[bidb + 1] - sum_s_k)
+        actual_seqlen_q(params.actual_seqlen_q == nullptr ?
+                            (!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q :
+                                                                         params.cu_seqlens_q[bidb + 1] - sum_s_q) :
+                            params.actual_seqlen_q[bidb]),
+        actual_seqlen_k(params.actual_seqlen_k == nullptr ?
+                            (!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k :
+                                                                         params.cu_seqlens_k[bidb + 1] - sum_s_k) :
+                            params.actual_seqlen_k[bidb])
     {
     }
 
diff --git a/src/turbomind/models/llama/flash_attention2/flash.h b/src/turbomind/models/llama/flash_attention2/flash.h
index 576cbc8d9c..8a5a7c5794 100644
--- a/src/turbomind/models/llama/flash_attention2/flash.h
+++ b/src/turbomind/models/llama/flash_attention2/flash.h
@@ -16,7 +16,7 @@ constexpr int D_DIM     = 2;
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct Qkv_params {
-    using index_t = uint32_t;
+    using index_t = size_t;
     // The QKV matrices.
     void* __restrict__ q_ptr;
     void* __restrict__ k_ptr;
@@ -25,8 +25,8 @@ struct Qkv_params {
     // batched ptr inputs.
     void** __restrict__ k_batched_ptr = nullptr;
     void** __restrict__ v_batched_ptr = nullptr;
-    int k_batched_offset              = 0;
-    int v_batched_offset              = 0;
+    size_t k_batched_offset           = 0;
+    size_t v_batched_offset           = 0;
 
     // The stride between rows of the Q, K and V matrices.
     index_t q_batch_stride;
@@ -72,6 +72,10 @@ struct Flash_fwd_params: public Qkv_params {
     int* __restrict__ cu_seqlens_q;
     int* __restrict__ cu_seqlens_k;
 
+    // array of length b with actual length of each sequence
+    int* __restrict__ actual_seqlen_q;
+    int* __restrict__ actual_seqlen_k;
+
     void* __restrict__ blockmask;
 
     bool is_bf16;
diff --git a/src/turbomind/models/llama/flash_attention2/flash_api.cpp b/src/turbomind/models/llama/flash_attention2/flash_api.cpp
index e2f12c7233..55bc92c1ff 100644
--- a/src/turbomind/models/llama/flash_attention2/flash_api.cpp
+++ b/src/turbomind/models/llama/flash_attention2/flash_api.cpp
@@ -121,6 +121,9 @@ class FlashAttentionOpImpl<T, FMHA_VERSION>::impl {
         fwd_params.cu_seqlens_q = params.cu_seqlens_q;
         fwd_params.cu_seqlens_k = params.cu_seqlens_k;
 
+        fwd_params.actual_seqlen_q = params.actual_seqlen_q;
+        fwd_params.actual_seqlen_k = params.actual_seqlen_k;
+
         fwd_params.blockmask = reinterpret_cast<void*>(params.mask);
 
         fwd_params.is_bf16   = false;
diff --git a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
index 29035421c1..4fae69bd08 100644
--- a/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
+++ b/src/turbomind/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
@@ -70,10 +70,10 @@ struct LlamaAttentionKernel:
         scalar_t** v_batch_seqs_ptr = nullptr;
         output_t** o_batch_seqs_ptr = nullptr;
 
-        int q_batch_seqs_offset = 0;
-        int k_batch_seqs_offset = 0;
-        int v_batch_seqs_offset = 0;
-        int o_batch_seqs_offset = 0;
+        size_t q_batch_seqs_offset = 0;
+        size_t k_batch_seqs_offset = 0;
+        size_t v_batch_seqs_offset = 0;
+        size_t o_batch_seqs_offset = 0;
 
         int32_t group_size = 1;
 
@@ -81,7 +81,7 @@ struct LlamaAttentionKernel:
 
         template<typename ptr_t>
         CUTLASS_DEVICE void
-        update_batched_ptr(ptr_t& data_ptr, ptr_t* batch_seq_ptr, int batch_seq_offset, int batch_id, int strideB)
+        update_batched_ptr(ptr_t& data_ptr, ptr_t* batch_seq_ptr, size_t batch_seq_offset, int batch_id, int strideB)
         {
             if (batch_seq_ptr != nullptr)
                 data_ptr = batch_seq_ptr[batch_id] + batch_seq_offset;
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index 6bd4644f0d..06cb24e042 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -80,12 +80,12 @@ void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
 
 template<typename T>
 struct BaseAttentionLayout {
-    int  stride_batch;
-    int  stride_seq;
-    int  stride_head;
-    bool use_seqlens       = false;
-    int  batch_seqs_offset = 0;
-    T**  batch_seqs        = nullptr;
+    int    stride_batch;
+    int    stride_seq;
+    int    stride_head;
+    bool   use_seqlens       = false;
+    size_t batch_seqs_offset = 0;
+    T**    batch_seqs        = nullptr;
 };
 
 template<typename T>
@@ -95,10 +95,12 @@ struct BaseAttentionParams {
     T*                     key;
     T*                     val;
     T*                     mask;
-    float*                 out_accum    = nullptr;
-    int*                   cu_seqlens_q = nullptr;
-    int*                   cu_seqlens_k = nullptr;
-    size_t                 group_size   = 1;
+    float*                 out_accum       = nullptr;
+    int*                   cu_seqlens_q    = nullptr;
+    int*                   cu_seqlens_k    = nullptr;
+    int*                   actual_seqlen_q = nullptr;
+    int*                   actual_seqlen_k = nullptr;
+    size_t                 group_size      = 1;
     BaseAttentionLayout<T> layout_q;
     BaseAttentionLayout<T> layout_k;
     BaseAttentionLayout<T> layout_v;
diff --git a/tests/csrc/unittests/test_context_attention_layer.cu b/tests/csrc/unittests/test_context_attention_layer.cu
index 948cd88a68..87693de34d 100644
--- a/tests/csrc/unittests/test_context_attention_layer.cu
+++ b/tests/csrc/unittests/test_context_attention_layer.cu
@@ -278,6 +278,8 @@ int main(int argc, const char* argv[])
     // auto* input_lengths  = (int*)allocator.malloc(sizeof(int) * batch_size, false);
     thrust::device_vector<int> input_lengths(batch_size);
     thrust::host_vector<int>   input_lengths_host(batch_size);
+    thrust::device_vector<int> kv_lengths(batch_size);
+    thrust::host_vector<int>   kv_lengths_host(batch_size);
 
     cudaRandomUniform<scalar_t>(query_ptr, batch_size * num_heads * seq_len * size_per_head);
     cudaRandomUniform<scalar_t>(key_ptr, batch_size * num_heads * key_len * size_per_head);
@@ -285,13 +287,12 @@ int main(int argc, const char* argv[])
     cudaRandomUniform<scalar_t>(mask_ptr, batch_size * seq_len * key_len);
 
     // create random length for batch
-    std::uniform_int_distribution<int> dist{seq_len / 2, seq_len};
-    auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
-    std::generate(begin(input_lengths_host), end(input_lengths_host), gen);
-    // for(int batch_id=0;batch_id<batch_size;++batch_id){
-    //     input_lengths_host[batch_id] = seq_len;
-    // }
-    thrust::copy(input_lengths_host.begin(), input_lengths_host.end(), input_lengths.begin());
+    {
+        std::uniform_int_distribution<int> dist{seq_len / 2, seq_len};
+        auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
+        std::generate(begin(input_lengths_host), end(input_lengths_host), gen);
+        thrust::copy(input_lengths_host.begin(), input_lengths_host.end(), input_lengths.begin());
+    }
     size_t  h_token_num = 0;
     size_t* h_pinned_token_num;
     auto    input_lengths_ptr = thrust::raw_pointer_cast(input_lengths.data());
@@ -306,10 +307,16 @@ int main(int argc, const char* argv[])
                                        stream);
     cudaFreeHost((void*)h_pinned_token_num);
 
-    int* k_lens = (int*)allocator.malloc(batch_size * sizeof(int));
-    deviceFill(k_lens, batch_size, key_len, stream);
+    {
+        std::uniform_int_distribution<int> dist{seq_len, key_len};
+        auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
+        std::generate(begin(kv_lengths_host), end(kv_lengths_host), gen);
+        thrust::copy(kv_lengths_host.begin(), kv_lengths_host.end(), kv_lengths.begin());
+    }
+    auto kv_lengths_ptr = thrust::raw_pointer_cast(kv_lengths.data());
+    // deviceFill(kv_lengths_ptr, batch_size, key_len, stream);
 
-    invokeCreateCausalMasks(mask_ptr, input_lengths_ptr, k_lens, seq_len, key_len, batch_size, stream);
+    invokeCreateCausalMasks(mask_ptr, input_lengths_ptr, kv_lengths_ptr, seq_len, key_len, batch_size, stream);
     // deviceFill(mask_ptr, batch_size*key_len*seq_len, scalar_t(1), stream);
 
     // compute gt
@@ -356,6 +363,8 @@ int main(int argc, const char* argv[])
                                              accum_buf_ptr,
                                              cu_seqlens_ptr,
                                              nullptr,
+                                             nullptr,
+                                             kv_lengths_ptr,
                                              1,
                                              layout_q,
                                              layout_k,
@@ -367,10 +376,10 @@ int main(int argc, const char* argv[])
     int num_rows = 8;
     // printf("query:\n");
     // printMatrix(query_ptr, num_rows, 8, size_per_head, true);
-    printf("expect:\n");
-    printMatrix(expect_out_ptr, num_rows, 8, size_per_head, true);
-    printf("actual:\n");
-    printMatrix(actual_out_ptr, num_rows, 8, size_per_head, true);
+    // printf("expect:\n");
+    // printMatrix(expect_out_ptr, num_rows, 8, size_per_head, true);
+    // printf("actual:\n");
+    // printMatrix(actual_out_ptr, num_rows, 8, size_per_head, true);
     checkResult(
         "all close:", actual_out_ptr, expect_out_ptr, batch_size * num_heads * seq_len * size_per_head, true, true);
 

From dfa67e8c06e497c1b88b8b9899a5b19a4fa564e4 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Mon, 18 Sep 2023 21:53:59 +0800
Subject: [PATCH 16/43] Profile token generation with more settings (#364)

* better profiler

* wait for releasing mem

* remove fire

* remove support for multiple model benchmark

* comments

* output more details

* correct tp
---
 benchmark/README.md             |   8 +-
 benchmark/profile_generation.py | 195 ++++++++++++++++++++++++++++++--
 2 files changed, 193 insertions(+), 10 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index bc047a3512..4fb08db4c6 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -23,10 +23,14 @@ python profile_throughput.py \
 
 `profile_generation.py` perform benchmark with dummy data.
 
+```shell
+pip install nvidia-ml-py
+```
+
 ```bash
 python profile_generation.py \
- /path/to/your/model \
- --concurrency 8 --input_seqlen 0 --output_seqlen 2048
+ --model-path /path/to/your/model \
+ --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
 ```
 
 ## profile serving
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index 214e07d45d..ecfd0d3e4f 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -1,12 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 # import multiprocessing as mp
+import argparse
+import csv
+import logging
+import os
 import os.path as osp
 import time
+from dataclasses import dataclass
 from queue import Queue
 from threading import Thread
 from typing import List
 
-import fire
 import numpy as np
+from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
+                    nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
+                    nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
+                    nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
+from tqdm import tqdm
 
 from lmdeploy.turbomind import Tokenizer, TurboMind
 
@@ -77,12 +87,12 @@ def _infer(model, session_id):
     print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')
 
 
-def main(model_path: str,
-         concurrency: int = 1,
-         input_seqlen: int = 0,
-         output_seqlen: int = 512,
-         test_round: int = 10,
-         tp: int = 1):
+def profile_throughput(model_path: str,
+                       concurrency: int = 1,
+                       input_seqlen: int = 0,
+                       output_seqlen: int = 512,
+                       test_round: int = 10,
+                       tp: int = 1):
     tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
     tokenizer = Tokenizer(tokenizer_model_path)
     tm_model = TurboMind(model_path=model_path, tp=tp)
@@ -141,7 +151,176 @@ def main(model_path: str,
           f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
           f'{token_latency_ave:.2f}s\n'
           f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
+    return tm_model.model_name, throughput, tm_model.gpu_count
+
+
+class MemoryMonitor:
+    from multiprocessing import Manager
+    max_mem = Manager().Value('f', 0)  # GB
+    device_count = Manager().Value('f', 0)
+
+    @staticmethod
+    def nvidia_info():
+        # pip install nvidia-ml-py
+        nvidia_dict = {
+            'state': True,
+            'nvidia_version': '',
+            'nvidia_count': 0,
+            'gpus': []
+        }
+        try:
+            nvmlInit()
+            nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
+            nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
+            for i in range(nvidia_dict['nvidia_count']):
+                handle = nvmlDeviceGetHandleByIndex(i)
+                memory_info = nvmlDeviceGetMemoryInfo(handle)
+                gpu = {
+                    'gpu_name': nvmlDeviceGetName(handle),
+                    'total': memory_info.total,
+                    'free': memory_info.free,
+                    'used': memory_info.used,
+                    'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
+                    'powerStatus': nvmlDeviceGetPowerState(handle)
+                }
+                nvidia_dict['gpus'].append(gpu)
+        except NVMLError as _:  # noqa
+            nvidia_dict['state'] = False
+        except Exception as _:  # noqa
+            nvidia_dict['state'] = False
+        finally:
+            try:
+                nvmlShutdown()
+            except:  # noqa
+                pass
+        return nvidia_dict
+
+    @classmethod
+    def mem_monitor(cls):
+        info = cls.nvidia_info()
+        max_mem = 0
+        mem_start = 0
+        cls.device_count.value = len(info['gpus'])
+        for used_total in info['gpus']:
+            mem_start += used_total['used']
+        while True:
+            info = cls.nvidia_info()
+            used = 0
+            for used_total in info['gpus']:
+                used += used_total['used']
+            if used > max_mem:
+                max_mem = used
+                cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
+
+    @classmethod
+    def start(cls):
+        cls._running = True
+        from multiprocessing import Process
+        cls.proc = Process(target=cls.mem_monitor)
+        cls.proc.start()
+
+    @classmethod
+    def terminate(cls) -> float:
+        """Terminate the subprocess and return maximum memory."""
+        cls.proc.kill()
+        return cls.max_mem.value
+
+
+@dataclass
+class ProfileResult:
+    model_name: str
+    batch: int
+    prompt_tokens: int
+    completion_tokens: int
+    throughput_per_proc: float
+    throughput_per_node: float
+    mem_per_proc: float
+    mem_per_gpu: float
+    mem_per_node: float
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Regression Test')
+    parser.add_argument('--model-path',
+                        type=str,
+                        help='benchmark test model path')
+    parser.add_argument('--concurrency',
+                        nargs='+',
+                        type=int,
+                        help='how many requests launched concurrently',
+                        default=[1, 8, 16, 32])
+    parser.add_argument(
+        '--prompt-tokens',
+        nargs='+',
+        type=int,
+        help='how many requests launched concurrently. One-to-one'
+        'correspondence with completion-tokens',
+        default=[64, 512, 512, 1024])
+    parser.add_argument('--completion-tokens',
+                        nargs='+',
+                        type=int,
+                        help='how many tokens to be generated. One-to-one'
+                        'correspondence with prompt-tokens',
+                        default=[512, 512, 1024, 1024])
+    parser.add_argument('--tp', type=int, help='Tensor parallel', default=1)
+    parser.add_argument('--dst-csv',
+                        type=str,
+                        help='Where to save the result.',
+                        default='profile_generation.csv')
+    parser.add_argument('--log-level',
+                        help='set log level',
+                        default='INFO',
+                        choices=list(logging._nameToLevel.keys()))
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    os.environ['TM_LOG_LEVEL'] = args.log_level
+    results: List[ProfileResult] = []
+    for batch in tqdm(args.concurrency):
+        for prompt_tokens, completion_tokens in tqdm(
+                zip(args.prompt_tokens, args.completion_tokens)):
+            MemoryMonitor.start()
+            from functools import partial
+            from multiprocessing import Pool
+            profile_target = partial(profile_throughput,
+                                     concurrency=batch,
+                                     input_seqlen=prompt_tokens,
+                                     output_seqlen=completion_tokens,
+                                     tp=args.tp)
+            output = Pool(1).map(profile_target, (args.model_path, ))
+            model_name, throughput_per_proc, tp = output[0]
+            time.sleep(5)  # wait a while for releasing GPU mem
+            memory = MemoryMonitor.terminate()
+            device_count = MemoryMonitor.device_count.value
+            results.append(
+                ProfileResult(model_name=model_name,
+                              batch=batch,
+                              prompt_tokens=prompt_tokens,
+                              completion_tokens=completion_tokens,
+                              throughput_per_proc=throughput_per_proc,
+                              throughput_per_node=throughput_per_proc / tp *
+                              device_count,
+                              mem_per_proc=memory,
+                              mem_per_gpu=memory / tp,
+                              mem_per_node=memory / tp * device_count))
+    with open(args.dst_csv, 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow([
+            'batch', 'prompt_tokens', 'completion_tokens',
+            'throughput_per_proc(token/s)', 'throughput_per_node(token/s)',
+            'mem_per_proc(GB)', 'mem_per_gpu(GB)', 'mem_per_node(GB)'
+        ])
+        for re in results:
+            writer.writerow([
+                re.batch, re.prompt_tokens, re.completion_tokens,
+                f'{re.throughput_per_proc:.2f}',
+                f'{re.throughput_per_node:.2f}', f'{re.mem_per_proc:.2f}',
+                f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}'
+            ])
 
 
 if __name__ == '__main__':
-    fire.Fire(main)
+    main()

From 19ff47dfb48ef53d6a7485687e2a3a63cfe611de Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Tue, 19 Sep 2023 10:47:40 +0800
Subject: [PATCH 17/43] rename readthedocs config file (#429)

---
 .readthedocs.yml => .readthedocs.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .readthedocs.yml => .readthedocs.yaml (100%)

diff --git a/.readthedocs.yml b/.readthedocs.yaml
similarity index 100%
rename from .readthedocs.yml
rename to .readthedocs.yaml

From df7955de37d1505aee7145fa471dc94458d98666 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 20 Sep 2023 16:09:06 +0800
Subject: [PATCH 18/43] Support InternLM 20B (#440)

* better profiler

* wait for releasing mem

* remove fire

* remove support for multiple model benchmark

* comments

* support actual seqlen

* change chat template

* update

* fix ut

* int->size_t

* output more details

* correct tp

* rollback

* update

* update readme

* add 'internlm-chat' as the default tag for internlm chat models

* rollback tokenizer

---------

Co-authored-by: AllentDan <AllentDan@yeah.net>
Co-authored-by: grimoire <yaoqian@pjlab.org.cn>
---
 README.md                         | 14 ++++++++------
 README_zh-CN.md                   | 14 ++++++++------
 lmdeploy/model.py                 | 25 +++++++++++++++++++++----
 tests/test_lmdeploy/test_model.py |  4 ++--
 4 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 42bcbee8b7..3b1f3697f0 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports InternLM-20B
 - \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
@@ -61,7 +62,8 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | :----------: | :-------------: | :--: | :-----: | :---: | :--: |
 |    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   InternLM   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
@@ -69,11 +71,11 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 
 ### Pytorch
 
-|  Models  | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :-------------: | :--: | :-----: | :---: | :--: |
-|  Llama   |       Yes       | Yes  |   No    |  No   |  No  |
-|  Llama2  |       Yes       | Yes  |   No    |  No   |  No  |
-| InternLM |       Yes       | Yes  |   No    |  No   |  No  |
+|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
+|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
+| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
 
 ## Performance
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 35cae96eb8..323c0654f2 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 InternLM-20B 模型
 - \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
@@ -62,7 +63,8 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | :----------: | :------: | :--: | :-----: | :---: | :--: |
 |    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|   InternLM   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
@@ -70,11 +72,11 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 
 ### Pytorch
 
-|   模型   | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :------: | :--: | :-----: | :---: | :--: |
-|  Llama   |   Yes    | Yes  |   No    |  No   |  No  |
-|  Llama2  |   Yes    | Yes  |   No    |  No   |  No  |
-| InternLM |   Yes    | Yes  |   No    |  No   |  No  |
+|    模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :------: | :--: | :-----: | :---: | :--: |
+|    Llama    |   Yes    | Yes  |   No    |  No   |  No  |
+|   Llama2    |   Yes    | Yes  |   No    |  No   |  No  |
+| InternLM-7B |   Yes    | Yes  |   No    |  No   |  No  |
 
 ## 性能
 
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index fd724a2a19..da472428d2 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -55,7 +55,7 @@ def get_prompt(self, prompt, sequence_start=True):
 
     @abstractmethod
     def decorate_prompt(self, prompt, sequence_start):
-        pass
+        return prompt
 
     @staticmethod
     def _translate_messages(messages: List):
@@ -169,6 +169,7 @@ def messages2prompt(self, messages, sequence_start=True):
         return ret
 
 
+@MODELS.register_module(name='internlm-chat')
 @MODELS.register_module(name='internlm-chat-7b')
 class InternLMChat7B(BaseModel):
     """Chat template of InternLM model."""
@@ -176,7 +177,7 @@ class InternLMChat7B(BaseModel):
     def __init__(self,
                  system='',
                  user='<|User|>',
-                 eoh='<eoh>',
+                 eoh='',
                  eoa='<eoa>',
                  assistant='<|Bot|>',
                  **kwargs):
@@ -223,7 +224,7 @@ def messages2prompt(self, messages, sequence_start=True):
         for user, assistant in zip(users, assistants):
             if assistant:
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
-                       f'{assistant}{self.eoa}'
+                       f'{assistant}{self.eoa}\n'
             else:
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
         return ret
@@ -231,19 +232,33 @@ def messages2prompt(self, messages, sequence_start=True):
     @property
     def stop_words(self):
         """Return the stop-words' token ids."""
-        return [103027, 103028]
+        return [103028]
 
 
+@MODELS.register_module(name='internlm-chat-20b')
 @MODELS.register_module(name='internlm-chat-7b-8k')
 class InternLMChat7B8K(InternLMChat7B):
+    """Chat template and generation parameters of InternLM-Chat-7B-8K and
+    InternLM-Chat-20B models."""
 
     def __init__(self, session_len=8192, **kwargs):
         super(InternLMChat7B8K, self).__init__(**kwargs)
         self.session_len = session_len
 
 
+@MODELS.register_module(name='internlm-20b')
+class InternLMBaseModel20B(BaseModel):
+    """Generation parameters of InternLM-20B-Base model."""
+
+    def __init__(self, session_len=4096, capability='completion', **kwargs):
+        super().__init__(session_len=session_len,
+                         capability=capability,
+                         **kwargs)
+
+
 @MODELS.register_module(name='baichuan-7b')
 class Baichuan7B(BaseModel):
+    """Generation parameters of Baichuan-7B base model."""
 
     def __init__(self, repetition_penalty=1.1, **kwargs):
         super().__init__(**kwargs)
@@ -252,6 +267,8 @@ def __init__(self, repetition_penalty=1.1, **kwargs):
 
 @MODELS.register_module(name='baichuan2-7b')
 class Baichuan2_7B(BaseModel):
+    """Chat template and generation parameters of Baichuan2-7B-Base and
+    Baichuan2-7B-Chat models."""
 
     def __init__(self,
                  temperature=0.3,
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 83487f1f03..dcf04d5c28 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -7,7 +7,7 @@ def test_base_model():
     model = MODELS.get('llama')()
     assert model is not None
     assert model.capability == 'chat'
-    assert model.get_prompt('test') is None
+    assert model.get_prompt('test') == 'test'
     assert model.stop_words is None
 
     model = MODELS.get('internlm')(capability='completion')
@@ -72,7 +72,7 @@ def test_baichuan():
 
     model = MODELS.get('baichuan-7b')(capability='chat')
     _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+    assert _prompt == prompt
 
 
 def test_llama2():

From 0be9e7ab6fe9a066cfb0a09d0e0c8d2e28435e58 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Wed, 20 Sep 2023 16:09:47 +0800
Subject: [PATCH 19/43] bump version to v0.0.9 (#428)

---
 lmdeploy/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index d397dc7003..519acfc0d8 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.0.8'
+__version__ = '0.0.9'
 short_version = __version__
 
 

From 719450017dcae9c58d42366f9b34fabe0c3d3230 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Mon, 25 Sep 2023 12:14:44 +0900
Subject: [PATCH 20/43] Fix typo in README.md (#462)

quantilized -> quantized
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3b1f3697f0..3b59a65a0c 100644
--- a/README.md
+++ b/README.md
@@ -234,7 +234,7 @@ LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight
 [Click here](./docs/en/kv_int8.md) to view the usage method, implementation formula, and test results for kv int8.
 
 > **Warning**<br />
-> runtime Tensor Parallel for quantilized model is not available. Please setup `--tp` on `deploy` to enable static TP.
+> runtime Tensor Parallel for quantized model is not available. Please setup `--tp` on `deploy` to enable static TP.
 
 ## Contributing
 

From e980377a206fa104cf1495d6d73af3ca2303f626 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 25 Sep 2023 20:30:42 +0800
Subject: [PATCH 21/43] Fix side effect brought by supporting codellama:
 `sequence_start` is always true when calling `model.get_prompt` (#466)

---
 lmdeploy/turbomind/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index d617b19835..4648b7921f 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -123,7 +123,7 @@ def main(model_path,
             step = 0
             seed = random.getrandbits(64)
         else:
-            prompt = model.get_prompt(prompt, nth_round)
+            prompt = model.get_prompt(prompt, nth_round == 1)
             input_ids = tokenizer.encode(prompt)
             if step + len(input_ids) >= tm_model.session_len:
                 print('WARNING: exceed session max length.'

From ce9e07562bceea2741083dca4813cd6d30b5ec4b Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 25 Sep 2023 21:15:09 +0800
Subject: [PATCH 22/43] Miss meta instruction of internlm-chat model (#470)

---
 lmdeploy/model.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index da472428d2..ce22694519 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -174,15 +174,21 @@ def messages2prompt(self, messages, sequence_start=True):
 class InternLMChat7B(BaseModel):
     """Chat template of InternLM model."""
 
-    def __init__(self,
-                 system='',
-                 user='<|User|>',
-                 eoh='',
-                 eoa='<eoa>',
-                 assistant='<|Bot|>',
-                 **kwargs):
+    def __init__(
+            self,
+            system='<|System|>',
+            meta_instruction="""You are an AI assistant whose name is InternLM (书生·浦语).
+- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
+""",  # noqa: E501
+            user='<|User|>',
+            eoh='',
+            eoa='<eoa>',
+            assistant='<|Bot|>',
+            **kwargs):
         super().__init__(**kwargs)
         self.system = system
+        self.meta_instruction = meta_instruction
         self.user = user
         self.eoh = eoh
         self.eoa = eoa
@@ -202,7 +208,8 @@ def decorate_prompt(self, prompt, sequence_start=True):
         assert self.capability == 'chat', \
             f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
-            return f'<BOS>{self.user}:{prompt}{self.eoh}\n' \
+            return f'<BOS>{self.system}:{self.meta_instruction}\n' \
+                   f'{self.user}:{prompt}{self.eoh}\n' \
                    f'{self.assistant}:'
         else:
             return f'\n{self.user}:{prompt}{self.eoh}\n' \

From 0cc667e1fd1c994b262453b1a15076203ac834ef Mon Sep 17 00:00:00 2001
From: akhoroshev <arthoroshev@gmail.com>
Date: Tue, 26 Sep 2023 04:41:17 +0300
Subject: [PATCH 23/43] [feature] Graceful termination of background threads in
 LlamaV2 (#458)

* cuda allocator fix

* graceful termination

* lint and compilation fix
---
 src/turbomind/models/llama/LlamaV2.cc | 15 +++++++++++--
 src/turbomind/models/llama/LlamaV2.h  |  3 +++
 src/turbomind/models/llama/Request.h  | 15 ++++++++++++-
 src/turbomind/utils/allocator.h       | 31 +++++++++++++++++----------
 4 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 9c48e4f818..8a1de364b0 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -126,6 +126,7 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
 template<typename T>
 LlamaV2<T>::~LlamaV2()
 {
+    shared_state_->request_queue.close();
     internal_thread_.join();
 
     delete decoder_;
@@ -448,12 +449,24 @@ void LlamaV2<T>::internalThreadEntry(int device_id)
 
             request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty);
 
+            // request queue was closed
+            // and there are no unprocessed requests in the queue
+            if (is_empty && infer_requests.empty() && stop_requests.empty()) {
+                // rank 0 sets flag
+                shared_state_->should_stop = true;
+            }
+
             batch_.verifyRequests(stop_requests, infer_requests);
         }
 
         // wait while rank-0 is dequeueing
         shared_state_->barrier->wait();
 
+        // exit if job is done
+        if (shared_state_->should_stop) {
+            return;
+        }
+
         bool modified = false;
 
         if (!(batch_.finishedCount() == 0 && stop_requests.empty() && infer_requests.empty())) {
@@ -486,8 +499,6 @@ void LlamaV2<T>::internalThreadEntry(int device_id)
             batch_.finish();
         }
     }
-
-    FT_CHECK(0);
 }
 
 template<typename T>
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index c52a02db0c..40633b0a22 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -46,6 +46,9 @@ class LlamaV2 {
         std::vector<std::shared_ptr<Request>> stop_requests;
         RequestQueue                          request_queue;
         std::shared_ptr<Barrier>              barrier;
+
+        // rank 0 sets flag to true if there are no more tasks in the request_queue
+        bool should_stop = false;
     };
 
     ~LlamaV2();
diff --git a/src/turbomind/models/llama/Request.h b/src/turbomind/models/llama/Request.h
index cb2d1858a3..0bccf84a57 100644
--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -44,6 +44,11 @@ class RequestQueue {
         futures.reserve(requests.size());
         {
             std::lock_guard<std::mutex> lock(mutex_);
+
+            if (closed_) {
+                throw std::runtime_error("Queue is closed");
+            }
+
             for (auto& r : requests) {
                 futures.push_back(r->signal.get_future());
                 if (r->stop_flag) {
@@ -65,7 +70,7 @@ class RequestQueue {
     {
         std::unique_lock<std::mutex> lock(mutex_);
         if (blocking) {
-            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()); });
+            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty() && closed_ == false); });
         }
 
         stop_requests.clear();
@@ -81,11 +86,19 @@ class RequestQueue {
         }
     }
 
+    void close()
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        closed_ = true;
+        cv_.notify_all();
+    }
+
 private:
     std::queue<std::shared_ptr<Request>> stop_queue_;
     std::queue<std::shared_ptr<Request>> infer_queue_;
     std::mutex                           mutex_;
     std::condition_variable              cv_;
+    bool                                 closed_ = false;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
index a87efcd73b..1ba191d211 100644
--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -125,9 +125,15 @@ class Allocator;
 template<>
 class Allocator<AllocatorType::CUDA>: public IAllocator {
 private:
-    const int                          device_id_;
-    cudaStream_t                       stream_ = 0;  // initialize as default stream
-    std::unordered_map<void*, size_t>* pointer_mapping_;
+    enum class MemoryType
+    {
+        HOST,
+        DEVICE
+    };
+
+    const int                                                 device_id_;
+    cudaStream_t                                              stream_ = 0;  // initialize as default stream
+    std::unordered_map<void*, std::pair<size_t, MemoryType>>* pointer_mapping_;
 
     bool isExist(void* address) const
     {
@@ -136,10 +142,10 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
     ReallocType isReMalloc(void* address, size_t size) const
     {
         FT_CHECK(isExist(address));
-        if (pointer_mapping_->at(address) < size) {
+        if (pointer_mapping_->at(address).first < size) {
             return ReallocType::INCREASE;
         }
-        else if (pointer_mapping_->at(address) == size) {
+        else if (pointer_mapping_->at(address).first == size) {
             return ReallocType::REUSE;
         }
         else {
@@ -151,7 +157,7 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
     Allocator(int device_id): device_id_(device_id)
     {
         TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        pointer_mapping_ = new std::unordered_map<void*, size_t>();
+        pointer_mapping_ = new std::unordered_map<void*, std::pair<size_t, MemoryType>>();
 #if defined(CUDA_MEMORY_POOL_DISABLED)
         TM_LOG_WARNING(
             "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
@@ -188,7 +194,9 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
     {
         TM_LOG_DEBUG(__PRETTY_FUNCTION__);
         while (!pointer_mapping_->empty()) {
-            free((void**)(&pointer_mapping_->begin()->first));
+            auto ptr           = pointer_mapping_->begin()->first;
+            auto size_and_type = pointer_mapping_->begin()->second;
+            free(&ptr, size_and_type.second == MemoryType::HOST);
         }
         delete pointer_mapping_;
     }
@@ -229,18 +237,19 @@ class Allocator<AllocatorType::CUDA>: public IAllocator {
         check_cuda_error(getSetDevice(o_device));
         TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
 
-        pointer_mapping_->insert({getAddress(ptr), size});
+        pointer_mapping_->insert({getAddress(ptr), {size, is_host ? MemoryType::HOST : MemoryType::DEVICE}});
 
         return ptr;
     }
 
-    void free(void** ptr, bool is_host = false) const
+    void free(void** ptr, bool _ = false) const
     {
         TM_LOG_DEBUG(__PRETTY_FUNCTION__);
         void* address = getAddress(*ptr);
         if (*ptr != nullptr) {
             int o_device = 0;
             if (pointer_mapping_->count(address)) {
+                const auto is_host = pointer_mapping_->at(address).second == MemoryType::HOST;
                 TM_LOG_DEBUG("Free buffer %p", address);
                 check_cuda_error(getSetDevice(device_id_, &o_device));
                 if (is_host) {
@@ -361,7 +370,7 @@ class Allocator<AllocatorType::TF>: public IAllocator {
     {
         while (!pointer_mapping_->empty()) {
             void* ptr = pointer_mapping_->begin()->second.flat<uint8>().data();
-            free((void**)(&ptr));
+            free(&ptr);
         }
         pointer_mapping_->clear();
         delete pointer_mapping_;
@@ -454,7 +463,7 @@ class Allocator<AllocatorType::TH>: public IAllocator {
         TM_LOG_DEBUG(__PRETTY_FUNCTION__);
         while (!pointer_mapping_->empty()) {
             void* ptr = pointer_mapping_->begin()->second.data_ptr();
-            free((void**)(&ptr));
+            free(&ptr);
         }
         pointer_mapping_->clear();
         delete pointer_mapping_;

From 327deaee4122b3ff7780e36d0e481c5997dbe1fa Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 26 Sep 2023 10:04:14 +0800
Subject: [PATCH 24/43] expose stop words and filter eoa (#352)

* expose stop words

* support string

* fix

* remove eoa from chatbot

* remove eoa of turbomind

* fix ut

* suffix wheel and fix InternLM no system bug
---
 lmdeploy/model.py                   | 43 ++++++++++-------------------
 lmdeploy/serve/turbomind/chatbot.py | 16 +++++++++--
 lmdeploy/turbomind/turbomind.py     | 18 +++++++++---
 lmdeploy/utils.py                   | 20 +++++++++++++-
 tests/test_lmdeploy/test_model.py   |  2 +-
 5 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index ce22694519..fbc6736f9a 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -29,12 +29,14 @@ def __init__(self,
                  temperature=0.8,
                  repetition_penalty=1.0,
                  capability='chat',
+                 stop_words=None,
                  **kwargs):
         self.session_len = session_len
         self.top_p = top_p
         self.top_k = top_k
         self.temperature = temperature
         self.repetition_penalty = repetition_penalty
+        self.stop_words = stop_words
         self.capability = capability
 
     def get_prompt(self, prompt, sequence_start=True):
@@ -101,11 +103,6 @@ def messages2prompt(self, messages, sequence_start=True):
             return self.get_prompt(messages)
         # chat history processing in derived classes
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return None
-
     @property
     def sampling_param(self):
         return SamplingParam(top_p=self.top_p,
@@ -185,6 +182,7 @@ def __init__(
             eoh='',
             eoa='<eoa>',
             assistant='<|Bot|>',
+            stop_words=['<eoa>'],
             **kwargs):
         super().__init__(**kwargs)
         self.system = system
@@ -193,6 +191,7 @@ def __init__(
         self.eoh = eoh
         self.eoa = eoa
         self.assistant = assistant
+        self.stop_words = stop_words
 
     def decorate_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
@@ -227,7 +226,8 @@ def messages2prompt(self, messages, sequence_start=True):
         if isinstance(messages, str):
             return self.get_prompt(messages, sequence_start)
         system, users, assistants = self._translate_messages(messages)
-        ret = '<BOS>'
+        system = self.meta_instruction if not system else system
+        ret = f'<BOS>{self.system}:{system}\n'
         for user, assistant in zip(users, assistants):
             if assistant:
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
@@ -236,11 +236,6 @@ def messages2prompt(self, messages, sequence_start=True):
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
         return ret
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return [103028]
-
 
 @MODELS.register_module(name='internlm-chat-20b')
 @MODELS.register_module(name='internlm-chat-7b-8k')
@@ -339,12 +334,14 @@ def __init__(self,
                  eoh='',
                  assistant='',
                  eoa='',
+                 stop_words=None,
                  **kwargs):
         super().__init__(**kwargs)
         self.meta_instruction = meta_instruction
         self.system = system
         self.user = user
         self.assistant = assistant
+        self.stop_words = stop_words
         self.eosys = eosys
         self.eoh = eoh
         self.eoa = eoa
@@ -382,11 +379,6 @@ def messages2prompt(self, messages, sequence_start=True):
                 ret += f'{self.user}{user}{self.eoh}{self.assistant}'
         return ret
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return [45623]
-
 
 @MODELS.register_module(name='llama2')
 class Llama2(BaseModel):
@@ -468,6 +460,7 @@ def __init__(self,
                  im_start='<|im_start|>',
                  im_end='<|im_end|>',
                  system='You are a helpful assistant.',
+                 stop_words=['<|im_end|>'],
                  **kwargs):
         super().__init__(**kwargs)
         self.session_len = session_len
@@ -478,6 +471,7 @@ def __init__(self,
         self.im_start = im_start
         self.im_end = im_end
         self.system = system
+        self.stop_words = stop_words
 
     def decorate_prompt(self, prompt, sequence_start=True):
         assert self.capability == 'chat', \
@@ -513,11 +507,6 @@ def messages2prompt(self, messages, sequence_start=True):
                        f'\n{self.im_start}assistant\n'
         return ret
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return [151645]  # <|im_end|>
-
 
 @MODELS.register_module(name='codellama')
 class CodeLlama(Llama2):
@@ -526,6 +515,7 @@ def __init__(self,
                  system='',
                  session_len=4096,
                  suffix_first=False,
+                 stop_words=None,
                  **kwargs):
         super().__init__(**kwargs)
         caps = ['completion', 'infilling', 'chat', 'python']
@@ -535,6 +525,7 @@ def __init__(self,
         self.default_sys_prompt = system
         self.session_len = session_len
         self.suffix_first = suffix_first
+        self.stop_words = stop_words
 
         # The following sampling parameters refers to https://github.com/facebookresearch/codellama # noqa: E501
         if self.capability == 'completion' or self.capability == 'python':
@@ -546,6 +537,8 @@ def __init__(self,
         elif self.capability == 'infilling':
             self.top_p = kwargs.get('top_p', 0.9)
             self.temperature = kwargs.get('temperature', 0.0)
+            if self.stop_words is None:
+                self.stop_words = ['<EOT>']
 
     def decorate_prompt(self, prompt, sequence_start=True):
         if self.capability == 'infilling':
@@ -574,14 +567,6 @@ def _get_prompt(self, prompt, sequence_start):
 
         return f'{self.b_inst} {prompt} {self.e_inst}'
 
-    @property
-    def stop_words(self):
-        if self.capability == 'infilling':
-            # EOT ID
-            return [32010]
-        else:
-            return None
-
     def messages2prompt(self, messages, sequence_start=True):
         assert self.capability == 'chat', \
             f'codellama message2prompt only supports chat mode ' \
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index eb532e2602..cc12fcff3b 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -18,6 +18,7 @@
 from lmdeploy.model import MODELS
 from lmdeploy.serve.turbomind.utils import (Postprocessor, Preprocessor,
                                             prepare_tensor)
+from lmdeploy.utils import filter_suffix
 
 
 @dataclass
@@ -157,6 +158,8 @@ def stream_infer(self,
                                                       request_output_len,
                                                       sequence_start,
                                                       sequence_end):
+            if status == StatusCode.TRITON_STREAM_END:  # remove stop_words
+                res = filter_suffix(res, self.model.stop_words)
             if status.value < 0:
                 break
             else:
@@ -346,6 +349,8 @@ def infer(self,
                                                       sequence_end):
             if status.value < 0:
                 break
+            if status == StatusCode.TRITON_STREAM_END:  # remove stop_words
+                res = filter_suffix(res, self.model.stop_words)
         if status.value == 0:
             self._session.histories = \
                 self._session.histories + self._session.prompt + \
@@ -386,16 +391,23 @@ def _get_eos(self):
         token_ids, _ = self.preprocess('<EOS>')
         return token_ids[0][0]
 
-    def _stop_words(self, stop_words: List[int]):
+    def _stop_words(self, stop_words: List[str]):
         """return stop-words' token ids."""
         if stop_words is None:
             return None
         assert isinstance(stop_words, List) and \
-               all(isinstance(elem, int) for elem in stop_words), \
+               all(isinstance(elem, str) for elem in stop_words), \
                f'stop_words must be a list but got {type(stop_words)}'
         # each id in stop_words represents a stop word
         # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
         # detailed explanation about turbomind's stop_words
+        stop_words = [
+            int(self.preprocess(stop_word)[0][0][-1])
+            for stop_word in stop_words
+        ]
+        assert isinstance(stop_words, List) and \
+               all(isinstance(elem, int) for elem in stop_words), \
+               'invalid stop_words'
         stop_word_offsets = range(1, len(stop_words) + 1)
         stop_words = np.array([[stop_words,
                                 stop_word_offsets]]).astype(np.int32)
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 45760a309a..f8a7444546 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -14,6 +14,7 @@
 
 import lmdeploy
 from lmdeploy.model import MODELS
+from lmdeploy.turbomind import Tokenizer
 from lmdeploy.utils import get_logger
 
 # TODO: find another way import _turbomind
@@ -22,14 +23,16 @@
 import _turbomind as _tm  # noqa: E402
 
 
-def _stop_words(stop_words: List[int]):
+def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
     """return list of stop-words to numpy.ndarray."""
     if stop_words is None:
         return None
     assert isinstance(stop_words, List) and \
-           all(isinstance(elem, int) for elem in stop_words), \
+           all(isinstance(elem, str) for elem in stop_words), \
            f'stop_words must be a list but got {type(stop_words)}'
-
+    stop_words = [tokenizer.encode(stop_word)[-1] for stop_word in stop_words]
+    assert isinstance(stop_words, List) and all(
+        isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
     # each id in stop_words represents a stop word
     # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
     # detailed explanation about fastertransformer's stop_words
@@ -106,7 +109,10 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
             self.model_name = parser.get(section_name, 'model_name')
             data_type = parser.get(section_name, 'weight_type')
         model = MODELS.get(self.model_name)()
-        self.stop_words = _stop_words(model.stop_words)
+        tokenizer_model_path = osp.join(model_path, 'triton_models',
+                                        'tokenizer')
+        tokenizer = Tokenizer(tokenizer_model_path)
+        self.stop_words = _stop_words(model.stop_words, tokenizer)
 
         # params
         self.node_id = node_id
@@ -162,6 +168,8 @@ def __init__(self, tm_model, cuda_stream_id=0):
         self.gpu_count = tm_model.gpu_count
 
         self.stop_words = tm_model.stop_words
+        self.stop_tokens = [] if self.stop_words is None else \
+            self.stop_words.flatten().tolist()
         self.eos_id = tm_model.eos_id
         self.session_len = tm_model.session_len
 
@@ -346,6 +354,8 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
                 output, len_ = output, len_.item()
                 if len(output) > 0 and output[-1].item() == self.eos_id:
                     outputs.append((output[:-1], len_ - 1))
+                elif len(output) > 0 and output[-1].item() in self.stop_tokens:
+                    outputs.append((output[:-1], len_))
                 else:
                     outputs.append((output, len_))
 
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index 7b6d51a01a..e284f50075 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
-from typing import Optional
+from typing import List, Optional
 
 logger_initialized = {}
 
@@ -77,3 +77,21 @@ def get_logger(name: str,
     logger_initialized[name] = True
 
     return logger
+
+
+def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
+    """Filter response with suffixes.
+
+    Args:
+        response (str): generated response by LLMs.
+        suffixes (str): a list of suffixes to be deleted.
+
+    Return:
+        str: a clean response.
+    """
+    if suffixes is None:
+        return response
+    for item in suffixes:
+        if response.endswith(item):
+            response = response[:len(response) - len(item)]
+    return response
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index dcf04d5c28..d07e1f1f73 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -133,7 +133,7 @@ def test_codellama_infilling():
 '''
     _prompt = model.get_prompt(prompt)
     assert _prompt.find('<FILL>') == -1
-    assert model.stop_words == [32010]
+    assert model.stop_words == ['<EOT>']
 
     model = MODELS.get('codellama')(capability='infilling', suffix_first=True)
     _prompt = model.get_prompt(prompt)

From a54e3e0937a9c96165c73b5846453860f9d21469 Mon Sep 17 00:00:00 2001
From: akhoroshev <arthoroshev@gmail.com>
Date: Tue, 26 Sep 2023 07:45:02 +0300
Subject: [PATCH 25/43] fix race condition (#460)

---
 src/turbomind/models/llama/LlamaBatch.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index 995f15b710..5d8d7d0411 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -30,6 +30,9 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
 
     auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) {
         TM_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
+        // We don't need a barrier there because
+        // this lambda is called only for new requests
+        // which are visible only for rank = 0 thread.
         req->signal.set_value(ec);
         req.reset();
     };
@@ -139,6 +142,12 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request
             check_cuda_error(cudaMemsetAsync(sequence_length.getPtr<int>(), 0, sizeof(int), stream_));
             check_cuda_error(cudaStreamSynchronize(stream_));
         }
+
+        // When the signal is set threads from LlamaV2::forward can exit
+        // and free inputs/outputs tensors.
+        // Therefore we need to make sure that no threads from LlamaV2::internalThreadEntry
+        // are accessing the tensors.
+        llama_->shared_state_->barrier->wait();
         if (rank_ == 0) {
             r->signal.set_value(ec);
         }
@@ -1112,6 +1121,11 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
         llama_->kv_cache_mgr_->update(cached_seq_[index], stream_);
     }
 
+    // When the signal is set threads from LlamaV2::forward can exit
+    // and free inputs/outputs tensors.
+    // Therefore we need to make sure that no threads from LlamaV2::internalThreadEntry
+    // are accessing the tensors.
+    llama_->shared_state_->barrier->wait();
     if (rank_ == 0) {
         requests_[index]->signal.set_value(0);
     }

From 22cd7d15151d9f360a6662e88056c6193c923707 Mon Sep 17 00:00:00 2001
From: aisensiy <aisensiy@163.com>
Date: Tue, 26 Sep 2023 14:34:40 +0800
Subject: [PATCH 26/43] Fix compatibility issues with Pydantic 2 (#465)

---
 lmdeploy/serve/openai/api_server.py | 6 +++---
 lmdeploy/serve/openai/protocol.py   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 28096af9b6..e1af990a5e 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -60,7 +60,7 @@ def create_error_response(status: HTTPStatus, message: str):
     return JSONResponse(
         ErrorResponse(message=message,
                       type='invalid_request_error',
-                      code=status.value).dict())
+                      code=status.value).model_dump())
 
 
 async def check_request(request) -> Optional[JSONResponse]:
@@ -152,7 +152,7 @@ def create_stream_response_json(
             model=model_name,
             choices=[choice_data],
         )
-        response_json = response.json(ensure_ascii=False)
+        response_json = response.model_dump_json()
 
         return response_json
 
@@ -167,7 +167,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
             chunk = ChatCompletionStreamResponse(id=request_id,
                                                  choices=[choice_data],
                                                  model=model_name)
-            data = chunk.json(exclude_unset=True, ensure_ascii=False)
+            data = chunk.model_dump_json(exclude_unset=True)
             yield f'data: {data}\n\n'
 
         async for res in result_generator:
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 8d5b387572..8f2919a1a5 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -84,7 +84,7 @@ class ChatCompletionResponseChoice(BaseModel):
     """Chat completion response choices."""
     index: int
     message: ChatMessage
-    finish_reason: Optional[Literal['stop', 'length']]
+    finish_reason: Optional[Literal['stop', 'length']] = None
 
 
 class ChatCompletionResponse(BaseModel):
@@ -107,7 +107,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
     """Chat completion response stream choice."""
     index: int
     delta: DeltaMessage
-    finish_reason: Optional[Literal['stop', 'length']]
+    finish_reason: Optional[Literal['stop', 'length']] = None
 
 
 class ChatCompletionStreamResponse(BaseModel):
@@ -142,7 +142,7 @@ class CompletionResponseChoice(BaseModel):
     index: int
     text: str
     logprobs: Optional[int] = None
-    finish_reason: Optional[Literal['stop', 'length']]
+    finish_reason: Optional[Literal['stop', 'length']] = None
 
 
 class CompletionResponse(BaseModel):

From 97dcdff7ab0dffdbcdc89294f9819739f98dccce Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Tue, 26 Sep 2023 15:26:59 +0800
Subject: [PATCH 27/43] fix benchmark serving cannot use Qwen tokenizer (#443)

* fix benchmark serving cannot use Qwen tokenizer

* update benchmark readme
---
 benchmark/README.md              | 16 +++++++++++++++-
 benchmark/profile_restful_api.py | 23 +++++------------------
 benchmark/profile_serving.py     | 24 +++++-------------------
 3 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 4fb08db4c6..b5573ae2b8 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -42,7 +42,21 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 
 python profile_serving.py \
     ${TritonServerAddress} \
-    /path/to/tokenizer \
+    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
+    ShareGPT_V3_unfiltered_cleaned_split.json \
+    --concurrency 64
+```
+
+## profile restful api
+
+`profile_restful_api.py` is used to do benchmark on api server.
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python profile_restful_api.py \
+    ${ServerAddress} \
+    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
     ShareGPT_V3_unfiltered_cleaned_split.json \
     --concurrency 64
 ```
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 5c06664741..ff1db7b4b5 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -1,6 +1,5 @@
 import json
 import multiprocessing as mp
-import os
 import random
 import time
 from typing import Iterable, List
@@ -8,8 +7,8 @@
 import fire
 import numpy as np
 import requests
-from sentencepiece import SentencePieceProcessor
 
+from lmdeploy.turbomind.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
 
 
@@ -45,20 +44,6 @@ def get_streaming_response(prompt: str,
             yield output, tokens
 
 
-class Tokenizer:
-
-    def __init__(self, model_path: str):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-
-    def encode(self, prompts: List):
-        prompts_token_ids = self.sp_model.Encode(prompts,
-                                                 add_bos=False,
-                                                 add_eos=False)
-        return [len(token_ids) for token_ids in prompts_token_ids]
-
-
 def infer(server_addr: str, session_id: int, req_queue: mp.Queue,
           res_que: mp.Queue):
     stats = []
@@ -132,8 +117,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
 
     start = time.perf_counter()
     tokenizer = Tokenizer(tokenizer_path)
-    prompts_token_lens = tokenizer.encode(prompts)
-    completions_token_lens = tokenizer.encode(completions)
+    prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
+    completions_token_lens = [
+        len(tokenizer.encode(prompt)) for prompt in completions
+    ]
     print(f'elapsed time for tokenization: '
           f'{round(time.perf_counter() - start, 2)} s')
 
diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
index 8973352bc1..c60e0799dc 100644
--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
@@ -1,30 +1,14 @@
 import json
 import logging
 import multiprocessing as mp
-import os
 import random
 import time
-from typing import List
 
 import fire
 import numpy as np
-from sentencepiece import SentencePieceProcessor
 
 from lmdeploy.serve.turbomind.chatbot import Chatbot
-
-
-class Tokenizer:
-
-    def __init__(self, model_path: str):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-
-    def encode(self, prompts: List):
-        prompts_token_ids = self.sp_model.Encode(prompts,
-                                                 add_bos=False,
-                                                 add_eos=False)
-        return [len(token_ids) for token_ids in prompts_token_ids]
+from lmdeploy.turbomind.tokenizer import Tokenizer
 
 
 def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
@@ -103,8 +87,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
 
     start = time.perf_counter()
     tokenizer = Tokenizer(tokenizer_path)
-    prompts_token_lens = tokenizer.encode(prompts)
-    completions_token_lens = tokenizer.encode(completions)
+    prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
+    completions_token_lens = [
+        len(tokenizer.encode(prompt)) for prompt in completions
+    ]
     print(f'elapsed time for tokenization: '
           f'{round(time.perf_counter() - start, 2)} s')
 

From 5d87c20fad2b816e885aeb3d4e3e2f2b368bf909 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 26 Sep 2023 20:36:00 +0800
Subject: [PATCH 28/43] Fix memory leak (#488)

* Fix memory leak

* modern c++
---
 src/turbomind/models/llama/LlamaWeight.cc                   | 4 ++++
 src/turbomind/triton_backend/llama/LlamaTritonModel.cc      | 6 +++---
 .../triton_backend/llama/LlamaTritonModelInstance.h         | 4 ++--
 src/turbomind/triton_backend/transformer_triton_backend.hpp | 1 +
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 80e561442a..e1287f471b 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -72,6 +72,10 @@ LlamaWeight<T>::~LlamaWeight()
 
     pre_decoder_embedding_table   = nullptr;
     post_decoder_embedding_kernel = nullptr;
+
+    for (auto& p : decoder_layer_weights) {
+        delete p;
+    }
 }
 
 template<typename T>
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index 57d5c9be5b..e670753701 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -249,13 +249,13 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
                                                   cuda_device_prop_ptr.get());
 
     return std::make_unique<LlamaTritonSharedModelInstance<T>>(
-        LlamaTritonSharedModelInstance<T>{std::move(llama),
-                                          shared_weights_[device_id],
-                                          std::move(allocator),
+        LlamaTritonSharedModelInstance<T>{std::move(allocator),
                                           std::move(cublas_algo_map),
                                           std::move(cublas_wrapper_mutex),
                                           std::move(cublas_wrapper),
                                           std::move(cuda_device_prop_ptr),
+                                          shared_weights_[device_id],
+                                          std::move(llama),
                                           session_len_});
 }
 
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
index 1713d96bef..4dff6eb24c 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h
@@ -29,13 +29,13 @@ namespace ft = turbomind;
 
 template<typename T>
 struct LlamaTritonSharedModelInstance {
-    std::unique_ptr<ft::LlamaV2<T>>                         llm;
-    std::shared_ptr<ft::LlamaWeight<T>>                     llm_weight;
     std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator;
     std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map;
     std::unique_ptr<std::mutex>                             cublas_wrapper_mutex;
     std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper;
     std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr;
+    std::shared_ptr<ft::LlamaWeight<T>>                     llm_weight;
+    std::unique_ptr<ft::LlamaV2<T>>                         llm;
     const int                                               session_len;
 };
 
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
index 4026048e31..8f1f88f5a6 100644
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -271,6 +271,7 @@ struct AbstractTransformerModel;
 struct AbstractTransformerModelInstance;
 
 struct AbstractTransformerModelInstance {
+    virtual ~AbstractTransformerModelInstance() = default;
     virtual std::shared_ptr<std::vector<triton::Tensor>>
     forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
 

From b58a9dffb12bdc36ee9b3f251185179602468de9 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 26 Sep 2023 20:51:27 +0800
Subject: [PATCH 29/43] bump version to v0.0.10 (#474)

---
 lmdeploy/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 519acfc0d8..0a68f5e0c8 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.0.9'
+__version__ = '0.0.10'
 short_version = __version__
 
 

From 026841447cd2e143b6d5d2fa9621b3bd1d975e25 Mon Sep 17 00:00:00 2001
From: aisensiy <aisensiy@163.com>
Date: Mon, 9 Oct 2023 10:59:22 +0800
Subject: [PATCH 30/43] Support CORS for openai api server (#481)

* Support CORS for openai api server

* Remove unnecessary var

* Add CORS support follow the same style with vllm
---
 lmdeploy/serve/openai/api_server.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index e1af990a5e..647c36609c 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -3,11 +3,12 @@
 import os
 import time
 from http import HTTPStatus
-from typing import AsyncGenerator, Optional
+from typing import AsyncGenerator, List, Optional
 
 import fire
 import uvicorn
 from fastapi import BackgroundTasks, FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from lmdeploy.serve.async_engine import AsyncEngine
@@ -321,7 +322,11 @@ def main(model_path: str,
          server_name: str = 'localhost',
          server_port: int = 23333,
          instance_num: int = 32,
-         tp: int = 1):
+         tp: int = 1,
+         allow_origins: List[str] = ['*'],
+         allow_credentials: bool = True,
+         allow_methods: List[str] = ['*'],
+         allow_headers: List[str] = ['*']):
     """An example to perform model inference through the command line
     interface.
 
@@ -331,7 +336,20 @@ def main(model_path: str,
         server_port (int): server port
         instance_num (int): number of instances of turbomind model
         tp (int): tensor parallel
+        allow_origins (List[str]): a list of allowed origins for CORS
+        allow_credentials (bool): whether to allow credentials for CORS
+        allow_methods (List[str]): a list of allowed HTTP methods for CORS
+        allow_headers (List[str]): a list of allowed HTTP headers for CORS
     """
+    if allow_origins:
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=allow_origins,
+            allow_credentials=allow_credentials,
+            allow_methods=allow_methods,
+            allow_headers=allow_headers,
+        )
+
     VariableInterface.async_engine = AsyncEngine(model_path=model_path,
                                                  instance_num=instance_num,
                                                  tp=tp)

From 19fea86c3a0c287c1345610ce683750b3472010d Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Mon, 9 Oct 2023 11:04:52 +0800
Subject: [PATCH 31/43] Change `shared_instance` type from `weakptr` to
 `shared_ptr` (#507)

* change shared_instances_ from weakptr to sharedptr

* update
---
 src/turbomind/triton_backend/llama/LlamaTritonModel.cc | 4 ++--
 src/turbomind/triton_backend/llama/LlamaTritonModel.h  | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index e670753701..8a7674a2ab 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -273,7 +273,7 @@ LlamaTritonModel<T>::createModelInstance(int
     std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance;
     {
         std::lock_guard<std::mutex> lock(shared_mutexes_[device_id]);
-        instance = shared_instances_[device_id].lock();
+        instance = shared_instances_[device_id];
         if (!instance) {
             instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
             instance->llm->setFfiLock(ffi_lock_);
@@ -347,7 +347,7 @@ LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_sta
     // create nccl group when there are non-occupied devices
     for (int i = 0; i < device_count; ++i) {
         std::lock_guard<std::mutex> lock(shared_mutexes_[i]);
-        if (shared_instances_[i].expired()) {
+        if (shared_instances_[i] == nullptr) {
             need_nccl_params = true;
             break;
         }
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 332000ce62..b7d8f439ca 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -108,9 +108,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
 
     std::shared_ptr<typename ft::LlamaV2<T>::SharedState> shared_state_;
 
-    // weak_ptr is used so that the instances get released when all strong references are gone
-    std::vector<std::weak_ptr<LlamaTritonSharedModelInstance<T>>> shared_instances_;
-    std::deque<std::mutex>                                        shared_mutexes_;  // is locking really needed?
+    std::vector<std::shared_ptr<LlamaTritonSharedModelInstance<T>>> shared_instances_;
+    std::deque<std::mutex>                                          shared_mutexes_;  // is locking really needed?
 
     bool is_fp16_;
     int  enable_custom_all_reduce_ = 0;

From fbd9770a9b7c5bc2eb2dbf66b43e1c54ef165185 Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 10 Oct 2023 01:56:52 +0800
Subject: [PATCH 32/43] set the default value of  being 0 (#532)

---
 lmdeploy/turbomind/turbomind.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index f8a7444546..51f0582fdf 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -229,7 +229,7 @@ def stream_infer(self,
                      request_output_len: int = 512,
                      sequence_start: bool = True,
                      sequence_end: bool = False,
-                     step=1,
+                     step=0,
                      stop=False,
                      top_p=0.8,
                      top_k=40,

From 759e1ddf63f5963dc10f6d5e75a64f3a0f2a0395 Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 11 Oct 2023 10:22:19 +0800
Subject: [PATCH 33/43] make IPv6 compatible, safe run for coroutine
 interrupting (#487)

* make IPv6 compatible, safe run for coroutine interrupting

* instance_id -> session_id and fix api_client.py

* update doc

* remove useless faq

* safe ip mapping

* update app.py

* remove print

* update doc
---
 benchmark/profile_restful_api.py    |  6 +-
 docs/en/restful_api.md              | 13 ++--
 docs/zh_cn/restful_api.md           | 13 ++--
 lmdeploy/serve/async_engine.py      | 94 +++++++++++++++++------------
 lmdeploy/serve/gradio/app.py        | 23 +++----
 lmdeploy/serve/openai/api_client.py | 16 +++--
 lmdeploy/serve/openai/api_server.py | 70 ++++++++++-----------
 lmdeploy/serve/openai/protocol.py   |  9 ++-
 8 files changed, 137 insertions(+), 107 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index ff1db7b4b5..ed922bfd7a 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -14,7 +14,7 @@
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -24,7 +24,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -36,7 +36,7 @@ def get_streaming_response(prompt: str,
                              stream=stream)
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
-                                     delimiter=b'\0'):
+                                     delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data['text']
diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md
index c5a4a0de07..cb70e26375 100644
--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -22,7 +22,7 @@ from typing import Iterable, List
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -32,7 +32,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -41,7 +41,7 @@ def get_streaming_response(prompt: str,
     response = requests.post(
         api_url, headers=headers, json=pload, stream=stream)
     for chunk in response.iter_lines(
-            chunk_size=8192, decode_unicode=False, delimiter=b'\0'):
+            chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data['text']
@@ -91,7 +91,7 @@ curl http://{server_ip}:{server_port}/generate \
   -H "Content-Type: application/json" \
   -d '{
     "prompt": "Hello! How are you?",
-    "instance_id": 1,
+    "session_id": 1,
     "sequence_start": true,
     "sequence_end": true
   }'
@@ -146,11 +146,10 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
 
 2. When OOM appeared at the server side, please reduce the number of `instance_num` when lanching the service.
 
-3. When the request with the same `instance_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards.
+3. When the request with the same `session_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards.
 
 4. Requests were previously being handled sequentially rather than concurrently. To resolve this issue,
 
-   - kindly provide unique instance_id values when calling the `generate` API or else your requests may be associated with client IP addresses
-   - additionally, setting `stream=true` enables processing multiple requests simultaneously
+   - kindly provide unique session_id values when calling the `generate` API or else your requests may be associated with client IP addresses
 
 5. Both `generate` api and `v1/chat/completions` upport engaging in multiple rounds of conversation, where input `prompt` or `messages` consists of either single strings or entire chat histories.These inputs are interpreted using multi-turn dialogue modes. However, ff you want to turn the mode of and manage the chat history in clients, please the parameter `sequence_end: true` when utilizing the `generate` function, or specify `renew_session: true` when making use of `v1/chat/completions`
diff --git a/docs/zh_cn/restful_api.md b/docs/zh_cn/restful_api.md
index ab35ead124..2b56fa0f26 100644
--- a/docs/zh_cn/restful_api.md
+++ b/docs/zh_cn/restful_api.md
@@ -24,7 +24,7 @@ from typing import Iterable, List
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -34,7 +34,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -43,7 +43,7 @@ def get_streaming_response(prompt: str,
     response = requests.post(
         api_url, headers=headers, json=pload, stream=stream)
     for chunk in response.iter_lines(
-            chunk_size=8192, decode_unicode=False, delimiter=b'\0'):
+            chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data['text']
@@ -93,7 +93,7 @@ curl http://{server_ip}:{server_port}/generate \
   -H "Content-Type: application/json" \
   -d '{
     "prompt": "Hello! How are you?",
-    "instance_id": 1,
+    "session_id": 1,
     "sequence_start": true,
     "sequence_end": true
   }'
@@ -148,12 +148,11 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
 
 2. 当服务端显存 OOM 时，可以适当减小启动服务时的 `instance_num` 个数
 
-3. 当同一个 `instance_id` 的请求给 `generate` 函数后，出现返回空字符串和负值的 `tokens`，应该是第二次问话没有设置 `sequence_start=false`
+3. 当同一个 `session_id` 的请求给 `generate` 函数后，出现返回空字符串和负值的 `tokens`，应该是第二次问话没有设置 `sequence_start=false`
 
 4. 如果感觉请求不是并发地被处理，而是一个一个地处理，请设置好以下参数：
 
-   - 不同的 instance_id 传入 `generate` api。否则，我们将自动绑定会话 id 为请求端的 ip 地址编号。
-   - 设置 `stream=true` 使模型在前向传播时可以允许其他请求进入被处理
+   - 不同的 session_id 传入 `generate` api。否则，我们将自动绑定会话 id 为请求端的 ip 地址编号。
 
 5. `generate` api 和 `v1/chat/completions` 均支持多轮对话。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问，也可以是一段对话历史。
    两个 api 都是默认开启多伦对话的，如果你想关闭这个功能，然后在客户端管理会话记录，请设置 `sequence_end: true` 传入 `generate`，或者设置
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 40f87ac0ea..e2c4b36840 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -47,14 +47,31 @@ def __init__(self, model_path, instance_num=32, tp=1) -> None:
         self.starts = [None] * instance_num
         self.steps = {}
 
+    def stop_session(self, session_id: int):
+        instance_id = session_id % self.instance_num
+        input_ids = self.tokenizer.encode('')
+        for outputs in self.generators[instance_id].stream_infer(
+                session_id,
+                input_ids,
+                request_output_len=0,
+                sequence_start=False,
+                sequence_end=False,
+                stop=True):
+            pass
+        self.available[instance_id] = True
+
     @contextmanager
-    def safe_run(self, instance_id: int, stop: bool = False):
+    def safe_run(self, instance_id: int, session_id: Optional[int] = None):
         self.available[instance_id] = False
-        yield
+        try:
+            yield
+        except (Exception, asyncio.CancelledError) as e:  # noqa
+            self.stop_session(session_id)
         self.available[instance_id] = True
 
-    async def get_embeddings(self, prompt):
-        prompt = self.model.get_prompt(prompt)
+    async def get_embeddings(self, prompt, do_prerpocess=False):
+        if do_prerpocess:
+            prompt = self.model.get_prompt(prompt)
         input_ids = self.tokenizer.encode(prompt)
         return input_ids
 
@@ -68,7 +85,7 @@ async def get_generator(self, instance_id: int, stop: bool = False):
     async def generate(
         self,
         messages,
-        instance_id,
+        session_id,
         stream_response=True,
         sequence_start=True,
         sequence_end=False,
@@ -85,7 +102,7 @@ async def generate(
 
         Args:
             messages (str | List): chat history or prompt
-            instance_id (int): actually request host ip
+            session_id (int): the session id
             stream_response (bool): whether return responses streamingly
             request_output_len (int): output token nums
             sequence_start (bool): indicator for starting a sequence
@@ -102,8 +119,7 @@ async def generate(
               1.0 means no penalty
             ignore_eos (bool): indicator for ignoring eos
         """
-        session_id = instance_id
-        instance_id %= self.instance_num
+        instance_id = session_id % self.instance_num
         if str(session_id) not in self.steps:
             self.steps[str(session_id)] = 0
         if step != 0:
@@ -119,7 +135,7 @@ async def generate(
                          finish_reason)
         else:
             generator = await self.get_generator(instance_id, stop)
-            with self.safe_run(instance_id):
+            with self.safe_run(instance_id, session_id):
                 response_size = 0
                 async for outputs in generator.async_stream_infer(
                         session_id=session_id,
@@ -188,14 +204,14 @@ async def generate_openai(
         instance_id %= self.instance_num
         sequence_start = False
         generator = await self.get_generator(instance_id)
-        self.available[instance_id] = False
         if renew_session:  # renew a session
             empty_input_ids = self.tokenizer.encode('')
             for outputs in generator.stream_infer(session_id=session_id,
                                                   input_ids=[empty_input_ids],
                                                   request_output_len=0,
                                                   sequence_start=False,
-                                                  sequence_end=True):
+                                                  sequence_end=True,
+                                                  stop=True):
                 pass
             self.steps[str(session_id)] = 0
         if str(session_id) not in self.steps:
@@ -212,31 +228,31 @@ async def generate_openai(
             yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
                          finish_reason)
         else:
-            response_size = 0
-            async for outputs in generator.async_stream_infer(
-                    session_id=session_id,
-                    input_ids=[input_ids],
-                    stream_output=stream_response,
-                    request_output_len=request_output_len,
-                    sequence_start=(sequence_start),
-                    sequence_end=False,
-                    step=self.steps[str(session_id)],
-                    stop=stop,
-                    top_k=top_k,
-                    top_p=top_p,
-                    temperature=temperature,
-                    repetition_penalty=repetition_penalty,
-                    ignore_eos=ignore_eos,
-                    random_seed=seed if sequence_start else None):
-                res, tokens = outputs[0]
-                # decode res
-                response = self.tokenizer.decode(res.tolist(),
-                                                 offset=response_size)
-                # response, history token len, input token len, gen token len
-                yield GenOut(response, self.steps[str(session_id)],
-                             len(input_ids), tokens, finish_reason)
-                response_size = tokens
-
-            # update step
-            self.steps[str(session_id)] += len(input_ids) + tokens
-        self.available[instance_id] = True
+            with self.safe_run(instance_id, session_id):
+                response_size = 0
+                async for outputs in generator.async_stream_infer(
+                        session_id=session_id,
+                        input_ids=[input_ids],
+                        stream_output=stream_response,
+                        request_output_len=request_output_len,
+                        sequence_start=(sequence_start),
+                        sequence_end=False,
+                        step=self.steps[str(session_id)],
+                        stop=stop,
+                        top_k=top_k,
+                        top_p=top_p,
+                        temperature=temperature,
+                        repetition_penalty=repetition_penalty,
+                        ignore_eos=ignore_eos,
+                        random_seed=seed if sequence_start else None):
+                    res, tokens = outputs[0]
+                    # decode res
+                    response = self.tokenizer.decode(res.tolist(),
+                                                     offset=response_size)
+                    # response, history len, input len, generation len
+                    yield GenOut(response, self.steps[str(session_id)],
+                                 len(input_ids), tokens, finish_reason)
+                    response_size = tokens
+
+                # update step
+                self.steps[str(session_id)] += len(input_ids) + tokens
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 954a5bcd32..71db7a2749 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -12,6 +12,7 @@
 from lmdeploy.serve.gradio.css import CSS
 from lmdeploy.serve.openai.api_client import (get_model_list,
                                               get_streaming_response)
+from lmdeploy.serve.openai.api_server import ip2id
 from lmdeploy.serve.turbomind.chatbot import Chatbot
 
 THEME = gr.themes.Soft(
@@ -37,7 +38,7 @@ def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
     instruction = state_chatbot[-1][0]
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
 
     bot_response = llama_chatbot.stream_infer(
         session_id, instruction, f'{session_id}-{len(state_chatbot)}')
@@ -166,7 +167,7 @@ def chat_stream_restful(
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     bot_summarized_response = ''
     state_chatbot = state_chatbot + [(instruction, None)]
 
@@ -176,7 +177,7 @@ def chat_stream_restful(
     for response, tokens, finish_reason in get_streaming_response(
             instruction,
             f'{InterFace.restful_api_url}/generate',
-            instance_id=session_id,
+            session_id=session_id,
             request_output_len=512,
             sequence_start=(len(state_chatbot) == 1),
             sequence_end=False):
@@ -212,12 +213,12 @@ def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
 
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     for response, tokens, finish_reason in get_streaming_response(
             '',
             f'{InterFace.restful_api_url}/generate',
-            instance_id=session_id,
+            session_id=session_id,
             request_output_len=0,
             sequence_start=False,
             sequence_end=True):
@@ -241,11 +242,11 @@ def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     for out in get_streaming_response('',
                                       f'{InterFace.restful_api_url}/generate',
-                                      instance_id=session_id,
+                                      session_id=session_id,
                                       request_output_len=0,
                                       sequence_start=False,
                                       sequence_end=False,
@@ -259,7 +260,7 @@ def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
             messages.append(dict(role='assistant', content=qa[1]))
     for out in get_streaming_response(messages,
                                       f'{InterFace.restful_api_url}/generate',
-                                      instance_id=session_id,
+                                      session_id=session_id,
                                       request_output_len=0,
                                       sequence_start=True,
                                       sequence_end=False):
@@ -346,7 +347,7 @@ async def chat_stream_local(
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     bot_summarized_response = ''
     state_chatbot = state_chatbot + [(instruction, None)]
 
@@ -391,7 +392,7 @@ async def reset_local_func(instruction_txtbox: gr.Textbox,
 
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     async for out in InterFace.async_engine.generate('',
                                                      session_id,
@@ -419,7 +420,7 @@ async def cancel_local_func(state_chatbot: gr.State, cancel_btn: gr.Button,
     """
     session_id = threading.current_thread().ident
     if request is not None:
-        session_id = int(request.kwargs['client']['host'].replace('.', ''))
+        session_id = ip2id(request.kwargs['client']['host'])
     # end the session
     async for out in InterFace.async_engine.generate('',
                                                      session_id,
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index 449b8a294a..a8718331be 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -17,7 +17,7 @@ def get_model_list(api_url: str):
 
 def get_streaming_response(prompt: str,
                            api_url: str,
-                           instance_id: int,
+                           session_id: int,
                            request_output_len: int = 512,
                            stream: bool = True,
                            sequence_start: bool = True,
@@ -28,7 +28,7 @@ def get_streaming_response(prompt: str,
     pload = {
         'prompt': prompt,
         'stream': stream,
-        'instance_id': instance_id,
+        'session_id': session_id,
         'request_output_len': request_output_len,
         'sequence_start': sequence_start,
         'sequence_end': sequence_end,
@@ -41,7 +41,7 @@ def get_streaming_response(prompt: str,
                              stream=stream)
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
-                                     delimiter=b'\0'):
+                                     delimiter=b'\n'):
         if chunk:
             data = json.loads(chunk.decode('utf-8'))
             output = data.pop('text', '')
@@ -62,12 +62,20 @@ def main(restful_api_url: str, session_id: int = 0):
     while True:
         prompt = input_prompt()
         if prompt == 'exit':
+            for output, tokens, finish_reason in get_streaming_response(
+                    '',
+                    f'{restful_api_url}/generate',
+                    session_id=session_id,
+                    request_output_len=0,
+                    sequence_start=(nth_round == 1),
+                    sequence_end=True):
+                pass
             exit(0)
         else:
             for output, tokens, finish_reason in get_streaming_response(
                     prompt,
                     f'{restful_api_url}/generate',
-                    instance_id=session_id,
+                    session_id=session_id,
                     request_output_len=512,
                     sequence_start=(nth_round == 1),
                     sequence_end=False):
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 647c36609c..94271c4b9b 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import json
 import os
 import time
 from http import HTTPStatus
@@ -7,7 +6,7 @@
 
 import fire
 import uvicorn
-from fastapi import BackgroundTasks, FastAPI, Request
+from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 
@@ -16,8 +15,8 @@
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingsRequest,
-    EmbeddingsResponse, ErrorResponse, GenerateRequest, ModelCard, ModelList,
-    ModelPermission, UsageInfo)
+    EmbeddingsResponse, ErrorResponse, GenerateRequest, GenerateResponse,
+    ModelCard, ModelList, ModelPermission, UsageInfo)
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
@@ -73,6 +72,16 @@ async def check_request(request) -> Optional[JSONResponse]:
     return ret
 
 
+def ip2id(host_ip: str):
+    """Convert host ip address to session id."""
+    if '.' in host_ip:  # IPv4
+        return int(host_ip.replace('.', '')[-8:])
+    if ':' in host_ip:  # IPv6
+        return int(host_ip.replace(':', '')[-8:], 16)
+    print('Warning, could not get session id from ip, set it 0')
+    return 0
+
+
 @app.post('/v1/chat/completions')
 async def chat_completions_v1(request: ChatCompletionRequest,
                               raw_request: Request = None):
@@ -106,19 +115,18 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     - presence_penalty (replaced with repetition_penalty)
     - frequency_penalty (replaced with repetition_penalty)
     """
-    instance_id = int(raw_request.client.host.replace('.', ''))
-
+    session_id = ip2id(raw_request.client.host)
     error_check_ret = await check_request(request)
     if error_check_ret is not None:
         return error_check_ret
 
     model_name = request.model
-    request_id = str(instance_id)
+    request_id = str(session_id)
     created_time = int(time.time())
 
     result_generator = VariableInterface.async_engine.generate_openai(
         request.messages,
-        instance_id,
+        session_id,
         True,  # always use stream to enable batching
         request.renew_session,
         request_output_len=request.max_tokens if request.max_tokens else 512,
@@ -128,15 +136,6 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         repetition_penalty=request.repetition_penalty,
         ignore_eos=request.ignore_eos)
 
-    async def abort_request() -> None:
-        async for _ in VariableInterface.async_engine.generate_openai(
-                request.messages,
-                instance_id,
-                True,
-                request.renew_session,
-                stop=True):
-            pass
-
     def create_stream_response_json(
         index: int,
         text: str,
@@ -181,12 +180,8 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
 
     # Streaming response
     if request.stream:
-        background_tasks = BackgroundTasks()
-        # Abort the request if the client disconnects.
-        background_tasks.add_task(abort_request)
         return StreamingResponse(completion_stream_generator(),
-                                 media_type='text/event-stream',
-                                 background=background_tasks)
+                                 media_type='text/event-stream')
 
     # Non-streaming response
     final_res = None
@@ -194,7 +189,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
     async for res in result_generator:
         if await raw_request.is_disconnected():
             # Abort the request if the client disconnects.
-            await abort_request()
+            VariableInterface.async_engine.stop_session(session_id)
             return create_error_response(HTTPStatus.BAD_REQUEST,
                                          'Client disconnected')
         final_res = res
@@ -257,7 +252,7 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
 
     The request should be a JSON object with the following fields:
     - prompt: the prompt to use for the generation.
-    - instance_id: determine which instance will be called. If not specified
+    - session_id: determine which instance will be called. If not specified
         with a value other than -1, using host ip directly.
     - sequence_start (bool): indicator for starting a sequence.
     - sequence_end (bool): indicator for ending a sequence
@@ -275,13 +270,13 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
         1.0 means no penalty
     - ignore_eos (bool): indicator for ignoring eos
     """
-    if request.instance_id == -1:
-        instance_id = int(raw_request.client.host.replace('.', ''))
-        request.instance_id = instance_id
+    if request.session_id == -1:
+        session_id = ip2id(raw_request.client.host)
+        request.session_id = session_id
 
     generation = VariableInterface.async_engine.generate(
         request.prompt,
-        request.instance_id,
+        request.session_id,
         stream_response=True,  # always use stream to enable batching
         sequence_start=request.sequence_start,
         sequence_end=request.sequence_end,
@@ -296,21 +291,26 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
         async for out in generation:
-            ret = {
-                'text': out.response,
-                'tokens': out.generate_token_len,
-                'finish_reason': out.finish_reason
-            }
-            yield (json.dumps(ret) + '\0').encode('utf-8')
+            chunk = GenerateResponse(text=out.response,
+                                     tokens=out.generate_token_len,
+                                     finish_reason=out.finish_reason)
+            data = chunk.model_dump_json()
+            yield f'{data}\n'
 
     if request.stream:
-        return StreamingResponse(stream_results())
+        return StreamingResponse(stream_results(),
+                                 media_type='text/event-stream')
     else:
         ret = {}
         text = ''
         tokens = 0
         finish_reason = None
         async for out in generation:
+            if await raw_request.is_disconnected():
+                # Abort the request if the client disconnects.
+                VariableInterface.async_engine.stop_session(session_id)
+                return create_error_response(HTTPStatus.BAD_REQUEST,
+                                             'Client disconnected')
             text += out.response
             tokens = out.generate_token_len
             finish_reason = out.finish_reason
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 8f2919a1a5..b4eeadff74 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -190,7 +190,7 @@ class EmbeddingsResponse(BaseModel):
 class GenerateRequest(BaseModel):
     """Generate request."""
     prompt: Union[str, List[Dict[str, str]]]
-    instance_id: int = -1
+    session_id: int = -1
     sequence_start: bool = True
     sequence_end: bool = False
     stream: bool = False
@@ -201,3 +201,10 @@ class GenerateRequest(BaseModel):
     temperature: float = 0.8
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
+
+
+class GenerateResponse(BaseModel):
+    """Generate response."""
+    text: str
+    tokens: int
+    finish_reason: Optional[Literal['stop', 'length']] = None

From 169d088adb755a237a64bf70973374768ea1fc50 Mon Sep 17 00:00:00 2001
From: Shahrukh Khan <sk28671@gmail.com>
Date: Wed, 11 Oct 2023 08:25:44 +0200
Subject: [PATCH 34/43] Fix typo in `docs/en/pytorch.md` (#539)

---
 docs/en/pytorch.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md
index 4bc3110735..e3662ab373 100644
--- a/docs/en/pytorch.md
+++ b/docs/en/pytorch.md
@@ -17,7 +17,7 @@ python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
 ```shell
 python -m lmdeploy.pytorch.chat \
     $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
-    --temperature 0 --max-histroy 0
+    --temperature 0 --max-history 0
 ```
 
 **Example 3**: Accelerate with deepspeed inference

From 0d2a151ec81344e81fd345f3e53edd65ff856d5b Mon Sep 17 00:00:00 2001
From: akhoroshev <arthoroshev@gmail.com>
Date: Wed, 11 Oct 2023 09:32:52 +0300
Subject: [PATCH 35/43] [bug] fix mismatched shape for decoder output tensor
 (#517)

---
 src/turbomind/models/llama/LlamaV2.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index 8a1de364b0..8768e7fd05 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -256,7 +256,7 @@ void LlamaV2<T>::contextDecode(T*         deocder_output,
     };
 
     std::unordered_map<std::string, Tensor> decoder_output_tensors{
-        {"decoder_output", {MEMORY_GPU, dtype, {bsz, max_input_len, hidden_units_}, context_decoder_output_buf}},
+        {"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_output_buf}},
         {"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_cache_ptr}},
         {"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_cache_ptr}},
         {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, deocder_output}}};

From 27e1247793c97b5c8fb49572851a7fa77149beaa Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Thu, 12 Oct 2023 11:47:52 +0800
Subject: [PATCH 36/43] update huggingface internlm-chat-7b model url (#546)

---
 README.md             | 2 +-
 README_zh-CN.md       | 2 +-
 docs/en/kv_int8.md    | 4 ++--
 docs/zh_cn/kv_int8.md | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 3b59a65a0c..b1cb8f420f 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ pip install lmdeploy
 
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
-git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b
+git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internlm-chat-7b
 
 # if you want to clone without large files – just their pointers
 # prepend your git clone with the following env var:
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 323c0654f2..63bdb105ac 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -110,7 +110,7 @@ pip install lmdeploy
 
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
-git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b
+git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internlm-chat-7b
 
 # if you want to clone without large files – just their pointers
 # prepend your git clone with the following env var:
diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md
index bbda6a239f..1f5f5aa125 100644
--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -69,7 +69,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 
 ## GPU Memory Test
 
-The test object is the [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) model.
+The test object is the [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b-v1_1) model.
 Testing method:
 
 1. Use `deploy.py` to convert the model, modify the maximum concurrency in the `workspace` configuration; adjust the number of requests in `llama_config.ini`.
@@ -93,7 +93,7 @@ As can be seen, the fp16 version requires 1030MB of GPU memory for each concurre
 
 ## Accuracy Test
 
-The test object is the [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) command model.
+The test object is the [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b-v1_1) command model.
 
 Below is the result of PTQ quantization of `kCacheKVInt8` method with only 128 randomly selected data from the c4 dataset. The accuracy was tested using [opencompass](https://github.com/InternLM/opencompass) before and after quantization.
 
diff --git a/docs/zh_cn/kv_int8.md b/docs/zh_cn/kv_int8.md
index e527b2be4a..3e006c6135 100644
--- a/docs/zh_cn/kv_int8.md
+++ b/docs/zh_cn/kv_int8.md
@@ -69,7 +69,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 
 ## 显存测试
 
-测试对象为 [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) 模型。
+测试对象为 [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b-v1_1) 模型。
 测试方法：
 
 1. 使用 `deploy.py` 转换模型，修改 `workspace` 配置中的最大并发数；调整 `llama_config.ini` 中的请求数
@@ -93,7 +93,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 
 ## 精度测试
 
-测试对象为 [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) 指令模型。
+测试对象为 [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b-v1_1) 指令模型。
 
 以下是 `kCacheKVInt8` 方法仅从 c4 数据集，随机选择 128 条数据 PTQ 量化。量化前后均使用 [opencompass](https://github.com/InternLM/opencompass) 测试精度。
 

From b21239a8a3a5a33f3aed98cec7f22e73c0cb9ac9 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Thu, 12 Oct 2023 20:21:00 +0800
Subject: [PATCH 37/43] support deploy qwen-14b-chat (#482)

* support deploy qwen-14b-chat

* update README

* load safetensors first
---
 README.md                          |  2 ++
 README_zh-CN.md                    |  2 ++
 lmdeploy/model.py                  |  1 +
 lmdeploy/serve/turbomind/deploy.py | 48 ++++++++++++++++++++----------
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index b1cb8f420f..a2de4d6ac0 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports Qwen-14B
 - \[2023/09\] TurboMind supports InternLM-20B
 - \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
@@ -65,6 +66,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |       Yes       | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
 |  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 63bdb105ac..09c66c2826 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 Qwen-14B
 - \[2023/09\] TurboMind 支持 InternLM-20B 模型
 - \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
@@ -66,6 +67,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |   Yes    | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
 |  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index fbc6736f9a..3bfc59aefa 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -448,6 +448,7 @@ def messages2prompt(self, messages, sequence_start=True):
         return ret
 
 
+@MODELS.register_module(name='qwen-14b')
 @MODELS.register_module(name='qwen-7b')
 class Qwen7BChat(BaseModel):
     """Chat template for Qwen-7B-Chat."""
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 1c2b1becc3..81129623ef 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -11,6 +11,7 @@
 import fire
 import safetensors
 import torch
+from safetensors.torch import load_file
 from sentencepiece import SentencePieceProcessor
 
 import lmdeploy
@@ -108,6 +109,35 @@ def tokenizer_info_qwen(model_dir: str):
     return n_words, bos_id, eos_id
 
 
+def load_checkpoint(model_path):
+    """Load checkpoint files into torch format.
+
+    Args:
+        model_path (str): the checkpoint folder
+    Returns:
+        Dict[str, torch.Tensor]: weight in torch format
+    """
+    suffixes = ['.safetensors', '.bin']
+    for suffix in suffixes:
+        files = [
+            file for file in os.listdir(model_path) if file.endswith(suffix)
+        ]
+        if len(files) > 0:
+            break
+
+    assert len(files) > 0, f'could not find checkpoints in {model_path}'
+    files = sorted(files)
+    print(files)
+    params = {}
+    for file in files:
+        if file.endswith('.bin'):
+            tmp = torch.load(osp.join(model_path, file), map_location='cpu')
+        else:
+            tmp = load_file(osp.join(model_path, file))
+        params.update(tmp)
+    return params
+
+
 def export(model_name: str,
            num_layer: int,
            norm_eps: float,
@@ -437,14 +467,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
     _qweight = 'weight'
     _suffixes = [_qweight, 'bias']
 
-    _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
-    _files = sorted(_files)
-    print(_files)
-
-    _params = {}
-    for _file in _files:
-        _tmp = torch.load(osp.join(model_path, _file), map_location='cpu')
-        _params.update(_tmp)
+    _params = load_checkpoint(model_path)
 
     def get_tensor(name):
         """return tensor according its name."""
@@ -837,14 +860,7 @@ def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
     # convert weights from hf to turbomind
     model_params = {}
 
-    _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
-    _files = sorted(_files)
-    print(_files)
-
-    _params = {}
-    for _file in _files:
-        _tmp = torch.load(osp.join(model_path, _file), map_location='cpu')
-        _params.update(_tmp)
+    _params = load_checkpoint(model_path)
 
     def get_tensor(name, trans=True):
         """return a transposed tensor according its name."""

From 6904053f1b40842a214a4704863c12ecc3957430 Mon Sep 17 00:00:00 2001
From: YiiSh <mokeyish@hotmail.com>
Date: Fri, 13 Oct 2023 11:46:29 +0800
Subject: [PATCH 38/43] Fix typing of openai protocol. (#554)

---
 lmdeploy/serve/openai/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index b4eeadff74..756af1a4ca 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -28,7 +28,7 @@ class ModelPermission(BaseModel):
     allow_fine_tuning: bool = False
     organization: str = '*'
     group: Optional[str] = None
-    is_blocking: str = False
+    is_blocking: bool = False
 
 
 class ModelCard(BaseModel):

From 77a268128ab0658d6ff0d5d80ee7f8b6e8e75923 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Fri, 13 Oct 2023 14:34:11 +0800
Subject: [PATCH 39/43] Add tp hint for deployment (#555)

* add tp hint for deploy

* fix lint

* assert tp in turbomind

* fix lint
---
 lmdeploy/serve/turbomind/deploy.py | 4 +++-
 lmdeploy/turbomind/turbomind.py    | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 81129623ef..992f2f57df 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -972,7 +972,7 @@ def main(model_name: str,
             META's llama format, and 'hf' means huggingface format
         tokenizer_path (str): the path of tokenizer model
         dst_path (str): the destination path that saves outputs
-        tp (int): the number of GPUs used for tensor parallelism
+        tp (int): the number of GPUs used for tensor parallelism, should be 2^n
         quant_path (str): path of the quantized model, which can be None
         group_size (int): a parameter used in AWQ to quantize fp16 weights
             to 4 bits
@@ -981,6 +981,8 @@ def main(model_name: str,
         f"'{model_name}' is not supported. " \
         f'The supported models are: {MODELS.module_dict.keys()}'
 
+    assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
+
     if model_format is None:
         model_format = 'qwen' if model_name == 'qwen-7b' else 'hf'
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 51f0582fdf..2c0f8924ba 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -86,6 +86,7 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
         node_num = 1
 
         # read meta from model path
+        assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
         self.gpu_count = tp
         self.session_len = 2048
         data_type = 'fp16'

From 0b861c4879bfc511666421384709c58cc8450ea5 Mon Sep 17 00:00:00 2001
From: del-zhenwu <dele.zhenwu@gmail.com>
Date: Fri, 13 Oct 2023 17:08:34 +0800
Subject: [PATCH 40/43] [doc] Update benchmark command in w4a16.md (#500)

* [doc] Update benchmark command in w4a16.md

* Update w4a16.md

* Update w4a16.md

add pip install nvidia-ml-py

* [doc] Update w4a16.md

* fix lint error

Signed-off-by: del-zhenwu <dele.zhenwu@gmail.com>

* [doc] update model_path & prompt_tokens

Signed-off-by: del-zhenwu <dele.zhenwu@gmail.com>

---------

Signed-off-by: del-zhenwu <dele.zhenwu@gmail.com>
---
 docs/en/w4a16.md    | 8 ++++++--
 docs/zh_cn/w4a16.md | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/en/w4a16.md b/docs/en/w4a16.md
index 57d59f3ae6..fedffdc9d0 100644
--- a/docs/en/w4a16.md
+++ b/docs/en/w4a16.md
@@ -62,10 +62,14 @@ Memory (GB) comparison results between 4-bit and 16-bit model with context size
 | Llama-2-7B-chat  | 15.1        | 6.3        | 16.2        | 7.5        |
 | Llama-2-13B-chat | OOM         | 10.3       | OOM         | 12.0       |
 
+```
+pip install nvidia-ml-py
+```
+
 ```shell
 python benchmark/profile_generation.py \
-  ./workspace \
-  --concurrency 1 --input_seqlen 1 --output_seqlen 512
+ --model-path ./workspace \
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```
 
 ## 4-bit Weight Quantization
diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md
index a67304b6d7..d28cb716dd 100644
--- a/docs/zh_cn/w4a16.md
+++ b/docs/zh_cn/w4a16.md
@@ -60,10 +60,14 @@ python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----serv
 | Llama-2-7B-chat  | 15.1        | 6.3        | 16.2        | 7.5        |
 | Llama-2-13B-chat | OOM         | 10.3       | OOM         | 12.0       |
 
+```
+pip install nvidia-ml-py
+```
+
 ```shell
 python benchmark/profile_generation.py \
-  ./workspace \
-  --concurrency 1 --input_seqlen 1 --output_seqlen 512
+ --model-path ./workspace \
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```
 
 ## 4bit 权重量化

From f4422fabfa3cb661b205d20bb6bb8875dc488b7c Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Mon, 16 Oct 2023 11:05:07 +0800
Subject: [PATCH 41/43] free runner disk (#552)

* free runner disk

* limit cpu

* docker.yml

* keep swap

* keep swap
---
 .github/workflows/docker.yml        | 25 ++++++++++++------------
 .github/workflows/linux-x64-gpu.yml | 30 ++++++++++++++++++++++-------
 .github/workflows/pypi.yml          | 12 ++++++++++++
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 9d6d62306b..dc51108078 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -24,19 +24,18 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
-      - name: Check disk space
-        run: |
-          df -h
-          ls /opt/hostedtoolcache
-          rm -rf ${GITHUB_WORKSPACE}/.git
-          rm -rf  /opt/hostedtoolcache/go
-          rm -rf  /opt/hostedtoolcache/node
-          rm -rf  /opt/hostedtoolcache/Ruby
-          rm -rf  /opt/hostedtoolcache/CodeQL
-          cat /proc/cpuinfo  | grep -ic proc
-          free
-          df -h
-          df . -h
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Get docker info
         run: |
           docker info
diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
index d2d2dbae7e..d940408ce7 100644
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
@@ -27,14 +27,30 @@ permissions:
 jobs:
   cuda-118:
     runs-on: ubuntu-latest
-    container: openmmlab/lmdeploy-builder:cuda11.8
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Build
-        run: |
-          source /opt/conda/bin/activate
-          conda activate py38
-          mkdir build && cd build
-          bash ../generate.sh
-          make -j$(nproc) && make install
+        uses: addnab/docker-run-action@v3
+        with:
+          image: openmmlab/lmdeploy-builder:cuda11.8
+          options: -v ${{ github.workspace }}:/work --cpus=1.8
+          run: |
+            cd /work
+            source /opt/conda/bin/activate
+            conda activate py38
+            mkdir build && cd build
+            bash ../generate.sh
+            make -j$(nproc) && make install
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index d5def5ce2b..7c56e08f7d 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -21,6 +21,18 @@ jobs:
       DOCKER_TAG: cuda11.8
       OUTPUT_FOLDER: cuda11.8_dist
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Build

From c261b49d5a4db2cdd26738e3df209e5a9068a479 Mon Sep 17 00:00:00 2001
From: "q.yao" <yaoqian@sensetime.com>
Date: Mon, 16 Oct 2023 13:57:35 +0800
Subject: [PATCH 42/43] Move `tokenizer.py` to the folder of lmdeploy (#543)

* move tokenizer

* remove Tokenizer in init

* update deploy.py
---
 benchmark/profile_generation.py       |  3 ++-
 benchmark/profile_restful_api.py      |  2 +-
 benchmark/profile_serving.py          |  2 +-
 benchmark/profile_throughput.py       |  3 ++-
 lmdeploy/serve/async_engine.py        |  2 +-
 lmdeploy/serve/turbomind/deploy.py    |  8 ++++----
 lmdeploy/{turbomind => }/tokenizer.py |  0
 lmdeploy/turbomind/__init__.py        |  3 +--
 lmdeploy/turbomind/chat.py            |  2 +-
 lmdeploy/turbomind/decode.py          |  2 +-
 lmdeploy/turbomind/turbomind.py       |  2 +-
 tests/python/test_tokenizer.py        | 15 ---------------
 12 files changed, 15 insertions(+), 29 deletions(-)
 rename lmdeploy/{turbomind => }/tokenizer.py (100%)
 delete mode 100644 tests/python/test_tokenizer.py

diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index ecfd0d3e4f..50e4be008f 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -18,7 +18,8 @@
                     nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
 from tqdm import tqdm
 
-from lmdeploy.turbomind import Tokenizer, TurboMind
+from lmdeploy.tokenizer import Tokenizer
+from lmdeploy.turbomind import TurboMind
 
 
 def infer(model, session_id: int, input_ids: str, output_seqlen: int,
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index ed922bfd7a..d1f6ebf80e 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -8,7 +8,7 @@
 import numpy as np
 import requests
 
-from lmdeploy.turbomind.tokenizer import Tokenizer
+from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
 
 
diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
index c60e0799dc..4580757eeb 100644
--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from lmdeploy.serve.turbomind.chatbot import Chatbot
-from lmdeploy.turbomind.tokenizer import Tokenizer
+from lmdeploy.tokenizer import Tokenizer
 
 
 def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index d8100113c5..9d92b31fa3 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -8,7 +8,8 @@
 
 import fire
 
-from lmdeploy.turbomind import Tokenizer, TurboMind
+from lmdeploy.tokenizer import Tokenizer
+from lmdeploy.turbomind import TurboMind
 
 
 def sample_requests(
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index e2c4b36840..9588b00da1 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -30,7 +30,7 @@ class AsyncEngine:
 
     def __init__(self, model_path, instance_num=32, tp=1) -> None:
         from lmdeploy import turbomind as tm
-        from lmdeploy.turbomind.tokenizer import Tokenizer
+        from lmdeploy.tokenizer import Tokenizer
         tokenizer_model_path = osp.join(model_path, 'triton_models',
                                         'tokenizer')
         tokenizer = Tokenizer(tokenizer_model_path)
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
index 992f2f57df..cc8db88f5c 100644
--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -306,7 +306,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
         shutil.copy(tokenizer_path,
                     osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
         with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'turbomind/tokenizer.py'),
+            shutil.copy(osp.join(root_path, 'tokenizer.py'),
                         osp.join(triton_models_path, 'tokenizer'))
     else:
         print(f'tokenizer model {tokenizer_path} does not exist')
@@ -435,7 +435,7 @@ def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
                 shutil.copy(json_path,
                             osp.join(triton_models_path, 'tokenizer', _file))
         with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'turbomind/tokenizer.py'),
+            shutil.copy(osp.join(root_path, 'tokenizer.py'),
                         osp.join(triton_models_path, 'tokenizer'))
     else:
         print(f'tokenizer model {tokenizer_path} does not exist')
@@ -601,7 +601,7 @@ def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
                 shutil.copy(json_path,
                             osp.join(triton_models_path, 'tokenizer', _file))
         with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'turbomind/tokenizer.py'),
+            shutil.copy(osp.join(root_path, 'tokenizer.py'),
                         osp.join(triton_models_path, 'tokenizer'))
     else:
         print(f'tokenizer model {tokenizer_path} does not exist')
@@ -831,7 +831,7 @@ def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
                 shutil.copy(json_path,
                             osp.join(triton_models_path, 'tokenizer', _file))
         with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'turbomind/tokenizer.py'),
+            shutil.copy(osp.join(root_path, 'tokenizer.py'),
                         osp.join(triton_models_path, 'tokenizer'))
     else:
         print(f'tokenizer model {tokenizer_path} does not exist')
diff --git a/lmdeploy/turbomind/tokenizer.py b/lmdeploy/tokenizer.py
similarity index 100%
rename from lmdeploy/turbomind/tokenizer.py
rename to lmdeploy/tokenizer.py
diff --git a/lmdeploy/turbomind/__init__.py b/lmdeploy/turbomind/__init__.py
index 02fb288f89..b2df77014c 100644
--- a/lmdeploy/turbomind/__init__.py
+++ b/lmdeploy/turbomind/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .tokenizer import Tokenizer
 from .turbomind import TurboMind
 
-__all__ = ['Tokenizer', 'TurboMind']
+__all__ = ['TurboMind']
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index 4648b7921f..de31a5daa7 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -8,7 +8,7 @@
 
 from lmdeploy import turbomind as tm
 from lmdeploy.model import MODELS
-from lmdeploy.turbomind.tokenizer import Tokenizer
+from lmdeploy.tokenizer import Tokenizer
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py
index 32dd40ca2f..daef35298c 100644
--- a/lmdeploy/turbomind/decode.py
+++ b/lmdeploy/turbomind/decode.py
@@ -6,7 +6,7 @@
 import torch
 
 from lmdeploy import turbomind as tm
-from lmdeploy.turbomind.tokenizer import Tokenizer
+from lmdeploy.tokenizer import Tokenizer
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 2c0f8924ba..b63f5dafe3 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -14,7 +14,7 @@
 
 import lmdeploy
 from lmdeploy.model import MODELS
-from lmdeploy.turbomind import Tokenizer
+from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
 
 # TODO: find another way import _turbomind
diff --git a/tests/python/test_tokenizer.py b/tests/python/test_tokenizer.py
deleted file mode 100644
index 411ca41562..0000000000
--- a/tests/python/test_tokenizer.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from lmdeploy.turbomind.tokenizer import Tokenizer
-
-
-def main():
-    tokenizer = Tokenizer('huggyllama/llama-7b')
-
-    prompts = ['cest la vie', '上帝已死']
-    for prompt in prompts:
-        tokens = tokenizer.encode(prompt)
-        output = tokenizer.decode(tokens)
-        print(output)
-
-
-if __name__ == '__main__':
-    main()

From bb3cce9a93d6c1d2a6f504afedec988e014587bf Mon Sep 17 00:00:00 2001
From: Lyu Han <lvhan_028@163.com>
Date: Tue, 17 Oct 2023 14:16:31 +0800
Subject: [PATCH 43/43] bump version to v0.0.11 (#567)

---
 lmdeploy/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 0a68f5e0c8..417dc76768 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.0.10'
+__version__ = '0.0.11'
 short_version = __version__